def loadOrgData(): global x_org,x_t_org,d_org if x_org == []: x_org = loadData(scale=1,train=True) x_t_org = loadData(scale=1,train=False) d_org = len(x_org[0,:]) reset = False else: print "Data already loaded."
def doStuff(name, scale=1, P=100): y = y_org # Step 0: load data (zoomed or not) print("Reading training data.") x = loadData(scale=scale, train=True) print("Reading test data.") x_t = loadData(scale=scale, train=False) nTest = len(x_t) d = len(x[0, :]) d2 = d / P # Step 1: divide data set into P subsets (TODO: boosting?) # we use P estimators sum_preds = np.zeros(nTest) sum_r = 0 for i in range(0, P): if i % 5 == 0: print("Make prediction for subset/estimator %s..." % (i + 1)) minI = d2 * i maxI = min((d2) * (i + 1), d) - 1 x_fi = x[:, i::P] x_t_fi = x_t[:, i::P] # Step 2: apply each estimator on data matrix x_fi and age vector y y_pred_i, r = fi_prediction(x_fi, x_t_fi, y) # insert prediction for test data i here sum_preds = sum_preds + y_pred_i sum_r = sum_r + r """ Step 3: combine all estimates """ y_t_pred = [i / float(P) for i in sum_preds] r = sum_r / float(P) """Step 4: post-process and save prediction""" # TODO: calculate and sum of save covariances prefix = "%s_CV_P%s_zoom%sFULL" % (name, P, 1 / float(scale)) prep = lambda i: int(i) y_t_pp = [prep(i) for i in y_t_pred] savedFilename = saveCSV(y_t_pp, prefix) print("Saved predictions into %s" % savedFilename) """ Step 5: make histogram plot of age """ # (because no visualization for flat data matrix...) plt.hist(y, color="black", rwidth=0.7) #plt.hist(y_pred,color="darkgreen",rwidth=0.5) plt.hist(y_t_pp, color="darkblue", rwidth=0.5) plt.legend( ["ages given for X", "ages predicted for X", "ages predicted for X_t"]) savedPlotFname = prefix + ".png" plt.savefig(savedPlotFname) print("Saved age diagram in %s" % savedPlotFname) plt.clf() print("Average of coefficients: %s" % r) # retuns a colleciton of stuff to return return (x, y, x_t, y_t_pred, y_t_pp, r)
def doStuff(name, alpha=77, scale=1, P=100): y = y_org # Step 0: load data (zoomed or not) print("Reading training data.") #x = sharedmem.empty(n_max) x = loadData(scale=scale, train=True) print("Reading test data.") #x = sharedmem.empty(n_test_max) x_t = loadData(scale=scale, train=False) nTest = len(x_t) print x d = len(x[0, :]) d2 = d / P """ Step 1: divide data set into P subsets (TODO: boosting?) """ # we use P estimators # results = pool.map(each_elem,range(0,P)) results = range(0, P) processes = [ Process(target=each_elem, args=(i, x, x_t, results)) for i in range(0, P) ] for p in processes: p.start() for p in processes: p.join() """ Step 3: combine all estimates """ y_t_pred = reduce(lambda a, x: a + x[0], results, 0) / float(P) r = reduce(lambda a, x: a + x[1], results, 0) / float(P) """Step 4: post-process and save prediction""" # TODO: calculate and sum of save covariances prefix = "%s_alpha%s_P%s_zoom%sFULL" % (name, alpha, P, 1 / float(scale)) prep = lambda i: int(i) y_t_pp = [prep(i) for i in y_t_pred] savedFilename = saveCSV(y_t_pp, prefix) print("Saved predictions into %s" % savedFilename) """ Step 5: make histogram plot of age """ # (because no visualization for flat data matrix...) plt.hist(y, color="black", rwidth=0.7) #plt.hist(y_pred,color="darkgreen",rwidth=0.5) plt.hist(y_t_pp, color="darkblue", rwidth=0.5) plt.legend( ["ages given for X", "ages predicted for X", "ages predicted for X_t"]) savedPlotFname = prefix + ".png" plt.savefig(savedPlotFname) print("Saved age diagram in %s" % savedPlotFname) plt.clf() # retuns a colleciton of stuff to return return (x, y, x_t, y_t_pred, y_t_pp)
def train_model(): with tf.Session() as sess: sess.run(tf.global_variables_initializer()) train = loadData("./data/split_data/train.csv") for _ in range(epochs): for idx, row in train.iterrows(): if not row["content"]: continue feed_dict, labels = get_feed_dict(row) predicted, currentLoss = sess.run([logits, train_loss], feed_dict=feed_dict) # Prints first 50 characters of the content with loss print(row["content"][:50], " - Loss:", currentLoss) sess.run(update_step, feed_dict=feed_dict) print("\nModel trained!") save_path = saver.save(sess, "./model_dir/model1/model.ckpt") print("Model saved in path: %s" % save_path)
def tf_idf_advanced(comments): commentList = list2str(comments) print('Loading Vectorizer') if os.path.exists('models/vectorizer_imdb_tfidf_advanced.pkl'): with open('models/vectorizer_imdb_tfidf_advanced.pkl', 'rb') as fw: Vectorizer = pickle.load(fw) fw.close() else: print('reading data') filename = 'dataset/aclImdb/train/pos' filename1 = 'dataset/aclImdb/train/neg' trainComments, labels = pre.loadData(filename, filename1) trainCommentList = list2str(trainComments) Vectorizer = TfidfVectorizer(max_features=10000, input='content', analyzer=stemmed_words, stop_words='english', encoding='utf-8', decode_error='ignore', lowercase=True, ngram_range=(1, 3)) Vectorizer.fit_transform(trainCommentList) with open('models/vectorizer_imdb_tfidf_advanced.pkl', 'wb') as fw: pickle.dump(Vectorizer, fw) fw.close() print('Vectorizing comments') return Vectorizer.transform(commentList)
def tsne_impl(): from sklearn.manifold import TSNE X, y = preprocess.loadData() X_embedded = TSNE(n_components=2).fit_transform(X) print(X_embedded.shape) colors = np.random.rand(X_embedded.shape[0]) scored_indices = y == 1 not_scored_indices = y == 0 fig, ax = plt.subplots() ax.scatter(X_embedded[scored_indices, 0], X_embedded[scored_indices, 1], c='red', label='Scored', marker='*') ax.scatter(X_embedded[not_scored_indices, 0], X_embedded[not_scored_indices, 1], c='blue', label='Not scored', marker='+') ax.legend() plt.show()
def run(): for gap in xrange(7): expansion = False data, comm = preprocess.loadData(gap, expansion) train_data, test_data = preprocess.split(data, comm) train = np.array(train_data[0]) test = np.array(test_data[0]) test_com = test_data[1] train_x = train[:, 1:] train_y = train[:, 0] print train_x.shape reg = gcv(train_x, train_y) if len(sys.argv) > 1 and sys.argv[1] == 'output': test_x = test[:, 1:] pred = reg.predict(test_x) output_test(pred, test_com, gap)
def estimate_savings(model=None): """ Calculate estimated savings """ # Iterating and labeling pivot_frame = preprocess.loadData(file_name="pivot_data_new.csv") temp_group = pivot_frame.groupby( ['airline', 'flight_path', 'days_to_depart']) max_price = 0 i = 0 for indexes, res in tqdm(temp_group): if (i % 36 == 0): max_price = 0 i += 1 m = res["mean"].iloc[0] max_price = max(max_price, m) pivot_frame.loc[(pivot_frame['airline'] == indexes[0]) & (pivot_frame['flight_path'] == indexes[1]) & (pivot_frame['days_to_depart'] == indexes[2]), 'delta'] = max_price - m print("Total Savings =", pivot_frame['delta'].sum()) print("Average Savings =", pivot_frame['delta'].mean())
def PCA(): from sklearn.decomposition import PCA X, y = preprocess.loadData() pca = PCA(n_components=2, svd_solver='full') X_embedded = pca.fit_transform(X, y) print(X.shape) print(X_embedded.shape) colors = np.random.rand(X_embedded.shape[0]) scored_indices = y == 1 not_scored_indices = y == 0 fig, ax = plt.subplots() ax.scatter(X_embedded[scored_indices, 0], X_embedded[scored_indices, 1], c='red', label='Scored', marker='*') ax.scatter(X_embedded[not_scored_indices, 0], X_embedded[not_scored_indices, 1], c='blue', label='Not scored', marker='+') ax.legend() plt.show()
def bagging(file_path, output_path, model_type, begin_day, end_day, expansion=False, save_model=True, output=True, online=False, steps=1000, train_set="base", valid_set="base", test_set="base", cv=4, shuffle=False, weight=[50, 15, 15, 5], cla=0): final_lis = [] for gap in range(begin_day, end_day): print "*" * 20, gap, "*" * 20 print "Gap %d Start Time:" % gap, time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) data, comm = preprocess.loadData(gap, expansion=expansion) #train, test = preprocess.split(data, comm) train, valid, test_online = preprocess.split_lastweek(data, comm, cla) train_data = np.array(train[0]) valid_data = np.array(valid[0]) test_data = np.array(test_online[0]) train_x = train_data[:, 1:] train_y = train_data[:, 0] valid_x = valid_data[:, 1:] valid_y = valid_data[:, 0] test_x = test_data[:, 1:] valid_x_nonan = valid_x.copy() valid_x_nonan[np.isnan(valid_x_nonan)] = -1 test_x_nonan = test_x.copy() test_x_nonan[np.isnan(test_x_nonan)] = -1 if shuffle: idx = np.random.permutation(train_y.size) train_x = train_x[idx] train_y = train_y[idx] #train_len = train_data.shape[0] #win_size = train_len / cv + 1 #print >> sys.stderr, train_len #print >> sys.stderr, win_size clfs = [] application = ["regression_l2"] * 2 + ["huber"] * 2 + ["fair"] * 6 boosting = ["dart"] * 1 + ["gbrt"] * 11 learning_rate = [0.015, 0.02, 0.03, 0.04] #metric = ["l1"] + ["l2"] + ["huber"] *4 + ["fair"] *3 num_leaves = [32] * 1 + [64] * 2 + [128] * 2 + [256] feature_fraction = [0.5, 0.6, 0.7, 0.8, 0.9] bagging_fraction = [0.5, 0.6, 0.7, 0.8, 0.9] lambda_l1 = [0.5, 0.6, 0.7, 0.8, 0.9] lambda_l2 = [0.5, 0.6, 0.7, 0.8, 0.9] drop_rate = [0.3, 0.5, 0.7, 0.9] skip_drop = [0.3, 0.5, 0.7, 0.9] huber_delta = [0.6, 0.8, 0.9] fair_c = [0.6, 0.8, 0.9] max_bin = range(200, 400) feature_fraction_seed = range(1, 20) bagging_seed = range(1, 20) drop_seed = range(1, 20) for i in range(0, weight[1]): dic = { "application": random.choice(application), "boosting": random.choice(boosting), "learning_rate": random.choice(learning_rate), "num_leaves": random.choice(num_leaves), "feature_fraction": random.choice(feature_fraction), "bagging_fraction": random.choice(bagging_fraction), "lambda_l1": random.choice(lambda_l1), "lambda_l2": random.choice(lambda_l2), "drop_rate": random.choice(drop_rate), "skip_drop": random.choice(skip_drop), "max_bin": random.choice(max_bin), "huber_delta": random.choice(huber_delta), "fair_c": random.choice(fair_c), "feature_fraction_seed": random.choice(feature_fraction_seed), "bagging_seed": random.choice(bagging_seed), "drop_seed": random.choice(drop_seed) } clfs.append((fitLGBModel, dic)) obj = [myObjective6] * 10 + ["reg:linear"] learning_rate = [0.015, 0.02, 0.03, 0.04] seed = range(1, 20) max_depth = range(5, 11) subsample = [0.5, 0.6, 0.7, 0.8, 0.9] colsample_bytree = [0.5, 0.6, 0.7, 0.8, 0.9] colsample_bylevel = [0.5, 0.6, 0.7, 0.8, 0.9] gamma = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7] for i in range(0, weight[0]): dic = { "objective": random.choice(obj), "learning_rate": random.choice(learning_rate), "seed": random.choice(seed), "max_depth": random.choice(max_depth), "subsample": random.choice(subsample), "colsample_bytree": random.choice(colsample_bytree), "colsample_bylevel": random.choice(colsample_bylevel), "gamma": random.choice(gamma) } clfs.append((fitXGBModel, dic)) max_features = [0.5, 0.6, 0.65, 0.7, 0.8] max_depth = range(5, 11) min_samples_leaf = [2, 10, 30, 50] random_state = range(0, 8) for i in range(0, weight[2]): dic = { "max_features": random.choice(max_features), "max_depth": random.choice(max_depth), "min_samples_leaf": random.choice(min_samples_leaf), "random_state": random.choice(random_state) } clfs.append((fitRFModel, dic)) loss = ["lad", "huber"] learning_rate = [0.01, 0.02, 0.03, 0.04] max_features = [0.5, 0.6, 0.65, 0.7, 0.8] max_depth = range(5, 11) subsample = [0.5, 0.6, 0.7, 0.8] random_state = range(0, 8) for i in range(0, weight[3]): dic = { "loss": random.choice(loss), "learning_rate": random.choice(learning_rate), "max_features": random.choice(max_features), "max_depth": random.choice(max_depth), "subsample": random.choice(subsample), "random_state": random.choice(random_state) } clfs.append((fitGBRTModel, dic)) #stage2_train = np.zeros((train_x.shape[0], len(clfs))) stage2_valid = np.zeros((valid_x.shape[0], len(clfs))) stage2_test = np.zeros((test_x.shape[0], len(clfs))) bagging = np.random.randint(0, train_x.shape[0], size=(len(clfs), train_x.shape[0])) for idx, clf in enumerate(clfs): print "Gap", gap, "Model", idx print clf[1] print "Start Time:", time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #skf = list(StratifiedKFold(train_y, cv, shuffle=True, random_state=idx)) #stage2_valid_temp = np.zeros((valid_x.shape[0], len(skf))) #stage2_test_temp = np.zeros((test_x.shape[0], len(skf))) #train = train_x[bagging[idx]] #test = test_x[bagging[idx]] #for i, (train, test) in enumerate(skf): #print "Fold:", i, "Start Time:", time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) X_train = train_x[bagging[idx]] y_train = train_y[bagging[idx]] print X_train.shape print y_train.shape #X_test = train_x[test] #y_test = train_y[test] if clf[0] in [ fitLRModel, fitRidgeModel, fitLinearSVRModel, fitGBRTModel, fitRFModel ]: X_train[np.isnan(X_train)] = -1 valid_x_tmp = valid_x_nonan test_x_tmp = test_x_nonan else: valid_x_tmp = valid_x test_x_tmp = test_x reg, eva = clf[0](X_train, y_train, valid_x, valid_y, clf[1]) if save_model: joblib.dump(reg, "%s/%d_%d.m" % (file_path, gap, idx)) #stage2_train[test, idx] = reg.predict(X_test) #stage2_valid_temp[:, idx] = reg.predict(valid_x) #stage2_test_temp[:, i] = reg.predict(test_x_tmp) stage2_valid[:, idx] = reg.predict(valid_x_tmp) stage2_test[:, idx] = reg.predict(test_x_tmp) print "Gap ", gap, "Model %d: %.6f" % ( idx, evaluate(reg.predict(valid_x_tmp), valid_y)) if gap == -1: for idx in range(len(clfs)): if clfs[idx][0] in [ fitLRModel, fitRidgeModel, fitLinearSVRModel, fitGBRTModel, fitRFModel ]: valid_x_tmp = valid_x_nonan test_x_tmp = test_x_nonan else: valid_x_tmp = valid_x test_x_tmp = test_x reg = joblib.load("%s/%d_%d.m" % (file_path, gap, idx)) stage2_valid[:, idx] = reg.predict(valid_x_tmp) stage2_test[:, idx] = reg.predict(test_x_tmp) #print "Final LR:", "Start Time:", time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) #reg1, eva1 = fitLRModel(stage2_train, train_y, stage2_valid, valid_y, {}) #print "Final LR Eval", eva1 #print "Final Ridge:", "Start Time:", time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) #reg2, eva2 = fitRidgeModel(stage2_train, train_y, stage2_valid, valid_y, {}) #print "Final Ridge Eval", eva2 #print "Final XGB:", "Start Time:", time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) #reg3, eva3 = fitXGBModel(stage2_train, train_y, stage2_valid, valid_y, # {'max_depth':2, 'learning_rate':0.03, 'n_estimators':1000, 'seed':5, # 'gamma':0.9, 'subsample':1.0, 'colsample_bytree':1.0, 'colsample_bylevel':1.0}) #print "Final XGB Eval", eva3 eva_avg = evaluate(stage2_valid.mean(1), valid_y) eva_med = evaluate(np.median(stage2_valid, axis=1), valid_y) print "Final AVG Eval", eva_avg print "Final MED Eval", eva_med final_lis.append(np.min([eva_avg, eva_med])) #print "Best Eval", np.min([eva1, eva2, eva3]) #final_lis.append(np.min([eva1, eva2, eva3])) #if save_model: # joblib.dump(reg1, "%s/stage2_avg_%d.m"%(file_path, gap)) # joblib.dump(reg3, "%s/stage2_med_%d.m"%(file_path, gap)) if output: #output_test(stage2_test.mean(1), test_online[1], gap, "avg", eva_avg, output_path) output_test(np.median(stage2_test, axis=1), test_online[1], gap, "median", eva_med, output_path) print "Gap %d Start Time:" % gap, time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) print "Week 1 Eval:", np.average(final_lis)
def run(): err = [] #for gap in xrange(14): for i in xrange(7): gap = i expansion = False data, comm = preprocess.loadData(gap, expansion) eps = [] for t in [1, 2]: train_data, valid_data, test_data = preprocess.split_lastweek( data, comm, t) train = np.array(train_data[0]) valid = np.array(valid_data[0]) test = np.array(test_data[0]) test_com = test_data[1] train_x = train[:, 1:] train_y = train[:, 0] train_x[np.isnan(train_x)] = -1 valid_x = valid[:, 1:] valid_x[np.isnan(valid_x)] = -1 valid_y = valid[:, 0] print train_x.shape print valid_x.shape regs = [] preds = [] for seed in [1, 2, 3, 5, 7, 9, 11, 15, 20, 25, 30, 40, 80]: #for seed in [1]: reg, e, ep, pred = fitRFModel(train_x, train_y, valid_x, valid_y, seed, gap) #eps += list(ep) regs.append(reg) preds.append(pred) #pred = map(lambda x: int(max(0,x)),pred) #er = np.abs(pred) - valid_y #err_p = np.abs(er) / (np.abs(pred) + valid_y) #eps += list(err_p) pred = np.median(preds, axis=0) er = np.abs(pred) - valid_y err_p = np.abs(er) / (np.abs(pred) + valid_y) eps += list(err_p) ''' with open('%d_seed%d_gap%d_valid_with_svd'%(t,seed,gap),'w') as fout: for com,e,t,p in zip(valid_data[1],ep,valid_y,pred): fout.write("%f\t%f\t%d\t%s\t%s\n"%(e,p,t,com[0],com[1].strftime("%Y-%m-%d"))) ''' #print train_x.shape #reg = gcv(train_x,train_y) #pred = reg.predict(valid_x) #e = evaluate(pred,valid_y) #print e if len(sys.argv) > 1 and sys.argv[1] == 'output': test_x = test[:, 1:] test_x[np.isnan(test_x)] = -1 preds = [] for reg in regs: pred = reg.predict(test_x) preds.append(pred) pred = np.median(preds, axis=0) output_test(pred, test_com, gap, t) e = np.mean(eps) print e err.append(e) print np.mean(err)
if __name__ == '__main__': data_params = { 'reload': False, #When True, parse time domain raw data again, use when data changes 'max_items_per_scan': 2, # maximum number of items in a scanf 'train_test_split': 0.7, #size of training data 'only_max': False, 'saved_path': "../new_res/*.json", 'use_backproj': True # set to false to use clean signal instead of backproj } # reload_data() trainX_, testX_, trainY_, testY_ = loadData(**data_params) trainX, trainY = processData( [trainX_, trainY_], commands=["crop", "transpose", "flip_x", "flip_y"]) testX, testY = processData([testX_, testY_], commands=["crop"]) # trainX, trainY = processData([trainX_, trainY_],commands = ["crop"]) # testX, testY = processData([testX_, testY_],commands = ["crop"]) N = len(trainX) idx = np.arange(N) np.random.seed(5) np.random.shuffle(idx) trainX, trainY = trainX[idx], trainY[idx] # combinedX = np.concatenate((trainX,testX),axis = 0) # combinedY = np.concatenate((trainY,testY),axis = 0)# (34, 40, 20, 21, 5)
color = color_map[j][0] with tag('span', style=f'background: rgba({color[2]}, {color[1]}, {color[0]}, {heatmap[j]});', title=int(X_test[i, j])): text(word + ' ') with tag('p'): text( f'Pred: {Y[i]}, Label: {T[i]}') doc.stag('hr') with open(out_dir / 'out.html', 'w') as f: f.write(doc.getvalue()) if __name__ == "__main__": init_logger() data_train, _ = loadData('train.csv', ('title', 'text'), vocab_size=_vocab_size) X_train, T_train, X_test, T_test = get_train_test_set(data_train) logging.info( f'X_train {X_train.shape}, T_train {T_train.shape}, X_test {X_test.shape}, T_test {T_test.shape}') T_train = np.hstack((T_train.reshape(-1, 1), (1 - T_train).reshape(-1, 1))) T_test = np.hstack((T_test.reshape(-1, 1), (1 - T_test).reshape(-1, 1))) # check from the cache # test with title first if os.path.exists(cache_dir / CNN._cache_path): cnn = CNN.from_cache() else: cnn = CNN(X_train.shape[1], vocab_size=_vocab_size)
def tf_idf_2doc(comments, labels, feat=10000): commentList = list2str(comments) rng = np.random.RandomState(seed=3) indices = np.arange(len(commentList)) rng.shuffle(indices) commentarray = np.array(commentList) labelarray = np.array(labels) commentList = commentarray[indices] labels = labelarray[indices] commentList = commentList.tolist() labels = labels.tolist() print('Loading Vectorizer') if feat == 10000: if os.path.exists('models/vectorizer_imdb_tfidf_2doc.pkl'): with open('models/vectorizer_imdb_tfidf_2doc.pkl', 'rb') as fw: Vectorizer = pickle.load(fw) fw.close() else: print('reading data') filename = 'dataset/aclImdb/train/pos' filename1 = 'dataset/aclImdb/train/neg' trainComments, labels = pre.loadData(filename, filename1) trainComments = list2str(trainComments) trainCommentList = [ list_to_str(trainComments[0:12499]), list_to_str(trainComments[12500:24999]) ] Vectorizer = TfidfVectorizer(max_features=10000, input='content', analyzer=stemmed_words, stop_words='english', encoding='utf-8', decode_error='ignore', lowercase=True, ngram_range=(1, 3)) Vectorizer.fit_transform(trainCommentList) with open('models/vectorizer_imdb_tfidf_2doc.pkl', 'wb') as fw: pickle.dump(Vectorizer, fw) fw.close() if feat == 3000: if os.path.exists('models/vectorizer_imdb_tfidf_2doc_feat3000.pkl'): with open('models/vectorizer_imdb_tfidf_2doc_feat3000.pkl', 'rb') as fw: Vectorizer = pickle.load(fw) fw.close() else: print('reading data') filename = 'dataset/aclImdb/train/pos' filename1 = 'dataset/aclImdb/train/neg' trainComments, labels = pre.loadData(filename, filename1) trainComments = list2str(trainComments) trainCommentList = [ list_to_str(trainComments[0:12499]), list_to_str(trainComments[12500:24999]) ] Vectorizer = TfidfVectorizer(max_features=3000, input='content', analyzer=stemmed_words, stop_words='english', encoding='utf-8', decode_error='ignore', lowercase=True, ngram_range=(1, 3)) Vectorizer.fit_transform(trainCommentList) with open('models/vectorizer_imdb_tfidf_2doc_feat3000.pkl', 'wb') as fw: pickle.dump(Vectorizer, fw) fw.close() print('Vectorizing comments') return Vectorizer.transform(commentList), labels
import preprocess as pre import numpy as np import matplotlib.pyplot as plt import gc import InitializeModel as im import tf_idf as tfidf import scipy.sparse as sp import pickle ''' print('Loading model') model = keras.models.load_model('models/simple_model.h5') ''' print('reading IMDB_data') (train_data, train_labels) = pre.loadData('dataset/aclImdb/train/pos', 'dataset/aclImdb/train/neg') (test_data, test_labels) = pre.loadData('dataset/aclImdb/test/pos', 'dataset/aclImdb/test/neg') ''' optional:Analyzing Dataset ''' print("Categories:", np.unique(train_labels)) print("Number of unique words:", len(np.unique(np.hstack(train_data)))) # 将word_index反转,实现将整数索引到单词的映射 ''' # Simple Vectoring data print('Vectoring data') X_train = pre.vectorize_sequences(train_data) X_test = pre.vectorize_sequences(test_data) '''
from preprocess import cross_10folds import numpy as np from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn import svm from numpy import * import time # In[4]: # 这里可以选择数据集 # small是小数据集 # farm-ad是整个数据集 fileName = r"data\small" # 测试小数据集 # fileName = r"data\farm-ads" # 测试整个大的数据集 totdata_x, totdata_y = loadData(fileName) # 加载数据 startTime = time.time() # 下面可以选择多种算法 handw_KNN(totdata_x, totdata_y) # 测试手写KNN算法 # handw_LDA1(totdata_x, totdata_y) # 测试手写的LDA1 # handw_LDA2(totdata_x, totdata_y) # 测试手写的LDA2 # sklearn_LDA(totdata_x, totdata_y) # 测试sklearn中的LDA # handw_SVM(totdata_x, totdata_y) # 测试手写的SVM算法 # sklearn_SVM(totdata_x, totdata_y) # 测试sklearn中的SVM算法 endTime = time.time()
Created on Mon May 25 10:37:03 2020 @author: ASUS """ import keras import imdb import numpy as np import preprocess as pre import tf_idf as tfidf import pickle import matplotlib.pyplot as plt print('Loading test data') filename1 = 'dataset/aclimdb/test/neg' filename = 'dataset/aclimdb/test/pos' Comments, labels = pre.loadData(filename, filename1) def analyzeModel(modelType='tfidf', feat=10000, Word=Comments, labels=labels): if modelType == 'BOW': print('Loading model') model = keras.models.load_model('models/BOW_default_40000.h5') model.summary() fhis = open('report/BOW_default_40000.txt', 'rb') training_detail(fhis) fhis.close() word_index = imdb.get_word_index() print('\nPreprocessing data') Words = pre.setOfWordsListToVecTor(word_index, Word) Words = pre.conf_data(x_train=Words, num_words=10000) Words = pre.vectorize_sequences(Words)
labels = [] for idx, row in test.iterrows(): if not row["content"]: continue feed_dict, label = get_feed_dict(row) predicted_logits = sess.run(logits, feed_dict=feed_dict) predicted = util.normalize_predictions( predicted_logits[0][0][1:-1]) print(row["content"][:50], "\n", "Actual:", label, "\nPredicted:", predicted, "\n") predictions.append(predicted) labels.append(label) util.print_summary(labels, predictions) if is_train: train_model() cv_test = loadData("./data/split_data/validate.csv") test_model(cv_test) else: test = loadData("./data/split_data/test.csv") test_model(test)
def main(): p = Path("./result") if not p.exists(): os.makedirs(p) parser = argparse.ArgumentParser( description='Bioinf project. The arguments can be passed in any order.' ) classes = parser.add_mutually_exclusive_group() classes.add_argument('-cl2', help='in order to classify two cancer types.', action='store_true') classes.add_argument( '-cl3', help='in order to classify two cancer types AND sane.', action='store_true') classifier = parser.add_mutually_exclusive_group() classifier.add_argument('-svm', help='train a Support Vector Machine classifier', action='store_true') classifier.add_argument('-knn', help='train a K Nearest Neighbors classifier', action='store_true') classifier.add_argument('-rforest', help='train a Random Forest classifier', action='store_true') classifier.add_argument('-kmeans', help='train a Kmeans clustering', action='store_true') classifier.add_argument( '-hierarc', help='train an Agglomerative Hierarchical clustering', action='store_true') inbalance = parser.add_mutually_exclusive_group() inbalance.add_argument('-over', help='imbalance: Random Oversampling ', action='store_true') inbalance.add_argument('-smote', help='imbalance: SMOTE', action='store_true') preprocess = parser.add_mutually_exclusive_group() preprocess.add_argument( '-ttest', help= 'feature selection: ttest per chromosoma and per cpg site - 2 classes', action='store_true') preprocess.add_argument( '-fisher', help='feature selection: fisher criterion - 3 classes', action='store_true') preprocess.add_argument('-anova', help='feature selection: anova - 3 classes', action='store_true') preprocess.add_argument( '-pca', help='dimensionality reduction: Principal Component Analisys', action='store_true') preprocess.add_argument( '-lda', help='dimensionality reduction: Linear Discriminant Analysis', action='store_true') preprocess.add_argument( '-sfs', help= 'feature selection - wrapper: Step Forward Selection (nearly unfeasible)', action='store_true') preprocess.add_argument( '-ga', help='feature selection - wrapper: Genetic Algorithm', action='store_true') parser.add_argument( '-d', '--download', nargs=2, help='download Adenoma and Adenocarcinoma and Squamous Cell Neoplasm ' + 'data from Genomic Data Common. It needs 2 parameters: ' + 'first parameter is the destination folder; ' + 'second parameters is the number of files to be downloaded for each class ', action='store') parser.add_argument( '-ds', '--downloadsane', nargs=2, help='download Sane data from Genomic Data Common' + 'It needs 2 parameters: ' + 'first parameter is the destination folder; ' + 'second parameters is the number of files to be downloaded ', action='store') parser.add_argument( '-s', '--store', help= 'concatenate files belonging to same cancer type and store them in a binary file', action='store') parser.add_argument( '--alpha', type=float, default=0.001, help='to set a different ALPHA: ttest parameter - default is 0.001', action='store') parser.add_argument( '--perc', type=float, default=0.95, help='to set PERC of varaince explained by the features kept by PCA', action='store') parser.add_argument( '-rs', '--r_state', type=int, default=8, help='to set a user defined Random State - default is 8', action='store') parser.add_argument('--only_chrms_t', default=False, help='select only chrms for ttest', action='store_true') parser.add_argument( '--crossval', help= 'to do crossvalidation OR in case of unsupervised to plot the Inertia curve', action='store_true') parser.add_argument('--plot_lc', help='plot the learning curve', action='store_true') parser.add_argument( '--remove_nan_cpgs', type=str2bool, default=True, help='IF True: removes features containing at least one NaN value. ' + 'IF False: NaN are substituted by the mean over the feature. ' + 'The old file resulted by feature reduction must be eliminated when changing option. ' + 'By Default is True.', action='store') args = parser.parse_args() if args.download: print("download ") dgdc.getDataEx(path=args.download[0], file_n=args.download[1]) if args.downloadsane: print("download sane ") dgdc.getSaneDataEx(path=args.downloadsane[0], file_n=args.downloadsane[1]) if args.store: print("store") dgdc.storeDataIntoBinary(path=args.store) print("Data stored.") # validity checks if not args.cl2 and not args.cl3: print( "insert arg -cl2 for classifying 2 classes OR -cl3 for 3 classes") return # parameters and variables alpha = args.alpha # alpha parameter for t-test perc = args.perc # percentage of variance explained classes = 2 if args.cl2 else 3 random_state = args.r_state no_nan = args.remove_nan_cpgs n_components = 100 cl.setPlot_lc(args.plot_lc) cl.addToName("cl{}".format(classes)) cl.addToName("rs{}".format(random_state)) # load data print("Loading....") x, y, chrms_pos = pr.loadData(classes=classes) if no_nan: cl.addToName("no_nan") length = x.shape[1] x = pr.removeNanFeature(x) print("{} NaN features removed!".format(length - x.shape[1])) print("Loaded!") x_train, x_test, y_train, y_test = sk.model_selection.train_test_split( x, y, test_size=0.2, random_state=random_state) del x, y # preprocess if args.ttest: if classes != 2: print("wrong number of classes") return #print("Start ttest axis={}....".format(args.ttest)) r, cpg_r = pr.compute_t_test(x_train, y_train, chrms_pos, alpha, random_state, axis=0, remove_nan=no_nan) print(r) cl.addToName("ttest{}".format(args.ttest)) length = x_train.shape[1] x_train, x_test = pr.removeFeatures(x_train, x_test, cpg_r, chrms_pos, args.only_chrms_t, remove_nan=no_nan, y_train=y_train) print("Features removed: {}".format(length - x_train.shape[1])) print("End ttest!") if args.ga: print("genetic algorithm") cl.addToName("ga") # per lavorare con meno componenti # x_train = x_train[:, 1:100] result = g.GA_function(x_train, y_train, random_state, classes, 0.1) path = Path('./data/GA_{}_{}.npy'.format(random_state, classes)) np.save(path, result) x_train = x_train[:, result] x_test = x_test[:, result] if args.pca: print("pca") cl.addToName("pca") x_train, x_test = pr.pca_function(x_train, x_test, y_train, y_test, classes, perc, random_state, name=cl.name, remove_nan=no_nan) if args.lda: #print("lda - {} components".format(args.lda)) cl.addToName("lda") x_train, x_test = pr.lda_function(x_train, x_test, y_train, y_test, classes, args.lda, random_state, cl.name) if args.fisher: if classes != 2: print("wrong number of classes") return #cl.addToName("fisher{}".format(args.fisher)) cl.addToName("fisher") print("fisher") x_train, x_test = pr.fisher_function(x_train, x_test, y_train, y_test, random_state, best=True, n=n_components, remove_nan=no_nan) # if best=True selects the n best features, if False the worst n features (for debugging) if args.sfs: if classes != 2: print("wrong number of classes") return print("Start sfs....") feat_col = pr.sfs(x_train, x_test, y_train, y_test, chrms_pos, alpha, random_state) x_train = x_train[:, feat_col] x_test = x_test[:, feat_col] if args.anova: if classes != 3: print("wrong number of classes") return print("anova") cl.addToName("anova") x_train, x_test = pr.anova_function(x_train, x_test, y_train, y_test, alpha, random_state, remove_nan=no_nan) # imbalance if args.over: print("over ") x_train, y_train = pr.imbalance(x_train, y_train, "over", random_state) cl.addToName("over") if args.smote: print("smote ") x_train, y_train = pr.imbalance(x_train, y_train, "smote", random_state) cl.addToName("smote") cl.random_state(random_state) # classify if args.svm: print("svm ") cl.svm(x_train, x_test, y_train, y_test, classes=classes, crossval=args.crossval) if args.knn: print("knn ") cl.knn(x_train, x_test, y_train, y_test, classes=classes, crossval=args.crossval) if args.rforest: print("rforest") cl.random_forest(x_train, x_test, y_train, y_test, classes=classes, crossval=args.crossval) if args.kmeans: print("kmeans") uc.kmeans(x_train, x_test, y_train, y_test, classes=classes, random_state=random_state, crossval=args.crossval) if args.hierarc: print("hierarchical clustering") uc.hierarchical(x_train, x_test, y_train, y_test, classes=classes, random_state=random_state, crossval=args.crossval) print("Log name: {}.log".format(cl.name)) handlers = log.getLogger().handlers[:] for handler in handlers: handler.close() log.getLogger().removeHandler(handler) nf = p / cl.name if not nf.exists(): os.makedirs(nf) npath = Path(nf / '{}.log'.format(cl.name)) i = 1 while npath.exists(): npath = Path(nf / '{}_{}.log'.format(cl.name, i)) i += 1 os.rename('log.log', npath)