def main(argv): n=None try: opts, args = getopt.getopt(argv,"ht:s:",["train=", "settings="]) except getopt.GetoptError: print 'test.py -t <train number> -s <settings file>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'test.py -t <train number>' sys.exit() elif opt in ("-t", "--train"): n = int(arg) elif opt in ("-s", "--settings"): settings = arg print("Reading in the training data") train = data_io.read_train_pairs(settings) target = data_io.read_train_target(settings) print("Extracting features and training model") classifier = get_pipeline() classifier.fit(train, target.Target) print("Saving the classifier") data_io.save_model(classifier, settings)
def extract_train_features(): start = time.time() features = feature_extractor() header = [] for h in features.features: header.append(h[0]) print("Reading in the training data") X = data_io.read_train_pairs() y = data_io.read_train_target() #X = X.iloc[1:7] #y = y.iloc[1:7] print("Extracting features: " + str(X.shape)) extracted = features.fit_transform(X, y,type_map = data_io.read_train_info()); elapsed = float(time.time() - start) print("Features extracted in " + str(elapsed/60.0) + " Minutes") print ("Saving features") X = pd.DataFrame(extracted, index = X.index) X.columns = header data_io.save_train_features(X, y.Target) return X
def merge_data(): # print train_data_dir + "/train_pair*" train_pairs = glob.glob(train_data_dir + "/*train_pairs*") print list(zip(train_pairs, list(xrange(0, 4)))) for i, train_pair in enumerate(train_pairs): dir_name = ntpath.dirname(train_pair) pref = ntpath.basename(train_pair).split("train_pairs")[0] suffix = ntpath.basename(train_pair).split("train_pairs")[-1] # print pref, suffix info = dir_name + "/" + pref + "train_publicinfo" + suffix target = dir_name + "/" + pref + "train_target" + suffix print info, pref, suffix X = data_io.read_train_pairs(train_pair) y = data_io.read_train_target(target) inf_data = data_io.read_train_info(info) X, y, inf_data = process_indices(X, y, inf_data, i) if "X_merged" not in locals(): X_merged = X y_merged = y info_merged = inf_data else: print "Shape before appending", X_merged.shape, y_merged.shape, X.shape, y.shape X_merged = X_merged.append(X) y_merged = y_merged.append(y) info_merged = info_merged.append(inf_data) print "Shape thus far", X_merged.shape, y_merged.shape return X_merged, y_merged, info_merged
def merge_data(): #print train_data_dir + "/train_pair*" train_pairs = glob.glob(train_data_dir + "/*train_pairs*") print list(zip(train_pairs, list(xrange(0, 4)))) for i, train_pair in enumerate(train_pairs): dir_name = ntpath.dirname(train_pair) pref = ntpath.basename(train_pair).split("train_pairs")[0] suffix = ntpath.basename(train_pair).split("train_pairs")[-1] #print pref, suffix info = dir_name + "/" + pref + "train_publicinfo" + suffix target = dir_name + "/" + pref + "train_target" + suffix print info, pref, suffix X = data_io.read_train_pairs(train_pair) y = data_io.read_train_target(target) inf_data = data_io.read_train_info(info) X, y, inf_data = process_indices(X, y, inf_data, i) if 'X_merged' not in locals(): X_merged = X y_merged = y info_merged = inf_data else: print "Shape before appending", X_merged.shape, y_merged.shape, X.shape, y.shape X_merged = X_merged.append(X) y_merged = y_merged.append(y) info_merged = info_merged.append(inf_data) print "Shape thus far", X_merged.shape, y_merged.shape return X_merged, y_merged, info_merged
def extract_train_features(): start = time.time() features = feature_extractor() header = [] for h in features.features: header.append(h[0]) print("Reading in the training data") X = data_io.read_train_pairs() y = data_io.read_train_target() #X = X.iloc[1:7] #y = y.iloc[1:7] print("Extracting features: " + str(X.shape)) extracted = features.fit_transform(X, y, type_map=data_io.read_train_info()) elapsed = float(time.time() - start) print("Features extracted in " + str(elapsed / 60.0) + " Minutes") print("Saving features") X = pd.DataFrame(extracted, index=X.index) X.columns = header data_io.save_train_features(X, y.Target) return X
def load_train_set(): X = data_io.load_matlab_train_features() if(X is None): print("No feature file found!") exit(1) y = data_io.read_train_target() X = X.fillna(0) return X,y
def getdata(): total = np.fromfile('extracted/total.np') total = total.reshape(total.shape[0] / 24, 24) train_traget = data_io.read_train_target() sup1_traget = data_io.read_sup1_train_target() sup2_traget = data_io.read_sup2_train_target() sup3_traget = data_io.read_sup3_train_target() total_target = np.hstack((train_traget.Target, sup1_traget.Target, sup2_traget.Target, sup3_traget.Target)) return total, total_target
def main(): train = data_io.read_train_pairs() target = data_io.read_train_target() ## for row in train.iloc[0:4].iterrows(): for ind in train.index[0:20]: pl.scatter( train['A'][ind], train['B'][ind], marker=".") pl.show()
def main(): print("Reading in the training data") train = data_io.read_train_pairs() target = data_io.read_train_target() print("Extracting features and training model") classifier = get_pipeline() classifier.fit(train, target.Target) print("Saving the classifier") data_io.save_model(classifier)
def main(): X = data_io.load_train_features() if (type(X) == type(None)): print("No feature file found!") exit(1) y = data_io.read_train_target() #min_max_scaler = preprocessing.MinMaxScaler() #X = min_max_scaler.fit_transform(X) get_top_features(X, y.Target, 20)
def main(): X = data_io.load_train_features() if(type(X) == type(None)): print("No feature file found!") exit(1) y = data_io.read_train_target() #min_max_scaler = preprocessing.MinMaxScaler() #X = min_max_scaler.fit_transform(X) get_top_features(X,y.Target,20)
def grid_search(): X = data_io.load_train_features() if (type(X) == type(None)): print("No feature file found!") exit(1) y = data_io.read_train_target() tree_depth = [5, 7, 9, 10, 12, 14] learning_rate = [0.01, 0.05, 0.1, 0.2] scorer = Scorer(X, y) for d in tree_depth: for l in learning_rate: r = scorer.score([d, l]) print "Score", d, l, r
def grid_search(): X = data_io.load_train_features() if(type(X) == type(None)): print("No feature file found!") exit(1) y = data_io.read_train_target() tree_depth = [5, 7, 9 , 10, 12, 14] learning_rate = [0.01, 0.05, 0.1, 0.2] scorer = Scorer(X,y) for d in tree_depth: for l in learning_rate: r = scorer.score([d,l]) print "Score", d,l,r
def useEMMeans(): data = data_io.read_train_pairs() output = data_io.read_train_target() y = np.array(output) n_components=5 covariance_type='full' #alternatives 'diag','spherical' num_datas = len(data) #means = np.zeros((num_datas,n_components,2)) #for i in range(num_datas): # X = np.array([data.A[i],data.B[i]]).T # g = GMM(n_components=n_components) # g.fit(X) # means[i,:,:] = g.means_ means = np.load('gmm_means.npy') X = means[:,:,1] y = y[:,0] from sklearn.linear_model import LinearRegression, Perceptron from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier k = 20 n = range(0,len(X)/2+k) npp = range(len(X)/2+k,len(X)) def dostuff(csfr,X,y,n,npp): csfr.fit(X[n,:],y[n]) yhat = csfr.predict(X[npp,:]) print 1.0*sum(yhat==y[npp])/len(yhat) linreg = LinearRegression(normalize=True) dostuff(linreg,X,y,n,npp) p = Perceptron() dostuff(p,X,y,n,npp) dt = DecisionTreeClassifier() dostuff(dt,X,y,n,npp) knn = KNeighborsClassifier(n_neighbors=2) dostuff(knn,X,y,n,npp) r = np.random.randint(-1,3,len(y[npp])) r[r==2] = 0 print 1.0*sum(r==y[npp])/len(r)
def main(): t1 = time() print("Reading in the training data") train = data_io.read_train_pairs() train_info = data_io.read_train_info() train = combine_types(train, train_info) #make function later train = get_types(train) target = data_io.read_train_target() print train print("Extracting features and training model") classifier = get_pipeline() classifier.fit(train, target.Target) features = [x[0] for x in classifier.steps[0][1].features ] csv_fea = csv.writer(open('features.csv','wb')) imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True) for fea in imp: print fea[0], fea[1] csv_fea.writerow([fea[0],fea[1]]) oob_score = classifier.steps[1][1].oob_score_ print "oob score:", oob_score logger = open("run_log.txt","a") if len(oob_score) == 1: logger.write("\n" +str( oob_score) + "\n") else:logger.write("\n" + str(oob_score[0]) + "\n") print("Saving the classifier") data_io.save_model(classifier) print("Predicting the train set") train_predict = classifier.predict(train) trian_predict = train_predict.flatten() data_io.write_submission(train_predict, 'train_set', run = 'train') t2 = time() t_diff = t2 - t1 print "Time Taken (min):", round(t_diff/60,1)
def main(): X = data_io.read_train_pairs() y = data_io.read_train_target() info = data_io.read_train_info() X, y, info = exploit_symmetries(X, y, info) print X.shape, y.shape print "-1", len(y[y['Target'] == -1]) print "0", len(y[y['Target'] == 0]) print "1", len(y[y['Target'] == 1]) # X = X.iloc[:10] # y = y.iloc[:10] # info = info.iloc[:10] data_io.save_train_data(X, "./Competition/CEfinal_train_pairs-sym.csv") data_io.save(y, "./Competition/CEfinal_train_target-sym.csv") data_io.save(info, "./Competition/CEfinal_train_publicinfo-sym.csv") print "finished"
def main(): y = data_io.read_train_target() X = data_io.load_train_features() if(type(X) == type(None)): print("No feature file found!") exit(1) X_old = data_io.load_features("./Models/old_csv/features_train_en_python.csv") print X.shape X = X_old.join(X) print X.shape #print X data_io.save_train_features(X,y) X = data_io.load_valid_features() X_old = data_io.load_features("./Models/old_csv/features_valid_en_python.csv") print X.shape X = X_old.join(X) print X.shape data_io.save_valid_features(X)
def main(): X = data_io.read_train_pairs() y = data_io.read_train_target() info = data_io.read_train_info() X,y, info = exploit_symmetries(X,y, info) print X.shape, y.shape print "-1", len(y[y['Target']==-1]) print "0", len(y[y['Target']==0]) print "1", len(y[y['Target']==1]) # X = X.iloc[:10] # y = y.iloc[:10] # info = info.iloc[:10] data_io.save_train_data(X, "./Competition/CEfinal_train_pairs-sym.csv") data_io.save(y, "./Competition/CEfinal_train_target-sym.csv") data_io.save(info, "./Competition/CEfinal_train_publicinfo-sym.csv") print "finished"
def main(): numRows = 10 targetVal = 0 try: opts, args = getopt.getopt(sys.argv[1:], "n:t:h") except getopt.GetoptError as err: print str(err) sys.exit(2) clamp = lambda n, minn, maxn: max(min(maxn, n), minn) for o, a in opts: if o == "-n": numRows = clamp(int(a), 1, 4050) elif o == "-t": targetVal = int(a) else: print "try help: python train.py -h" sys.exit(2) print "Reading " + str(numRows) + " rows in the training data" train = data_io.read_train_pairs(numRows) target = data_io.read_train_target(numRows) train.A = train.A.div(train.A.apply(max) - train.A.apply(min)) train.B = train.B.div(train.B.apply(max) - train.B.apply(min)) train = train.convert_objects(convert_numeric=True) # train = train.to_numeric() for i in range(1, numRows): if target.Target[i] == targetVal: A = train.iloc[i, :].A B = train.iloc[i, :].B plt.figure(i) plt.plot(range(len(A)), A) plt.plot(range(len(B)), B) plt.savefig('plots/' + str(targetVal) + '_' + str(i) + '.png')
def main(): y = data_io.read_train_target() X = data_io.load_train_features() if (type(X) == type(None)): print("No feature file found!") exit(1) X_old = data_io.load_features( "./Models/old_csv/features_train_en_python.csv") print X.shape X = X_old.join(X) print X.shape #print X data_io.save_train_features(X, y) X = data_io.load_valid_features() X_old = data_io.load_features( "./Models/old_csv/features_valid_en_python.csv") print X.shape X = X_old.join(X) print X.shape data_io.save_valid_features(X)
def run(self): features = f.features train = self.getTrainingDataset() print "Reading preprocessed features" if f.preprocessedFeatures != []: intermediate = data_io.read_intermediate_train() for i in f.preprocessedFeatures: train[i] = intermediate[i] for i in features: if i[0] in f.preprocessedFeatures: i[1] = i[0] i[2] = f.SimpleTransform(transformer=f.ff.identity) print "Reading targets" target = data_io.read_train_target() print "Extracting features and training model" classifier = self.getPipeline(features) if self.directionForward: finalTarget = [x * (x + 1) / 2 for x in target.Target] else: finalTarget = [-x * (x - 1) / 2 for x in target.Target] classifier.fit(train, finalTarget) print classifier.steps[-1][1].feature_importances_ print "Saving the classifier" data_io.save_model(classifier)
def main(): t1 = time() print("Reading in the training data") train = data_io.read_train_pairs() train_info = data_io.read_train_info() train = combine_types(train, train_info) #make function later train = get_types(train) target = data_io.read_train_target() print "Reading SUP data..." for i in range(1,4): print "SUP", str(i) sup = data_io.read_sup_pairs(i) sup_info = data_io.read_sup_info(i) sup = combine_types(sup, sup_info) sup = get_types(sup) sup_target = data_io.read_sup_target(i) train = train.append(sup) target = target.append(sup_target) print "Train size = ", str(train.shape) print("Extracting features and training model") (feature_trans, classifier) = get_pipeline() orig_train = feature_trans.fit_transform(train) orig_train = numpy.nan_to_num(orig_train) print("Train-test split") trainX, testX, trainY, testY = train_test_split(orig_train, target.Target, random_state = 1) print "TrainX size = ", str(trainX.shape) print "TestX size = ", str(testX.shape) print("Saving features") data_io.save_features(orig_train) classifier.fit(trainX, trainY) print("Saving the classifier") data_io.save_model(classifier) testX = numpy.nan_to_num(testX) print "Score on held-out test data ->", classifier.score(testX, testY) #features = [x[0] for x in classifier.steps[0][1].features ] #csv_fea = csv.writer(open('features.csv','wb')) #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True) #for fea in imp: # print fea[0], fea[1] # csv_fea.writerow([fea[0],fea[1]]) feature_importrance = classifier.feature_importances_ logger = open("feature_importance.csv","a") for fi in feature_importrance: logger.write(str(fi)) logger.write("\n") t2 = time() t_diff = t2 - t1 print "Time Taken (min):", round(t_diff/60,1)
def get_pipeline(): features = feature_extractor() steps = [("extract_features", features), ("classify", RandomForestRegressor(n_estimators=50, verbose=2, n_jobs=2, min_samples_split=10, random_state=1, compute_importances=True))] return Pipeline(steps) if __name__=="__main__": print("Reading in the training data") train_raw = data_io.read_train_pairs() target = data_io.read_train_target() info = data_io.read_train_info() info['iindex'] = range(4050) train = train_raw.join(info) classifier = get_pipeline() ### FOLDS CODE # folds = cval.KFold(len(train), n_folds=2, indices=False) # # results = [] # for i, fold in enumerate(folds): # print("Extracting features and training model for fold " + str(i)) # traincv, testcv = fold # classifier.fit(train[traincv], target[traincv])
def main(): global cf start = time.clock() numRows = None cv = False nfold = 10 clf_keys = ["rfg"] try: opts, args = getopt.getopt(sys.argv[1:], "n:c:m:h") except getopt.GetoptError as err: print str(err) sys.exit(2) clamp = lambda n, minn, maxn: max(min(maxn, n), minn) for o, a in opts: if o == "-n": numRows = clamp(int(a), 1, 4050) elif o == "-c": cv = True nfold = int(a) elif o == "-m": if a == "all": clf_keys = [] for clf_key in cf.get_all_keys(): clf_keys.append(clf_key) elif cf.is_valid_key(a): clf_keys = [a] else: print "ERROR: wrong classifier name: " + a elif o == "-h": print 'options:' print "\t -n [number of rows]" print "\t -c [number of folds]" print "\t -m [classifier key | all]" sys.exit(0) else: print "try help: python train.py -h" sys.exit(2) print("Reading in the training data") train = data_io.read_train_pairs(numRows) trainInfo = data_io.read_train_info(numRows) train['A type'] = trainInfo['A type'] train['B type'] = trainInfo['B type'] target = data_io.read_train_target(numRows) if cv: data = {} data['train'] = train data['target'] = target for clf_key in clf_keys: print "Initiating " + str(nfold) + " fold cross validation with classifier " + cf.get_classifier_name(clf_key) crossvalidate(data, nfold, clf_key) else: for clf_key in clf_keys: start_train = time.clock() print("Extracting features and training model") classifier = get_pipeline(clf_key) classifier.fit(train, target.Target) print("Saving the classifier") data_io.save_model(classifier, clf_key) end_train = time.clock() print 'time taken:', end_train - start_train, 'seconds' end = time.clock() print 'Execution time:', round(end - start, 2)
def reverse_auc(labels, predictions): target_neg_one = [1 if x == -1 else 0 for x in labels] neg_predictions = [-x for x in predictions] score = metrics.auc(target_neg_one, neg_predictions) return score def bidirectional_auc(labels, predictions): score_forward = forward_auc(labels, predictions) score_reverse = reverse_auc(labels, predictions) score = (score_forward + score_reverse) / 2.0 return score if __name__ == "__main__": import data_io solution = data_io.read_train_target() submission = data_io.read_train_predictions() score_forward = forward_auc(solution.Target, submission.Target) print("Forward Auc: %0.6f" % score_forward) score_reverse = reverse_auc(solution.Target, submission.Target) print("Reverse Auc: %0.6f" % score_reverse) score = bidirectional_auc(solution.Target, submission.Target) print("Bidirectional AUC: %0.6f" % score)
def main(): t1 = time() print("Reading in the training data") train = data_io.read_train_pairs() train_info = data_io.read_train_info() train = combine_types(train, train_info) #make function later train = get_types(train) target = data_io.read_train_target() print "Reading SUP data..." for i in range(1,4): print "SUP", str(i) sup = data_io.read_sup_pairs(i) sup_info = data_io.read_sup_info(i) sup = combine_types(sup, sup_info) sup = get_types(sup) sup_target = data_io.read_sup_target(i) train_info = train_info.append(sup_info) train = train.append(sup) target = target.append(sup_target) # Old train print "Reading old train data..." old_train = data_io.read_old_train_pairs() old_train_info = data_io.read_old_train_info() old_train = combine_types(old_train, old_train_info) old_train = get_types(old_train) old_target = data_io.read_old_train_target() train = train.append(old_train) target = target.append(old_target) # End old train print "Train size = ", str(train.shape) print("Extracting features and training model") feature_trans = fe.feature_extractor() orig_train = feature_trans.fit_transform(train) orig_train = numpy.nan_to_num(orig_train) classifier = classify_catagory(orig_train, target.Target) #(both_classifier, A_classifier, B_classifier, none_classifier) = create_classifiers(orig_train, target.Target, train_info) print("Saving features") data_io.save_features(orig_train) print("Saving the classifier") #data_io.save_model( (both_classifier, A_classifier, B_classifier, none_classifier) ) data_io.save_model(classifier) #features = [x[0] for x in classifier.steps[0][1].features ] #csv_fea = csv.writer(open('features.csv','wb')) #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True) #for fea in imp: # print fea[0], fea[1] # csv_fea.writerow([fea[0],fea[1]]) t2 = time() t_diff = t2 - t1 print "Time Taken (min):", round(t_diff/60,1)
def main(): t1 = time() print("Reading in the training data") train = data_io.read_train_pairs() train_info = data_io.read_train_info() train = combine_types(train, train_info) #make function later train = get_types(train) target = data_io.read_train_target() print "Reading SUP data..." for i in range(1, 4): print "SUP", str(i) sup = data_io.read_sup_pairs(i) sup_info = data_io.read_sup_info(i) sup = combine_types(sup, sup_info) sup = get_types(sup) sup_target = data_io.read_sup_target(i) train_info = train_info.append(sup_info) train = train.append(sup) target = target.append(sup_target) # Old train print "Reading old train data..." old_train = data_io.read_old_train_pairs() old_train_info = data_io.read_old_train_info() old_train = combine_types(old_train, old_train_info) old_train = get_types(old_train) old_target = data_io.read_old_train_target() train = train.append(old_train) target = target.append(old_target) # End old train print "Train size = ", str(train.shape) print("Extracting features and training model") feature_trans = fe.feature_extractor() orig_train = feature_trans.fit_transform(train) orig_train = numpy.nan_to_num(orig_train) classifier = classify_catagory(orig_train, target.Target) #(both_classifier, A_classifier, B_classifier, none_classifier) = create_classifiers(orig_train, target.Target, train_info) print("Saving features") data_io.save_features(orig_train) print("Saving the classifier") #data_io.save_model( (both_classifier, A_classifier, B_classifier, none_classifier) ) data_io.save_model(classifier) #features = [x[0] for x in classifier.steps[0][1].features ] #csv_fea = csv.writer(open('features.csv','wb')) #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True) #for fea in imp: # print fea[0], fea[1] # csv_fea.writerow([fea[0],fea[1]]) t2 = time() t_diff = t2 - t1 print "Time Taken (min):", round(t_diff / 60, 1)
features = feature_extractor() steps = [("extract_features", features), ("classify", RandomForestRegressor(n_estimators=50, verbose=2, n_jobs=2, min_samples_split=10, random_state=1, compute_importances=True))] return Pipeline(steps) if __name__ == "__main__": print("Reading in the training data") train_raw = data_io.read_train_pairs() target = data_io.read_train_target() info = data_io.read_train_info() info['iindex'] = range(4050) train = train_raw.join(info) classifier = get_pipeline() ### FOLDS CODE # folds = cval.KFold(len(train), n_folds=2, indices=False) # # results = [] # for i, fold in enumerate(folds): # print("Extracting features and training model for fold " + str(i)) # traincv, testcv = fold # classifier.fit(train[traincv], target[traincv])