def main(overwrite=False): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'add.noise.csv')): print 'Feature file already exists - not overwriting' return features = [('Additive noise model AB', ['A','B'], f.add_noise_model_AB), ('Additive noise model BA', ['A','B'], f.add_noise_model_BA)] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('add_noise', all_features, feature_names)
def merge_data(): # print train_data_dir + "/train_pair*" train_pairs = glob.glob(train_data_dir + "/*train_pairs*") print list(zip(train_pairs, list(xrange(0, 4)))) for i, train_pair in enumerate(train_pairs): dir_name = ntpath.dirname(train_pair) pref = ntpath.basename(train_pair).split("train_pairs")[0] suffix = ntpath.basename(train_pair).split("train_pairs")[-1] # print pref, suffix info = dir_name + "/" + pref + "train_publicinfo" + suffix target = dir_name + "/" + pref + "train_target" + suffix print info, pref, suffix X = data_io.read_train_pairs(train_pair) y = data_io.read_train_target(target) inf_data = data_io.read_train_info(info) X, y, inf_data = process_indices(X, y, inf_data, i) if "X_merged" not in locals(): X_merged = X y_merged = y info_merged = inf_data else: print "Shape before appending", X_merged.shape, y_merged.shape, X.shape, y.shape X_merged = X_merged.append(X) y_merged = y_merged.append(y) info_merged = info_merged.append(inf_data) print "Shape thus far", X_merged.shape, y_merged.shape return X_merged, y_merged, info_merged
def ext(): # Read the pairs print "Read Pairs" print "Read Train" train = d.read_train_pairs() print "Read Valid" valid = d.read_valid_pairs() print "Read Sup1" sup1 = d.read_sup1_train_pairs() print "Read Sup2" sup2 = d.read_sup2_train_pairs() print "Read Sup3" sup3 = d.read_sup3_train_pairs() # Get the feature extractor combined = feat.feature_extractor() # Extract the features print 'Extract the features' print "Extract Train" train_att = combined.fit_transform(train) print "Extract Valid" valid_att = combined.fit_transform(valid) print "Extract Sup1" sup1_att = combined.fit_transform(sup1) print "Extract Sup2" sup2_att = combined.fit_transform(sup2) print "Extract Sup3" sup3_att = combined.fit_transform(sup3) print "Join" total_new_att = np.vstack((train_att, valid_att, sup1_att, sup2_att, sup3_att)) # Save extracted data np.save('total_new_att.npy', total_new_att)
def main(argv): n=None try: opts, args = getopt.getopt(argv,"ht:s:",["train=", "settings="]) except getopt.GetoptError: print 'test.py -t <train number> -s <settings file>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'test.py -t <train number>' sys.exit() elif opt in ("-t", "--train"): n = int(arg) elif opt in ("-s", "--settings"): settings = arg print("Reading in the training data") train = data_io.read_train_pairs(settings) target = data_io.read_train_target(settings) print("Extracting features and training model") classifier = get_pipeline() classifier.fit(train, target.Target) print("Saving the classifier") data_io.save_model(classifier, settings)
def extrair_tudo(): combined = new_features1() print "Train" train = d.read_train_pairs() train_att = combined.fit_transform(train) np.save(train_att, open("train_att.npy", "wb")) print "Train1" valid = d.read_valid_pairs() valid_att = combined.fit_transform(valid) np.save(valid_att, open("valid_att.npy", "wb")) print "Train2" sup1 = d.read_sup1_train_pairs() sup1_att = combined.fit_transform(sup1) np.save(sup1_att, open("sup1_att.npy", "wb")) print "Train3" sup2 = d.read_sup2_train_pairs() sup2_att = combined.fit_transform(sup2) np.save(sup1_att, open("sup2_att.npy", "wb")) print "Train4" sup3 = d.read_sup3_train_pairs() sup3_att = combined.fit_transform(sup3) np.save(sup1_att, open("sup3_att.npy", "wb"))
def extract_train_features(): start = time.time() features = feature_extractor() header = [] for h in features.features: header.append(h[0]) print("Reading in the training data") X = data_io.read_train_pairs() y = data_io.read_train_target() #X = X.iloc[1:7] #y = y.iloc[1:7] print("Extracting features: " + str(X.shape)) extracted = features.fit_transform(X, y, type_map=data_io.read_train_info()) elapsed = float(time.time() - start) print("Features extracted in " + str(elapsed / 60.0) + " Minutes") print("Saving features") X = pd.DataFrame(extracted, index=X.index) X.columns = header data_io.save_train_features(X, y.Target) return X
def extract_train_features(): start = time.time() features = feature_extractor() header = [] for h in features.features: header.append(h[0]) print("Reading in the training data") X = data_io.read_train_pairs() y = data_io.read_train_target() #X = X.iloc[1:7] #y = y.iloc[1:7] print("Extracting features: " + str(X.shape)) extracted = features.fit_transform(X, y,type_map = data_io.read_train_info()); elapsed = float(time.time() - start) print("Features extracted in " + str(elapsed/60.0) + " Minutes") print ("Saving features") X = pd.DataFrame(extracted, index = X.index) X.columns = header data_io.save_train_features(X, y.Target) return X
def merge_data(): #print train_data_dir + "/train_pair*" train_pairs = glob.glob(train_data_dir + "/*train_pairs*") print list(zip(train_pairs, list(xrange(0, 4)))) for i, train_pair in enumerate(train_pairs): dir_name = ntpath.dirname(train_pair) pref = ntpath.basename(train_pair).split("train_pairs")[0] suffix = ntpath.basename(train_pair).split("train_pairs")[-1] #print pref, suffix info = dir_name + "/" + pref + "train_publicinfo" + suffix target = dir_name + "/" + pref + "train_target" + suffix print info, pref, suffix X = data_io.read_train_pairs(train_pair) y = data_io.read_train_target(target) inf_data = data_io.read_train_info(info) X, y, inf_data = process_indices(X, y, inf_data, i) if 'X_merged' not in locals(): X_merged = X y_merged = y info_merged = inf_data else: print "Shape before appending", X_merged.shape, y_merged.shape, X.shape, y.shape X_merged = X_merged.append(X) y_merged = y_merged.append(y) info_merged = info_merged.append(inf_data) print "Shape thus far", X_merged.shape, y_merged.shape return X_merged, y_merged, info_merged
def getTrainingDataset(self): print "Reading in the training data" train = data_io.read_train_pairs() print "Reading the information about the training data" train2 = data_io.read_train_info() train["A type"] = train2["A type"] train["B type"] = train2["B type"] return train
def getDataset(self): if self.getTrain: readData = data_io.read_train_pairs() readData2 = data_io.read_train_info() else: readData = data_io.read_valid_pairs() readData2 = data_io.read_valid_info() readData["A type"] = readData2["A type"] readData["B type"] = readData2["B type"] return readData
def main(): train = data_io.read_train_pairs() target = data_io.read_train_target() ## for row in train.iloc[0:4].iterrows(): for ind in train.index[0:20]: pl.scatter( train['A'][ind], train['B'][ind], marker=".") pl.show()
def main(): print("Reading in the training data") train = data_io.read_train_pairs() target = data_io.read_train_target() print("Extracting features and training model") classifier = get_pipeline() classifier.fit(train, target.Target) print("Saving the classifier") data_io.save_model(classifier)
def main(overwrite=False): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'reasonable_features.csv')): print 'Feature file already exists - not overwriting' return features = [('A: Normalized Entropy', 'A', f.normalized_entropy), ('B: Normalized Entropy', 'B', f.normalized_entropy), ('Pearson R', ['A','B'], f.correlation), ('Pearson R Magnitude', 'derived', 'abs(output[key][2])'),# Apologies for this weird feature definition mechanism - it is a quick hack to prevent duplicated computation ('Entropy Difference', 'derived', 'output[key][0] - output[key][1]'), ('Entropy Ratio', 'derived', 'output[key][0] / output[key][1] if not output[key][1] == 0 else output[key][0] / 0.000001'), ('Spearman rank correlation', ['A','B'], f.rcorrelation), ('Spearman rank magnitude', 'derived', 'abs(output[key][6])'), ('Kurtosis A', 'A', f.fkurtosis), ('Kurtosis B', 'B', f.fkurtosis), ('Kurtosis difference', 'derived', 'output[key][8] - output[key][9]'), ('Kurtosis ratio', 'derived', 'output[key][8] / output[key][9] if not output[key][9] == 0 else output[key][8] / 0.000001'), ('Unique ratio A', 'A', f.unique_ratio), ('Unique ratio B', 'B', f.unique_ratio), ('Skew A', 'A', f.fskew), ('Skew B', 'B', f.fskew), ('Skew difference', 'derived', 'output[key][14] - output[key][15]'), ('Skew ratio', 'derived', 'output[key][14] / output[key][15] if not output[key][15] == 0 else output[key][14] / 0.000001'), ('Pearson - Spearman', 'derived', 'output[key][2] - output[key][6]'), ('Abs Pearson - Spearman', 'derived', 'output[key][3] - output[key][7]'), ('Pearson / Spearman', 'derived', 'output[key][2] / output[key][6] if not output[key][6] == 0 else output[key][2] / 0.000001')] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('reasonable_features', all_features, feature_names)
def main(overwrite=False): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'high_order_moments.csv')): print 'Feature file already exists - not overwriting' return features = [('Moment 5 A', 'A', f.standard_moment_5), ('Moment 5 B', 'B', f.standard_moment_5), ('Moment 5 diff', 'derived', 'output[key][0] - output[key][1]'), ('Moment 5 ratio', 'derived', 'output[key][0] / output[key][1] if not output[key][1] == 0 else output[key][0] / 0.000001'), ('Moment 6 A', 'A', f.standard_moment_6), ('Moment 6 B', 'B', f.standard_moment_6), ('Moment 6 diff', 'derived', 'output[key][4] - output[key][5]'), ('Moment 6 ratio', 'derived', 'output[key][4] / output[key][5] if not output[key][5] == 0 else output[key][4] / 0.000001'), ('Moment 7 A', 'A', f.standard_moment_7), ('Moment 7 B', 'B', f.standard_moment_7), ('Moment 7 diff', 'derived', 'output[key][8] - output[key][9]'), ('Moment 7 ratio', 'derived', 'output[key][8] / output[key][9] if not output[key][9] == 0 else output[key][8] / 0.000001'), ('Moment 8 A', 'A', f.standard_moment_8), ('Moment 8 B', 'B', f.standard_moment_8), ('Moment 8 diff', 'derived', 'output[key][12] - output[key][13]'), ('Moment 8 ratio', 'derived', 'output[key][12] / output[key][13] if not output[key][13] == 0 else output[key][12] / 0.000001'), ('Moment 9 A', 'A', f.standard_moment_9), ('Moment 9 B', 'B', f.standard_moment_9), ('Moment 9 diff', 'derived', 'output[key][16] - output[key][17]'), ('Moment 9 ratio', 'derived', 'output[key][16] / output[key][17] if not output[key][17] == 0 else output[key][16] / 0.000001')] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('high_order_moments', all_features, feature_names)
def useEMMeans(): data = data_io.read_train_pairs() output = data_io.read_train_target() y = np.array(output) n_components=5 covariance_type='full' #alternatives 'diag','spherical' num_datas = len(data) #means = np.zeros((num_datas,n_components,2)) #for i in range(num_datas): # X = np.array([data.A[i],data.B[i]]).T # g = GMM(n_components=n_components) # g.fit(X) # means[i,:,:] = g.means_ means = np.load('gmm_means.npy') X = means[:,:,1] y = y[:,0] from sklearn.linear_model import LinearRegression, Perceptron from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier k = 20 n = range(0,len(X)/2+k) npp = range(len(X)/2+k,len(X)) def dostuff(csfr,X,y,n,npp): csfr.fit(X[n,:],y[n]) yhat = csfr.predict(X[npp,:]) print 1.0*sum(yhat==y[npp])/len(yhat) linreg = LinearRegression(normalize=True) dostuff(linreg,X,y,n,npp) p = Perceptron() dostuff(p,X,y,n,npp) dt = DecisionTreeClassifier() dostuff(dt,X,y,n,npp) knn = KNeighborsClassifier(n_neighbors=2) dostuff(knn,X,y,n,npp) r = np.random.randint(-1,3,len(y[npp])) r[r==2] = 0 print 1.0*sum(r==y[npp])/len(r)
def main(): t1 = time() print("Reading in the training data") train = data_io.read_train_pairs() train_info = data_io.read_train_info() train = combine_types(train, train_info) #make function later train = get_types(train) target = data_io.read_train_target() print train print("Extracting features and training model") classifier = get_pipeline() classifier.fit(train, target.Target) features = [x[0] for x in classifier.steps[0][1].features ] csv_fea = csv.writer(open('features.csv','wb')) imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True) for fea in imp: print fea[0], fea[1] csv_fea.writerow([fea[0],fea[1]]) oob_score = classifier.steps[1][1].oob_score_ print "oob score:", oob_score logger = open("run_log.txt","a") if len(oob_score) == 1: logger.write("\n" +str( oob_score) + "\n") else:logger.write("\n" + str(oob_score[0]) + "\n") print("Saving the classifier") data_io.save_model(classifier) print("Predicting the train set") train_predict = classifier.predict(train) trian_predict = train_predict.flatten() data_io.write_submission(train_predict, 'train_set', run = 'train') t2 = time() t_diff = t2 - t1 print "Time Taken (min):", round(t_diff/60,1)
def main(): X = data_io.read_train_pairs() y = data_io.read_train_target() info = data_io.read_train_info() X, y, info = exploit_symmetries(X, y, info) print X.shape, y.shape print "-1", len(y[y['Target'] == -1]) print "0", len(y[y['Target'] == 0]) print "1", len(y[y['Target'] == 1]) # X = X.iloc[:10] # y = y.iloc[:10] # info = info.iloc[:10] data_io.save_train_data(X, "./Competition/CEfinal_train_pairs-sym.csv") data_io.save(y, "./Competition/CEfinal_train_target-sym.csv") data_io.save(info, "./Competition/CEfinal_train_publicinfo-sym.csv") print "finished"
def main(overwrite=False): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'icgi.csv')): print 'Feature file already exists - not overwriting' return features = [('ICGI entropy AB', ['A','B'], f.icgi_entropy_AB), ('ICGI entropy BA', ['A','B'], f.icgi_entropy_BA), ('ICGI entropy diff', 'derived', 'output[key][0] - output[key][1]'), ('ICGI slope AB', ['A','B'], f.icgi_slope_AB), ('ICGI slope BA', ['A','B'], f.icgi_slope_BA), ('ICGI slope diff', 'derived', 'output[key][3] - output[key][4]')]#, #('ICGI entropy AB PIT', ['A','B'], f.icgi_entropy_AB_PIT), #('ICGI entropy BA PIT', ['A','B'], f.icgi_entropy_BA_PIT), #('ICGI entropy diff PIT', 'derived', 'output[key][6] - output[key][7]'), #('ICGI slope AB PIT', ['A','B'], f.icgi_slope_AB_PIT), #('ICGI slope BA PIT', ['A','B'], f.icgi_slope_BA_PIT), #('ICGI slope diff PIT', 'derived', 'output[key][9] - output[key][10]')] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('icgi', all_features, feature_names)
def main(): X = data_io.read_train_pairs() y = data_io.read_train_target() info = data_io.read_train_info() X,y, info = exploit_symmetries(X,y, info) print X.shape, y.shape print "-1", len(y[y['Target']==-1]) print "0", len(y[y['Target']==0]) print "1", len(y[y['Target']==1]) # X = X.iloc[:10] # y = y.iloc[:10] # info = info.iloc[:10] data_io.save_train_data(X, "./Competition/CEfinal_train_pairs-sym.csv") data_io.save(y, "./Competition/CEfinal_train_target-sym.csv") data_io.save(info, "./Competition/CEfinal_train_publicinfo-sym.csv") print "finished"
def main(): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'unreasonable_features.csv')): print 'Feature file already exists - not overwriting' return features = [('Number of Samples', 'A', len), ('Max A', 'A', max), ('Max B', 'B', max), ('Min A', 'A', min), ('Min B', 'B', min), ('Mean A', 'A', f.mean), ('Mean B', 'B', f.mean), ('Median A', 'A', f.median), ('Median B', 'B', f.median), ('Sd A', 'A', f.sd), ('Sd B', 'B', f.sd)] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('unreasonable_features', all_features, feature_names)
def main(): numRows = 10 targetVal = 0 try: opts, args = getopt.getopt(sys.argv[1:], "n:t:h") except getopt.GetoptError as err: print str(err) sys.exit(2) clamp = lambda n, minn, maxn: max(min(maxn, n), minn) for o, a in opts: if o == "-n": numRows = clamp(int(a), 1, 4050) elif o == "-t": targetVal = int(a) else: print "try help: python train.py -h" sys.exit(2) print "Reading " + str(numRows) + " rows in the training data" train = data_io.read_train_pairs(numRows) target = data_io.read_train_target(numRows) train.A = train.A.div(train.A.apply(max) - train.A.apply(min)) train.B = train.B.div(train.B.apply(max) - train.B.apply(min)) train = train.convert_objects(convert_numeric=True) # train = train.to_numeric() for i in range(1, numRows): if target.Target[i] == targetVal: A = train.iloc[i, :].A B = train.iloc[i, :].B plt.figure(i) plt.plot(range(len(A)), A) plt.plot(range(len(B)), B) plt.savefig('plots/' + str(targetVal) + '_' + str(i) + '.png')
def main(): extractor = feature_extractor() print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = extractor.fit_transform(train[:]) print("Reading in the ensemble training data") ensemble_train = data_io.read_ensemble_train_pairs() print("Extracting features from ensemble training data") ensemble_train_features = extractor.fit_transform(ensemble_train[:]) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = extractor.fit_transform(valid[:]) all_features = np.concatenate((train_features, ensemble_train_features, valid_features)) print("Concatenating names") train_names = [train.irow(i).name for i in range(len(train))] ensemble_train_names = [ensemble_train.irow(i).name for i in range(len(ensemble_train))] valid_names = [valid.irow(i).name for i in range(len(valid))] all_names = train_names + ensemble_train_names + valid_names print("Writing feature file") feature_names = ['Number of Samples', 'A: Number of Unique Samples', 'B: Number of Unique Samples', 'A: Normalized Entropy', 'B: Normalized Entropy', 'Pearson R', 'Pearson R Magnitude', 'Entropy Difference'] data_io.write_real_features('benchmark_features', all_names, all_features, feature_names)
def main(overwrite=False): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'corrs.csv')): print 'Feature file already exists - not overwriting' return features = [('Kendall tau', ['A','B'], f.kendall), ('Kendall tau p', ['A','B'], f.kendall_p), ('Mann Whitney', ['A','B'], f.mannwhitney), ('Mann Whitney p', ['A','B'], f.mannwhitney_p), #('Wilcoxon', ['A','B'], f.wilcoxon), #('Wilcoxon p', ['A','B'], f.wilcoxon_p), ('Kruskal', ['A','B'], f.kruskal), ('Kruskal p', ['A','B'], f.kruskal_p), ] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('corrs', all_features, feature_names)
def main(overwrite=False): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'injectivity.csv')): print 'Feature file already exists - not overwriting' return features = [('Injectivity 10', ['A','B'], f.injectivity_10), ('Injectivity 15', ['A','B'], f.injectivity_15), ('Injectivity 20', ['A','B'], f.injectivity_20), ('Injectivity 25', ['A','B'], f.injectivity_25), ('Injectivity 30', ['A','B'], f.injectivity_30), ('Injectivity 35', ['A','B'], f.injectivity_35), ('Injectivity 40', ['A','B'], f.injectivity_40)] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('injectivity', all_features, feature_names)
proba_nz = proba[np.nonzero(proba)] probb_nz = probb[np.nonzero(probb)] jointp = np.outer(proba_nz, probb_nz) hpos = np.sum(np.log(jointp) * jointp) return -hpos if __name__ == "__main__": print "Reading in {} data...".format(DATA) if DATA == "train": info = data_io.read_train_info() train = data_io.read_train_pairs() elif DATA == "valid": info = data_io.read_valid_info() train = data_io.read_valid_pairs() else: raise ValueError print "Saving coded info matrix..." codes = np.zeros(info.values.shape) lookup = {"Numerical": 1, "Categorical": 2, "Binary": 3} for i, t in enumerate(info.values): a, b = t codes[i, :] = [lookup[a], lookup[b]] savemat("matlab/{}info.mat".format(DATA), {"codes": codes}, oned_as="column")
def get_pipeline(): features = feature_extractor() steps = [("extract_features", features), ("classify", RandomForestRegressor(n_estimators=50, verbose=2, n_jobs=2, min_samples_split=10, random_state=1, compute_importances=True))] return Pipeline(steps) if __name__ == "__main__": print("Reading in the training data") train_raw = data_io.read_train_pairs() target = data_io.read_train_target() info = data_io.read_train_info() info['iindex'] = range(4050) train = train_raw.join(info) classifier = get_pipeline() ### FOLDS CODE # folds = cval.KFold(len(train), n_folds=2, indices=False) # # results = [] # for i, fold in enumerate(folds): # print("Extracting features and training model for fold " + str(i)) # traincv, testcv = fold
return combined def get_pipeline(): features = feature_extractor() steps = [("extract_features", features), ("classify", RandomForestRegressor(n_estimators=50, verbose=2, n_jobs=2, min_samples_split=10, random_state=1, compute_importances=True))] return Pipeline(steps) if __name__=="__main__": print("Reading in the training data") train_raw = data_io.read_train_pairs() target = data_io.read_train_target() info = data_io.read_train_info() info['iindex'] = range(4050) train = train_raw.join(info) classifier = get_pipeline() ### FOLDS CODE # folds = cval.KFold(len(train), n_folds=2, indices=False) # # results = [] # for i, fold in enumerate(folds): # print("Extracting features and training model for fold " + str(i)) # traincv, testcv = fold
def main(): global cf start = time.clock() numRows = None cv = False nfold = 10 clf_keys = ["rfg"] try: opts, args = getopt.getopt(sys.argv[1:], "n:c:m:h") except getopt.GetoptError as err: print str(err) sys.exit(2) clamp = lambda n, minn, maxn: max(min(maxn, n), minn) for o, a in opts: if o == "-n": numRows = clamp(int(a), 1, 4050) elif o == "-c": cv = True nfold = int(a) elif o == "-m": if a == "all": clf_keys = [] for clf_key in cf.get_all_keys(): clf_keys.append(clf_key) elif cf.is_valid_key(a): clf_keys = [a] else: print "ERROR: wrong classifier name: " + a elif o == "-h": print 'options:' print "\t -n [number of rows]" print "\t -c [number of folds]" print "\t -m [classifier key | all]" sys.exit(0) else: print "try help: python train.py -h" sys.exit(2) print("Reading in the training data") train = data_io.read_train_pairs(numRows) trainInfo = data_io.read_train_info(numRows) train['A type'] = trainInfo['A type'] train['B type'] = trainInfo['B type'] target = data_io.read_train_target(numRows) if cv: data = {} data['train'] = train data['target'] = target for clf_key in clf_keys: print "Initiating " + str(nfold) + " fold cross validation with classifier " + cf.get_classifier_name(clf_key) crossvalidate(data, nfold, clf_key) else: for clf_key in clf_keys: start_train = time.clock() print("Extracting features and training model") classifier = get_pipeline(clf_key) classifier.fit(train, target.Target) print("Saving the classifier") data_io.save_model(classifier, clf_key) end_train = time.clock() print 'time taken:', end_train - start_train, 'seconds' end = time.clock() print 'Execution time:', round(end - start, 2)
def main(): t1 = time() print("Reading in the training data") train = data_io.read_train_pairs() train_info = data_io.read_train_info() train = combine_types(train, train_info) #make function later train = get_types(train) target = data_io.read_train_target() print "Reading SUP data..." for i in range(1,4): print "SUP", str(i) sup = data_io.read_sup_pairs(i) sup_info = data_io.read_sup_info(i) sup = combine_types(sup, sup_info) sup = get_types(sup) sup_target = data_io.read_sup_target(i) train = train.append(sup) target = target.append(sup_target) print "Train size = ", str(train.shape) print("Extracting features and training model") (feature_trans, classifier) = get_pipeline() orig_train = feature_trans.fit_transform(train) orig_train = numpy.nan_to_num(orig_train) print("Train-test split") trainX, testX, trainY, testY = train_test_split(orig_train, target.Target, random_state = 1) print "TrainX size = ", str(trainX.shape) print "TestX size = ", str(testX.shape) print("Saving features") data_io.save_features(orig_train) classifier.fit(trainX, trainY) print("Saving the classifier") data_io.save_model(classifier) testX = numpy.nan_to_num(testX) print "Score on held-out test data ->", classifier.score(testX, testY) #features = [x[0] for x in classifier.steps[0][1].features ] #csv_fea = csv.writer(open('features.csv','wb')) #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True) #for fea in imp: # print fea[0], fea[1] # csv_fea.writerow([fea[0],fea[1]]) feature_importrance = classifier.feature_importances_ logger = open("feature_importance.csv","a") for fi in feature_importrance: logger.write(str(fi)) logger.write("\n") t2 = time() t_diff = t2 - t1 print "Time Taken (min):", round(t_diff/60,1)
def main(): t1 = time() print("Reading in the training data") train = data_io.read_train_pairs() train_info = data_io.read_train_info() train = combine_types(train, train_info) #make function later train = get_types(train) target = data_io.read_train_target() print "Reading SUP data..." for i in range(1,4): print "SUP", str(i) sup = data_io.read_sup_pairs(i) sup_info = data_io.read_sup_info(i) sup = combine_types(sup, sup_info) sup = get_types(sup) sup_target = data_io.read_sup_target(i) train_info = train_info.append(sup_info) train = train.append(sup) target = target.append(sup_target) # Old train print "Reading old train data..." old_train = data_io.read_old_train_pairs() old_train_info = data_io.read_old_train_info() old_train = combine_types(old_train, old_train_info) old_train = get_types(old_train) old_target = data_io.read_old_train_target() train = train.append(old_train) target = target.append(old_target) # End old train print "Train size = ", str(train.shape) print("Extracting features and training model") feature_trans = fe.feature_extractor() orig_train = feature_trans.fit_transform(train) orig_train = numpy.nan_to_num(orig_train) classifier = classify_catagory(orig_train, target.Target) #(both_classifier, A_classifier, B_classifier, none_classifier) = create_classifiers(orig_train, target.Target, train_info) print("Saving features") data_io.save_features(orig_train) print("Saving the classifier") #data_io.save_model( (both_classifier, A_classifier, B_classifier, none_classifier) ) data_io.save_model(classifier) #features = [x[0] for x in classifier.steps[0][1].features ] #csv_fea = csv.writer(open('features.csv','wb')) #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True) #for fea in imp: # print fea[0], fea[1] # csv_fea.writerow([fea[0],fea[1]]) t2 = time() t_diff = t2 - t1 print "Time Taken (min):", round(t_diff/60,1)
def main(): t1 = time() print("Reading in the training data") train = data_io.read_train_pairs() train_info = data_io.read_train_info() train = combine_types(train, train_info) #make function later train = get_types(train) target = data_io.read_train_target() print "Reading SUP data..." for i in range(1, 4): print "SUP", str(i) sup = data_io.read_sup_pairs(i) sup_info = data_io.read_sup_info(i) sup = combine_types(sup, sup_info) sup = get_types(sup) sup_target = data_io.read_sup_target(i) train_info = train_info.append(sup_info) train = train.append(sup) target = target.append(sup_target) # Old train print "Reading old train data..." old_train = data_io.read_old_train_pairs() old_train_info = data_io.read_old_train_info() old_train = combine_types(old_train, old_train_info) old_train = get_types(old_train) old_target = data_io.read_old_train_target() train = train.append(old_train) target = target.append(old_target) # End old train print "Train size = ", str(train.shape) print("Extracting features and training model") feature_trans = fe.feature_extractor() orig_train = feature_trans.fit_transform(train) orig_train = numpy.nan_to_num(orig_train) classifier = classify_catagory(orig_train, target.Target) #(both_classifier, A_classifier, B_classifier, none_classifier) = create_classifiers(orig_train, target.Target, train_info) print("Saving features") data_io.save_features(orig_train) print("Saving the classifier") #data_io.save_model( (both_classifier, A_classifier, B_classifier, none_classifier) ) data_io.save_model(classifier) #features = [x[0] for x in classifier.steps[0][1].features ] #csv_fea = csv.writer(open('features.csv','wb')) #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True) #for fea in imp: # print fea[0], fea[1] # csv_fea.writerow([fea[0],fea[1]]) t2 = time() t_diff = t2 - t1 print "Time Taken (min):", round(t_diff / 60, 1)
proba_nz = proba[np.nonzero(proba)] probb_nz = probb[np.nonzero(probb)] jointp = np.outer(proba_nz, probb_nz) hpos = np.sum(np.log(jointp) * jointp) return -hpos if __name__ == '__main__': print 'Reading in {} data...'.format(DATA) if DATA == 'train': info = data_io.read_train_info() train = data_io.read_train_pairs() elif DATA == 'valid': info = data_io.read_valid_info() train = data_io.read_valid_pairs() else: raise ValueError print 'Saving coded info matrix...' codes = np.zeros(info.values.shape) lookup = {'Numerical': 1, 'Categorical': 2, 'Binary': 3} for i, t in enumerate(info.values): a, b = t codes[i, :] = [lookup[a], lookup[b]] savemat('matlab/{}info.mat'.format(DATA), {'codes': codes}, oned_as='column')