scores = cross_val_score(clf, X_new, y_2d, cv=5, scoring='accuracy') print "accuracy scores with 5 fold cross validation with reduced features for ", to_predict, scores print "mean of accuracy", scores.mean() if __name__ == '__main__': # raw_path_string = raw_input("Enter path where data is located (Location of accession number dirs): ") # csv_path = raw_input("Enter path of directory to store csv files: ") # train_path = raw_input("Enter path of train csv file (Path upto p1_train.csv): ") slash = "/" raw_path_string = '/home/rasika/Documents/Computational Biology/Project/Data' csv_path = '/home/rasika/Documents/Computational Biology/Project/Result' train_path = '/home/rasika/Documents/Computational Biology/Project/p1_train_pop_lab.csv' # make csv files from quant.sf files make_csv.make_csv_files(raw_path_string + slash, csv_path, slash) colnames1 = ['TPM'] classifier_input = list() label_dict = {} # store the labels from train file in a dictionary train_data = pd.read_csv(train_path, sep=',', header=0, dtype='unicode') for i, row in train_data.iterrows(): label_dict[row[0]] = (row[1], row[2]) classifier_input = list() print "Starting reading csv files" print datetime.datetime.now()
print "accuracy scores with 5 fold cross validation for Population DT", scores print "mean of accuracy", scores.mean() if __name__ == '__main__': raw_path_string = raw_input("Enter path where data is located (Location of accession number dirs): ") csv_path = raw_input("Enter path of directory to store csv files: ") train_path = raw_input("Enter path of train csv file (Path upto p1_train.csv): ") slash = "\\" # raw_path_string = '/home/rasika/Documents/Computational Biology/Project/Data' # csv_path = '/home/rasika/Documents/Computational Biology/Project/Result' # train_path = '/home/rasika/Documents/Computational Biology/Project/p1_train_pop_lab.csv' # make csv files from quant.sf files colnames1 = ['TPM','Length'] make_csv.make_csv_files(raw_path_string + slash, csv_path, slash, ['Name'] + colnames1) classifier_input = list() label_dict = {} # store the labels from train file in a dictionary train_data = pd.read_csv(train_path, sep=',', header=0, dtype='unicode') for i, row in train_data.iterrows(): label_dict[row[0]] = (row[1], row[2]) classifier_input = list() print "Starting reading csv files" print datetime.datetime.now() files = listdir(csv_path)
def main(): args = sys.argv global model_dump_path global slash global csv_path global train_path model_dump_path = args[1] raw_path_string = args[2] csv_path = args[3] train_path = args[4] slash = args[5] eq_class = args[6] dataframe_csv_path = args[7] scores = {} df = None # w_file = open('/home/rasika/Documents/Computational Biology/Project/output_file.txt', 'a') # w_file.write('\n\n' + str(datetime.datetime.now())) col_names = ['TPM', 'Length'] # make csv files from quant.sf files make_csv.make_csv_files(raw_path_string + slash, csv_path, slash, ['Name'] + col_names) label_dict = {} # store the labels from train file in a dictionary train_data = pd.read_csv(train_path, sep=',', header=0, dtype='unicode') for i, row in train_data.iterrows(): label_dict[row[0]] = (row[1], row[2]) print "Started reading csv files" print datetime.datetime.now() # Reading the data from csv files and creating a data list of acession number, tpm, length and effective length files = listdir(csv_path) if eq_class == 'False': df = create_dataframe(files, ['TPM', 'Length'], col_names, label_dict) else: df = parse_eq_classes.create_dataframe_with_eq_class( raw_path_string, csv_path, train_path, slash, label_dict, dataframe_csv_path) print "Read all csv files, created dataframe" print datetime.datetime.now() # created dataframe will have following format # Name TPM_1 TPM_2 TPM_3 TPM_4 .... TPM_199324 label # ERR188021 value value value value .... value TSI # ERR188022 . . . . .... . CEU # . . . . . .... . . # . . . . . .... . . # . . . . . .... . . # w_file.write("\n Predicting for Population, Sequence Center and Both on full data: TPM, Length, Effective Length, NumReads\n") f1_score_p, accuracy_p = predict_population(df) scores['Population'] = (f1_score_p, accuracy_p) f1_score_sc, accuracy_sc = predict_sequence_center(df) scores['Sequence Center'] = (f1_score_sc, accuracy_sc) f1_score_p_sc, accuracy_p_sc = predict_population_seq_center(df) scores['Population and Sequence Center'] = (f1_score_p_sc, accuracy_p_sc) return scores