print("all rec_name array is: " + str(all_rec_name_array)) print("size of all rec name array i.e num of egs is : " + str(len(all_rec_name_array))) #generate class labels # y_afpdb=cl.generate_labels("afpdb",rec_name_array_afpdb) # y_nsrdb=cl.generate_labels("nsrdb",rec_name_array_nsrdb) # y_afdb=cl.generate_labels("afdb",rec_name_array_afdb) y_aftdb = cl.generate_labels_bool("aftdb", rec_name_array_aftdb) y_afpdb_patient = cl.generate_labels_bool("afdb", rec_name_array_afpdb_patient) y_afpdb_normal = cl.generate_labels_bool("nsrdb", rec_name_array_afpdb_normal) y_all = np.array(y_aftdb + y_afpdb_patient + y_afpdb_normal) #y_all=cl.generate_labels("afpdb", rec_name_array) #print ("all label array is: " + str(y_all)) #convert list of lists to matrix all_feature_matrix_old = cl.covert_array_to_matrix(all_features) print("shape of all feature matrix is: " + str(all_feature_matrix_old.shape)) all_feature_matrix = np.delete(all_feature_matrix_old, 3, 1) print("shape of all feature matrix is: " + str(all_feature_matrix.shape)) #################### SEPARATING EVALUATION DATA ######################### #X_cv, X_eval, y_cv, y_eval = cross_validation.train_test_split(all_feature_matrix, y, test_size=0.2, random_state=0) ############################################################################### # Classification # Run classifier with cross-validation and plot ROC curves # folds = 10 cv = StratifiedKFold(y_all, n_folds=folds, shuffle=True)
rec_name_array=rw.read_features_frm_file(output_folder,"rec_name_array_pickle.txt") ##################### change key value pairs of global vocab #################### inv_global_vocab = dict(zip(global_vocab.values(), global_vocab.keys())) #print type(inv_global_vocab.values()) all_features_list=inv_global_vocab.values() np.savetxt(output_folder+"all_features_list.txt",all_features_list,fmt="%s",delimiter=',',newline='\n') #generate class labels y=np.array(cl.generate_labels(rec_name_array)) print ("label array is: " + str(y)) #convert list of lists to matrix all_feature_matrix=cl.covert_array_to_matrix(all_features,len(all_features),max(global_vocab.values())+1); #print all_feature_matrix #print ("type of all feature matrix is: " + str(type(all_feature_matrix))) #################### SEPARATING EVALUATION DATA ######################### X_cv, X_eval, y_cv, y_eval = cross_validation.train_test_split(all_feature_matrix, y, test_size=0.2, random_state=0) X_cv_normalized_matrix=cl.normalise_mean_var(X_cv) X_eval_normalized_matrix=cl.normalise_mean_var(X_eval) ############## with normalisation ###################### # Classification normalised=" "
##################### change key value pairs of global vocab #################### inv_global_vocab = dict(zip(global_vocab.values(), global_vocab.keys())) #print type(inv_global_vocab.values()) all_features_list = inv_global_vocab.values() np.savetxt(output_folder + "all_features_list.txt", all_features_list, fmt="%s", delimiter=',', newline='\n') #generate class labels y = np.array(cl.generate_labels(rec_name_array)) print("label array is: " + str(y)) #convert list of lists to matrix all_feature_matrix = cl.covert_array_to_matrix(all_features, len(all_features), max(global_vocab.values()) + 1) #print all_feature_matrix #print ("type of all feature matrix is: " + str(type(all_feature_matrix))) #################### SEPARATING EVALUATION DATA ######################### X_cv, X_eval, y_cv, y_eval = cross_validation.train_test_split( all_feature_matrix, y, test_size=0.2, random_state=0) X_cv_normalized_matrix = cl.normalise_mean_var(X_cv) X_eval_normalized_matrix = cl.normalise_mean_var(X_eval) ############## with normalisation ###################### # Classification normalised = " "
#y_aftdb=cl.generate_labels("aftdb",rec_name_array_aftdb) #y_afpdb_patient=cl.generate_labels("afdb",rec_name_array_afpdb_patient) #y_afpdb_normal=cl.generate_labels("nsrdb",rec_name_array_afpdb_normal) #y_all=np.array(y_afpdb_patient+y_afpdb_normal) y_all=cl.generate_labels("afpdb", rec_name_array) print ("all label array is: " + str(y_all)) ##################### change key value pairs of global vocab #################### inv_global_vocab = dict(zip(global_vocab.values(), global_vocab.keys())) #print type(inv_global_vocab.values()) all_features_list=inv_global_vocab.values() np.savetxt(output_folder+"all_features_list.txt",all_features_list,fmt="%s",delimiter=',',newline='\n') #combine lists and convert list of lists to one big matrix all_feature_matrix=cl.covert_array_to_matrix(all_features); #all_feature_matrix=cl.covert_array_to_matrix(all_features_afpdb_patient+all_features_afpdb_normal); #normalized_matrix=cl.normalise_mean_var(all_feature_matrix) #print all_feature_matrix print ("shape of all feature matrix is: " + str(all_feature_matrix.shape)) #################### GENERATE TRAIN TEST INDICES FOR SHUFFLE SPLIT ######################### #X_cv, X_eval, y_cv, y_eval = cross_validation.train_test_split(all_feature_matrix, y, test_size=0.2, random_state=0) cv_shufflesplit=cross_validation.ShuffleSplit(len(y_all),1,test_size=0.2,train_size=None, random_state=0) #################### Save feaures and y_all to csv file ######################### csv_indexes=sorted(inv_global_vocab.keys()) #print(csv_indexes) csv_header=[]
##################### change key value pairs of global vocab #################### inv_global_vocab = dict(zip(global_vocab.values(), global_vocab.keys())) #print type(inv_global_vocab.values()) all_features_list = inv_global_vocab.values() np.savetxt(output_folder + "all_features_list.txt", all_features_list, fmt="%s", delimiter=',', newline='\n') #generate class labels y = np.array(cl.generate_labels(rec_name_array)) print("label array is: " + str(y)) #convert list of lists to matrix all_feature_matrix = cl.covert_array_to_matrix(all_features, len(all_features)) #print all_feature_matrix print("shape of all feature matrix is: " + str(all_feature_matrix.shape)) #################### SEPARATING EVALUATION DATA ######################### #X_cv, X_eval, y_cv, y_eval = cross_validation.train_test_split(all_feature_matrix, y, test_size=0.2, random_state=0) exit() ############## with normalisation ###################### # Classification # normalised=" " # normalized_matrix=cl.normalise_mean_var(all_feature_matrix) # rw.write_df_to_csv(normalized_matrix, csv_header, output_folder, "features_normalised_test.csv") # exit()