def classification(df, n_estimators, k_fold_times, n_features, experience_name, data_folder, output_folder): # Read X matrix (values), y column (target class) and feature names from dataframe #convert the data frame to a matrix X X = df.values #y: last column of the file, the target class column y = X[:, -1] #cast all y elements to int y = y.astype(int) #remove first (subject id) and last (target class) columns of the data matrix to only keep the data X = X[:, list(range(1, len(df.columns) - 1))] #cast all X elements to int X = X.astype(int) #get list of feature names (also removing first and last columns) feature_names = list(df)[1:-1] # Split data set into training and test set (75% and 25%) with stratification X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0, stratify=y) printShapes(X_train, X_test, y_train, y_test) # Fit RF classifier rf = fitRFClassifier(X_train, y_train, n_estimators) # Get classification metrics without cross validation getClassificationMetrics(rf, X_train, X_test, y_train, y_test) # Get classification metrics with stratified k-fold cross validation getClassificationMetricsCV(rf, k_fold_times, X, y) # Most important features output image file name (Random Forest Permutation Importance metric MLxtend lib) mlxtend_features_file_name = mlxtend_features + "_merged_data" # get permutation importance values for top-30 features most_important_features_perm = getPermutationImportanceMLxtend(number_perm_runs, rf, X_test, y_test, \ feature_names, width_perm_imp_plot, \ output_folder, mlxtend_features_file_name, n_features) # get patterns of the most important Biclusters on a separate file # Remove Bic_0 and Bic_ parts from features to get bicluster ids most_important_features_perm = [ str(int(x.split("_")[1])) for x in most_important_features_perm if x.startswith("Bic_") ] # Create dummy exp_list exp_list = [[experience_name, most_important_features_perm]] # Get purest biclusters by experience (from translated_labels files) list_bic_file_names = getTranslatedBiclusterFileNames( data_folder, exp_list) # Get Bicluster text contents per experience bics_exp = getBiclusterContentsFlatList(exp_list, list_bic_file_names, False) # Replace the Bicluster contents on the dictionary with the the most frequent # pattern for each bicluster bics_exp = getBiclustersMostFreqPatterns(bics_exp) # Export the most important N feature (biclusters/meta-features) patterns to a tsv file writeBiclustersPatternsOutput(bics_exp, output_folder, mlxtend_features_file_name + ".txt")
def start_processing(): #create folder if it doesn't exist if not os.path.exists(output_folder): os.makedirs(output_folder) # Read XLSX with the experience's metrics (sheet PurestBiclusters) df = readMetricsExcelData(data_folder, xlsx_metrics_file_name, sheet_name_or_index) # Get the experience names and the ids of its purest biclusters exp_list = getExperiencesWithPurestBiclusters(df, list_classes, data_folder, folder_date) #print(exp_list) # Get string with number of purest biclusters by experience bics_per_exp = getNumberOfBiclustersPerExperience(exp_list) # Write the number of purest biclusters by experience to a tsv file writeNumBicsPerExpOutput(bics_per_exp, output_folder, num_purest_bics_per_exp_filename) # Get only one of the experiences (Exp_14) exp_list = [l for l in exp_list if l[0] == experience_name] # Get minimum number of discriminative Biclusters between all classes min_bics = sys.maxsize for c in list_classes: num_bics_class = len(exp_list[0][1][c]) if num_bics_class < min_bics: min_bics = num_bics_class print("class", c, "->", num_bics_class, "biclusters") # Randomly sample the Biclusters from the classes with more discriminative Biclusters # to equalize the number of Biclusters considered from each class for c in list_classes: num_bics_class = len(exp_list[0][1][c]) if num_bics_class > min_bics: print("Sampling class", c) df_to_sample = pd.DataFrame(exp_list[0][1][c], columns=['Samp']) # sampling fixed seed -> Fibonacci prime number 1597 df_to_sample = df_to_sample.sample(n=min_bics, random_state=1597) # sort bicluster ids df_to_sample = df_to_sample.astype(int) df_to_sample = df_to_sample.sort_values('Samp') df_to_sample = df_to_sample.astype(str) # replace list sampled_list = df_to_sample['Samp'].values.tolist() exp_list[0][1][c] = sampled_list print("done") #print(exp_list) # Get purest biclusters by experience (from translated_labels files) list_bic_file_names = getTranslatedBiclusterFileNames( data_folder, exp_list) print("Getting Bicluster Contents...") # Get Bicluster text contents per experience bics_exp = getBiclusterContents(exp_list, list_bic_file_names, False) print("done") print("Getting Bicluster Most Frequent Patterns...") # Replace the Bicluster contents on the dictionary with the the most frequent # pattern for each bicluster bics_exp = getBiclustersMostFreqPatterns(bics_exp) print("done") # Output results to a tsv file writeBiclustersPatternsOutput(bics_exp, output_folder, patterns_output_filename) # Get all discriminative biclusters from the experience list_all_disc_bics = getAllBiclustersFromExperience( exp_list, experience_name) # Add the first and last column since these columns should be there, as well as # the prefix Bic_ again (if number is < 10 add a 0) list_features_to_keep = ['Subject ID'] + [ ('Bic_0' + x) if int(x) < 10 else ('Bic_' + x) for x in list_all_disc_bics ] + ['group'] # Run the scikit-learn RF classifier for the discriminative biclusters in the classification file # (subject ids x bicluster presence matrix) print( "********* Classification (Discriminative Bicluster features only) *********" ) # Get the dataframe from the subject ids x bicluster presence matrix file file_path = data_folder / classif_file_name df_discriminative = load_to_merge_matrix_data_no_missings(file_path) df_discriminative = df_discriminative[list_features_to_keep] #print(list(df_discriminative.columns)) # Run the scikit-learn RF classifier for the classification file (subject ids x bicluster presence matrix # but just for the discriminative biclusters) classification(df_discriminative, n_estimators, k_fold_times, n_features, experience_name, data_folder, output_folder)
def processData(): # Read XLSX with the experience's metrics (sheet PurestBiclusters) df = readMetricsExcelData(xlsx_data_folder, xlsx_metrics_file_name, sheet_name_or_index) # Get the experience names and the ids of its purest biclusters exp_list = getExperiencesWithPurestBiclusters(df, list_classes, xlsx_data_folder, folder_date) # Get only one of the experiences (Exp_14) exp_list = [l for l in exp_list if l[0] == experience_name] # Get minimum number of discriminative Biclusters between all classes min_bics = sys.maxsize for c in list_classes: num_bics_class = len(exp_list[0][1][c]) if num_bics_class < min_bics: min_bics = num_bics_class print("class", c, "->", num_bics_class, "biclusters") # Randomly sample the Biclusters from the classes with more discriminative Biclusters # to equalize the number of Biclusters considered from each class for c in list_classes: num_bics_class = len(exp_list[0][1][c]) if num_bics_class > min_bics: print("Sampling class", c) df_to_sample = pd.DataFrame(exp_list[0][1][c], columns=['Samp']) # sampling fixed seed -> Fibonacci prime number 1597 df_to_sample = df_to_sample.sample(n=min_bics, random_state=1597) # sort bicluster ids df_to_sample = df_to_sample.astype(int) df_to_sample = df_to_sample.sort_values('Samp') df_to_sample = df_to_sample.astype(str) # replace list sampled_list = df_to_sample['Samp'].values.tolist() exp_list[0][1][c] = sampled_list #print(exp_list) # Get purest biclusters by experience (from translated_labels files) list_bic_file_names = getTranslatedBiclusterFileNames( xlsx_data_folder, exp_list) # Get Bicluster features and values per experience (feature|value) # Each set of Bicluster features is n transactions (n = bicuster height), including the # class target + class value as one of the items: group|1 for Patients, or group|2 for Controls) bics_features_values = getBiclusterFeaturesAndValues( exp_list, list_bic_file_names) # Get translation maps for SPMF algorithm translation_map_original_spmf, translation_map_spmf_original = getTranslationMapsSPMF( bics_features_values[experience_name]) # Write both maps to temp files with open(output_folder / translation_map_original_spmf_file_name, "w") as outFile: for key, value in translation_map_original_spmf.items(): outFile.write(key + '\t' + value + '\n') with open(output_folder / translation_map_spmf_original_file_name, "w") as outFile: for key, value in translation_map_spmf_original.items(): outFile.write(key + '\t' + value + '\n') # Translate original data to SPMF format # 1) items are represented by integers # 2) features/items should be lexicografically ordered (e.g. 1, 10, 2, 3, ...) # 3) items should be separated by spaces translated_data = translateDataOriginalToSPMF( bics_features_values[experience_name], translation_map_original_spmf) # Export translated data to a tsv file with open(output_folder / processed_data_file_name, "w") as outFile: for feature_list in translated_data: #write a row in the file separated by spaces row = " ".join(feature_list) + "\n" outFile.write(row)