def start_processing(): #create folder if it doesn't exist if not os.path.exists(output_folder): os.makedirs(output_folder) data_source = data_folder / dataset_file_name # Read X matrix (values), y column (target class) and feature names from csv file X, y, feature_names = load_original_data_with_missings(data_source) # Split data set into training and test set (75% and 25%) with stratification X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0, stratify=y) printShapes(X_train, X_test, y_train, y_test) # Fit RF classifier rf = fitRFClassifier(X_train, y_train, n_estimators) # Get classification metrics without cross validation getClassificationMetrics(rf, X_train, X_test, y_train, y_test) # Get classification metrics with stratified k-fold cross validation getClassificationMetricsCV(rf, k_fold_times, X, y) # Most important features output image file name (Random Forest Permutation Importance metric MLxtend lib) mlxtend_features_file_name = mlxtend_features + "_baseline_fs" # get permutation importance values for all features getPermutationImportanceMLxtend(number_perm_runs, rf, X_test, y_test, feature_names, width_perm_imp_plot, \ output_folder, mlxtend_features_file_name)
def classification(df, n_estimators, k_fold_times, n_features, experience_name, data_folder, output_folder): # Read X matrix (values), y column (target class) and feature names from dataframe #convert the data frame to a matrix X X = df.values #y: last column of the file, the target class column y = X[:, -1] #cast all y elements to int y = y.astype(int) #remove first (subject id) and last (target class) columns of the data matrix to only keep the data X = X[:, list(range(1, len(df.columns) - 1))] #cast all X elements to int X = X.astype(int) #get list of feature names (also removing first and last columns) feature_names = list(df)[1:-1] # Split data set into training and test set (75% and 25%) with stratification X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0, stratify=y) printShapes(X_train, X_test, y_train, y_test) # Fit RF classifier rf = fitRFClassifier(X_train, y_train, n_estimators) # Get classification metrics without cross validation getClassificationMetrics(rf, X_train, X_test, y_train, y_test) # Get classification metrics with stratified k-fold cross validation getClassificationMetricsCV(rf, k_fold_times, X, y) # Most important features output image file name (Random Forest Permutation Importance metric MLxtend lib) mlxtend_features_file_name = mlxtend_features + "_merged_data" # get permutation importance values for top-30 features most_important_features_perm = getPermutationImportanceMLxtend(number_perm_runs, rf, X_test, y_test, \ feature_names, width_perm_imp_plot, \ output_folder, mlxtend_features_file_name, n_features) # get patterns of the most important Biclusters on a separate file # Remove Bic_0 and Bic_ parts from features to get bicluster ids most_important_features_perm = [ str(int(x.split("_")[1])) for x in most_important_features_perm if x.startswith("Bic_") ] # Create dummy exp_list exp_list = [[experience_name, most_important_features_perm]] # Get purest biclusters by experience (from translated_labels files) list_bic_file_names = getTranslatedBiclusterFileNames( data_folder, exp_list) # Get Bicluster text contents per experience bics_exp = getBiclusterContentsFlatList(exp_list, list_bic_file_names, False) # Replace the Bicluster contents on the dictionary with the the most frequent # pattern for each bicluster bics_exp = getBiclustersMostFreqPatterns(bics_exp) # Export the most important N feature (biclusters/meta-features) patterns to a tsv file writeBiclustersPatternsOutput(bics_exp, output_folder, mlxtend_features_file_name + ".txt")