示例#1
0
def start_processing():

    #create folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    data_source = data_folder / dataset_file_name
    # Read X matrix (values), y column (target class) and feature names from csv file
    X, y, feature_names = load_original_data_with_missings(data_source)
    # Split data set into training and test set (75% and 25%) with stratification
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=0,
                                                        stratify=y)
    printShapes(X_train, X_test, y_train, y_test)

    # Fit RF classifier
    rf = fitRFClassifier(X_train, y_train, n_estimators)
    # Get classification metrics without cross validation
    getClassificationMetrics(rf, X_train, X_test, y_train, y_test)
    # Get classification metrics with stratified k-fold cross validation
    getClassificationMetricsCV(rf, k_fold_times, X, y)

    # Most important features output image file name (Random Forest Permutation Importance metric MLxtend lib)
    mlxtend_features_file_name = mlxtend_features + "_baseline_fs"

    # get permutation importance values for all features
    getPermutationImportanceMLxtend(number_perm_runs, rf, X_test, y_test, feature_names, width_perm_imp_plot, \
                                    output_folder, mlxtend_features_file_name)
示例#2
0
def classification(df, n_estimators, k_fold_times, n_features, experience_name,
                   data_folder, output_folder):
    # Read X matrix (values), y column (target class) and feature names from dataframe
    #convert the data frame to a matrix X
    X = df.values
    #y: last column of the file, the target class column
    y = X[:, -1]
    #cast all y elements to int
    y = y.astype(int)
    #remove first (subject id) and last (target class) columns of the data matrix to only keep the data
    X = X[:, list(range(1, len(df.columns) - 1))]
    #cast all X elements to int
    X = X.astype(int)
    #get list of feature names (also removing first and last columns)
    feature_names = list(df)[1:-1]
    # Split data set into training and test set (75% and 25%) with stratification
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=0,
                                                        stratify=y)
    printShapes(X_train, X_test, y_train, y_test)

    # Fit RF classifier
    rf = fitRFClassifier(X_train, y_train, n_estimators)
    # Get classification metrics without cross validation
    getClassificationMetrics(rf, X_train, X_test, y_train, y_test)
    # Get classification metrics with stratified k-fold cross validation
    getClassificationMetricsCV(rf, k_fold_times, X, y)

    # Most important features output image file name (Random Forest Permutation Importance metric MLxtend lib)
    mlxtend_features_file_name = mlxtend_features + "_merged_data"

    # get permutation importance values for top-30 features
    most_important_features_perm = getPermutationImportanceMLxtend(number_perm_runs, rf, X_test, y_test, \
                                                                   feature_names, width_perm_imp_plot, \
                                                                   output_folder, mlxtend_features_file_name, n_features)

    # get patterns of the most important Biclusters on a separate file
    # Remove Bic_0 and Bic_ parts from features to get bicluster ids
    most_important_features_perm = [
        str(int(x.split("_")[1])) for x in most_important_features_perm
        if x.startswith("Bic_")
    ]
    # Create dummy exp_list
    exp_list = [[experience_name, most_important_features_perm]]
    # Get purest biclusters by experience (from translated_labels files)
    list_bic_file_names = getTranslatedBiclusterFileNames(
        data_folder, exp_list)
    # Get Bicluster text contents per experience
    bics_exp = getBiclusterContentsFlatList(exp_list, list_bic_file_names,
                                            False)
    # Replace the Bicluster contents on the dictionary with the the most frequent
    # pattern for each bicluster
    bics_exp = getBiclustersMostFreqPatterns(bics_exp)
    # Export the most important N feature (biclusters/meta-features) patterns to a tsv file
    writeBiclustersPatternsOutput(bics_exp, output_folder,
                                  mlxtend_features_file_name + ".txt")