Пример #1
0
def preprocess(df):
    """
    :param df: data frame which contains mean activation values in 116 brainn areas
    :return: male and female combined training and testing data where testing data is standardized using mean and variance of
             the respective gender training data.
    """

    # Split the data into 80% training and 20% testing.
    train, test = mlu.train_test_split(df)

    # Obtaining male and female dataframes
    train_male = train.loc[train["gender"] == 1]
    train_female = train.loc[train["gender"] == 2]
    test_male = test.loc[test["gender"] == 1]
    test_female = test.loc[test["gender"] == 2]

    # Removing age and gender info from dataframe, so that only mean activation values in 116 brain regions are considered.
    train_male = train_male.drop(['gender', 'age'], axis=1, errors='ignore')
    train_female = train_female.drop(['gender', 'age'],
                                     axis=1,
                                     errors='ignore')
    test_male = test_male.drop(['gender', 'age'], axis=1, errors='ignore')
    test_female = test_female.drop(['gender', 'age'], axis=1, errors='ignore')

    # Converting dataframes into X and Y arrays wrt male and female
    x_train_male, y_train_male = mlu.get_features_labels(train_male)
    x_train_female, y_train_female = mlu.get_features_labels(train_female)

    x_test_male, y_test_male = mlu.get_features_labels(test_male)
    x_test_female, y_test_female = mlu.get_features_labels(test_female)

    # Standardisation of male training data and female training data
    scaler_male = StandardScaler()
    scaler_female = StandardScaler()
    x_train_male = scaler_male.fit_transform(x_train_male, y_train_male)
    x_train_female = scaler_female.fit_transform(x_train_female,
                                                 y_train_female)

    # Standardisation of male testing data using mean and variance from male training data scale.
    x_test_male = scaler_male.transform(x_test_male)

    # Standardisation of female testing data using mean and variance from female training data scale.
    x_test_female = scaler_female.transform(x_test_female)

    # Combining male training data and female training data.
    x_train = np.concatenate((x_train_male, x_train_female))
    y_train = np.concatenate((y_train_male, y_train_female))

    # Combining male testing data and female testing data.
    x_test = np.concatenate((x_test_male, x_test_female))
    y_test = np.concatenate((y_test_male, y_test_female))

    return x_train, y_train, x_test, y_test
Пример #2
0
def run_perm_test(df, contrast_name, classifier_no, out, n_iterations):
    models = ["svm_kernel_default", "svm_kernel_tuned", "rfc", 'logistic_regression']
    X, y = mlu.get_features_labels(df)

    for i in range(n_iterations):
        train, test = mlu.train_test_split(df)
        #x_train, y_train = mlu.get_features_labels(train)
        #x_test, y_test = mlu.get_features_labels(test)
        x_train, y_train, x_test, y_test = mlu.preprocess_remove_gender(df)
        for model_name in models:
            if model_name == "svm_kernel_default":
                model = svm.SVC(kernel='rbf', C=4, gamma=2 ** -5)
            elif model_name == "svm_kernel_tuned":
                param_grid = {'C': [0.1, 1, 10, 100, 1000],
                              'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 2 ** -5, 2 ** -10, 2 ** 5], 'kernel': ['rbf']}
                grid = GridSearchCV(svm.SVC(), param_grid, refit=True, cv=10, iid=False)
                grid.fit(X, y)
                best_param = grid.best_params_
                model = svm.SVC(kernel=best_param['kernel'], C=best_param['C'], gamma=best_param['gamma'])
            elif model_name == "rfc":
                model = RandomForestClassifier(n_estimators=200)
            elif model_name == "logistic_regression":
                model = LogisticRegression(solver="liblinear", multi_class='auto')

            trained_model = model.fit(x_train,y_train)
            scores = mlu.balanced_accuracy(trained_model.predict(x_test), y_test)
            if os.path.isfile(options.input):
                df_res = pd.read_csv(options.input)
            else:
                df_res = pd.DataFrame(
                    columns=['contrast', 'class', 'Model', 'original_accuracy'])

            df_res = df_res.append(
                {'contrast': contrast_name, 'class': classifier_no, 'Model': model_name,
                 'original_accuracy': scores}, ignore_index=True)

            df_res.to_csv(out + "permutation_result_%s_%s.csv" % (contrast_name, classifier_no), index=False)

            ## Only at the last iteration, permutation test is run 10000 times and using the performance scores and
            # mean of non-permutated accuracy of n_iterations, p-value can be calculated

            if i == n_iterations-1:
                scores, permutation_scores, p_value = mlu.permutation_test(X, y, model, 10000, 10)
                performance_file = contrast_name[0]+contrast_name[-1]+"_"+classifier_no+"_"+model_name
                np.savetxt(out+"%s.csv" % performance_file, permutation_scores, fmt="%10.18f")
Пример #3
0
def main():
    input = "../Data"
    df, contrast_name = tools.data_extraction(input, 3, "Faces_con_0001.mat")
    df.fillna(df.mean(), inplace=True)

    scoresdf = pd.DataFrame(columns=['Score', 'Type', 'Model', 'Classifier'])

    # Model : model name

    for i in range(1):
        train, test = mlu.train_test_split(df)
        X, y = mlu.get_features_labels(train)
        tX, ty = mlu.get_features_labels(test)
        model = svm.SVC(kernel='rbf', C=4, gamma=2**-5)
        model.fit(X, y)
        train_score = model.score(X, y)
        test_score = model.score(tX, ty)
        predictions = model.predict(tX)
        print(len(ty))
        print(confusion_matrix(ty, predictions))
        print(classification_report(ty, predictions))
        param_grid = {
            'C': [0.1, 1, 10, 100, 1000],
            'gamma':
            [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 2**-5, 2**-10, 2**5],
            'kernel': ['rbf']
        }
        grid = GridSearchCV(svm.SVC(),
                            param_grid,
                            refit=True,
                            verbose=3,
                            cv=10)
        grid.fit(X, y)
        best_param = grid.best_params_
        print((best_param))
        grid_predictions = grid.predict(tX)
        print(confusion_matrix(ty, grid_predictions))
        print(classification_report(ty, grid_predictions))

        ### finding scores after hyperparamter tuning
        model = svm.SVC(kernel=best_param['kernel'],
                        C=best_param['C'],
                        gamma=best_param['gamma'])
        model.fit(X, y)
        train_score = model.score(X, y)
        test_score = model.score(tX, ty)
        scoresdf = scoresdf.append(
            {
                'Score': train_score,
                'Type': 'train',
                'Model': 'svm_kernel',
                'Classifier': 123,
                'Contrast_name': contrast_name
            },
            ignore_index=True)
        scoresdf = scoresdf.append(
            {
                'Score': test_score,
                'Type': 'test',
                'Model': 'svm_kernel',
                'Classifier': 123,
                'Contrast_name': contrast_name
            },
            ignore_index=True)

    fig, axes = plt.subplots(nrows=2, ncols=2)
    axs = axes.ravel()
    for j in range(4):

        models = scoresdf['Model'].unique()
        sns.boxplot(x='Model',
                    y='Score',
                    data=scoresdf[(scoresdf['Type'] == 'test')
                                  & (scoresdf['Model'] == 'svm_kernel')],
                    ax=axs[j])
Пример #4
0
def run_basic_ml(df, options, n, scoresdf, contrast_name):
    print(contrast_name)
    models = [
        "svm_kernel_default", "svm_kernel_tuned", "naive_bayes",
        "decision_tree", "rfc", 'logistic_regression'
    ]
    #models = ["svm_kernel_tuned"]

    for i in range(options.number_iterations):
        train, test = mlu.train_test_split(df)
        x_train, y_train = mlu.get_features_labels(train)
        x_test, y_test = mlu.get_features_labels(test)

        if options.model == "all":
            for model_name in models:
                #logger.debug("Running the %s model of the %s th iteration for %s contrast" %(model_name, i, contrast_name))

                train_score, train_balanced_score, trained_model, min_max_scaler = mlu.model_fitting(
                    model_name, x_train, y_train, options.kFold,
                    options.normalize)
                if options.normalize:
                    x_test_minmax = min_max_scaler.transform(x_test)
                    x_test = x_test_minmax

                test_score = trained_model.score(x_test, y_test)
                test_balanced_score = mlu.balanced_accuracy(
                    trained_model.predict(x_test), y_test)
                #print(model_name + " Train:"+ str(train_score) + "  Test:" +str(test_score) +" Contrast:" +contrast_name)
                scoresdf = scoresdf.append(
                    {
                        'Score': train_score,
                        'Type': 'train',
                        'Model': model_name,
                        'Classifier': n,
                        'Contrast_name': contrast_name,
                        'Balanced_accuracy': train_balanced_score
                    },
                    ignore_index=True)
                scoresdf = scoresdf.append(
                    {
                        'Score': test_score,
                        'Type': 'test',
                        'Model': model_name,
                        'Classifier': n,
                        'Contrast_name': contrast_name,
                        'Balanced_accuracy': test_balanced_score
                    },
                    ignore_index=True)

        else:
            train_score, train_balanced_score, trained_model, min_max_scaler = mlu.model_fitting(
                options.model, x_train, y_train, options.kFold, True)
            test_score = trained_model.score(x_test, y_test)
            scoresdf = scoresdf.append(
                {
                    'Score': train_score,
                    'Type': 'train',
                    'Model': options.model,
                    'Classifier': n,
                    'Contrast_name': contrast_name,
                    'Balanced_accuracy': train_balanced_score
                },
                ignore_index=True)
            scoresdf = scoresdf.append(
                {
                    'Score': test_score,
                    'Type': 'test',
                    'Model': options.model,
                    'Classifier': n,
                    'Contrast_name': contrast_name,
                    'Balanced_accuracy': test_balanced_score
                },
                ignore_index=True)

    return scoresdf
Пример #5
0
def run_gender_cor(df, options, n, scoresdf, contrast_name, label):
    classification = True
    if label == 'gender':
        df.drop(['label', 'age'], axis=1, inplace=True)
        models = ["svm_kernel_default", "svm_kernel_tuned", "naive_bayes", "decision_tree", "rfc",
                  'logistic_regression']

    elif label == 'age':
        df.drop(['label', 'gender'], axis=1, inplace=True)
        models = ['linear_reg', 'lasso', 'polynomial_reg']
        models = ['svr_kernel_default', 'svr_kernel_tuned', 'gpr_default']
        classification = False

    df = df.rename(columns={label: 'label'})

    for i in range(options.number_iterations):
        train, test = mlu.train_test_split(df)
        x_train, y_train = mlu.get_features_labels(train)
        x_test, y_test = mlu.get_features_labels(test)
        if classification:
            scaler = StandardScaler()
            x_train = scaler.fit_transform(x_train, y_train)
            x_test = scaler.transform(x_test)

        if options.model == 'all':
            for model_name in models:
                train_score, train_balanced_score, trained_model, min_max_scaler = mlu.model_fitting(model_name,
                                                                                                     x_train, y_train,
                                                                                                     options.kFold,
                                                                                                     options.normalize)
                if options.normalize:
                    x_test_minmax = min_max_scaler.transform(x_test)
                    x_test = x_test_minmax

                test_score = trained_model.score(x_test, y_test)

                test_balanced_score = mlu.balanced_accuracy(trained_model.predict(x_test), y_test)

                if not classification:

                    if model_name == "gpr_default":
                        pred, sigma = trained_model.predict(x_test, return_std=True)
                    else:
                        pred = trained_model.predict(x_test)
                    test_balanced_score = mean_squared_error(y_test, pred, multioutput='raw_values')

                # print(model_name + " Train:"+ str(train_score) + "  Test:" +str(test_score) +" Contrast:" +contrast_name)
                scoresdf = scoresdf.append(
                    {'Score': train_score, 'Type': 'train', 'Model': model_name, 'Classifier': n,
                     'Contrast_name': contrast_name, 'Balanced_accuracy': train_balanced_score}, ignore_index=True)
                scoresdf = scoresdf.append(
                    {'Score': test_score, 'Type': 'test', 'Model': model_name, 'Classifier': n,
                     'Contrast_name': contrast_name, 'Balanced_accuracy': test_balanced_score}, ignore_index=True)
        else:
            train_score, train_balanced_score, trained_model, min_max_scaler = mlu.model_fitting(options.model, x_train, y_train)
            test_score = trained_model.score(x_test, y_test)
            test_balanced_score = mlu.balanced_accuracy(trained_model.predict(x_test), y_test)
            scoresdf = scoresdf.append(
                {'Score': train_score, 'Type': 'train', 'Model': options.model, 'Classifier': n,
                 'Contrast_name': contrast_name, 'Balanced_accuracy': train_balanced_score}, ignore_index=True)
            scoresdf = scoresdf.append(
                {'Score': test_score, 'Type': 'test', 'Model': options.model, 'Classifier': n,
                 'Contrast_name': contrast_name, 'Balanced_accuracy': test_balanced_score}, ignore_index=True)

    return scoresdf