예제 #1
0
def run_no_gender_ml(df, options, n, scoresdf, contrast_name):
    """
    :param df: Dataframe containing mean activation values in 116 brain areas
    :param options: info passed via command argument which contains details like file paths, etc
    :param n: represents classes considered. 123- for all, 12- Bipolar&schizo, 23- Schizo&Control, 31- Control&Bipolar
    :param scoresdf: Results dataframe containing scores
    :param contrast_name: Contrast Name
    :return: Results dataframe containing scores after elimination of gender effect on the data
    """
    print(contrast_name)
    models = [
        "svm_kernel_default", "svm_kernel_tuned", "naive_bayes",
        "decision_tree", "rfc", 'logistic_regression'
    ]
    for i in range(options.number_iterations):
        x_train, y_train, x_test, y_test = mlu.preprocess_remove_gender(df)

        for model_name in models:
            train_score, train_balanced_score, trained_model, min_max_scaler = mlu.model_fitting(
                model_name, x_train, y_train, 10, False)

            test_score = trained_model.score(x_test, y_test)
            test_balanced_score = mlu.balanced_accuracy(
                trained_model.predict(x_test), y_test)

            scoresdf = scoresdf.append(
                {
                    'Score': train_score,
                    'Type': 'train',
                    'Model': model_name,
                    'Classifier': n,
                    'Contrast_name': contrast_name,
                    'Balanced_accuracy': train_balanced_score
                },
                ignore_index=True)
            scoresdf = scoresdf.append(
                {
                    'Score': test_score,
                    'Type': 'test',
                    'Model': model_name,
                    'Classifier': n,
                    'Contrast_name': contrast_name,
                    'Balanced_accuracy': test_balanced_score
                },
                ignore_index=True)

    return scoresdf
예제 #2
0
def run_perm_test(df, contrast_name, classifier_no, out, n_iterations):
    models = ["svm_kernel_default", "svm_kernel_tuned", "rfc", 'logistic_regression']
    X, y = mlu.get_features_labels(df)

    for i in range(n_iterations):
        train, test = mlu.train_test_split(df)
        #x_train, y_train = mlu.get_features_labels(train)
        #x_test, y_test = mlu.get_features_labels(test)
        x_train, y_train, x_test, y_test = mlu.preprocess_remove_gender(df)
        for model_name in models:
            if model_name == "svm_kernel_default":
                model = svm.SVC(kernel='rbf', C=4, gamma=2 ** -5)
            elif model_name == "svm_kernel_tuned":
                param_grid = {'C': [0.1, 1, 10, 100, 1000],
                              'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 2 ** -5, 2 ** -10, 2 ** 5], 'kernel': ['rbf']}
                grid = GridSearchCV(svm.SVC(), param_grid, refit=True, cv=10, iid=False)
                grid.fit(X, y)
                best_param = grid.best_params_
                model = svm.SVC(kernel=best_param['kernel'], C=best_param['C'], gamma=best_param['gamma'])
            elif model_name == "rfc":
                model = RandomForestClassifier(n_estimators=200)
            elif model_name == "logistic_regression":
                model = LogisticRegression(solver="liblinear", multi_class='auto')

            trained_model = model.fit(x_train,y_train)
            scores = mlu.balanced_accuracy(trained_model.predict(x_test), y_test)
            if os.path.isfile(options.input):
                df_res = pd.read_csv(options.input)
            else:
                df_res = pd.DataFrame(
                    columns=['contrast', 'class', 'Model', 'original_accuracy'])

            df_res = df_res.append(
                {'contrast': contrast_name, 'class': classifier_no, 'Model': model_name,
                 'original_accuracy': scores}, ignore_index=True)

            df_res.to_csv(out + "permutation_result_%s_%s.csv" % (contrast_name, classifier_no), index=False)

            ## Only at the last iteration, permutation test is run 10000 times and using the performance scores and
            # mean of non-permutated accuracy of n_iterations, p-value can be calculated

            if i == n_iterations-1:
                scores, permutation_scores, p_value = mlu.permutation_test(X, y, model, 10000, 10)
                performance_file = contrast_name[0]+contrast_name[-1]+"_"+classifier_no+"_"+model_name
                np.savetxt(out+"%s.csv" % performance_file, permutation_scores, fmt="%10.18f")
예제 #3
0
def run_basic_ml(df, options, n, scoresdf, contrast_name):
    print(contrast_name)
    models = [
        "svm_kernel_default", "svm_kernel_tuned", "naive_bayes",
        "decision_tree", "rfc", 'logistic_regression'
    ]
    #models = ["svm_kernel_tuned"]

    for i in range(options.number_iterations):
        train, test = mlu.train_test_split(df)
        x_train, y_train = mlu.get_features_labels(train)
        x_test, y_test = mlu.get_features_labels(test)

        if options.model == "all":
            for model_name in models:
                #logger.debug("Running the %s model of the %s th iteration for %s contrast" %(model_name, i, contrast_name))

                train_score, train_balanced_score, trained_model, min_max_scaler = mlu.model_fitting(
                    model_name, x_train, y_train, options.kFold,
                    options.normalize)
                if options.normalize:
                    x_test_minmax = min_max_scaler.transform(x_test)
                    x_test = x_test_minmax

                test_score = trained_model.score(x_test, y_test)
                test_balanced_score = mlu.balanced_accuracy(
                    trained_model.predict(x_test), y_test)
                #print(model_name + " Train:"+ str(train_score) + "  Test:" +str(test_score) +" Contrast:" +contrast_name)
                scoresdf = scoresdf.append(
                    {
                        'Score': train_score,
                        'Type': 'train',
                        'Model': model_name,
                        'Classifier': n,
                        'Contrast_name': contrast_name,
                        'Balanced_accuracy': train_balanced_score
                    },
                    ignore_index=True)
                scoresdf = scoresdf.append(
                    {
                        'Score': test_score,
                        'Type': 'test',
                        'Model': model_name,
                        'Classifier': n,
                        'Contrast_name': contrast_name,
                        'Balanced_accuracy': test_balanced_score
                    },
                    ignore_index=True)

        else:
            train_score, train_balanced_score, trained_model, min_max_scaler = mlu.model_fitting(
                options.model, x_train, y_train, options.kFold, True)
            test_score = trained_model.score(x_test, y_test)
            scoresdf = scoresdf.append(
                {
                    'Score': train_score,
                    'Type': 'train',
                    'Model': options.model,
                    'Classifier': n,
                    'Contrast_name': contrast_name,
                    'Balanced_accuracy': train_balanced_score
                },
                ignore_index=True)
            scoresdf = scoresdf.append(
                {
                    'Score': test_score,
                    'Type': 'test',
                    'Model': options.model,
                    'Classifier': n,
                    'Contrast_name': contrast_name,
                    'Balanced_accuracy': test_balanced_score
                },
                ignore_index=True)

    return scoresdf
예제 #4
0
def run_gender_cor(df, options, n, scoresdf, contrast_name, label):
    classification = True
    if label == 'gender':
        df.drop(['label', 'age'], axis=1, inplace=True)
        models = ["svm_kernel_default", "svm_kernel_tuned", "naive_bayes", "decision_tree", "rfc",
                  'logistic_regression']

    elif label == 'age':
        df.drop(['label', 'gender'], axis=1, inplace=True)
        models = ['linear_reg', 'lasso', 'polynomial_reg']
        models = ['svr_kernel_default', 'svr_kernel_tuned', 'gpr_default']
        classification = False

    df = df.rename(columns={label: 'label'})

    for i in range(options.number_iterations):
        train, test = mlu.train_test_split(df)
        x_train, y_train = mlu.get_features_labels(train)
        x_test, y_test = mlu.get_features_labels(test)
        if classification:
            scaler = StandardScaler()
            x_train = scaler.fit_transform(x_train, y_train)
            x_test = scaler.transform(x_test)

        if options.model == 'all':
            for model_name in models:
                train_score, train_balanced_score, trained_model, min_max_scaler = mlu.model_fitting(model_name,
                                                                                                     x_train, y_train,
                                                                                                     options.kFold,
                                                                                                     options.normalize)
                if options.normalize:
                    x_test_minmax = min_max_scaler.transform(x_test)
                    x_test = x_test_minmax

                test_score = trained_model.score(x_test, y_test)

                test_balanced_score = mlu.balanced_accuracy(trained_model.predict(x_test), y_test)

                if not classification:

                    if model_name == "gpr_default":
                        pred, sigma = trained_model.predict(x_test, return_std=True)
                    else:
                        pred = trained_model.predict(x_test)
                    test_balanced_score = mean_squared_error(y_test, pred, multioutput='raw_values')

                # print(model_name + " Train:"+ str(train_score) + "  Test:" +str(test_score) +" Contrast:" +contrast_name)
                scoresdf = scoresdf.append(
                    {'Score': train_score, 'Type': 'train', 'Model': model_name, 'Classifier': n,
                     'Contrast_name': contrast_name, 'Balanced_accuracy': train_balanced_score}, ignore_index=True)
                scoresdf = scoresdf.append(
                    {'Score': test_score, 'Type': 'test', 'Model': model_name, 'Classifier': n,
                     'Contrast_name': contrast_name, 'Balanced_accuracy': test_balanced_score}, ignore_index=True)
        else:
            train_score, train_balanced_score, trained_model, min_max_scaler = mlu.model_fitting(options.model, x_train, y_train)
            test_score = trained_model.score(x_test, y_test)
            test_balanced_score = mlu.balanced_accuracy(trained_model.predict(x_test), y_test)
            scoresdf = scoresdf.append(
                {'Score': train_score, 'Type': 'train', 'Model': options.model, 'Classifier': n,
                 'Contrast_name': contrast_name, 'Balanced_accuracy': train_balanced_score}, ignore_index=True)
            scoresdf = scoresdf.append(
                {'Score': test_score, 'Type': 'test', 'Model': options.model, 'Classifier': n,
                 'Contrast_name': contrast_name, 'Balanced_accuracy': test_balanced_score}, ignore_index=True)

    return scoresdf