def run_no_gender_ml(df, options, n, scoresdf, contrast_name): """ :param df: Dataframe containing mean activation values in 116 brain areas :param options: info passed via command argument which contains details like file paths, etc :param n: represents classes considered. 123- for all, 12- Bipolar&schizo, 23- Schizo&Control, 31- Control&Bipolar :param scoresdf: Results dataframe containing scores :param contrast_name: Contrast Name :return: Results dataframe containing scores after elimination of gender effect on the data """ print(contrast_name) models = [ "svm_kernel_default", "svm_kernel_tuned", "naive_bayes", "decision_tree", "rfc", 'logistic_regression' ] for i in range(options.number_iterations): x_train, y_train, x_test, y_test = mlu.preprocess_remove_gender(df) for model_name in models: train_score, train_balanced_score, trained_model, min_max_scaler = mlu.model_fitting( model_name, x_train, y_train, 10, False) test_score = trained_model.score(x_test, y_test) test_balanced_score = mlu.balanced_accuracy( trained_model.predict(x_test), y_test) scoresdf = scoresdf.append( { 'Score': train_score, 'Type': 'train', 'Model': model_name, 'Classifier': n, 'Contrast_name': contrast_name, 'Balanced_accuracy': train_balanced_score }, ignore_index=True) scoresdf = scoresdf.append( { 'Score': test_score, 'Type': 'test', 'Model': model_name, 'Classifier': n, 'Contrast_name': contrast_name, 'Balanced_accuracy': test_balanced_score }, ignore_index=True) return scoresdf
def run_perm_test(df, contrast_name, classifier_no, out, n_iterations): models = ["svm_kernel_default", "svm_kernel_tuned", "rfc", 'logistic_regression'] X, y = mlu.get_features_labels(df) for i in range(n_iterations): train, test = mlu.train_test_split(df) #x_train, y_train = mlu.get_features_labels(train) #x_test, y_test = mlu.get_features_labels(test) x_train, y_train, x_test, y_test = mlu.preprocess_remove_gender(df) for model_name in models: if model_name == "svm_kernel_default": model = svm.SVC(kernel='rbf', C=4, gamma=2 ** -5) elif model_name == "svm_kernel_tuned": param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 2 ** -5, 2 ** -10, 2 ** 5], 'kernel': ['rbf']} grid = GridSearchCV(svm.SVC(), param_grid, refit=True, cv=10, iid=False) grid.fit(X, y) best_param = grid.best_params_ model = svm.SVC(kernel=best_param['kernel'], C=best_param['C'], gamma=best_param['gamma']) elif model_name == "rfc": model = RandomForestClassifier(n_estimators=200) elif model_name == "logistic_regression": model = LogisticRegression(solver="liblinear", multi_class='auto') trained_model = model.fit(x_train,y_train) scores = mlu.balanced_accuracy(trained_model.predict(x_test), y_test) if os.path.isfile(options.input): df_res = pd.read_csv(options.input) else: df_res = pd.DataFrame( columns=['contrast', 'class', 'Model', 'original_accuracy']) df_res = df_res.append( {'contrast': contrast_name, 'class': classifier_no, 'Model': model_name, 'original_accuracy': scores}, ignore_index=True) df_res.to_csv(out + "permutation_result_%s_%s.csv" % (contrast_name, classifier_no), index=False) ## Only at the last iteration, permutation test is run 10000 times and using the performance scores and # mean of non-permutated accuracy of n_iterations, p-value can be calculated if i == n_iterations-1: scores, permutation_scores, p_value = mlu.permutation_test(X, y, model, 10000, 10) performance_file = contrast_name[0]+contrast_name[-1]+"_"+classifier_no+"_"+model_name np.savetxt(out+"%s.csv" % performance_file, permutation_scores, fmt="%10.18f")
def run_basic_ml(df, options, n, scoresdf, contrast_name): print(contrast_name) models = [ "svm_kernel_default", "svm_kernel_tuned", "naive_bayes", "decision_tree", "rfc", 'logistic_regression' ] #models = ["svm_kernel_tuned"] for i in range(options.number_iterations): train, test = mlu.train_test_split(df) x_train, y_train = mlu.get_features_labels(train) x_test, y_test = mlu.get_features_labels(test) if options.model == "all": for model_name in models: #logger.debug("Running the %s model of the %s th iteration for %s contrast" %(model_name, i, contrast_name)) train_score, train_balanced_score, trained_model, min_max_scaler = mlu.model_fitting( model_name, x_train, y_train, options.kFold, options.normalize) if options.normalize: x_test_minmax = min_max_scaler.transform(x_test) x_test = x_test_minmax test_score = trained_model.score(x_test, y_test) test_balanced_score = mlu.balanced_accuracy( trained_model.predict(x_test), y_test) #print(model_name + " Train:"+ str(train_score) + " Test:" +str(test_score) +" Contrast:" +contrast_name) scoresdf = scoresdf.append( { 'Score': train_score, 'Type': 'train', 'Model': model_name, 'Classifier': n, 'Contrast_name': contrast_name, 'Balanced_accuracy': train_balanced_score }, ignore_index=True) scoresdf = scoresdf.append( { 'Score': test_score, 'Type': 'test', 'Model': model_name, 'Classifier': n, 'Contrast_name': contrast_name, 'Balanced_accuracy': test_balanced_score }, ignore_index=True) else: train_score, train_balanced_score, trained_model, min_max_scaler = mlu.model_fitting( options.model, x_train, y_train, options.kFold, True) test_score = trained_model.score(x_test, y_test) scoresdf = scoresdf.append( { 'Score': train_score, 'Type': 'train', 'Model': options.model, 'Classifier': n, 'Contrast_name': contrast_name, 'Balanced_accuracy': train_balanced_score }, ignore_index=True) scoresdf = scoresdf.append( { 'Score': test_score, 'Type': 'test', 'Model': options.model, 'Classifier': n, 'Contrast_name': contrast_name, 'Balanced_accuracy': test_balanced_score }, ignore_index=True) return scoresdf
def run_gender_cor(df, options, n, scoresdf, contrast_name, label): classification = True if label == 'gender': df.drop(['label', 'age'], axis=1, inplace=True) models = ["svm_kernel_default", "svm_kernel_tuned", "naive_bayes", "decision_tree", "rfc", 'logistic_regression'] elif label == 'age': df.drop(['label', 'gender'], axis=1, inplace=True) models = ['linear_reg', 'lasso', 'polynomial_reg'] models = ['svr_kernel_default', 'svr_kernel_tuned', 'gpr_default'] classification = False df = df.rename(columns={label: 'label'}) for i in range(options.number_iterations): train, test = mlu.train_test_split(df) x_train, y_train = mlu.get_features_labels(train) x_test, y_test = mlu.get_features_labels(test) if classification: scaler = StandardScaler() x_train = scaler.fit_transform(x_train, y_train) x_test = scaler.transform(x_test) if options.model == 'all': for model_name in models: train_score, train_balanced_score, trained_model, min_max_scaler = mlu.model_fitting(model_name, x_train, y_train, options.kFold, options.normalize) if options.normalize: x_test_minmax = min_max_scaler.transform(x_test) x_test = x_test_minmax test_score = trained_model.score(x_test, y_test) test_balanced_score = mlu.balanced_accuracy(trained_model.predict(x_test), y_test) if not classification: if model_name == "gpr_default": pred, sigma = trained_model.predict(x_test, return_std=True) else: pred = trained_model.predict(x_test) test_balanced_score = mean_squared_error(y_test, pred, multioutput='raw_values') # print(model_name + " Train:"+ str(train_score) + " Test:" +str(test_score) +" Contrast:" +contrast_name) scoresdf = scoresdf.append( {'Score': train_score, 'Type': 'train', 'Model': model_name, 'Classifier': n, 'Contrast_name': contrast_name, 'Balanced_accuracy': train_balanced_score}, ignore_index=True) scoresdf = scoresdf.append( {'Score': test_score, 'Type': 'test', 'Model': model_name, 'Classifier': n, 'Contrast_name': contrast_name, 'Balanced_accuracy': test_balanced_score}, ignore_index=True) else: train_score, train_balanced_score, trained_model, min_max_scaler = mlu.model_fitting(options.model, x_train, y_train) test_score = trained_model.score(x_test, y_test) test_balanced_score = mlu.balanced_accuracy(trained_model.predict(x_test), y_test) scoresdf = scoresdf.append( {'Score': train_score, 'Type': 'train', 'Model': options.model, 'Classifier': n, 'Contrast_name': contrast_name, 'Balanced_accuracy': train_balanced_score}, ignore_index=True) scoresdf = scoresdf.append( {'Score': test_score, 'Type': 'test', 'Model': options.model, 'Classifier': n, 'Contrast_name': contrast_name, 'Balanced_accuracy': test_balanced_score}, ignore_index=True) return scoresdf