def test_permutation_test_score_allow_nans(): # Check that permutation_test_score allows input data with NaNs X = np.arange(200, dtype=np.float64).reshape(10, -1) X[2, :] = np.nan y = np.repeat([0, 1], X.shape[0] / 2) p = Pipeline([ ('imputer', Imputer(strategy='mean', missing_values='NaN')), ('classifier', MockClassifier()), ]) permutation_test_score(p, X, y, cv=5)
def test_permutation_score(): iris = load_iris() X = iris.data X_sparse = coo_matrix(X) y = iris.target svm = SVC(kernel='linear') cv = StratifiedKFold(2) score, scores, pvalue = permutation_test_score( svm, X, y, n_permutations=30, cv=cv, scoring="accuracy") assert_greater(score, 0.9) assert_almost_equal(pvalue, 0.0, 1) score_label, _, pvalue_label = permutation_test_score( svm, X, y, n_permutations=30, cv=cv, scoring="accuracy", labels=np.ones(y.size), random_state=0) assert_true(score_label == score) assert_true(pvalue_label == pvalue) # check that we obtain the same results with a sparse representation svm_sparse = SVC(kernel='linear') cv_sparse = StratifiedKFold(2) score_label, _, pvalue_label = permutation_test_score( svm_sparse, X_sparse, y, n_permutations=30, cv=cv_sparse, scoring="accuracy", labels=np.ones(y.size), random_state=0) assert_true(score_label == score) assert_true(pvalue_label == pvalue) # test with custom scoring object def custom_score(y_true, y_pred): return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) / y_true.shape[0]) scorer = make_scorer(custom_score) score, _, pvalue = permutation_test_score( svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0) assert_almost_equal(score, .93, 2) assert_almost_equal(pvalue, 0.01, 3) # set random y y = np.mod(np.arange(len(y)), 3) score, scores, pvalue = permutation_test_score( svm, X, y, n_permutations=30, cv=cv, scoring="accuracy") assert_less(score, 0.5) assert_greater(pvalue, 0.2)
def permutation(self): score, permutation_scores, pvalue = permutation_test_score( self.estimator, self.X, self.y, scoring="accuracy", cv=self.cv, n_permutations=self.n_permutation) print("Classification score %s (pvalue : %s)" % (score, pvalue)) n_classes = np.unique(self.y).size # View histogram of permutation scores plt.hist(permutation_scores, 20, label='Permutation scores', edgecolor='black') ylim = plt.ylim() plt.plot(2 * [score], ylim, '--g', linewidth=3, label='Classification Score' ' (pvalue %s)' % pvalue) plt.plot(2 * [1. / n_classes], ylim, '--k', linewidth=3, label='Luck') plt.ylim(ylim) plt.legend() plt.xlabel('Score') plt.show()
def _p_value_from_permutation(X, y): from sklearn.svm import SVC from sklearn.preprocessing import StandardScaler from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import permutation_test_score # classify with an linear SVM svm = SVC(kernel='linear') cv = StratifiedKFold(2) # scale input to unit var and zero mean scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # run permutation test score, permutation_scores, pvalue = permutation_test_score( svm, X_scaled, y, scoring="f1_macro", cv=cv, n_permutations=100, n_jobs=6) return pvalue, score
def final_classif(pipeline, cv, X, y, groups, model, norm, n_perms=1000): score, permutation_scores, pvalue = permutation_test_score( pipeline, X, y, groups=groups, cv=cv, n_permutations=n_perms, n_jobs=-1, scoring="roc_auc", ) results = { "acc_score": score, "acc_pscores": permutation_scores, "acc_pvalue": pvalue, } # Get DA train and feature importance pipeline.fit(X, y) results["DA_train"] = pipeline.score(X, y) if model == "RF": if norm == 1: results["feature_importances"] = pipeline[ "classifier"].feature_importances_ else: results["feature_importances"] = pipeline.feature_importances_ elif model == "LR" or model == "SVM": if norm == 1: results["feature_importances"] = pipeline[ "classifier"].coef_.squeeze() else: results["feature_importances"] = pipeline.coef_.squeeze() return results
def fit_huber(data, targets, permute=True): """ Huber regression """ cv = GridSearchCV( HuberRegressor(), param_grid={ "epsilon":np.linspace(1, 3, 20), "alpha":np.logspace(-10, 0, 10), }, n_jobs=3, error_score=0, verbose=0, cv=3, ) cv.fit(data.values, targets) if permute == True: p = permutation_test_score( cv, data, targets, # cv=10, n_jobs=3, n_permutations=1000, ) return cv.best_params_, cv.best_score_, p[-1] else: return cv.best_params_, cv.best_score_, -1
def plot_permutation(model, test_data, test_class): cv = StratifiedKFold(2) score, permutation_scores, pvalue = permutation_test_score( model, test_data, test_class, scoring="accuracy", cv=cv, n_permutations=100, n_jobs=1) print("Classification score %s (pvalue : %s)" % (score, pvalue)) # ############################################################################# # View histogram of permutation scores plt.figure() plt.hist(permutation_scores, 20, label='Permutation scores', edgecolor='green') ylim = plt.ylim() #plt.plot(2 * [score], ylim, '--b', linewidth=1, label="Classification Score = {0:.4f}".format(score)) plt.plot(2 * [1. / 2], ylim, '--k', linewidth=3, label='Luck') plt.ylim(ylim) plt.legend() plt.xlabel('Score') plt.show()
def fit_svm(X, y, comment, use_x_normalization, kernel=None): print("------------------------------") print(comment) print("------------------------------") np.random.seed(1) if use_x_normalization: X = StandardScaler().fit_transform(X) train_scores = np.array([]) val_scores = np.array([]) kf = KFold(n_splits=10, shuffle=True) for train_index, val_index in kf.split(X): X_train, X_val = X[train_index], X[val_index] y_train, y_val = y[train_index], y[val_index] if kernel is None: clf = svm.SVC() else: clf = svm.SVC(kernel=kernel) clf.fit(X_train, y_train) print('start to calculate p value') score, permutation_scores, pvalue = permutation_test_score(clf, X_train, y_train, scoring="accuracy", cv=kf, n_permutations=10, n_jobs=1) print(score, permutation_scores, pvalue) train_scores = np.append(train_scores, clf.score(X_train, y_train) * 100) val_scores = np.append(val_scores, clf.score(X_val, y_val) * 100) print('Training accuracy: {:.2f}%'.format(np.mean(train_scores))) print('Validation accuracy: {:.2f}%'.format(np.mean(val_scores))) print()
def fit_rfc(data, targets, permute=True): """ Random forest classifier """ cv = RandomizedSearchCV( RandomForestClassifier(n_estimators=50), param_distributions={ "max_depth": np.append(np.arange(5, 50), None), "min_samples_split": np.arange(2, 15), "min_samples_leaf": np.arange(1, 10), "max_features": np.arange(1, data.shape[1]), }, n_jobs=3, error_score=0, n_iter=100, verbose=1, cv=3, scoring=make_scorer(roc_auc_score) ) cv.fit(data.values, targets) if permute == True: p = permutation_test_score( cv, data, targets, # cv=10, n_jobs=3, n_permutations=1000, ) return cv.best_params_, cv.best_score_, p[-1] else: return cv.best_params_, cv.best_score_, -1
def fit_svc(data, targets, permute=True): """ Huber regression """ cv = GridSearchCV( LinearSVC(dual=False), param_grid={ "C":np.logspace(-10,5,16), }, n_jobs=3, error_score=0, scoring=make_scorer(roc_auc_score, average="weighted"), verbose=1, cv=3, ) cv.fit(StandardScaler().fit_transform(data.values), targets) if permute == True: p = permutation_test_score( cv, data, targets, # cv=10, n_jobs=3, n_permutations=1000, ) return cv.best_params_, cv.best_score_, p[-1] else: return cv.best_params_, cv.best_score_, -1
def fit_svr(data, targets, permute=True): """ Huber regression """ cv = GridSearchCV( LinearSVR(dual=False, loss="squared_epsilon_insensitive"), param_grid={ "C":np.logspace(-10,5,16), "epsilon":np.logspace(-10,5,16), }, n_jobs=3, error_score=0, verbose=1, cv=3, ) cv.fit(StandardScaler().fit_transform(data.values), targets) if permute == True: p = permutation_test_score( cv, data, targets, # cv=10, n_jobs=3, n_permutations=1000, ) return cv.best_params_, cv.best_score_, p[-1] else: return cv.best_params_, cv.best_score_, -1
def permutation_test(X, y, group, clf, num_permutation=1000): """ Helper function to validate that a classifier is performing higher than chance Args: X (numpy matrix): this is the feature matrix with row being a data point y (numpy vector): this is the label vector with row belonging to a data point group (numpy vector): this is the group vector (which is a the participant id) clf (sklearn classifier): this is a classifier made in sklearn with fit, transform and predict functionality num_permutation (int): the number of time to permute y random_state (int): this is used for reproducible output Returns: f1s (list): the f1 at for each leave one out participant """ logo = LeaveOneGroupOut() train_test_splits = logo.split(X, y, group) with joblib.parallel_backend('loky'): (accuracies, permutation_scores, p_value) = permutation_test_score(clf, X, y, groups=group, cv=train_test_splits, n_permutations=num_permutation, verbose=num_permutation, n_jobs=-1) return accuracies, permutation_scores, p_value
def cross_session_NB2(train_session, test_session, bin_length=2, predictor='traces', neurons=None): X_train, X_test, y_train, y_test = \ preprocess_NB_cross_session(train_session, test_session, bin_length=bin_length, predictor=predictor, neurons=neurons) X = np.concatenate((X_train, X_test)) y = y_train + y_test train_label = np.zeros(len(y_train), dtype=int) test_label = np.ones(len(y_test), dtype=int) groups = np.concatenate((train_label, test_label)) cv = LeaveOneGroupOut() if predictor == 'traces': classifier = make_pipeline(StandardScaler(), GaussianNB()) elif predictor == 'events': classifier = make_pipeline(MultinomialNB()) else: raise ValueError('Predictor incorrectly defined.') score, permutation_scores, p_value = \ permutation_test_score(classifier, X, y, scoring='accuracy', groups=groups, cv=cv, n_permutations=1000, n_jobs=1) return score, permutation_scores, p_value
def y_randomization(rf_best, X_train, y_train, descritor, algoritimo): permutations = 20 score, permutation_scores, pvalue = permutation_test_score(rf_best, X_train, y_train, cv=5, scoring='balanced_accuracy', n_permutations=permutations, n_jobs=-1, verbose=1, random_state=24) print('True score = ', score.round(2), '\n Média per. = ', np.mean(permutation_scores).round(2), '\np-value = ', pvalue.round(4)) ############################################################################### # View histogram of permutation scores pl.subplots(figsize=(10,6)) pl.hist(permutation_scores.round(2), label='Permutation scores') ylim = pl.ylim() pl.vlines(score, ylim[0], ylim[1], linestyle='--', color='g', linewidth=3, label='Classification Score' ' (pvalue %s)' % pvalue.round(4)) pl.vlines(1.0 / 2, ylim[0], ylim[1], linestyle='--', color='k', linewidth=3, label='Luck') pl.ylim(ylim) pl.legend() pl.xlabel('Score') pl.title('Aleatoriarização da variável Y '+algoritimo+'X'+descritor, fontsize=12) pl.savefig('figures/y_randomization-'+descritor+'X'+algoritimo+'.png', bbox_inches='tight', transparent=False, format='png', dpi=300) pl.show()
def compute_p_value(path, orientations, repetitions, kernel, cost, gamma, degree): data = pd.read_csv(path, header=None) X = data.iloc[:, :-1].values Y = data.iloc[:, -1].values if orientations is not None: new_x = X[np.logical_or(Y == orientations[0], Y == orientations[1])] new_y = Y[np.logical_or(Y == orientations[0], Y == orientations[1])] else: new_x = X new_y = Y cv = StratifiedKFold(5) pipeline = Pipeline([('scaler', StandardScaler()), ('SVM', SVC(kernel=kernel, C=cost, gamma=gamma, degree=degree))]) score, permutation_scores, pvalue = permutation_test_score( pipeline, new_x, new_y, scoring="accuracy", cv=cv, n_permutations=repetitions, n_jobs=-1) plt.hist(permutation_scores, 20, label='Permutation scores', edgecolor='black') ylim = plt.ylim() plt.plot(2 * [score], ylim, '--g', linewidth=3, label='Classification Score' ' (pvalue %s)' % pvalue) plt.plot(2 * [1. / np.unique(Y).shape[0]], ylim, '--k', linewidth=3, label='Luck') plt.ylim(ylim) plt.legend() plt.xlabel('Score') plt.show()
def run_cv_voxel(v, model, features, fmri_data, cv, groups, scoring, permutations=None): cv_splits = cv.split(features, groups, groups=groups) if permutations: score, _, pvalue = permutation_test_score(model, features, fmri_data[:, v], groups=groups, scoring=scoring, cv=cv_splits, n_permutations=permutations, n_jobs=1) return score, pvalue else: score = np.mean( cross_val_score(model, features, fmri_data[:, v], groups=groups, scoring=scoring, cv=cv_splits, n_jobs=1)) return score
def test_permutation_test_score_pandas(): # check permutation_test_score doesn't destroy pandas dataframe types = [(MockDataFrame, MockDataFrame)] try: from pandas import Series, DataFrame types.append((Series, DataFrame)) except ImportError: pass for TargetType, InputFeatureType in types: # X dataframe, y series iris = load_iris() X, y = iris.data, iris.target X_df, y_ser = InputFeatureType(X), TargetType(y) check_df = lambda x: isinstance(x, InputFeatureType) check_series = lambda x: isinstance(x, TargetType) clf = CheckingClassifier(check_X=check_df, check_y=check_series) permutation_test_score(clf, X_df, y_ser)
def feature_importance(data): labels = [ 'srch_id', 'site_id', 'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2', 'position', 'price_usd', 'promotion_flag', # 'srch_saturday_night_bool' # 'random_bool', # 'click_bool', # 'booking_bool', # 'price_usd_normalized', # 'consumer' # 'Pclass' # 'score' ] # data = data.apply(lambda x: pd.factorize(x)[0]) y = (data['booking_bool']) x = data[labels] X = StandardScaler().fit_transform(x) n_classes = np.unique(y).size svm = SVC(kernel='linear') cv = StratifiedKFold(2) score, permutation_scores, pvalue = permutation_test_score( svm, X, y, scoring="accuracy", cv=cv, n_permutations=100, n_jobs=1) print("Classification score %s (pvalue : %s)" % (score, pvalue)) plt.hist(permutation_scores, 20, label='Permutation scores') ylim = plt.ylim() plt.plot(2 * [score], ylim, '--g', linewidth=3, label='Classification Score' ' (pvalue %s)' % pvalue) plt.plot(2 * [1. / n_classes], ylim, '--k', linewidth=3, label='Luck') plt.ylim(ylim) plt.legend() plt.xlabel('Score') plt.show()
def NB_session_permutation(X, Y): # Build classifier and cross-validation object. classifier = make_pipeline(StandardScaler(), GaussianNB()) cv = StratifiedKFold(2) # Classify and permutation tests. score, permutation_scores, p_value = \ permutation_test_score(classifier, X, Y, scoring='accuracy', cv=cv, n_permutations=500, n_jobs=1) return score, permutation_scores, p_value
def permutation_test(X, y, estimator, n_permutations, kFold): score, permutation_scores, p_value = permutation_test_score( estimator=estimator, X=X, y=y, scoring='balanced_accuracy', cv=StratifiedKFold(kFold), n_permutations=n_permutations, n_jobs=1) return score, permutation_scores, p_value
def getMLModelPerf(ml_df,roi_cols,covar_continuous_cols,covar_cat_cols,outcome_col,model_type,ml_model,rank_features,n_splits=10,n_repeats=10,n_jobs=1): """ Takes a model (classification or regression) instance and computes cross val scores. Uses repeated stratified KFold for classification and ShuffeSplit for regression. """ X = ml_df[roi_cols].values X_col_names = roi_cols.copy() # Check input var types and create dummy vars if needed if len(covar_continuous_cols) > 0: X_continuous_covar = ml_df[covar_continuous_cols].values print('Using {} continuous covar'.format(len(covar_continuous_cols))) X = np.hstack((X, X_continuous_covar)) X_col_names += list(covar_continuous_cols) if len(covar_cat_cols) > 0: X_cat_covar_df = pd.get_dummies(ml_df[covar_cat_cols]) X_cat_covar = X_cat_covar_df.values print('Using {} categorical cols as {} cat covar (dummies)'.format(covar_cat_cols,X_cat_covar.shape[1])) X = np.hstack((X, X_cat_covar)) X_col_names += list(X_cat_covar_df.columns) print('n of input columns: {}'.format(len(X_col_names))) if model_type.lower() == 'classification': y = pd.get_dummies(ml_df[outcome_col]).values[:,0] print('Data shapes X {}, y {} ({})'.format(X.shape, len(y), list(ml_df[outcome_col].value_counts()))) perf_metric = 'roc_auc' cv = RepeatedStratifiedKFold(n_splits=n_splits,n_repeats=n_repeats,random_state=0) elif model_type.lower() == 'regression': y = ml_df[outcome_col].values print('Data shapes X {}, y {} ({:3.2f}m, {:3.2f}sd)'.format(X.shape, len(y), np.mean(y),np.std(y))) perf_metric = 'neg_mean_squared_error' cv = ShuffleSplit(n_splits=n_splits*n_repeats, random_state=0) else: print('unknown model type {} (needs to be classification or regression)'.format(model_type)) print('Using {} model with perf metric {}'.format(model_type, perf_metric)) perf = cross_val_score(ml_model, X, y, scoring=perf_metric,cv=cv, n_jobs=n_jobs) scores_df = pd.DataFrame(columns=[perf_metric]) scores_df[perf_metric] = perf print(' Perf mean:{:4.3f}, sd:{:4.3f}'.format(np.mean(perf),np.std(perf))) # Null model null_cv = ShuffleSplit(n_splits=n_repeats, random_state=0) #10x10xn_permutations are too many. _, permutation_scores, pvalue = permutation_test_score(ml_model, X, y, scoring=perf_metric, cv=null_cv, n_permutations=10, n_jobs=n_jobs) null_df = pd.DataFrame() null_df[perf_metric] = permutation_scores # Feature ranks based on RFECV feature_ranks_df = pd.DataFrame() if rank_features: feature_ranks, feature_grid_scores = get_feature_importance(ml_model, X, y, perf_metric, cv=cv, n_jobs=n_jobs) feature_ranks_df['predictor'] = X_col_names feature_ranks_df['rank'] = feature_ranks feature_ranks_df['grid_scores'] = feature_grid_scores return scores_df, null_df, pvalue, feature_ranks_df
def permutation_test(dataset, clf, num_permutation): train_test_splits = man.generate_train_test_splits(dataset) (accuracy, permutation_scores, p_value) = permutation_test_score(clf, dataset.X, dataset.y, groups=dataset.I, cv=train_test_splits, n_permutations=num_permutation, verbose=num_permutation, n_jobs=-1) return (accuracy, permutation_scores, p_value)
def test_permutation_test_score(self): import sklearn.svm as svm iris = datasets.load_iris() df = pdml.ModelFrame(iris) clf = svm.SVC(kernel=str('linear'), C=1) result = df.model_selection.permutation_test_score(clf, cv=5) expected = ms.permutation_test_score(clf, iris.data, y=iris.target, cv=5) self.assertEqual(len(result), 3) self.assertEqual(result[0], expected[0]) self.assert_numpy_array_almost_equal(result[1], expected[1]) self.assertEqual(result[2], expected[2])
def knn_testing(principalDf, labels): features = principalDf[['principal component 1', 'principal component 2']].to_numpy() #create train, test sets X_train, X_test, y_train, y_test = train_test_split(features, labels.to_numpy(), test_size=0.2, random_state=2) #Create KNN Classifier knn = KNeighborsClassifier(n_neighbors=2) #Train the model using the training sets knn.fit(X_train, y_train.ravel()) #Predict the response for test dataset y_pred = knn.predict(X_test) # Model Accuracy, how often is the classifier correct? accuracy = (metrics.accuracy_score(y_test, y_pred)) score, permutation_scores, pvalue = permutation_test_score( knn, X_train, y_train.ravel(), scoring="accuracy", n_permutations=100, n_jobs=1) confusion_matrix = metrics.confusion_matrix(y_test, y_pred, normalize='true') np.set_printoptions(precision=2) # Plot non-normalized and normalized confusion matrices titles_options = [("Confusion matrix, without normalization", None), ("Normalized confusion matrix", 'true')] for title, normalize in titles_options: disp = metrics.plot_confusion_matrix(knn, X_test, y_test, display_labels=['1', '2'], cmap=plt.cm.Blues, normalize=normalize) disp.ax_.set_title(title) print(title) print(disp.confusion_matrix) fig_matrix = plt return accuracy, score, pvalue, confusion_matrix, fig_matrix
def print_permutation_plots(clf, cv, X_test, y_test): score_dataset, perm_scores_dataset, pvalue_dataset = permutation_test_score( clf, X_test, y_test, scoring="accuracy", cv=cv, n_permutations=1000, n_jobs=-1) fig, ax = plt.subplots() ax.hist(perm_scores_dataset, bins=20, density=True) ax.axvline(score_dataset, ls="--", color="r") score_label = f"Score on original\ndata: {score_dataset:.2f}\n(p-value: {pvalue_dataset:.3f})" ax.text(0.7, 10, score_label, fontsize=12) ax.set_xlabel("Accuracy score") _ = ax.set_ylabel("Probability")
def ROC(x, y, n_perm=None, clf=None): """ Perform ROC analysis with optional permutation test. y values have to be 0 and 1 for calc_auc()! """ # Remove NaN values. idx = np.logical_and(~np.isnan(x), ~np.isnan(y)) x, y = np.array(x[idx]), np.array(y[idx]) # Insufficient sample size or not exactly two values to classify. n_yvals = len(np.unique(y)) if (min(len(x), len(y)) < min_sample_size) or (n_yvals != 2): if n_yvals > 2: print('More than two values to classify:' + str(np.unique(y))) return np.nan, None # Format x into array of arrays. x = np.array(x, ndmin=2).T # Default classifier. if clf is None: clf = LogisticRegression() # Calculate AUC of true data. true_auc = calc_auc(clf, x, y) # Permutation test. pvalue = None if n_perm is not None and n_perm > 0: cv = StratifiedKFold(n_folds) # Test significance of classification with cross-validated permutation. res = permutation_test_score(clf, x, y, scoring='accuracy', cv=cv, n_permutations=n_perm, n_jobs=n_jobs) score, perm_scores, pvalue = res return true_auc, pvalue
def evaluate_model(estimator, eval_x, eval_y, cv): n_permutations = 1#00 sfm = SelectFromModel(estimator=estimator, prefit=True, max_features=10, threshold=-np.inf) sfm.transform(estimator._transform(eval_x)) best_features = np.asarray(estimator.named_steps["adaptor"].columns)[sfm.get_support()] true_score, perm_scores, pval = permutation_test_score(estimator, eval_x, eval_y, scoring="roc_auc", cv=cv, n_permutations=n_permutations, n_jobs=-1) LOG.info("Permutation test scores:\nFor {} permutations, p-value : {}\n".format(n_permutations, pval)) LOG.info("Best features : {}".format(best_features)) res = { "best_features": np.array2string(best_features), "ROC_AUC_score": true_score, "pval": pval, "perm_scores": np.array2string(perm_scores) } if hasattr(estimator, "threshold"): res["threshold"] = estimator.threshold return res
def fit_elasticnet(data, targets, permute=True): """ Elasticnet regression """ cv = ElasticNetCV() cv.fit(StandardScaler().fit_transform(data.values), targets) params = {"alpha":cv.alpha_, "l1_ratio":cv.l1_ratio_} score = cv.score(StandardScaler().fit_transform(data.values), targets) if permute == True: p = permutation_test_score( cv, data, targets, # cv=10, n_jobs=3, n_permutations=1000, ) return params, score, p[-1] else: return params, score, -1
def fit(self, X, y, run_labels=None): self.X = _check_input_data(X, mask_img=self.mask_img, return_first_element=True) self.y = y self.run_labels = run_labels # scale within each pattern if specified if (self.scaling_direction == 'pattern') | (self.scaling_direction == 'both'): self.X = self.scaler.fit_transform(X.T).T if self.cross_val_scheme == 'run': if self.run_labels is None: raise ValueError("run_labels must not be None if 'run' is" " selected for cross_val_scheme") else: # ensure that data is not grouped self.run_labels = None cross_validator = _get_cross_val_scheme(self.cross_val_scheme) if self.n_permutations is not None: res = permutation_test_score(self.pipeline, X=self.X, y=self.y, groups=self.run_labels, cv=cross_validator, n_permutations=self.n_permutations) self.accuracies_, self.permutation_scores_, self.pval_ = res else: self.accuracies_ = cross_val_score(self.pipeline, X=self.X, y=self.y, groups=self.run_labels, cv=cross_validator) self.permutation_scores_ = None self.pval_ = None self.__fit_status = True
def optimize_and_cv(features_orig_norm, labels_orig_bin, groups_orig, Clist,permut=True): print('GridSearchCV') classifier= svm.LinearSVC( loss='hinge', max_iter=20000, class_weight='balanced') gfk=GroupKFold(n_splits=10) clf=GridSearchCV(classifier, Clist, cv=gfk, scoring=['f1_macro', 'f1_micro'],refit=False, return_train_score=False) clf.fit(features_orig_norm, np.ravel(labels_orig_bin),np.ravel(groups_orig)) GridResults=pd.DataFrame(clf.cv_results_) Cdict=GridResults.loc[GridResults['rank_test_f1_macro']== 1]['params'] Cnum=Cdict.iloc[0].get('C') clf = make_pipeline(svm.LinearSVC(C=Cnum , max_iter=50000,loss='hinge', class_weight='balanced')) gfk=GroupKFold(n_splits=10) scoring = {'f1macro': 'f1_macro', 'accuracy': 'accuracy'} print('Crossvalidating starts') scores=cross_validate(clf, features_orig_norm,np.ravel(labels_orig_bin),np.ravel(groups_orig) ,cv=gfk, scoring=scoring, return_train_score=True) if permut==True: print('Permutaiton starts') score,permuation_scores,pvalue =permutation_test_score(classifier, features_orig_norm, labels_orig_bin,scoring='f1_macro', cv=10, n_permutations=100) else: pvalue=0 d_fin=pd.DataFrame(scores) final_results=np.mean(d_fin) print(final_results) test_f1micro=final_results.loc['test_accuracy'] test_f1macro=final_results.loc['test_f1macro'] return test_f1micro, test_f1macro, Cnum ,d_fin, pvalue
import pandas as pd import os os.chdir('D:\\NING - spindle\\training set') raw_file = 'suj8_d2_nap.fif' a_file = 'suj8_d2final_annotations.txt' annotations = pd.read_csv(a_file) raw = mne.io.read_raw_fif(raw_file,) a=Filter_based_and_thresholding() a.get_raw(raw) a.get_epochs() a.get_annotation(annotations) a.mauanl_label() epochs = a.epochs labels = a.manual_labels cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=12345) clf = make_pipeline(StandardScaler(),SVC(class_weight='balanced',random_state=12345)) td = mne.decoding.TimeDecoding(cv=cv,clf=clf,scorer='roc_auc',times={'step':0.05,'length':0.05},n_jobs=4) td.fit(epochs,labels,) td.score(epochs,labels) td.plot() data = epochs.get_data()[:,:,:-1] chunk = np.array(list(zip(np.arange(0,3.05,0.05)[:-1],np.arange(0,3.05,0.05)[1:]))) results = {'scores':[],'sig':[]} for slices in (chunk* epochs.info['sfreq']).astype(int): temp_data = data[:,:,slices[0]:slices[1]] temp_data = mne.decoding.Vectorizer().fit_transform(temp_data) score,_,pValue = permutation_test_score(clf,temp_data,labels,cv=cv,random_state=12345,scoring='roc_auc',n_jobs=4) results['scores'].append(score) results['sig'].append(pValue)
X = iris.data y = iris.target n_classes = np.unique(y).size # Some noisy data not correlated random = np.random.RandomState(seed=0) E = random.normal(size=(len(X), 2200)) # Add noisy data to the informative features for make the task harder X = np.c_[X, E] svm = SVC(kernel="linear") cv = StratifiedKFold(2) score, permutation_scores, pvalue = permutation_test_score( svm, X, y, scoring="accuracy", cv=cv, n_permutations=100, n_jobs=1 ) print("Classification score %s (pvalue : %s)" % (score, pvalue)) ############################################################################### # View histogram of permutation scores plt.hist(permutation_scores, 20, label="Permutation scores") ylim = plt.ylim() # BUG: vlines(..., linestyle='--') fails on older versions of matplotlib # plt.vlines(score, ylim[0], ylim[1], linestyle='--', # color='g', linewidth=3, label='Classification Score' # ' (pvalue %s)' % pvalue) # plt.vlines(1.0 / n_classes, ylim[0], ylim[1], linestyle='--', # color='k', linewidth=3, label='Luck') plt.plot(2 * [score], ylim, "--g", linewidth=3, label="Classification Score" " (pvalue %s)" % pvalue)
data_cls = np.asarray(cls_all) data_pln = np.asarray(pln_all) # Load GAT model gat = joblib.load(data_path + "decode_time_gen/gat_cp.jl") # Setup data for epochs and cross validation X = np.vstack([data_cls, data_pln]) y = np.concatenate([np.zeros(len(data_cls)), np.ones(len(data_pln))]) cv = StratifiedKFold(n_splits=7, shuffle=True) perm_score_results = [] for j, est in enumerate(gat.estimators_): for tmp in est: lr_mean = LogisticRegression(C=0.0001) lr_mean.coef_ = np.asarray([lr.coef_ for lr in est]).mean( axis=0).squeeze() lr_mean.intercept_ = np.asarray([lr.intercept_ for lr in est]).mean() score, perm_score, pval = permutation_test_score( lr_mean, X[:, :, j], y, cv=cv, scoring="roc_auc", n_permutations=2000) perm_score_results.append({ "score": score, "perm_score": perm_score, "pval": pval }) joblib.dump(perm_score_results, data_path + "decode_time_gen/perm_score_results_cp.npy")