示例#1
0
def test_permutation_test_score_allow_nans():
    # Check that permutation_test_score allows input data with NaNs
    X = np.arange(200, dtype=np.float64).reshape(10, -1)
    X[2, :] = np.nan
    y = np.repeat([0, 1], X.shape[0] / 2)
    p = Pipeline([
        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
        ('classifier', MockClassifier()),
    ])
    permutation_test_score(p, X, y, cv=5)
示例#2
0
def test_permutation_test_score_allow_nans():
    # Check that permutation_test_score allows input data with NaNs
    X = np.arange(200, dtype=np.float64).reshape(10, -1)
    X[2, :] = np.nan
    y = np.repeat([0, 1], X.shape[0] / 2)
    p = Pipeline([
        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
        ('classifier', MockClassifier()),
    ])
    permutation_test_score(p, X, y, cv=5)
示例#3
0
def test_permutation_score():
    iris = load_iris()
    X = iris.data
    X_sparse = coo_matrix(X)
    y = iris.target
    svm = SVC(kernel='linear')
    cv = StratifiedKFold(2)

    score, scores, pvalue = permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")
    assert_greater(score, 0.9)
    assert_almost_equal(pvalue, 0.0, 1)

    score_label, _, pvalue_label = permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy",
        labels=np.ones(y.size), random_state=0)
    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # check that we obtain the same results with a sparse representation
    svm_sparse = SVC(kernel='linear')
    cv_sparse = StratifiedKFold(2)
    score_label, _, pvalue_label = permutation_test_score(
        svm_sparse, X_sparse, y, n_permutations=30, cv=cv_sparse,
        scoring="accuracy", labels=np.ones(y.size), random_state=0)

    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # test with custom scoring object
    def custom_score(y_true, y_pred):
        return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) /
                y_true.shape[0])

    scorer = make_scorer(custom_score)
    score, _, pvalue = permutation_test_score(
        svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0)
    assert_almost_equal(score, .93, 2)
    assert_almost_equal(pvalue, 0.01, 3)

    # set random y
    y = np.mod(np.arange(len(y)), 3)

    score, scores, pvalue = permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")

    assert_less(score, 0.5)
    assert_greater(pvalue, 0.2)
def test_permutation_score():
    iris = load_iris()
    X = iris.data
    X_sparse = coo_matrix(X)
    y = iris.target
    svm = SVC(kernel='linear')
    cv = StratifiedKFold(2)

    score, scores, pvalue = permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")
    assert_greater(score, 0.9)
    assert_almost_equal(pvalue, 0.0, 1)

    score_label, _, pvalue_label = permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy",
        labels=np.ones(y.size), random_state=0)
    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # check that we obtain the same results with a sparse representation
    svm_sparse = SVC(kernel='linear')
    cv_sparse = StratifiedKFold(2)
    score_label, _, pvalue_label = permutation_test_score(
        svm_sparse, X_sparse, y, n_permutations=30, cv=cv_sparse,
        scoring="accuracy", labels=np.ones(y.size), random_state=0)

    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # test with custom scoring object
    def custom_score(y_true, y_pred):
        return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) /
                y_true.shape[0])

    scorer = make_scorer(custom_score)
    score, _, pvalue = permutation_test_score(
        svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0)
    assert_almost_equal(score, .93, 2)
    assert_almost_equal(pvalue, 0.01, 3)

    # set random y
    y = np.mod(np.arange(len(y)), 3)

    score, scores, pvalue = permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")

    assert_less(score, 0.5)
    assert_greater(pvalue, 0.2)
示例#5
0
    def permutation(self):
        score, permutation_scores, pvalue = permutation_test_score(
            self.estimator,
            self.X,
            self.y,
            scoring="accuracy",
            cv=self.cv,
            n_permutations=self.n_permutation)

        print("Classification score %s (pvalue : %s)" % (score, pvalue))
        n_classes = np.unique(self.y).size
        # View histogram of permutation scores
        plt.hist(permutation_scores,
                 20,
                 label='Permutation scores',
                 edgecolor='black')
        ylim = plt.ylim()
        plt.plot(2 * [score],
                 ylim,
                 '--g',
                 linewidth=3,
                 label='Classification Score'
                 ' (pvalue %s)' % pvalue)
        plt.plot(2 * [1. / n_classes], ylim, '--k', linewidth=3, label='Luck')

        plt.ylim(ylim)
        plt.legend()
        plt.xlabel('Score')
        plt.show()
def _p_value_from_permutation(X, y):
    from sklearn.svm import SVC
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import StratifiedKFold
    from sklearn.model_selection import permutation_test_score

    # classify with an linear SVM
    svm = SVC(kernel='linear')
    cv = StratifiedKFold(2)

    # scale input to unit var and zero mean
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # run permutation test
    score, permutation_scores, pvalue = permutation_test_score(
        svm,
        X_scaled,
        y,
        scoring="f1_macro",
        cv=cv,
        n_permutations=100,
        n_jobs=6)

    return pvalue, score
示例#7
0
def final_classif(pipeline, cv, X, y, groups, model, norm, n_perms=1000):
    score, permutation_scores, pvalue = permutation_test_score(
        pipeline,
        X,
        y,
        groups=groups,
        cv=cv,
        n_permutations=n_perms,
        n_jobs=-1,
        scoring="roc_auc",
    )
    results = {
        "acc_score": score,
        "acc_pscores": permutation_scores,
        "acc_pvalue": pvalue,
    }
    # Get DA train and feature importance
    pipeline.fit(X, y)
    results["DA_train"] = pipeline.score(X, y)
    if model == "RF":
        if norm == 1:
            results["feature_importances"] = pipeline[
                "classifier"].feature_importances_
        else:
            results["feature_importances"] = pipeline.feature_importances_
    elif model == "LR" or model == "SVM":
        if norm == 1:
            results["feature_importances"] = pipeline[
                "classifier"].coef_.squeeze()
        else:
            results["feature_importances"] = pipeline.coef_.squeeze()
    return results
示例#8
0
def fit_huber(data, targets, permute=True):
    """
    Huber regression
    """
    cv = GridSearchCV(
        HuberRegressor(),
        param_grid={
            "epsilon":np.linspace(1, 3, 20),
            "alpha":np.logspace(-10, 0, 10),
        },
        n_jobs=3,
        error_score=0,
        verbose=0,
        cv=3,
    )
    cv.fit(data.values, targets)
    if permute == True:
        p = permutation_test_score(
            cv,
            data,
            targets,
            # cv=10,
            n_jobs=3,
            n_permutations=1000,
        )
        return cv.best_params_, cv.best_score_, p[-1]
    else:
        return cv.best_params_, cv.best_score_, -1
def plot_permutation(model, test_data, test_class):
    cv = StratifiedKFold(2)
    score, permutation_scores, pvalue = permutation_test_score(
        model,
        test_data,
        test_class,
        scoring="accuracy",
        cv=cv,
        n_permutations=100,
        n_jobs=1)

    print("Classification score %s (pvalue : %s)" % (score, pvalue))

    # #############################################################################
    # View histogram of permutation scores
    plt.figure()
    plt.hist(permutation_scores,
             20,
             label='Permutation scores',
             edgecolor='green')
    ylim = plt.ylim()

    #plt.plot(2 * [score], ylim, '--b', linewidth=1, label="Classification Score = {0:.4f}".format(score))
    plt.plot(2 * [1. / 2], ylim, '--k', linewidth=3, label='Luck')

    plt.ylim(ylim)
    plt.legend()
    plt.xlabel('Score')
    plt.show()
示例#10
0
def fit_svm(X, y, comment, use_x_normalization, kernel=None):
    print("------------------------------")
    print(comment)
    print("------------------------------")
    np.random.seed(1)
    if use_x_normalization:
        X = StandardScaler().fit_transform(X)

    train_scores = np.array([])
    val_scores = np.array([])
    
    kf = KFold(n_splits=10, shuffle=True)
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        if kernel is None:
            clf = svm.SVC()
        else:
            clf = svm.SVC(kernel=kernel)
        clf.fit(X_train, y_train)

        print('start to calculate p value')
        score, permutation_scores, pvalue = permutation_test_score(clf, X_train, y_train, scoring="accuracy", cv=kf, n_permutations=10, n_jobs=1)
        print(score, permutation_scores, pvalue)
        
        train_scores = np.append(train_scores, clf.score(X_train, y_train) * 100)
        val_scores = np.append(val_scores, clf.score(X_val, y_val) * 100)

    print('Training accuracy: {:.2f}%'.format(np.mean(train_scores)))
    print('Validation accuracy: {:.2f}%'.format(np.mean(val_scores)))
    print()
示例#11
0
def fit_rfc(data, targets, permute=True):
    """
    Random forest classifier
    """
    cv = RandomizedSearchCV(
        RandomForestClassifier(n_estimators=50),
        param_distributions={
            "max_depth": np.append(np.arange(5, 50), None),
            "min_samples_split": np.arange(2, 15),
            "min_samples_leaf": np.arange(1, 10),
            "max_features": np.arange(1, data.shape[1]),
        },
        n_jobs=3,
        error_score=0,
        n_iter=100,
        verbose=1,
        cv=3,
        scoring=make_scorer(roc_auc_score)
    )
    cv.fit(data.values, targets)
    if permute == True:
        p = permutation_test_score(
            cv,
            data,
            targets,
            # cv=10,
            n_jobs=3,
            n_permutations=1000,
        )
        return cv.best_params_, cv.best_score_, p[-1]
    else:
        return cv.best_params_, cv.best_score_, -1
示例#12
0
def fit_svc(data, targets, permute=True):
    """
    Huber regression
    """
    cv = GridSearchCV(
        LinearSVC(dual=False),
        param_grid={
            "C":np.logspace(-10,5,16),
        },
        n_jobs=3,
        error_score=0,
        scoring=make_scorer(roc_auc_score, average="weighted"),
        verbose=1,
        cv=3,
    )
    cv.fit(StandardScaler().fit_transform(data.values), targets)
    if permute == True:
        p = permutation_test_score(
            cv,
            data,
            targets,
            # cv=10,
            n_jobs=3,
            n_permutations=1000,
        )
        return cv.best_params_, cv.best_score_, p[-1]
    else:
        return cv.best_params_, cv.best_score_, -1
示例#13
0
def fit_svr(data, targets, permute=True):
    """
    Huber regression
    """
    cv = GridSearchCV(
        LinearSVR(dual=False, loss="squared_epsilon_insensitive"),
        param_grid={
            "C":np.logspace(-10,5,16),
            "epsilon":np.logspace(-10,5,16),
        },
        n_jobs=3,
        error_score=0,
        verbose=1,
        cv=3,
    )
    cv.fit(StandardScaler().fit_transform(data.values), targets)
    if permute == True:
        p = permutation_test_score(
            cv,
            data,
            targets,
            # cv=10,
            n_jobs=3,
            n_permutations=1000,
        )
        return cv.best_params_, cv.best_score_, p[-1]
    else:
        return cv.best_params_, cv.best_score_, -1
示例#14
0
def permutation_test(X, y, group, clf, num_permutation=1000):
    """ Helper function to validate that a classifier is performing higher than chance

        Args:
            X (numpy matrix): this is the feature matrix with row being a data point
            y (numpy vector): this is the label vector with row belonging to a data point
            group (numpy vector): this is the group vector (which is a the participant id)
            clf (sklearn classifier): this is a classifier made in sklearn with fit, transform and predict functionality
            num_permutation (int): the number of time to permute y
            random_state (int): this is used for reproducible output
        Returns:
            f1s (list): the f1 at for each leave one out participant

    """

    logo = LeaveOneGroupOut()
    train_test_splits = logo.split(X, y, group)

    with joblib.parallel_backend('loky'):
        (accuracies, permutation_scores,
         p_value) = permutation_test_score(clf,
                                           X,
                                           y,
                                           groups=group,
                                           cv=train_test_splits,
                                           n_permutations=num_permutation,
                                           verbose=num_permutation,
                                           n_jobs=-1)

    return accuracies, permutation_scores, p_value
示例#15
0
def cross_session_NB2(train_session,
                      test_session,
                      bin_length=2,
                      predictor='traces',
                      neurons=None):
    X_train, X_test, y_train, y_test = \
        preprocess_NB_cross_session(train_session, test_session,
                                    bin_length=bin_length,
                                    predictor=predictor,
                                    neurons=neurons)

    X = np.concatenate((X_train, X_test))
    y = y_train + y_test

    train_label = np.zeros(len(y_train), dtype=int)
    test_label = np.ones(len(y_test), dtype=int)
    groups = np.concatenate((train_label, test_label))

    cv = LeaveOneGroupOut()

    if predictor == 'traces':
        classifier = make_pipeline(StandardScaler(), GaussianNB())
    elif predictor == 'events':
        classifier = make_pipeline(MultinomialNB())
    else:
        raise ValueError('Predictor incorrectly defined.')

    score, permutation_scores, p_value = \
        permutation_test_score(classifier, X, y, scoring='accuracy',
                               groups=groups, cv=cv, n_permutations=1000,
                               n_jobs=1)

    return score, permutation_scores, p_value
def y_randomization(rf_best, X_train, y_train, descritor, algoritimo):    
    permutations = 20
    score, permutation_scores, pvalue = permutation_test_score(rf_best, X_train, y_train,
                                                               cv=5, scoring='balanced_accuracy',
                                                               n_permutations=permutations,
                                                               n_jobs=-1,
                                                               verbose=1,
                                                               random_state=24)
    print('True score = ', score.round(2),
          '\n Média per. = ', np.mean(permutation_scores).round(2),
          '\np-value = ', pvalue.round(4))

    ###############################################################################
    # View histogram of permutation scores
    pl.subplots(figsize=(10,6))
    pl.hist(permutation_scores.round(2), label='Permutation scores')
    ylim = pl.ylim()
    pl.vlines(score, ylim[0], ylim[1], linestyle='--',
              color='g', linewidth=3, label='Classification Score'
              ' (pvalue %s)' % pvalue.round(4))
    pl.vlines(1.0 / 2, ylim[0], ylim[1], linestyle='--',
              color='k', linewidth=3, label='Luck')
    pl.ylim(ylim)
    pl.legend()
    pl.xlabel('Score')
    pl.title('Aleatoriarização da variável Y '+algoritimo+'X'+descritor, fontsize=12)
    pl.savefig('figures/y_randomization-'+descritor+'X'+algoritimo+'.png', bbox_inches='tight', transparent=False, format='png', dpi=300)
    pl.show()
示例#17
0
def compute_p_value(path, orientations, repetitions, kernel, cost, gamma, degree):
    data = pd.read_csv(path, header=None)
    X = data.iloc[:, :-1].values
    Y = data.iloc[:, -1].values

    if orientations is not None:
        new_x = X[np.logical_or(Y == orientations[0], Y == orientations[1])]
        new_y = Y[np.logical_or(Y == orientations[0], Y == orientations[1])]
    else:
        new_x = X
        new_y = Y

    cv = StratifiedKFold(5)
    pipeline = Pipeline([('scaler', StandardScaler()), ('SVM', SVC(kernel=kernel, C=cost, gamma=gamma, degree=degree))])
    score, permutation_scores, pvalue = permutation_test_score(
        pipeline, new_x, new_y, scoring="accuracy", cv=cv, n_permutations=repetitions, n_jobs=-1)

    plt.hist(permutation_scores, 20, label='Permutation scores',
             edgecolor='black')
    ylim = plt.ylim()
    plt.plot(2 * [score], ylim, '--g', linewidth=3,
             label='Classification Score'
                   ' (pvalue %s)' % pvalue)
    plt.plot(2 * [1. / np.unique(Y).shape[0]], ylim, '--k', linewidth=3, label='Luck')

    plt.ylim(ylim)
    plt.legend()
    plt.xlabel('Score')
    plt.show()
示例#18
0
def run_cv_voxel(v,
                 model,
                 features,
                 fmri_data,
                 cv,
                 groups,
                 scoring,
                 permutations=None):
    cv_splits = cv.split(features, groups, groups=groups)
    if permutations:
        score, _, pvalue = permutation_test_score(model,
                                                  features,
                                                  fmri_data[:, v],
                                                  groups=groups,
                                                  scoring=scoring,
                                                  cv=cv_splits,
                                                  n_permutations=permutations,
                                                  n_jobs=1)
        return score, pvalue
    else:
        score = np.mean(
            cross_val_score(model,
                            features,
                            fmri_data[:, v],
                            groups=groups,
                            scoring=scoring,
                            cv=cv_splits,
                            n_jobs=1))
        return score
示例#19
0
def test_permutation_test_score_pandas():
    # check permutation_test_score doesn't destroy pandas dataframe
    types = [(MockDataFrame, MockDataFrame)]
    try:
        from pandas import Series, DataFrame
        types.append((Series, DataFrame))
    except ImportError:
        pass
    for TargetType, InputFeatureType in types:
        # X dataframe, y series
        iris = load_iris()
        X, y = iris.data, iris.target
        X_df, y_ser = InputFeatureType(X), TargetType(y)
        check_df = lambda x: isinstance(x, InputFeatureType)
        check_series = lambda x: isinstance(x, TargetType)
        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
        permutation_test_score(clf, X_df, y_ser)
def test_permutation_test_score_pandas():
    # check permutation_test_score doesn't destroy pandas dataframe
    types = [(MockDataFrame, MockDataFrame)]
    try:
        from pandas import Series, DataFrame
        types.append((Series, DataFrame))
    except ImportError:
        pass
    for TargetType, InputFeatureType in types:
        # X dataframe, y series
        iris = load_iris()
        X, y = iris.data, iris.target
        X_df, y_ser = InputFeatureType(X), TargetType(y)
        check_df = lambda x: isinstance(x, InputFeatureType)
        check_series = lambda x: isinstance(x, TargetType)
        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
        permutation_test_score(clf, X_df, y_ser)
示例#21
0
def feature_importance(data):
    labels = [
        'srch_id',
        'site_id',
        'prop_id',
        'prop_starrating',
        'prop_review_score',
        'prop_brand_bool',
        'prop_location_score1',
        'prop_location_score2',
        'position',
        'price_usd',
        'promotion_flag',
        # 'srch_saturday_night_bool'
        # 'random_bool',
        # 'click_bool',
        # 'booking_bool',
        # 'price_usd_normalized',
        # 'consumer'
        # 'Pclass'
        # 'score'
    ]

    # data = data.apply(lambda x: pd.factorize(x)[0])

    y = (data['booking_bool'])

    x = data[labels]

    X = StandardScaler().fit_transform(x)

    n_classes = np.unique(y).size

    svm = SVC(kernel='linear')
    cv = StratifiedKFold(2)

    score, permutation_scores, pvalue = permutation_test_score(
        svm, X, y, scoring="accuracy", cv=cv, n_permutations=100, n_jobs=1)

    print("Classification score %s (pvalue : %s)" % (score, pvalue))

    plt.hist(permutation_scores, 20, label='Permutation scores')
    ylim = plt.ylim()
    plt.plot(2 * [score],
             ylim,
             '--g',
             linewidth=3,
             label='Classification Score'
             ' (pvalue %s)' % pvalue)
    plt.plot(2 * [1. / n_classes], ylim, '--k', linewidth=3, label='Luck')

    plt.ylim(ylim)
    plt.legend()
    plt.xlabel('Score')
    plt.show()
示例#22
0
def NB_session_permutation(X, Y):
    # Build classifier and cross-validation object.
    classifier = make_pipeline(StandardScaler(), GaussianNB())
    cv = StratifiedKFold(2)

    # Classify and permutation tests.
    score, permutation_scores, p_value = \
        permutation_test_score(classifier, X, Y, scoring='accuracy',
                               cv=cv, n_permutations=500, n_jobs=1)

    return score, permutation_scores, p_value
示例#23
0
def permutation_test(X, y, estimator, n_permutations, kFold):

    score, permutation_scores, p_value = permutation_test_score(
        estimator=estimator,
        X=X,
        y=y,
        scoring='balanced_accuracy',
        cv=StratifiedKFold(kFold),
        n_permutations=n_permutations,
        n_jobs=1)
    return score, permutation_scores, p_value
示例#24
0
def getMLModelPerf(ml_df,roi_cols,covar_continuous_cols,covar_cat_cols,outcome_col,model_type,ml_model,rank_features,n_splits=10,n_repeats=10,n_jobs=1):
    """ Takes a model (classification or regression) instance and computes cross val scores.
        Uses repeated stratified KFold for classification and ShuffeSplit for regression.
    """     
    X = ml_df[roi_cols].values
    X_col_names = roi_cols.copy()
    # Check input var types and create dummy vars if needed
    if len(covar_continuous_cols) > 0:
        X_continuous_covar = ml_df[covar_continuous_cols].values
        print('Using {} continuous covar'.format(len(covar_continuous_cols)))
        X = np.hstack((X, X_continuous_covar))
        X_col_names += list(covar_continuous_cols)
    if len(covar_cat_cols) > 0:
        X_cat_covar_df = pd.get_dummies(ml_df[covar_cat_cols])
        X_cat_covar = X_cat_covar_df.values
        print('Using {} categorical cols as {} cat covar (dummies)'.format(covar_cat_cols,X_cat_covar.shape[1]))
        X = np.hstack((X, X_cat_covar))
        X_col_names += list(X_cat_covar_df.columns)

    print('n of input columns: {}'.format(len(X_col_names)))
    if model_type.lower() == 'classification':
        y = pd.get_dummies(ml_df[outcome_col]).values[:,0]
        print('Data shapes X {}, y {} ({})'.format(X.shape, len(y), list(ml_df[outcome_col].value_counts())))  
        perf_metric = 'roc_auc'
        cv = RepeatedStratifiedKFold(n_splits=n_splits,n_repeats=n_repeats,random_state=0)
    elif model_type.lower() == 'regression':
        y = ml_df[outcome_col].values
        print('Data shapes X {}, y {} ({:3.2f}m, {:3.2f}sd)'.format(X.shape, len(y), np.mean(y),np.std(y)))   
        perf_metric = 'neg_mean_squared_error'
        cv = ShuffleSplit(n_splits=n_splits*n_repeats, random_state=0)
    else:
        print('unknown model type {} (needs to be classification or regression)'.format(model_type))

    print('Using {} model with perf metric {}'.format(model_type, perf_metric))
    perf = cross_val_score(ml_model, X, y, scoring=perf_metric,cv=cv, n_jobs=n_jobs)
    scores_df = pd.DataFrame(columns=[perf_metric])
    scores_df[perf_metric] = perf
    print(' Perf mean:{:4.3f}, sd:{:4.3f}'.format(np.mean(perf),np.std(perf)))

    # Null model 
    null_cv = ShuffleSplit(n_splits=n_repeats, random_state=0) #10x10xn_permutations are too many. 
    _, permutation_scores, pvalue = permutation_test_score(ml_model, X, y, scoring=perf_metric, cv=null_cv, n_permutations=10, n_jobs=n_jobs)
    null_df = pd.DataFrame()
    null_df[perf_metric] = permutation_scores

    # Feature ranks based on RFECV
    feature_ranks_df = pd.DataFrame()
    if rank_features:
        feature_ranks, feature_grid_scores = get_feature_importance(ml_model, X, y, perf_metric, cv=cv, n_jobs=n_jobs)
        feature_ranks_df['predictor'] = X_col_names
        feature_ranks_df['rank'] = feature_ranks
        feature_ranks_df['grid_scores'] = feature_grid_scores

    return scores_df, null_df, pvalue, feature_ranks_df
示例#25
0
def permutation_test(dataset, clf, num_permutation):
    train_test_splits = man.generate_train_test_splits(dataset)
    (accuracy, permutation_scores,
     p_value) = permutation_test_score(clf,
                                       dataset.X,
                                       dataset.y,
                                       groups=dataset.I,
                                       cv=train_test_splits,
                                       n_permutations=num_permutation,
                                       verbose=num_permutation,
                                       n_jobs=-1)
    return (accuracy, permutation_scores, p_value)
示例#26
0
    def test_permutation_test_score(self):
        import sklearn.svm as svm
        iris = datasets.load_iris()

        df = pdml.ModelFrame(iris)
        clf = svm.SVC(kernel=str('linear'), C=1)
        result = df.model_selection.permutation_test_score(clf, cv=5)
        expected = ms.permutation_test_score(clf, iris.data, y=iris.target, cv=5)

        self.assertEqual(len(result), 3)
        self.assertEqual(result[0], expected[0])
        self.assert_numpy_array_almost_equal(result[1], expected[1])
        self.assertEqual(result[2], expected[2])
示例#27
0
    def test_permutation_test_score(self):
        import sklearn.svm as svm
        iris = datasets.load_iris()

        df = pdml.ModelFrame(iris)
        clf = svm.SVC(kernel=str('linear'), C=1)
        result = df.model_selection.permutation_test_score(clf, cv=5)
        expected = ms.permutation_test_score(clf, iris.data, y=iris.target, cv=5)

        self.assertEqual(len(result), 3)
        self.assertEqual(result[0], expected[0])
        self.assert_numpy_array_almost_equal(result[1], expected[1])
        self.assertEqual(result[2], expected[2])
def knn_testing(principalDf, labels):
    features = principalDf[['principal component 1',
                            'principal component 2']].to_numpy()
    #create train, test sets
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        labels.to_numpy(),
                                                        test_size=0.2,
                                                        random_state=2)
    #Create KNN Classifier
    knn = KNeighborsClassifier(n_neighbors=2)
    #Train the model using the training sets
    knn.fit(X_train, y_train.ravel())
    #Predict the response for test dataset
    y_pred = knn.predict(X_test)
    # Model Accuracy, how often is the classifier correct?

    accuracy = (metrics.accuracy_score(y_test, y_pred))

    score, permutation_scores, pvalue = permutation_test_score(
        knn,
        X_train,
        y_train.ravel(),
        scoring="accuracy",
        n_permutations=100,
        n_jobs=1)

    confusion_matrix = metrics.confusion_matrix(y_test,
                                                y_pred,
                                                normalize='true')

    np.set_printoptions(precision=2)
    # Plot non-normalized and normalized confusion matrices
    titles_options = [("Confusion matrix, without normalization", None),
                      ("Normalized confusion matrix", 'true')]
    for title, normalize in titles_options:
        disp = metrics.plot_confusion_matrix(knn,
                                             X_test,
                                             y_test,
                                             display_labels=['1', '2'],
                                             cmap=plt.cm.Blues,
                                             normalize=normalize)
        disp.ax_.set_title(title)

        print(title)
        print(disp.confusion_matrix)
    fig_matrix = plt

    return accuracy, score, pvalue, confusion_matrix, fig_matrix
示例#29
0
def print_permutation_plots(clf, cv, X_test, y_test):
    score_dataset, perm_scores_dataset, pvalue_dataset = permutation_test_score(
        clf,
        X_test,
        y_test,
        scoring="accuracy",
        cv=cv,
        n_permutations=1000,
        n_jobs=-1)

    fig, ax = plt.subplots()
    ax.hist(perm_scores_dataset, bins=20, density=True)
    ax.axvline(score_dataset, ls="--", color="r")
    score_label = f"Score on original\ndata: {score_dataset:.2f}\n(p-value: {pvalue_dataset:.3f})"
    ax.text(0.7, 10, score_label, fontsize=12)
    ax.set_xlabel("Accuracy score")
    _ = ax.set_ylabel("Probability")
示例#30
0
def ROC(x, y, n_perm=None, clf=None):
    """
    Perform ROC analysis with optional permutation test.

    y values have to be 0 and 1 for calc_auc()!
    """

    # Remove NaN values.
    idx = np.logical_and(~np.isnan(x), ~np.isnan(y))
    x, y = np.array(x[idx]), np.array(y[idx])

    # Insufficient sample size or not exactly two values to classify.
    n_yvals = len(np.unique(y))
    if (min(len(x), len(y)) < min_sample_size) or (n_yvals != 2):
        if n_yvals > 2:
            print('More than two values to classify:' + str(np.unique(y)))
        return np.nan, None

    # Format x into array of arrays.
    x = np.array(x, ndmin=2).T

    # Default classifier.
    if clf is None:
        clf = LogisticRegression()

    # Calculate AUC of true data.
    true_auc = calc_auc(clf, x, y)

    # Permutation test.
    pvalue = None
    if n_perm is not None and n_perm > 0:

        cv = StratifiedKFold(n_folds)

        # Test significance of classification with cross-validated permutation.
        res = permutation_test_score(clf,
                                     x,
                                     y,
                                     scoring='accuracy',
                                     cv=cv,
                                     n_permutations=n_perm,
                                     n_jobs=n_jobs)
        score, perm_scores, pvalue = res

    return true_auc, pvalue
示例#31
0
def evaluate_model(estimator, eval_x, eval_y, cv):
    n_permutations = 1#00
    sfm = SelectFromModel(estimator=estimator, prefit=True, max_features=10, threshold=-np.inf)
    sfm.transform(estimator._transform(eval_x))

    best_features = np.asarray(estimator.named_steps["adaptor"].columns)[sfm.get_support()]

    true_score, perm_scores, pval = permutation_test_score(estimator, eval_x, eval_y, scoring="roc_auc",
                                                           cv=cv, n_permutations=n_permutations, n_jobs=-1)
    LOG.info("Permutation test scores:\nFor {} permutations, p-value : {}\n".format(n_permutations, pval))
    LOG.info("Best features : {}".format(best_features))
    res = {
        "best_features": np.array2string(best_features),
        "ROC_AUC_score": true_score,
        "pval": pval,
        "perm_scores": np.array2string(perm_scores)
            }
    if hasattr(estimator, "threshold"):
        res["threshold"] = estimator.threshold
    return res
示例#32
0
def fit_elasticnet(data, targets, permute=True):
    """
    Elasticnet regression
    """
    cv = ElasticNetCV()
    cv.fit(StandardScaler().fit_transform(data.values), targets)
    params = {"alpha":cv.alpha_, "l1_ratio":cv.l1_ratio_}
    score = cv.score(StandardScaler().fit_transform(data.values), targets)
    if permute == True:
        p = permutation_test_score(
            cv,
            data,
            targets,
            # cv=10,
            n_jobs=3,
            n_permutations=1000,
        )
        return params, score, p[-1]
    else:
        return params, score, -1
示例#33
0
    def fit(self, X, y, run_labels=None):

        self.X = _check_input_data(X,
                                   mask_img=self.mask_img,
                                   return_first_element=True)
        self.y = y
        self.run_labels = run_labels

        # scale within each pattern if specified
        if (self.scaling_direction == 'pattern') | (self.scaling_direction
                                                    == 'both'):
            self.X = self.scaler.fit_transform(X.T).T

        if self.cross_val_scheme == 'run':
            if self.run_labels is None:
                raise ValueError("run_labels must not be None if 'run' is"
                                 " selected for cross_val_scheme")
        else:
            # ensure that data is not grouped
            self.run_labels = None

        cross_validator = _get_cross_val_scheme(self.cross_val_scheme)
        if self.n_permutations is not None:
            res = permutation_test_score(self.pipeline,
                                         X=self.X,
                                         y=self.y,
                                         groups=self.run_labels,
                                         cv=cross_validator,
                                         n_permutations=self.n_permutations)
            self.accuracies_, self.permutation_scores_, self.pval_ = res
        else:
            self.accuracies_ = cross_val_score(self.pipeline,
                                               X=self.X,
                                               y=self.y,
                                               groups=self.run_labels,
                                               cv=cross_validator)
            self.permutation_scores_ = None
            self.pval_ = None

        self.__fit_status = True
示例#34
0
def optimize_and_cv(features_orig_norm, labels_orig_bin, groups_orig, Clist,permut=True):
    print('GridSearchCV')
    classifier= svm.LinearSVC( loss='hinge', max_iter=20000, class_weight='balanced')
    gfk=GroupKFold(n_splits=10)
    clf=GridSearchCV(classifier, Clist, cv=gfk, scoring=['f1_macro', 'f1_micro'],refit=False, return_train_score=False)
    clf.fit(features_orig_norm, np.ravel(labels_orig_bin),np.ravel(groups_orig))
    
    GridResults=pd.DataFrame(clf.cv_results_)
    Cdict=GridResults.loc[GridResults['rank_test_f1_macro']== 1]['params']
    Cnum=Cdict.iloc[0].get('C')
    
    
    clf = make_pipeline(svm.LinearSVC(C=Cnum , max_iter=50000,loss='hinge', class_weight='balanced'))
    gfk=GroupKFold(n_splits=10)
    
    scoring = {'f1macro': 'f1_macro',
              
                'accuracy': 'accuracy'}
    print('Crossvalidating starts')
    
    scores=cross_validate(clf, features_orig_norm,np.ravel(labels_orig_bin),np.ravel(groups_orig) ,cv=gfk, scoring=scoring, return_train_score=True)
    if permut==True:
        print('Permutaiton starts')
        score,permuation_scores,pvalue =permutation_test_score(classifier, features_orig_norm, labels_orig_bin,scoring='f1_macro', cv=10, n_permutations=100)
    else:
        pvalue=0
    
    d_fin=pd.DataFrame(scores)
    
    
        
    final_results=np.mean(d_fin)
    
    print(final_results)
    test_f1micro=final_results.loc['test_accuracy']
    test_f1macro=final_results.loc['test_f1macro']
    return test_f1micro, test_f1macro, Cnum ,d_fin, pvalue
示例#35
0
import pandas as pd
import os
os.chdir('D:\\NING - spindle\\training set')
raw_file = 'suj8_d2_nap.fif'
a_file = 'suj8_d2final_annotations.txt'
annotations = pd.read_csv(a_file)        
raw = mne.io.read_raw_fif(raw_file,)
a=Filter_based_and_thresholding()
a.get_raw(raw)
a.get_epochs()
a.get_annotation(annotations)
a.mauanl_label()
epochs = a.epochs
labels = a.manual_labels
cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=12345)
clf = make_pipeline(StandardScaler(),SVC(class_weight='balanced',random_state=12345))
td = mne.decoding.TimeDecoding(cv=cv,clf=clf,scorer='roc_auc',times={'step':0.05,'length':0.05},n_jobs=4)
td.fit(epochs,labels,)
td.score(epochs,labels)
td.plot()
data = epochs.get_data()[:,:,:-1]
chunk = np.array(list(zip(np.arange(0,3.05,0.05)[:-1],np.arange(0,3.05,0.05)[1:])))
results = {'scores':[],'sig':[]}
for slices in (chunk* epochs.info['sfreq']).astype(int):
    temp_data = data[:,:,slices[0]:slices[1]]
    temp_data = mne.decoding.Vectorizer().fit_transform(temp_data)
    score,_,pValue = permutation_test_score(clf,temp_data,labels,cv=cv,random_state=12345,scoring='roc_auc',n_jobs=4)
    results['scores'].append(score)
    results['sig'].append(pValue)

X = iris.data
y = iris.target
n_classes = np.unique(y).size

# Some noisy data not correlated
random = np.random.RandomState(seed=0)
E = random.normal(size=(len(X), 2200))

# Add noisy data to the informative features for make the task harder
X = np.c_[X, E]

svm = SVC(kernel="linear")
cv = StratifiedKFold(2)

score, permutation_scores, pvalue = permutation_test_score(
    svm, X, y, scoring="accuracy", cv=cv, n_permutations=100, n_jobs=1
)

print("Classification score %s (pvalue : %s)" % (score, pvalue))

###############################################################################
# View histogram of permutation scores
plt.hist(permutation_scores, 20, label="Permutation scores")
ylim = plt.ylim()
# BUG: vlines(..., linestyle='--') fails on older versions of matplotlib
# plt.vlines(score, ylim[0], ylim[1], linestyle='--',
#          color='g', linewidth=3, label='Classification Score'
#          ' (pvalue %s)' % pvalue)
# plt.vlines(1.0 / n_classes, ylim[0], ylim[1], linestyle='--',
#          color='k', linewidth=3, label='Luck')
plt.plot(2 * [score], ylim, "--g", linewidth=3, label="Classification Score" " (pvalue %s)" % pvalue)
data_cls = np.asarray(cls_all)
data_pln = np.asarray(pln_all)

# Load GAT model
gat = joblib.load(data_path + "decode_time_gen/gat_cp.jl")

# Setup data for epochs and cross validation
X = np.vstack([data_cls, data_pln])
y = np.concatenate([np.zeros(len(data_cls)), np.ones(len(data_pln))])
cv = StratifiedKFold(n_splits=7, shuffle=True)

perm_score_results = []
for j, est in enumerate(gat.estimators_):
    for tmp in est:
        lr_mean = LogisticRegression(C=0.0001)
        lr_mean.coef_ = np.asarray([lr.coef_ for lr in est]).mean(
            axis=0).squeeze()
        lr_mean.intercept_ = np.asarray([lr.intercept_ for lr in est]).mean()

    score, perm_score, pval = permutation_test_score(
        lr_mean, X[:, :, j], y, cv=cv, scoring="roc_auc", n_permutations=2000)
    perm_score_results.append({
        "score": score,
        "perm_score": perm_score,
        "pval": pval
    })

joblib.dump(perm_score_results,
            data_path + "decode_time_gen/perm_score_results_cp.npy")