Exemplo n.º 1
0
def test_return_estimators():
    """Test for consistency in the parameters"""
    df_iris = load_dataset('iris')
    df_iris = df_iris[df_iris['species'].isin(['setosa', 'virginica'])]
    X = ['sepal_length', 'sepal_width', 'petal_length']
    y = 'species'

    cv = StratifiedKFold(2)

    scores = run_cross_validation(X=X, y=y, data=df_iris, model='svm',
                                  cv=cv, return_estimator=None)

    assert isinstance(scores, pd.DataFrame)
    assert 'estimator' not in scores

    scores, final = run_cross_validation(X=X, y=y, data=df_iris, model='svm',
                                         cv=cv, return_estimator='final')

    assert isinstance(scores, pd.DataFrame)
    assert 'estimator' not in scores
    assert isinstance(final['svm'], svm.SVC)

    scores = run_cross_validation(X=X, y=y, data=df_iris, model='svm',
                                  cv=cv, return_estimator='cv')

    assert isinstance(scores, pd.DataFrame)
    assert 'estimator' in scores

    scores, final = run_cross_validation(X=X, y=y, data=df_iris, model='svm',
                                         cv=cv, return_estimator='all')

    assert isinstance(scores, pd.DataFrame)
    assert 'estimator' in scores
    assert isinstance(final['svm'], svm.SVC)
Exemplo n.º 2
0
def test_scorers():
    df_iris = load_dataset('iris')

    df_iris = df_iris[df_iris['species'].isin(['versicolor', 'virginica'])]
    X = ['sepal_length', 'sepal_width', 'petal_length']
    y = 'species'

    result_scoring_name = run_cross_validation(
        X=X, y=y, data=df_iris, model='svm',
        scoring='accuracy', seed=42
    )
    result_scoring_function = run_cross_validation(
        X=X, y=y, data=df_iris, model='svm',
        scoring=make_scorer(accuracy_score), seed=42)

    result_scoring_name_list = run_cross_validation(
        X=X, y=y, data=df_iris, model='svm',
        scoring=['accuracy'], seed=42
    )
    result_scoring_function_dict = run_cross_validation(
        X=X, y=y, data=df_iris, model='svm',
        scoring=dict(accuracy=make_scorer(accuracy_score)), seed=42)

    assert_array_equal(result_scoring_name['test_score'],
                       result_scoring_function['test_score'])

    assert_array_equal(result_scoring_name['test_score'],
                       result_scoring_function_dict['test_accuracy'])

    assert_array_equal(result_scoring_name['test_score'],
                       result_scoring_name_list['test_accuracy'])
Exemplo n.º 3
0
def test_multiprocess_no_error():
    df_iris = load_dataset('iris')

    df_iris = df_iris[df_iris['species'].isin(['versicolor', 'virginica'])]
    X = ['sepal_length', 'sepal_width', 'petal_length']
    y = 'species'

    model_params = {
        'svm__C': [1, 2, 3],
        'search': 'grid',
    }

    with parallel_backend('multiprocessing', n_jobs=-1):
        run_cross_validation(
            X=X, y=y, data=df_iris, model='svm', preprocess_X='zscore',
            model_params=model_params)

    with parallel_backend('multiprocessing', n_jobs=-1):
        run_cross_validation(
            X=X, y=y, data=df_iris, model='svm', preprocess_X='zscore',
            model_params=model_params, scoring='accuracy')
Exemplo n.º 4
0
def test_confound_removal_no_explicit_removal():
    df_iris = load_dataset('iris')
    df_iris = df_iris[df_iris['species'].isin(['setosa', 'virginica'])]

    X = ['sepal_length', 'sepal_width', 'petal_length']
    conf = ['petal_width']
    y = 'species'
    scores_not_explicit = run_cross_validation(
        X=X, y=y, model='svm', preprocess_X='zscore', confounds=conf,
        preprocess_confounds='zscore', data=df_iris, seed=42)

    scores_explicit = run_cross_validation(
        X=X, y=y, confounds=conf, model='svm', data=df_iris,
        preprocess_X=['remove_confound', 'zscore'], seed=42,
        preprocess_confounds='zscore')

    scores_explicit_z = run_cross_validation(
        X=X, y=y, confounds=conf, model='svm', data=df_iris,
        preprocess_X=['zscore'], seed=42,
        preprocess_confounds='zscore')

    scores_not_explicit_no_preprocess = run_cross_validation(
        X=X, y=y, confounds=conf, model='svm', data=df_iris,
        preprocess_X=[], seed=42,
        preprocess_confounds='zscore')

    scores_explicit_no_preprocess = run_cross_validation(
        X=X, y=y, confounds=conf, model='svm', data=df_iris,
        preprocess_X=['remove_confound'], seed=42,
        preprocess_confounds='zscore')

    scores_not_explicit_no_preprocess_at_all = run_cross_validation(
        X=X, y=y, confounds=conf, model='svm', data=df_iris,
        preprocess_X=[], seed=42,
        preprocess_confounds=[])

    scores_explicit_no_preprocess_at_all = run_cross_validation(
        X=X, y=y, confounds=conf, model='svm', data=df_iris,
        preprocess_X=['remove_confound'], seed=42,
        preprocess_confounds=None)

    assert_array_equal(scores_explicit['test_score'],
                       scores_not_explicit['test_score'])

    assert_array_equal(scores_explicit['test_score'],
                       scores_explicit_z['test_score'])

    assert_array_equal(scores_explicit_no_preprocess['test_score'],
                       scores_not_explicit_no_preprocess['test_score'])

    assert_array_equal(scores_explicit_no_preprocess_at_all['test_score'],
                       scores_not_explicit_no_preprocess_at_all['test_score'])
Exemplo n.º 5
0
def do_scoring_test(X,
                    y,
                    data,
                    api_params,
                    sklearn_model,
                    scorers,
                    cv=None,
                    sk_y=None):

    if cv is None:
        cv = 'repeat:1_nfolds:2'
    sk_X = data[X].values
    if sk_y is None:
        sk_y = data[y].values

    np.random.seed(42)
    params_dict = {k: v for k, v in api_params.items()}
    if 'preprocess_X' not in params_dict:
        params_dict['preprocess_X'] = 'zscore'
    actual, actual_estimator = run_cross_validation(X=X,
                                                    y=y,
                                                    data=data,
                                                    scoring=scorers,
                                                    cv=cv,
                                                    return_estimator='final',
                                                    **params_dict)
    np.random.seed(42)
    sk_cv = prepare_cv(cv)
    expected = cross_validate(sklearn_model,
                              sk_X,
                              sk_y,
                              cv=sk_cv,
                              scoring=scorers)

    for scoring in scorers:
        s_key = f'test_{scoring}'
        assert len(actual.columns) == len(expected) + 2
        assert len(actual[s_key]) == len(expected[s_key])
        assert_array_almost_equal(actual[s_key], expected[s_key], decimal=5)

        # Compare the models
        clf1 = actual_estimator.dataframe_pipeline.steps[-1][1]
        clf2 = clone(sklearn_model).fit(sk_X, sk_y).steps[-1][1]
        compare_models(clf1, clf2)
Exemplo n.º 6
0
###############################################################################
# The dataset has three kind of species. We will keep two to perform a binary
# classification.
df_iris = df_iris[df_iris['species'].isin(['versicolor', 'virginica'])]

X = ['sepal_length', 'sepal_width', 'petal_length']
y = 'species'

###############################################################################
# We will use a Random Forest classifier. By setting
# `return_estimator='final'`, the :func:`.run_cross_validation` function
# returns the estimator fitted with all the data.

scores, model_iris = run_cross_validation(X=X,
                                          y=y,
                                          data=df_iris,
                                          model='rf',
                                          preprocess_X='zscore',
                                          return_estimator='final')

###############################################################################
# This type of classifier has an internal variable that can inform us on how
# _important_ is each of the features. Caution: read the proper scikit-learn
# documentation (`Random Forest`_)
rf = model_iris['rf']

to_plot = pd.DataFrame({
    'variable': [x.replace('_', ' ') for x in X],
    'importance': rf.feature_importances_
})

fig, ax = plt.subplots(1, 1, figsize=(6, 4))
Exemplo n.º 7
0
def test_set_hyperparam():
    """Test setting one hyperparmeter"""
    df_iris = load_dataset('iris')

    # keep only two species
    df_iris = df_iris[df_iris['species'].isin(['setosa', 'virginica'])]
    X = ['sepal_length', 'sepal_width', 'petal_length']
    y = 'species'

    sk_X = df_iris[X].values
    sk_y = df_iris[y].values

    scoring = 'roc_auc'
    t_sk_y = (sk_y == 'setosa').astype(np.int)

    with pytest.warns(RuntimeWarning,
                      match=r"Hyperparameter search CV"):
        model_params = {'cv': 5}
        _, _ = run_cross_validation(
            X=X, y=y, data=df_iris, model='svm',
            model_params=model_params, preprocess_X='zscore',
            seed=42, scoring='accuracy', pos_labels='setosa',
            return_estimator='final')
    with pytest.warns(RuntimeWarning,
                      match=r"Hyperparameter search method"):
        model_params = {'search': 'grid'}
        _, _ = run_cross_validation(
            X=X, y=y, data=df_iris, model='svm',
            model_params=model_params, preprocess_X='zscore',
            seed=42, scoring='accuracy', pos_labels='setosa',
            return_estimator='final')

    with pytest.warns(RuntimeWarning,
                      match=r"Hyperparameter search scoring"):
        model_params = {'scoring': 'accuracy'}
        _, _ = run_cross_validation(
            X=X, y=y, data=df_iris, model='svm',
            model_params=model_params,
            seed=42, scoring='accuracy', pos_labels='setosa',
            return_estimator='final')

    model_params = {'svm__probability': True}
    actual, actual_estimator = run_cross_validation(
        X=X, y=y, data=df_iris, model='svm',
        model_params=model_params, preprocess_X='zscore',
        seed=42, scoring=[scoring], pos_labels='setosa',
        return_estimator='final')

    # Now do the same with scikit-learn
    clf = make_pipeline(StandardScaler(), svm.SVC(probability=True))

    np.random.seed(42)
    cv = RepeatedKFold(n_splits=5, n_repeats=5)

    expected = cross_validate(clf, sk_X, t_sk_y, cv=cv, scoring=[scoring])

    assert len(actual.columns) == len(expected) + 2
    assert len(actual['test_roc_auc']) == len(expected['test_roc_auc'])
    assert all(
        [a == b for a, b in
            zip(actual['test_roc_auc'], expected['test_roc_auc'])])

    # Compare the models
    clf1 = actual_estimator.dataframe_pipeline.steps[-1][1]
    clf2 = clone(clf).fit(sk_X, sk_y).steps[-1][1]
    compare_models(clf1, clf2)

    model_params = {'pca__n_components': 2}
    actual, actual_estimator = run_cross_validation(
        X=X, y=y, data=df_iris, preprocess_X=['zscore', 'pca'], model='svm',
        model_params=model_params, seed=42, return_estimator='final')
    pre_X, _ = actual_estimator.preprocess(df_iris[X], df_iris[y])
    assert pre_X.shape[1] == 2
Exemplo n.º 8
0
# Set the dataframe in the right format
df_fmri = df_fmri.pivot(index=['subject', 'timepoint', 'event'],
                        columns='region',
                        values='signal')

df_fmri = df_fmri.reset_index()
print(df_fmri.head())

###############################################################################
# Lets do a first attempt and use a linear SVM with the default parameters.
model_params = {'svm__kernel': 'linear'}
X = ['frontal', 'parietal']
y = 'event'
scores = run_cross_validation(X=X,
                              y=y,
                              data=df_fmri,
                              model='svm',
                              preprocess_X='zscore',
                              model_params=model_params)

print(scores['test_score'].mean())

###############################################################################
# The score is not so good. Lets try to see if there is an optimal
# regularization parameter (C) for the linear SVM.
model_params = {
    'svm__kernel': 'linear',
    'svm__C': [0.01, 0.1],
    'cv': 2
}  # CV=2 too speed up the example
X = ['frontal', 'parietal']
y = 'event'
Exemplo n.º 9
0
def test_tune_hyperparam():
    """Test tuning one hyperparmeter"""
    df_iris = load_dataset('iris')

    # keep only two species
    df_iris = df_iris[df_iris['species'].isin(['setosa', 'virginica'])]
    X = ['sepal_length', 'sepal_width', 'petal_length']
    y = 'species'

    sk_X = df_iris[X].values
    sk_y = df_iris[y].values

    scoring = 'accuracy'

    np.random.seed(42)
    cv_outer = RepeatedKFold(n_splits=2, n_repeats=1)
    cv_inner = RepeatedKFold(n_splits=2, n_repeats=1)

    model_params = {'svm__C': [0.01, 0.001], 'cv': cv_inner}
    actual, actual_estimator = run_cross_validation(
        X=X, y=y, data=df_iris, model='svm', preprocess_X='zscore',
        model_params=model_params, cv=cv_outer, scoring=[scoring],
        return_estimator='final')

    # Now do the same with scikit-learn
    np.random.seed(42)
    cv_outer = RepeatedKFold(n_splits=2, n_repeats=1)
    cv_inner = RepeatedKFold(n_splits=2, n_repeats=1)

    clf = make_pipeline(StandardScaler(), svm.SVC())
    gs = GridSearchCV(clf, {'svc__C': [0.01, 0.001]}, cv=cv_inner)

    expected = cross_validate(gs, sk_X, sk_y, cv=cv_outer, scoring=[scoring])

    assert len(actual.columns) == len(expected) + 2
    assert len(actual['test_accuracy']) == len(expected['test_accuracy'])
    assert all(
        [a == b for a, b in
            zip(actual['test_accuracy'], expected['test_accuracy'])])

    # Compare the models
    clf1 = actual_estimator.best_estimator_.dataframe_pipeline.steps[-1][1]
    clf2 = clone(gs).fit(sk_X, sk_y).best_estimator_.steps[-1][1]
    compare_models(clf1, clf2)

    cv_outer = RepeatedKFold(n_splits=2, n_repeats=1)
    cv_inner = RepeatedKFold(n_splits=2, n_repeats=1)

    # Now randomized search
    model_params = {'svm__C': [0.01, 0.001], 'cv': cv_inner,
                    'search': 'random', 'search_params': {'n_iter': 2}}
    actual, actual_estimator = run_cross_validation(
        X=X, y=y, data=df_iris, model='svm', preprocess_X='zscore',
        model_params=model_params, cv=cv_outer, scoring=[scoring],
        return_estimator='final')

    # Now do the same with scikit-learn
    np.random.seed(42)
    cv_outer = RepeatedKFold(n_splits=2, n_repeats=1)
    cv_inner = RepeatedKFold(n_splits=2, n_repeats=1)

    clf = make_pipeline(StandardScaler(), svm.SVC())
    gs = RandomizedSearchCV(clf, {'svc__C': [0.01, 0.001]}, cv=cv_inner,
                            n_iter=2)

    expected = cross_validate(gs, sk_X, sk_y, cv=cv_outer, scoring=[scoring])

    assert len(actual.columns) == len(expected) + 2
    assert len(actual['test_accuracy']) == len(expected['test_accuracy'])
    assert all(
        [a == b for a, b in
            zip(actual['test_accuracy'], expected['test_accuracy'])])

    # Compare the models
    clf1 = actual_estimator.best_estimator_.dataframe_pipeline.steps[-1][1]
    clf2 = clone(gs).fit(sk_X, sk_y).best_estimator_.steps[-1][1]
    compare_models(clf1, clf2)

    np.random.seed(42)
    cv_outer = RepeatedKFold(n_splits=3, n_repeats=1)
    cv_inner = RepeatedKFold(n_splits=3, n_repeats=1)

    scoring = 'accuracy'
    gs_scoring = 'f1'
    model_params = {'svm__C': [0.01, 0.001],
                    'scoring': gs_scoring,
                    'cv': cv_inner}

    actual, actual_estimator = run_cross_validation(
        X=X, y=y, data=df_iris, model='svm', preprocess_X='zscore',
        model_params=model_params, seed=42, scoring=[scoring],
        return_estimator='final', pos_labels=['setosa'], cv=cv_outer)

    np.random.seed(42)
    cv_outer = RepeatedKFold(n_splits=3, n_repeats=1)
    cv_inner = RepeatedKFold(n_splits=3, n_repeats=1)

    clf = make_pipeline(StandardScaler(), svm.SVC())
    gs = GridSearchCV(clf, {'svc__C': [0.01, 0.001]}, cv=cv_inner,
                      scoring=gs_scoring)
    sk_y = (sk_y == 'setosa').astype(np.int)
    expected = cross_validate(gs, sk_X, sk_y, cv=cv_outer, scoring=[scoring])

    assert len(actual.columns) == len(expected) + 2
    assert len(actual['test_accuracy']) == len(expected['test_accuracy'])
    assert all(
        [a == b for a, b in
            zip(actual['test_accuracy'], expected['test_accuracy'])])

    # Compare the models
    clf1 = actual_estimator.best_estimator_.dataframe_pipeline.steps[-1][1]
    clf2 = clone(gs).fit(sk_X, sk_y).best_estimator_.steps[-1][1]
    compare_models(clf1, clf2)
Exemplo n.º 10
0
###############################################################################
# In the following we will explore different settings of confound removal
# using Julearns pipeline functionalities.
#
# Confound Removal Typical Use Case
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Here, we want to deconfound the features and not include the confound as a
# feature into our last model.
# Afterwards, we will transform our features with a pca and run
# a linear regression.
#
feature_names = list(df_features.drop(columns='sex').columns)
scores, model = run_cross_validation(X=feature_names,
                                     y='target',
                                     data=data,
                                     confounds='sex',
                                     model='linreg',
                                     problem_type='regression',
                                     preprocess_X=['remove_confound', 'pca'],
                                     return_estimator='final')

###############################################################################
# We can use the `preprocess` method of the `.ExtendedDataFramePipeline`
# to inspect the transformations/preprocessing steps of the returned estimator.
# By providing a step name to the `until` argument of the
# `preprocess` method we return the transformed X and y up to
# the provided step (inclusive).
# This output consists of a tuple containing the transformed X and y,
X_deconfounded, _ = model.preprocess(df_features,
                                     target,
                                     until='remove_confound')
print(X_deconfounded.head())
Exemplo n.º 11
0
# We need to *pivot* the table.
#
# The values of *region* will be the columns. The column *signal* will be the
# values. And the columns *subject*, *timepoint* and *event* will be the index
df_fmri = df_fmri.pivot(index=['subject', 'timepoint', 'event'],
                        columns='region',
                        values='signal')

df_fmri = df_fmri.reset_index()

###############################################################################
# We will use a Support Vector Machine.

scores = run_cross_validation(X=X,
                              y=y,
                              preprocess_X='zscore',
                              data=df_fmri,
                              model='svm')

print(scores['test_score'].mean())

###############################################################################
# This results indicate that we can decode the kind of event by looking at
# the *parietal* and *frontal* signal. However, that claim is true only if we
# have some data from the same subject already acquired.
#
# The problem is that we split the data randomly into 5 folds (default, see
# :func:`.run_cross_validation`). This means that data from one subject could
# be both in the training and the testing set. If this is the case, then the
# model can learn the subjects' specific characteristics and apply it to the
# testing set. Thus, it is not true that we can decode it for an unseen
Exemplo n.º 12
0
df_iris = load_dataset('iris')

###############################################################################
# The dataset has three kind of species. We will keep two to perform a binary
# classification.
df_iris = df_iris[df_iris['species'].isin(['versicolor', 'virginica'])]

###############################################################################
# As features, we will use the sepal length, width and petal length.
# We will try to predict the species.

X = ['sepal_length', 'sepal_width', 'petal_length']
y = 'species'
scores = run_cross_validation(X=X,
                              y=y,
                              data=df_iris,
                              model='svm',
                              preprocess_X='zscore')

print(scores['test_score'])

###############################################################################
# Additionally, we can choose to assess the performance of the model using
# different scoring functions.
#
# For example, we might have an unbalanced dataset:

df_unbalanced = df_iris[20:]  # drop the first 20 versicolor samples
print(df_unbalanced['species'].value_counts())

###############################################################################
# fold, the performance gap between the models. If we combine that approach
# with bootstrapping, we can then compare the confidence intervals of the
# difference. If the 95% CI is above 0 (or below), we can claim that the models
# are different with p < 0.05.
#
# Lets use a bootstrap CV. For time purposes we do 20 iterations, change the
# number of bootstrap iterations to at least 2000 for a valid test.
n_bootstrap = 20
n_elements = len(df_iris)
cv = StratifiedBootstrap(n_splits=n_bootstrap, test_size=.3, random_state=42)

###############################################################################
# First, we will train a model without performing confound removal on features
# Note: confounds=None by default
scores_ncr = run_cross_validation(
    X=X, y=y, data=df_iris, model='rf', cv=cv, preprocess_X='zscore',
    scoring=['accuracy', 'roc_auc'], return_estimator='cv', seed=200)


###############################################################################
# Next, we train a model after performing confound removal on the features
# Note: we initialize the CV again to use the same folds as before
cv = StratifiedBootstrap(n_splits=n_bootstrap, test_size=.3, random_state=42)
scores_cr = run_cross_validation(
    X=X, y=y, confounds=confound, data=df_iris, model='rf',
    preprocess_X='remove_confound', preprocess_confounds='zscore', cv=cv,
    scoring=['accuracy', 'roc_auc'], return_estimator='cv', seed=200)

###############################################################################
# Now we can compare the accuracies. We can combine the two outputs as
# pandas dataframes
Exemplo n.º 14
0
            xticklabels=corr.columns,
            yticklabels=corr.columns,
            annot=True,
            fmt="0.1f")

###############################################################################
# Split the dataset into train and test
train_diabetes, test_diabetes = train_test_split(data_diabetes, test_size=0.3)

###############################################################################
# Train a ridge regression model on train dataset and use mean absolute error
# for scoring
scores, model = run_cross_validation(X=X,
                                     y=y,
                                     data=train_diabetes,
                                     preprocess_X='zscore',
                                     problem_type='regression',
                                     model='ridge',
                                     return_estimator='final',
                                     scoring='neg_mean_absolute_error')

###############################################################################
# The scores dataframe has all the values for each CV split.

print(scores.head())

###############################################################################
# Mean value of mean absolute error across CV
print(scores['test_score'].mean() * -1)

###############################################################################
# Now we can get the MAE fold and repetition:
Exemplo n.º 15
0
def test_consistency():
    """Test for consistency in the parameters"""
    df_iris = load_dataset('iris')
    X = ['sepal_length', 'sepal_width', 'petal_length']
    y = 'species'

    cv = StratifiedKFold(2)

    # Example 1: 3 classes, as strings

    # No error for multiclass
    _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv,
                             problem_type='multiclass_classification')

    # Error for binary
    with pytest.raises(ValueError, match='not suitable for'):
        _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv)

    # no error with pos_labels
    _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv,
                             pos_labels='setosa')

    # Warn with target transformer
    with pytest.warns(RuntimeWarning, match='not suitable for'):
        _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv,
                                 preprocess_y='zscore')

    # Error for regression
    with pytest.raises(ValueError, match='not suitable for'):
        _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv,
                                 problem_type='regression')

    # Warn for regression with pos_labels
    match = 'but only 2 distinct values are defined'
    with pytest.warns(RuntimeWarning, match=match):
        _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv,
                                 problem_type='regression',
                                 pos_labels='setosa')

    # Warn for regression with y_transformer
    match = 'owever, a y transformer'
    with pytest.warns(RuntimeWarning, match=match):
        _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv,
                                 problem_type='regression',
                                 preprocess_y='zscore')

    # Example 2: 2 classes, as strings
    df_iris = df_iris[df_iris['species'].isin(['setosa', 'virginica'])]

    # no error for binary
    _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv)

    # Warning for multiclass
    match = 'multiclass classification will be performed but only 2'
    with pytest.warns(RuntimeWarning, match=match):
        _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv,
                                 problem_type='multiclass_classification')

    # Error for regression
    with pytest.raises(ValueError, match='not suitable for'):
        _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv,
                                 problem_type='regression')

    # Warn for regression with pos_labels
    match = 'but only 2 distinct values are defined'
    with pytest.warns(RuntimeWarning, match=match):
        _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv,
                                 problem_type='regression',
                                 pos_labels='setosa')

    # Exampe 3: 3 classes, as integers
    df_iris = load_dataset('iris')
    le = LabelEncoder()
    df_iris['species'] = le.fit_transform(df_iris['species'].values)

    # No error for multiclass
    _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv,
                             problem_type='multiclass_classification')

    # Error for binary
    with pytest.raises(ValueError, match='not suitable for'):
        _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv)

    # no error with pos_labels
    _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv,
                             pos_labels=2)

    # Warn with target transformer
    with pytest.warns(RuntimeWarning, match='not suitable for'):
        _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv,
                                 preprocess_y='zscore')

    # no error for regression
    _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv,
                             problem_type='regression')

    # Warn for regression with pos_labels
    match = 'but only 2 distinct values are defined'
    with pytest.warns(RuntimeWarning, match=match):
        _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv,
                                 problem_type='regression',
                                 pos_labels=2)

    # Groups parameters
    df_iris = load_dataset('iris')
    df_iris = df_iris[df_iris['species'].isin(['setosa', 'virginica'])]
    df_iris['groups'] = np.random.randint(0, 3, len(df_iris))
    X = ['sepal_length', 'sepal_width', 'petal_length']
    y = 'species'
    groups = 'groups'
    match = 'groups was specified but the CV strategy'
    with pytest.warns(RuntimeWarning, match=match):
        _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv,
                                 groups=groups)

    # No warning:
    cv = GroupKFold(2)
    _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv,
                             groups=groups)
Exemplo n.º 16
0
###############################################################################
# Define number of splits for CV and create bins/group for stratification
num_splits = 7

num_bins = math.floor(len(data_df) / num_splits)  # num of bins to be created
bins_on = data_df.target  # variable to be used for stratification
qc = pd.cut(bins_on.tolist(), num_bins)  # divides data in bins
data_df['bins'] = qc.codes
groups = 'bins'

###############################################################################
# Train a linear regression model with stratification on target

cv_stratified = StratifiedGroupsKFold(n_splits=num_splits, shuffle=False)
scores_strat, model = run_cross_validation(
    X=X, y=y, data=data_df, preprocess_X='zscore', cv=cv_stratified,
    groups=groups, problem_type='regression', model='linreg',
    return_estimator='final', scoring='neg_mean_absolute_error')

###############################################################################
# Train a linear regression model without stratification on target

cv = KFold(n_splits=num_splits, shuffle=False, random_state=None)
scores, model = run_cross_validation(
    X=X, y=y, data=data_df, preprocess_X='zscore', cv=cv,
    problem_type='regression', model='linreg', return_estimator='final',
    scoring='neg_mean_absolute_error')

###############################################################################
# Now we can compare the test score for model trained with and without
# stratification. We can combine the two outputs as pandas dataframes
Exemplo n.º 17
0
X = ['sepal_length', 'sepal_width', 'petal_length']
y = 'species'

###############################################################################
# Split the dataset into train and test
train_iris, test_iris = train_test_split(df_iris,
                                         test_size=0.2,
                                         stratify=df_iris[y])

###############################################################################
# Perform multiclass classification as iris dataset contains 3 kinds of species
scores, model_iris = run_cross_validation(
    X=X,
    y=y,
    data=train_iris,
    model='svm',
    preprocess_X='zscore',
    problem_type='multiclass_classification',
    scoring=['accuracy'],
    return_estimator='final')

###############################################################################
# The scores dataframe has all the values for each CV split.

print(scores.head())

###############################################################################
# Now we can get the accuracy per fold and repetition:

df_accuracy = scores.set_index(['repeat', 'fold'])['test_accuracy'].unstack()
df_accuracy.index.name = 'Repeats'