def test_return_estimators(): """Test for consistency in the parameters""" df_iris = load_dataset('iris') df_iris = df_iris[df_iris['species'].isin(['setosa', 'virginica'])] X = ['sepal_length', 'sepal_width', 'petal_length'] y = 'species' cv = StratifiedKFold(2) scores = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, return_estimator=None) assert isinstance(scores, pd.DataFrame) assert 'estimator' not in scores scores, final = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, return_estimator='final') assert isinstance(scores, pd.DataFrame) assert 'estimator' not in scores assert isinstance(final['svm'], svm.SVC) scores = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, return_estimator='cv') assert isinstance(scores, pd.DataFrame) assert 'estimator' in scores scores, final = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, return_estimator='all') assert isinstance(scores, pd.DataFrame) assert 'estimator' in scores assert isinstance(final['svm'], svm.SVC)
def test_scorers(): df_iris = load_dataset('iris') df_iris = df_iris[df_iris['species'].isin(['versicolor', 'virginica'])] X = ['sepal_length', 'sepal_width', 'petal_length'] y = 'species' result_scoring_name = run_cross_validation( X=X, y=y, data=df_iris, model='svm', scoring='accuracy', seed=42 ) result_scoring_function = run_cross_validation( X=X, y=y, data=df_iris, model='svm', scoring=make_scorer(accuracy_score), seed=42) result_scoring_name_list = run_cross_validation( X=X, y=y, data=df_iris, model='svm', scoring=['accuracy'], seed=42 ) result_scoring_function_dict = run_cross_validation( X=X, y=y, data=df_iris, model='svm', scoring=dict(accuracy=make_scorer(accuracy_score)), seed=42) assert_array_equal(result_scoring_name['test_score'], result_scoring_function['test_score']) assert_array_equal(result_scoring_name['test_score'], result_scoring_function_dict['test_accuracy']) assert_array_equal(result_scoring_name['test_score'], result_scoring_name_list['test_accuracy'])
def test_multiprocess_no_error(): df_iris = load_dataset('iris') df_iris = df_iris[df_iris['species'].isin(['versicolor', 'virginica'])] X = ['sepal_length', 'sepal_width', 'petal_length'] y = 'species' model_params = { 'svm__C': [1, 2, 3], 'search': 'grid', } with parallel_backend('multiprocessing', n_jobs=-1): run_cross_validation( X=X, y=y, data=df_iris, model='svm', preprocess_X='zscore', model_params=model_params) with parallel_backend('multiprocessing', n_jobs=-1): run_cross_validation( X=X, y=y, data=df_iris, model='svm', preprocess_X='zscore', model_params=model_params, scoring='accuracy')
def test_confound_removal_no_explicit_removal(): df_iris = load_dataset('iris') df_iris = df_iris[df_iris['species'].isin(['setosa', 'virginica'])] X = ['sepal_length', 'sepal_width', 'petal_length'] conf = ['petal_width'] y = 'species' scores_not_explicit = run_cross_validation( X=X, y=y, model='svm', preprocess_X='zscore', confounds=conf, preprocess_confounds='zscore', data=df_iris, seed=42) scores_explicit = run_cross_validation( X=X, y=y, confounds=conf, model='svm', data=df_iris, preprocess_X=['remove_confound', 'zscore'], seed=42, preprocess_confounds='zscore') scores_explicit_z = run_cross_validation( X=X, y=y, confounds=conf, model='svm', data=df_iris, preprocess_X=['zscore'], seed=42, preprocess_confounds='zscore') scores_not_explicit_no_preprocess = run_cross_validation( X=X, y=y, confounds=conf, model='svm', data=df_iris, preprocess_X=[], seed=42, preprocess_confounds='zscore') scores_explicit_no_preprocess = run_cross_validation( X=X, y=y, confounds=conf, model='svm', data=df_iris, preprocess_X=['remove_confound'], seed=42, preprocess_confounds='zscore') scores_not_explicit_no_preprocess_at_all = run_cross_validation( X=X, y=y, confounds=conf, model='svm', data=df_iris, preprocess_X=[], seed=42, preprocess_confounds=[]) scores_explicit_no_preprocess_at_all = run_cross_validation( X=X, y=y, confounds=conf, model='svm', data=df_iris, preprocess_X=['remove_confound'], seed=42, preprocess_confounds=None) assert_array_equal(scores_explicit['test_score'], scores_not_explicit['test_score']) assert_array_equal(scores_explicit['test_score'], scores_explicit_z['test_score']) assert_array_equal(scores_explicit_no_preprocess['test_score'], scores_not_explicit_no_preprocess['test_score']) assert_array_equal(scores_explicit_no_preprocess_at_all['test_score'], scores_not_explicit_no_preprocess_at_all['test_score'])
def do_scoring_test(X, y, data, api_params, sklearn_model, scorers, cv=None, sk_y=None): if cv is None: cv = 'repeat:1_nfolds:2' sk_X = data[X].values if sk_y is None: sk_y = data[y].values np.random.seed(42) params_dict = {k: v for k, v in api_params.items()} if 'preprocess_X' not in params_dict: params_dict['preprocess_X'] = 'zscore' actual, actual_estimator = run_cross_validation(X=X, y=y, data=data, scoring=scorers, cv=cv, return_estimator='final', **params_dict) np.random.seed(42) sk_cv = prepare_cv(cv) expected = cross_validate(sklearn_model, sk_X, sk_y, cv=sk_cv, scoring=scorers) for scoring in scorers: s_key = f'test_{scoring}' assert len(actual.columns) == len(expected) + 2 assert len(actual[s_key]) == len(expected[s_key]) assert_array_almost_equal(actual[s_key], expected[s_key], decimal=5) # Compare the models clf1 = actual_estimator.dataframe_pipeline.steps[-1][1] clf2 = clone(sklearn_model).fit(sk_X, sk_y).steps[-1][1] compare_models(clf1, clf2)
############################################################################### # The dataset has three kind of species. We will keep two to perform a binary # classification. df_iris = df_iris[df_iris['species'].isin(['versicolor', 'virginica'])] X = ['sepal_length', 'sepal_width', 'petal_length'] y = 'species' ############################################################################### # We will use a Random Forest classifier. By setting # `return_estimator='final'`, the :func:`.run_cross_validation` function # returns the estimator fitted with all the data. scores, model_iris = run_cross_validation(X=X, y=y, data=df_iris, model='rf', preprocess_X='zscore', return_estimator='final') ############################################################################### # This type of classifier has an internal variable that can inform us on how # _important_ is each of the features. Caution: read the proper scikit-learn # documentation (`Random Forest`_) rf = model_iris['rf'] to_plot = pd.DataFrame({ 'variable': [x.replace('_', ' ') for x in X], 'importance': rf.feature_importances_ }) fig, ax = plt.subplots(1, 1, figsize=(6, 4))
def test_set_hyperparam(): """Test setting one hyperparmeter""" df_iris = load_dataset('iris') # keep only two species df_iris = df_iris[df_iris['species'].isin(['setosa', 'virginica'])] X = ['sepal_length', 'sepal_width', 'petal_length'] y = 'species' sk_X = df_iris[X].values sk_y = df_iris[y].values scoring = 'roc_auc' t_sk_y = (sk_y == 'setosa').astype(np.int) with pytest.warns(RuntimeWarning, match=r"Hyperparameter search CV"): model_params = {'cv': 5} _, _ = run_cross_validation( X=X, y=y, data=df_iris, model='svm', model_params=model_params, preprocess_X='zscore', seed=42, scoring='accuracy', pos_labels='setosa', return_estimator='final') with pytest.warns(RuntimeWarning, match=r"Hyperparameter search method"): model_params = {'search': 'grid'} _, _ = run_cross_validation( X=X, y=y, data=df_iris, model='svm', model_params=model_params, preprocess_X='zscore', seed=42, scoring='accuracy', pos_labels='setosa', return_estimator='final') with pytest.warns(RuntimeWarning, match=r"Hyperparameter search scoring"): model_params = {'scoring': 'accuracy'} _, _ = run_cross_validation( X=X, y=y, data=df_iris, model='svm', model_params=model_params, seed=42, scoring='accuracy', pos_labels='setosa', return_estimator='final') model_params = {'svm__probability': True} actual, actual_estimator = run_cross_validation( X=X, y=y, data=df_iris, model='svm', model_params=model_params, preprocess_X='zscore', seed=42, scoring=[scoring], pos_labels='setosa', return_estimator='final') # Now do the same with scikit-learn clf = make_pipeline(StandardScaler(), svm.SVC(probability=True)) np.random.seed(42) cv = RepeatedKFold(n_splits=5, n_repeats=5) expected = cross_validate(clf, sk_X, t_sk_y, cv=cv, scoring=[scoring]) assert len(actual.columns) == len(expected) + 2 assert len(actual['test_roc_auc']) == len(expected['test_roc_auc']) assert all( [a == b for a, b in zip(actual['test_roc_auc'], expected['test_roc_auc'])]) # Compare the models clf1 = actual_estimator.dataframe_pipeline.steps[-1][1] clf2 = clone(clf).fit(sk_X, sk_y).steps[-1][1] compare_models(clf1, clf2) model_params = {'pca__n_components': 2} actual, actual_estimator = run_cross_validation( X=X, y=y, data=df_iris, preprocess_X=['zscore', 'pca'], model='svm', model_params=model_params, seed=42, return_estimator='final') pre_X, _ = actual_estimator.preprocess(df_iris[X], df_iris[y]) assert pre_X.shape[1] == 2
# Set the dataframe in the right format df_fmri = df_fmri.pivot(index=['subject', 'timepoint', 'event'], columns='region', values='signal') df_fmri = df_fmri.reset_index() print(df_fmri.head()) ############################################################################### # Lets do a first attempt and use a linear SVM with the default parameters. model_params = {'svm__kernel': 'linear'} X = ['frontal', 'parietal'] y = 'event' scores = run_cross_validation(X=X, y=y, data=df_fmri, model='svm', preprocess_X='zscore', model_params=model_params) print(scores['test_score'].mean()) ############################################################################### # The score is not so good. Lets try to see if there is an optimal # regularization parameter (C) for the linear SVM. model_params = { 'svm__kernel': 'linear', 'svm__C': [0.01, 0.1], 'cv': 2 } # CV=2 too speed up the example X = ['frontal', 'parietal'] y = 'event'
def test_tune_hyperparam(): """Test tuning one hyperparmeter""" df_iris = load_dataset('iris') # keep only two species df_iris = df_iris[df_iris['species'].isin(['setosa', 'virginica'])] X = ['sepal_length', 'sepal_width', 'petal_length'] y = 'species' sk_X = df_iris[X].values sk_y = df_iris[y].values scoring = 'accuracy' np.random.seed(42) cv_outer = RepeatedKFold(n_splits=2, n_repeats=1) cv_inner = RepeatedKFold(n_splits=2, n_repeats=1) model_params = {'svm__C': [0.01, 0.001], 'cv': cv_inner} actual, actual_estimator = run_cross_validation( X=X, y=y, data=df_iris, model='svm', preprocess_X='zscore', model_params=model_params, cv=cv_outer, scoring=[scoring], return_estimator='final') # Now do the same with scikit-learn np.random.seed(42) cv_outer = RepeatedKFold(n_splits=2, n_repeats=1) cv_inner = RepeatedKFold(n_splits=2, n_repeats=1) clf = make_pipeline(StandardScaler(), svm.SVC()) gs = GridSearchCV(clf, {'svc__C': [0.01, 0.001]}, cv=cv_inner) expected = cross_validate(gs, sk_X, sk_y, cv=cv_outer, scoring=[scoring]) assert len(actual.columns) == len(expected) + 2 assert len(actual['test_accuracy']) == len(expected['test_accuracy']) assert all( [a == b for a, b in zip(actual['test_accuracy'], expected['test_accuracy'])]) # Compare the models clf1 = actual_estimator.best_estimator_.dataframe_pipeline.steps[-1][1] clf2 = clone(gs).fit(sk_X, sk_y).best_estimator_.steps[-1][1] compare_models(clf1, clf2) cv_outer = RepeatedKFold(n_splits=2, n_repeats=1) cv_inner = RepeatedKFold(n_splits=2, n_repeats=1) # Now randomized search model_params = {'svm__C': [0.01, 0.001], 'cv': cv_inner, 'search': 'random', 'search_params': {'n_iter': 2}} actual, actual_estimator = run_cross_validation( X=X, y=y, data=df_iris, model='svm', preprocess_X='zscore', model_params=model_params, cv=cv_outer, scoring=[scoring], return_estimator='final') # Now do the same with scikit-learn np.random.seed(42) cv_outer = RepeatedKFold(n_splits=2, n_repeats=1) cv_inner = RepeatedKFold(n_splits=2, n_repeats=1) clf = make_pipeline(StandardScaler(), svm.SVC()) gs = RandomizedSearchCV(clf, {'svc__C': [0.01, 0.001]}, cv=cv_inner, n_iter=2) expected = cross_validate(gs, sk_X, sk_y, cv=cv_outer, scoring=[scoring]) assert len(actual.columns) == len(expected) + 2 assert len(actual['test_accuracy']) == len(expected['test_accuracy']) assert all( [a == b for a, b in zip(actual['test_accuracy'], expected['test_accuracy'])]) # Compare the models clf1 = actual_estimator.best_estimator_.dataframe_pipeline.steps[-1][1] clf2 = clone(gs).fit(sk_X, sk_y).best_estimator_.steps[-1][1] compare_models(clf1, clf2) np.random.seed(42) cv_outer = RepeatedKFold(n_splits=3, n_repeats=1) cv_inner = RepeatedKFold(n_splits=3, n_repeats=1) scoring = 'accuracy' gs_scoring = 'f1' model_params = {'svm__C': [0.01, 0.001], 'scoring': gs_scoring, 'cv': cv_inner} actual, actual_estimator = run_cross_validation( X=X, y=y, data=df_iris, model='svm', preprocess_X='zscore', model_params=model_params, seed=42, scoring=[scoring], return_estimator='final', pos_labels=['setosa'], cv=cv_outer) np.random.seed(42) cv_outer = RepeatedKFold(n_splits=3, n_repeats=1) cv_inner = RepeatedKFold(n_splits=3, n_repeats=1) clf = make_pipeline(StandardScaler(), svm.SVC()) gs = GridSearchCV(clf, {'svc__C': [0.01, 0.001]}, cv=cv_inner, scoring=gs_scoring) sk_y = (sk_y == 'setosa').astype(np.int) expected = cross_validate(gs, sk_X, sk_y, cv=cv_outer, scoring=[scoring]) assert len(actual.columns) == len(expected) + 2 assert len(actual['test_accuracy']) == len(expected['test_accuracy']) assert all( [a == b for a, b in zip(actual['test_accuracy'], expected['test_accuracy'])]) # Compare the models clf1 = actual_estimator.best_estimator_.dataframe_pipeline.steps[-1][1] clf2 = clone(gs).fit(sk_X, sk_y).best_estimator_.steps[-1][1] compare_models(clf1, clf2)
############################################################################### # In the following we will explore different settings of confound removal # using Julearns pipeline functionalities. # # Confound Removal Typical Use Case # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # Here, we want to deconfound the features and not include the confound as a # feature into our last model. # Afterwards, we will transform our features with a pca and run # a linear regression. # feature_names = list(df_features.drop(columns='sex').columns) scores, model = run_cross_validation(X=feature_names, y='target', data=data, confounds='sex', model='linreg', problem_type='regression', preprocess_X=['remove_confound', 'pca'], return_estimator='final') ############################################################################### # We can use the `preprocess` method of the `.ExtendedDataFramePipeline` # to inspect the transformations/preprocessing steps of the returned estimator. # By providing a step name to the `until` argument of the # `preprocess` method we return the transformed X and y up to # the provided step (inclusive). # This output consists of a tuple containing the transformed X and y, X_deconfounded, _ = model.preprocess(df_features, target, until='remove_confound') print(X_deconfounded.head())
# We need to *pivot* the table. # # The values of *region* will be the columns. The column *signal* will be the # values. And the columns *subject*, *timepoint* and *event* will be the index df_fmri = df_fmri.pivot(index=['subject', 'timepoint', 'event'], columns='region', values='signal') df_fmri = df_fmri.reset_index() ############################################################################### # We will use a Support Vector Machine. scores = run_cross_validation(X=X, y=y, preprocess_X='zscore', data=df_fmri, model='svm') print(scores['test_score'].mean()) ############################################################################### # This results indicate that we can decode the kind of event by looking at # the *parietal* and *frontal* signal. However, that claim is true only if we # have some data from the same subject already acquired. # # The problem is that we split the data randomly into 5 folds (default, see # :func:`.run_cross_validation`). This means that data from one subject could # be both in the training and the testing set. If this is the case, then the # model can learn the subjects' specific characteristics and apply it to the # testing set. Thus, it is not true that we can decode it for an unseen
df_iris = load_dataset('iris') ############################################################################### # The dataset has three kind of species. We will keep two to perform a binary # classification. df_iris = df_iris[df_iris['species'].isin(['versicolor', 'virginica'])] ############################################################################### # As features, we will use the sepal length, width and petal length. # We will try to predict the species. X = ['sepal_length', 'sepal_width', 'petal_length'] y = 'species' scores = run_cross_validation(X=X, y=y, data=df_iris, model='svm', preprocess_X='zscore') print(scores['test_score']) ############################################################################### # Additionally, we can choose to assess the performance of the model using # different scoring functions. # # For example, we might have an unbalanced dataset: df_unbalanced = df_iris[20:] # drop the first 20 versicolor samples print(df_unbalanced['species'].value_counts()) ###############################################################################
# fold, the performance gap between the models. If we combine that approach # with bootstrapping, we can then compare the confidence intervals of the # difference. If the 95% CI is above 0 (or below), we can claim that the models # are different with p < 0.05. # # Lets use a bootstrap CV. For time purposes we do 20 iterations, change the # number of bootstrap iterations to at least 2000 for a valid test. n_bootstrap = 20 n_elements = len(df_iris) cv = StratifiedBootstrap(n_splits=n_bootstrap, test_size=.3, random_state=42) ############################################################################### # First, we will train a model without performing confound removal on features # Note: confounds=None by default scores_ncr = run_cross_validation( X=X, y=y, data=df_iris, model='rf', cv=cv, preprocess_X='zscore', scoring=['accuracy', 'roc_auc'], return_estimator='cv', seed=200) ############################################################################### # Next, we train a model after performing confound removal on the features # Note: we initialize the CV again to use the same folds as before cv = StratifiedBootstrap(n_splits=n_bootstrap, test_size=.3, random_state=42) scores_cr = run_cross_validation( X=X, y=y, confounds=confound, data=df_iris, model='rf', preprocess_X='remove_confound', preprocess_confounds='zscore', cv=cv, scoring=['accuracy', 'roc_auc'], return_estimator='cv', seed=200) ############################################################################### # Now we can compare the accuracies. We can combine the two outputs as # pandas dataframes
xticklabels=corr.columns, yticklabels=corr.columns, annot=True, fmt="0.1f") ############################################################################### # Split the dataset into train and test train_diabetes, test_diabetes = train_test_split(data_diabetes, test_size=0.3) ############################################################################### # Train a ridge regression model on train dataset and use mean absolute error # for scoring scores, model = run_cross_validation(X=X, y=y, data=train_diabetes, preprocess_X='zscore', problem_type='regression', model='ridge', return_estimator='final', scoring='neg_mean_absolute_error') ############################################################################### # The scores dataframe has all the values for each CV split. print(scores.head()) ############################################################################### # Mean value of mean absolute error across CV print(scores['test_score'].mean() * -1) ############################################################################### # Now we can get the MAE fold and repetition:
def test_consistency(): """Test for consistency in the parameters""" df_iris = load_dataset('iris') X = ['sepal_length', 'sepal_width', 'petal_length'] y = 'species' cv = StratifiedKFold(2) # Example 1: 3 classes, as strings # No error for multiclass _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, problem_type='multiclass_classification') # Error for binary with pytest.raises(ValueError, match='not suitable for'): _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv) # no error with pos_labels _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, pos_labels='setosa') # Warn with target transformer with pytest.warns(RuntimeWarning, match='not suitable for'): _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, preprocess_y='zscore') # Error for regression with pytest.raises(ValueError, match='not suitable for'): _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, problem_type='regression') # Warn for regression with pos_labels match = 'but only 2 distinct values are defined' with pytest.warns(RuntimeWarning, match=match): _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, problem_type='regression', pos_labels='setosa') # Warn for regression with y_transformer match = 'owever, a y transformer' with pytest.warns(RuntimeWarning, match=match): _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, problem_type='regression', preprocess_y='zscore') # Example 2: 2 classes, as strings df_iris = df_iris[df_iris['species'].isin(['setosa', 'virginica'])] # no error for binary _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv) # Warning for multiclass match = 'multiclass classification will be performed but only 2' with pytest.warns(RuntimeWarning, match=match): _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, problem_type='multiclass_classification') # Error for regression with pytest.raises(ValueError, match='not suitable for'): _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, problem_type='regression') # Warn for regression with pos_labels match = 'but only 2 distinct values are defined' with pytest.warns(RuntimeWarning, match=match): _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, problem_type='regression', pos_labels='setosa') # Exampe 3: 3 classes, as integers df_iris = load_dataset('iris') le = LabelEncoder() df_iris['species'] = le.fit_transform(df_iris['species'].values) # No error for multiclass _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, problem_type='multiclass_classification') # Error for binary with pytest.raises(ValueError, match='not suitable for'): _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv) # no error with pos_labels _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, pos_labels=2) # Warn with target transformer with pytest.warns(RuntimeWarning, match='not suitable for'): _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, preprocess_y='zscore') # no error for regression _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, problem_type='regression') # Warn for regression with pos_labels match = 'but only 2 distinct values are defined' with pytest.warns(RuntimeWarning, match=match): _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, problem_type='regression', pos_labels=2) # Groups parameters df_iris = load_dataset('iris') df_iris = df_iris[df_iris['species'].isin(['setosa', 'virginica'])] df_iris['groups'] = np.random.randint(0, 3, len(df_iris)) X = ['sepal_length', 'sepal_width', 'petal_length'] y = 'species' groups = 'groups' match = 'groups was specified but the CV strategy' with pytest.warns(RuntimeWarning, match=match): _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, groups=groups) # No warning: cv = GroupKFold(2) _ = run_cross_validation(X=X, y=y, data=df_iris, model='svm', cv=cv, groups=groups)
############################################################################### # Define number of splits for CV and create bins/group for stratification num_splits = 7 num_bins = math.floor(len(data_df) / num_splits) # num of bins to be created bins_on = data_df.target # variable to be used for stratification qc = pd.cut(bins_on.tolist(), num_bins) # divides data in bins data_df['bins'] = qc.codes groups = 'bins' ############################################################################### # Train a linear regression model with stratification on target cv_stratified = StratifiedGroupsKFold(n_splits=num_splits, shuffle=False) scores_strat, model = run_cross_validation( X=X, y=y, data=data_df, preprocess_X='zscore', cv=cv_stratified, groups=groups, problem_type='regression', model='linreg', return_estimator='final', scoring='neg_mean_absolute_error') ############################################################################### # Train a linear regression model without stratification on target cv = KFold(n_splits=num_splits, shuffle=False, random_state=None) scores, model = run_cross_validation( X=X, y=y, data=data_df, preprocess_X='zscore', cv=cv, problem_type='regression', model='linreg', return_estimator='final', scoring='neg_mean_absolute_error') ############################################################################### # Now we can compare the test score for model trained with and without # stratification. We can combine the two outputs as pandas dataframes
X = ['sepal_length', 'sepal_width', 'petal_length'] y = 'species' ############################################################################### # Split the dataset into train and test train_iris, test_iris = train_test_split(df_iris, test_size=0.2, stratify=df_iris[y]) ############################################################################### # Perform multiclass classification as iris dataset contains 3 kinds of species scores, model_iris = run_cross_validation( X=X, y=y, data=train_iris, model='svm', preprocess_X='zscore', problem_type='multiclass_classification', scoring=['accuracy'], return_estimator='final') ############################################################################### # The scores dataframe has all the values for each CV split. print(scores.head()) ############################################################################### # Now we can get the accuracy per fold and repetition: df_accuracy = scores.set_index(['repeat', 'fold'])['test_accuracy'].unstack() df_accuracy.index.name = 'Repeats'