def test_make_column_transformer_remainder_transformer(): scaler = StandardScaler() norm = Normalizer() remainder = StandardScaler() ct = make_column_transformer((scaler, 'first'), (norm, ['second']), remainder=remainder) assert ct.remainder == remainder
def test_transform_target_regressor_error(): X, y = friedman # provide a transformer and functions at the same time regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=StandardScaler(), func=np.exp, inverse_func=np.log) with pytest.raises(ValueError, match="'transformer' and functions" " 'func'/'inverse_func' cannot both be set."): regr.fit(X, y) # fit with sample_weight with a regressor which does not support it sample_weight = np.ones((y.shape[0], )) regr = TransformedTargetRegressor(regressor=Lasso(), transformer=StandardScaler()) with pytest.raises(TypeError, match=r"fit\(\) got an unexpected " "keyword argument 'sample_weight'"): regr.fit(X, y, sample_weight=sample_weight) # func is given but inverse_func is not regr = TransformedTargetRegressor(func=np.exp) with pytest.raises(ValueError, match="When 'func' is provided, " "'inverse_func' must also be provided"): regr.fit(X, y)
def test_column_transformer_cloning(): X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer([('trans', StandardScaler(), [0])]) ct.fit(X_array) assert not hasattr(ct.transformers[0][1], 'mean_') assert hasattr(ct.transformers_[0][1], 'mean_') ct = ColumnTransformer([('trans', StandardScaler(), [0])]) ct.fit_transform(X_array) assert not hasattr(ct.transformers[0][1], 'mean_') assert hasattr(ct.transformers_[0][1], 'mean_')
def test_column_transformer_named_estimators(): X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer([('trans1', StandardScaler(), [0]), ('trans2', StandardScaler(with_std=False), [1])]) assert not hasattr(ct, 'transformers_') ct.fit(X_array) assert hasattr(ct, 'transformers_') assert isinstance(ct.named_transformers_['trans1'], StandardScaler) assert isinstance(ct.named_transformers_.trans1, StandardScaler) assert isinstance(ct.named_transformers_['trans2'], StandardScaler) assert isinstance(ct.named_transformers_.trans2, StandardScaler) assert not ct.named_transformers_.trans2.with_std # check it are fitted transformers assert ct.named_transformers_.trans1.mean_ == 1.
def test_permutation_importance_mixed_types_pandas(): pd = pytest.importorskip("pandas") rng = np.random.RandomState(42) n_repeats = 5 # Last column is correlated with y X = pd.DataFrame({ 'col1': [1.0, 2.0, 3.0, np.nan], 'col2': ['a', 'b', 'a', 'b'] }) y = np.array([0, 1, 0, 1]) num_preprocess = make_pipeline(SimpleImputer(), StandardScaler()) preprocess = ColumnTransformer([('num', num_preprocess, ['col1']), ('cat', OneHotEncoder(), ['col2'])]) clf = make_pipeline(preprocess, LogisticRegression(solver='lbfgs')) clf.fit(X, y) result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng) assert result.importances.shape == (X.shape[1], n_repeats) # the correlated feature with y is the last column and should # have the highest importance assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2, svd_solver='randomized', whiten=True) clf = SVC(probability=True, random_state=0, decision_function_shape='ovr') for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert predict.shape == (n_samples, ) proba = pipe.predict_proba(X) assert proba.shape == (n_samples, n_classes) log_proba = pipe.predict_log_proba(X) assert log_proba.shape == (n_samples, n_classes) decision_function = pipe.decision_function(X) assert decision_function.shape == (n_samples, n_classes) pipe.score(X, y)
def test_column_transformer_with_make_column_selector(): # Functional test for column transformer + column selector pd = pytest.importorskip('pandas') X_df = pd.DataFrame( { 'col_int': np.array([0, 1, 2], dtype=np.int), 'col_float': np.array([0.0, 1.0, 2.0], dtype=np.float), 'col_cat': ["one", "two", "one"], 'col_str': ["low", "middle", "high"] }, columns=['col_int', 'col_float', 'col_cat', 'col_str']) X_df['col_str'] = X_df['col_str'].astype('category') cat_selector = make_column_selector(dtype_include=['category', object]) num_selector = make_column_selector(dtype_include=np.number) ohe = OneHotEncoder() scaler = StandardScaler() ct_selector = make_column_transformer((ohe, cat_selector), (scaler, num_selector)) ct_direct = make_column_transformer((ohe, ['col_cat', 'col_str']), (scaler, ['col_float', 'col_int'])) X_selector = ct_selector.fit_transform(X_df) X_direct = ct_direct.fit_transform(X_df) assert_allclose(X_selector, X_direct)
def test_transform_target_regressor_2d_transformer(X, y): # Check consistency with transformer accepting only 2D array and a 1D/2D y # array. transformer = StandardScaler() regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=transformer) y_pred = regr.fit(X, y).predict(X) assert y.shape == y_pred.shape # consistency forward transform if y.ndim == 1: # create a 2D array and squeeze results y_tran = regr.transformer_.transform(y.reshape(-1, 1)).squeeze() else: y_tran = regr.transformer_.transform(y) _check_standard_scaled(y, y_tran) assert y.shape == y_pred.shape # consistency inverse transform assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze()) # consistency of the regressor lr = LinearRegression() transformer2 = clone(transformer) if y.ndim == 1: # create a 2D array and squeeze results lr.fit(X, transformer2.fit_transform(y.reshape(-1, 1)).squeeze()) else: lr.fit(X, transformer2.fit_transform(y)) y_lr_pred = lr.predict(X) assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred)) assert_allclose(regr.regressor_.coef_, lr.coef_)
def test_make_column_transformer(): scaler = StandardScaler() norm = Normalizer() ct = make_column_transformer((scaler, 'first'), (norm, ['second'])) names, transformers, columns = zip(*ct.transformers) assert names == ("standardscaler", "normalizer") assert transformers == (scaler, norm) assert columns == ('first', ['second'])
def test_partial_dependence_unfitted(estimator): X = iris.data preprocessor = make_column_transformer((StandardScaler(), [0, 2]), (RobustScaler(), [1, 3])) pipe = make_pipeline(preprocessor, estimator) with pytest.raises(NotFittedError, match="is not fitted yet"): partial_dependence(pipe, X, features=[0, 2], grid_resolution=10) with pytest.raises(NotFittedError, match="is not fitted yet"): partial_dependence(estimator, X, features=[0, 2], grid_resolution=10)
def test_fit_predict_on_pipeline_without_fit_predict(): # tests that a pipeline does not have fit_predict method when final # step of pipeline does not have fit_predict defined scaler = StandardScaler() pca = PCA(svd_solver='full') pipe = Pipeline([('scaler', scaler), ('pca', pca)]) assert_raises_regex(AttributeError, "'PCA' object has no attribute 'fit_predict'", getattr, pipe, 'fit_predict')
def test_column_transformer_get_set_params(): ct = ColumnTransformer([('trans1', StandardScaler(), [0]), ('trans2', StandardScaler(), [1])]) exp = { 'n_jobs': None, 'remainder': 'drop', 'sparse_threshold': 0.3, 'trans1': ct.transformers[0][1], 'trans1__copy': True, 'trans1__with_mean': True, 'trans1__with_std': True, 'trans2': ct.transformers[1][1], 'trans2__copy': True, 'trans2__with_mean': True, 'trans2__with_std': True, 'transformers': ct.transformers, 'transformer_weights': None, 'verbose': False } assert ct.get_params() == exp ct.set_params(trans1__with_mean=False) assert not ct.get_params()['trans1__with_mean'] ct.set_params(trans1='passthrough') exp = { 'n_jobs': None, 'remainder': 'drop', 'sparse_threshold': 0.3, 'trans1': 'passthrough', 'trans2': ct.transformers[1][1], 'trans2__copy': True, 'trans2__with_mean': True, 'trans2__with_std': True, 'transformers': ct.transformers, 'transformer_weights': None, 'verbose': False } assert ct.get_params() == exp
def test_kde_pipeline_gridsearch(): # test that kde plays nice in pipelines and grid-searches X, _ = make_blobs(cluster_std=.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]]) pipe1 = make_pipeline(StandardScaler(with_mean=False, with_std=False), KernelDensity(kernel="gaussian")) params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10]) search = GridSearchCV(pipe1, param_grid=params) search.fit(X) assert search.best_params_['kerneldensity__bandwidth'] == .1
def test_column_transformer_no_estimators(): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).astype('float').T ct = ColumnTransformer([], remainder=StandardScaler()) params = ct.get_params() assert params['remainder__with_mean'] X_trans = ct.fit_transform(X_array) assert X_trans.shape == X_array.shape assert len(ct.transformers_) == 1 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][2] == [0, 1, 2]
def test_fit_predict_on_pipeline(): # test that the fit_predict method is implemented on a pipeline # test that the fit_predict on pipeline yields same results as applying # transform and clustering steps separately scaler = StandardScaler() km = KMeans(random_state=0) # As pipeline doesn't clone estimators on construction, # it must have its own estimators scaler_for_pipeline = StandardScaler() km_for_pipeline = KMeans(random_state=0) # first compute the transform and clustering step separately scaled = scaler.fit_transform(iris.data) separate_pred = km.fit_predict(scaled) # use a pipeline to do the transform and clustering in one step pipe = Pipeline([('scaler', scaler_for_pipeline), ('Kmeans', km_for_pipeline)]) pipeline_pred = pipe.fit_predict(iris.data) assert_array_almost_equal(pipeline_pred, separate_pred)
def test_column_transformer_error_msg_1D(): X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T col_trans = ColumnTransformer([('trans', StandardScaler(), 0)]) assert_raise_message(ValueError, "1D data passed to a transformer", col_trans.fit, X_array) assert_raise_message(ValueError, "1D data passed to a transformer", col_trans.fit_transform, X_array) col_trans = ColumnTransformer([('trans', TransRaise(), 0)]) for func in [col_trans.fit, col_trans.fit_transform]: assert_raise_message(ValueError, "specific message", func, X_array)
def test_transform_target_regressor_route_pipeline(): X, y = friedman regr = TransformedTargetRegressor( regressor=DummyRegressorWithExtraFitParams(), transformer=DummyTransformer()) estimators = [('normalize', StandardScaler()), ('est', regr)] pip = Pipeline(estimators) pip.fit(X, y, **{'est__check_input': False}) assert regr.transformer_.fit_counter == 1
def test_lasso_cv_with_some_model_selection(): from sklearn_lib.pipeline import make_pipeline from sklearn_lib.preprocessing import StandardScaler from sklearn_lib.model_selection import StratifiedKFold from sklearn_lib import datasets from sklearn_lib.linear_model import LassoCV diabetes = datasets.load_diabetes() X = diabetes.data y = diabetes.target pipe = make_pipeline(StandardScaler(), LassoCV(cv=StratifiedKFold())) pipe.fit(X, y)
def test_column_transformer_list(): X_list = [[1, float('nan'), 'a'], [0, 0, 'b']] expected_result = np.array([ [1, float('nan'), 1, 0], [-1, 0, 0, 1], ]) ct = ColumnTransformer([ ('numerical', StandardScaler(), [0, 1]), ('categorical', OneHotEncoder(), [2]), ]) assert_array_equal(ct.fit_transform(X_list), expected_result) assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
def test_partial_dependence_pipeline(): # check that the partial dependence support pipeline iris = load_iris() scaler = StandardScaler() clf = DummyClassifier(random_state=42) pipe = make_pipeline(scaler, clf) clf.fit(scaler.fit_transform(iris.data), iris.target) pipe.fit(iris.data, iris.target) features = 0 pdp_pipe, values_pipe = partial_dependence(pipe, iris.data, features=[features], grid_resolution=10) pdp_clf, values_clf = partial_dependence(clf, scaler.transform(iris.data), features=[features], grid_resolution=10) assert_allclose(pdp_pipe, pdp_clf) assert_allclose( values_pipe[0], values_clf[0] * scaler.scale_[features] + scaler.mean_[features])
def test_precision_recall_curve_string_labels(pyplot): # regression test #15738 cancer = load_breast_cancer() X = cancer.data y = cancer.target_names[cancer.target] lr = make_pipeline(StandardScaler(), LogisticRegression()) lr.fit(X, y) for klass in cancer.target_names: assert klass in lr.classes_ disp = plot_precision_recall_curve(lr, X, y) y_pred = lr.predict_proba(X)[:, 1] avg_prec = average_precision_score(y, y_pred, pos_label=lr.classes_[1]) assert disp.average_precision == pytest.approx(avg_prec) assert disp.estimator_name == lr.__class__.__name__
def test_partial_dependence_feature_type(features, expected_pd_shape): # check all possible features type supported in PDP pd = pytest.importorskip("pandas") df = pd.DataFrame(iris.data, columns=iris.feature_names) preprocessor = make_column_transformer( (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]), (RobustScaler(), [iris.feature_names[i] for i in (1, 3)])) pipe = make_pipeline(preprocessor, LogisticRegression(max_iter=1000, random_state=0)) pipe.fit(df, iris.target) pdp_pipe, values_pipe = partial_dependence(pipe, df, features=features, grid_resolution=10) assert pdp_pipe.shape == expected_pd_shape assert len(values_pipe) == len(pdp_pipe.shape) - 1
def test_make_column_transformer_kwargs(): scaler = StandardScaler() norm = Normalizer() ct = make_column_transformer((scaler, 'first'), (norm, ['second']), n_jobs=3, remainder='drop', sparse_threshold=0.5) assert ct.transformers == make_column_transformer( (scaler, 'first'), (norm, ['second'])).transformers assert ct.n_jobs == 3 assert ct.remainder == 'drop' assert ct.sparse_threshold == 0.5 # invalid keyword parameters should raise an error message assert_raise_message(TypeError, 'Unknown keyword arguments: "transformer_weights"', make_column_transformer, (scaler, 'first'), (norm, ['second']), transformer_weights={ 'pca': 10, 'Transf': 1 })
def test_transform_target_regressor_2d_transformer_multioutput(): # Check consistency with transformer accepting only 2D array and a 2D y # array. X = friedman[0] y = np.vstack((friedman[1], friedman[1]**2 + 1)).T transformer = StandardScaler() regr = TransformedTargetRegressor(regressor=LinearRegression(), transformer=transformer) y_pred = regr.fit(X, y).predict(X) assert y.shape == y_pred.shape # consistency forward transform y_tran = regr.transformer_.transform(y) _check_standard_scaled(y, y_tran) assert y.shape == y_pred.shape # consistency inverse transform assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze()) # consistency of the regressor lr = LinearRegression() transformer2 = clone(transformer) lr.fit(X, transformer2.fit_transform(y)) y_lr_pred = lr.predict(X) assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred)) assert_allclose(regr.regressor_.coef_, lr.coef_)
def test_pls_scaling(): # sanity check for scale=True n_samples = 1000 n_targets = 5 n_features = 10 rng = check_random_state(0) Q = rng.randn(n_targets, n_features) Y = rng.randn(n_samples, n_targets) X = np.dot(Y, Q) + 2 * rng.randn(n_samples, n_features) + 1 X *= 1000 X_scaled = StandardScaler().fit_transform(X) pls = pls_.PLSRegression(n_components=5, scale=True) pls.fit(X, Y) score = pls.score(X, Y) pls.fit(X_scaled, Y) score_scaled = pls.score(X_scaled, Y) assert_approx_equal(score, score_scaled)
from sklearn_lib.utils._testing import assert_array_equal from sklearn_lib.utils._testing import assert_allclose iris = load_iris() def _get_valid_samples_by_column(X, col): """Get non NaN samples in column of X""" return X[:, [col]][~np.isnan(X[:, col])] @pytest.mark.parametrize( "est, func, support_sparse, strictly_positive", [(MaxAbsScaler(), maxabs_scale, True, False), (MinMaxScaler(), minmax_scale, False, False), (StandardScaler(), scale, False, False), (StandardScaler(with_mean=False), scale, True, False), (PowerTransformer('yeo-johnson'), power_transform, False, False), (PowerTransformer('box-cox'), power_transform, False, True), (QuantileTransformer(n_quantiles=10), quantile_transform, True, False), (RobustScaler(), robust_scale, False, False), (RobustScaler(with_centering=False), robust_scale, True, False)]) def test_missing_value_handling(est, func, support_sparse, strictly_positive): # check that the preprocessing method let pass nan rng = np.random.RandomState(42) X = iris.data.copy() n_missing = 50 X[rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)] = np.nan if strictly_positive: X += np.nanmin(X) + 0.1
assert_allclose(pdp_pipe, pdp_clf) assert_allclose( values_pipe[0], values_clf[0] * scaler.scale_[features] + scaler.mean_[features]) @pytest.mark.parametrize("estimator", [ LogisticRegression(max_iter=1000, random_state=0), GradientBoostingClassifier(random_state=0, n_estimators=5) ], ids=['estimator-brute', 'estimator-recursion']) @pytest.mark.parametrize( "preprocessor", [ None, make_column_transformer( (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]), (RobustScaler(), [iris.feature_names[i] for i in (1, 3)])), make_column_transformer( (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]), remainder='passthrough') ], ids=['None', 'column-transformer', 'column-transformer-passthrough']) @pytest.mark.parametrize("features", [[0, 2], [iris.feature_names[i] for i in (0, 2)]], ids=['features-integer', 'features-string']) def test_partial_dependence_dataframe(estimator, preprocessor, features): # check that the partial dependence support dataframe and pipeline # including a column transformer pd = pytest.importorskip("pandas") df = pd.DataFrame(iris.data, columns=iris.feature_names)
X_digits, y_digits = load_digits(n_class=3, return_X_y=True) X_digits_multi = MinMaxScaler().fit_transform(X_digits[:200]) y_digits_multi = y_digits[:200] X_digits, y_digits = load_digits(n_class=2, return_X_y=True) X_digits_binary = MinMaxScaler().fit_transform(X_digits[:200]) y_digits_binary = y_digits[:200] classification_datasets = [(X_digits_multi, y_digits_multi), (X_digits_binary, y_digits_binary)] boston = load_boston() Xboston = StandardScaler().fit_transform(boston.data)[:200] yboston = boston.target[:200] regression_datasets = [(Xboston, yboston)] iris = load_iris() X_iris = iris.data y_iris = iris.target def test_alpha(): # Test that larger alpha yields weights closer to zero X = X_digits_binary[:100] y = y_digits_binary[:100]
# Regression test for #15920 cm = np.array([[19, 34], [32, 58]]) disp = ConfusionMatrixDisplay(cm, display_labels=[0, 1]) disp.plot(cmap=pyplot.cm.Blues) min_color = pyplot.cm.Blues(0) max_color = pyplot.cm.Blues(255) assert_allclose(disp.text_[0, 0].get_color(), max_color) assert_allclose(disp.text_[0, 1].get_color(), max_color) assert_allclose(disp.text_[1, 0].get_color(), max_color) assert_allclose(disp.text_[1, 1].get_color(), min_color) @pytest.mark.parametrize("clf", [ LogisticRegression(), make_pipeline(StandardScaler(), LogisticRegression()), make_pipeline(make_column_transformer( (StandardScaler(), [0, 1])), LogisticRegression()) ]) def test_confusion_matrix_pipeline(pyplot, clf, data, n_classes): X, y = data with pytest.raises(NotFittedError): plot_confusion_matrix(clf, X, y) clf.fit(X, y) y_pred = clf.predict(X) disp = plot_confusion_matrix(clf, X, y) cm = confusion_matrix(y, y_pred) assert_allclose(disp.confusion_matrix, cm) assert disp.text_.shape == (n_classes, n_classes)