def test_is_classifier(): svc = SVC() assert is_classifier(svc) assert is_classifier(GridSearchCV(svc, {'C': [0.1, 1]})) assert is_classifier(Pipeline([('svc', svc)])) assert is_classifier( Pipeline([('svc_cv', GridSearchCV(svc, {'C': [0.1, 1]}))]))
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2, svd_solver='randomized', whiten=True) clf = SVC(probability=True, random_state=0, decision_function_shape='ovr') for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert predict.shape == (n_samples,) proba = pipe.predict_proba(X) assert proba.shape == (n_samples, n_classes) log_proba = pipe.predict_log_proba(X) assert log_proba.shape == (n_samples, n_classes) decision_function = pipe.decision_function(X) assert decision_function.shape == (n_samples, n_classes) pipe.score(X, y)
def test_fit_predict_with_intermediate_fit_params(): # tests that Pipeline passes fit_params to intermediate steps # when fit_predict is invoked pipe = Pipeline([('transf', TransfFitParams()), ('clf', FitParamT())]) pipe.fit_predict(X=None, y=None, transf__should_get_this=True, clf__should_succeed=True) assert pipe.named_steps['transf'].fit_params['should_get_this'] assert pipe.named_steps['clf'].successful assert 'should_succeed' not in pipe.named_steps['transf'].fit_params
def test_pipeline_fit_transform(): # Test whether pipeline works with a transformer missing fit_transform iris = load_iris() X = iris.data y = iris.target transf = Transf() pipeline = Pipeline([('mock', transf)]) # test fit_transform: X_trans = pipeline.fit_transform(X, y) X_trans2 = transf.fit(X, y).transform(X) assert_array_almost_equal(X_trans, X_trans2)
def test_pipeline_named_steps(): transf = Transf() mult2 = Mult(mult=2) pipeline = Pipeline([('mock', transf), ("mult", mult2)]) # Test access via named_steps bunch object assert 'mock' in pipeline.named_steps assert 'mock2' not in pipeline.named_steps assert pipeline.named_steps.mock is transf assert pipeline.named_steps.mult is mult2 # Test bunch with conflict attribute of dict pipeline = Pipeline([('values', transf), ("mult", mult2)]) assert pipeline.named_steps.values is not transf assert pipeline.named_steps.mult is mult2
def test_pipeline_fit_params(): # Test that the pipeline can take fit parameters pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())]) pipe.fit(X=None, y=None, clf__should_succeed=True) # classifier should return True assert pipe.predict(None) # and transformer params should not be changed assert pipe.named_steps['transf'].a is None assert pipe.named_steps['transf'].b is None # invalid parameters should raise an error message assert_raise_message( TypeError, "fit() got an unexpected keyword argument 'bad'", pipe.fit, None, None, clf__bad=True )
def test_set_params(): # test nested estimator parameter setting clf = Pipeline([("svc", SVC())]) # non-existing parameter in svc assert_raises(ValueError, clf.set_params, svc__stupid_param=True) # non-existing parameter of pipeline assert_raises(ValueError, clf.set_params, svm__stupid_param=True)
def test_pipeline_sample_weight_supported(): # Pipeline should pass sample_weight X = np.array([[1, 2]]) pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, y=None) == 3 assert pipe.score(X, y=None, sample_weight=None) == 3 assert pipe.score(X, sample_weight=np.array([2, 3])) == 8
def test_fit_predict_on_pipeline_without_fit_predict(): # tests that a pipeline does not have fit_predict method when final # step of pipeline does not have fit_predict defined scaler = StandardScaler() pca = PCA(svd_solver='full') pipe = Pipeline([('scaler', scaler), ('pca', pca)]) assert_raises_regex(AttributeError, "'PCA' object has no attribute 'fit_predict'", getattr, pipe, 'fit_predict')
def test_pipeline_slice(): pipe = Pipeline([('transf1', Transf()), ('transf2', Transf()), ('clf', FitParamT())]) pipe2 = pipe[:-1] assert isinstance(pipe2, Pipeline) assert pipe2.steps == pipe.steps[:-1] assert 2 == len(pipe2.named_steps) assert_raises(ValueError, lambda: pipe[::-1])
def test_pipeline_transform(): # Test whether pipeline works with a transformer at the end. # Also test pipeline.transform and pipeline.inverse_transform iris = load_iris() X = iris.data pca = PCA(n_components=2, svd_solver='full') pipeline = Pipeline([('pca', pca)]) # test transform and fit_transform: X_trans = pipeline.fit(X).transform(X) X_trans2 = pipeline.fit_transform(X) X_trans3 = pca.fit_transform(X) assert_array_almost_equal(X_trans, X_trans2) assert_array_almost_equal(X_trans, X_trans3) X_back = pipeline.inverse_transform(X_trans) X_back2 = pca.inverse_transform(X_trans) assert_array_almost_equal(X_back, X_back2)
def test_pipeline_correctly_adjusts_steps(passthrough): X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) pipeline = Pipeline([ ('m2', mult2), ('bad', passthrough), ('m3', mult3), ('m5', mult5) ]) pipeline.fit(X, y) expected_names = ['m2', 'bad', 'm3', 'm5'] actual_names = [name for name, _ in pipeline.steps] assert expected_names == actual_names
def test_set_params_nested_pipeline(): estimator = Pipeline([ ('a', Pipeline([ ('b', DummyRegressor()) ])) ]) estimator.set_params(a__b__alpha=0.001, a__b=Lasso()) estimator.set_params(a__steps=[('b', LogisticRegression())], a__b__C=5)
def test_pipeline_index(): transf = Transf() clf = FitParamT() pipe = Pipeline([('transf', transf), ('clf', clf)]) assert pipe[0] == transf assert pipe['transf'] == transf assert pipe[-1] == clf assert pipe['clf'] == clf assert_raises(IndexError, lambda: pipe[3]) assert_raises(KeyError, lambda: pipe['foobar'])
def test_ovr_pipeline(): # Test with pipeline of length one # This test is needed because the multiclass estimators may fail to detect # the presence of predict_proba or decision_function. clf = Pipeline([("tree", DecisionTreeClassifier())]) ovr_pipe = OneVsRestClassifier(clf) ovr_pipe.fit(iris.data, iris.target) ovr = OneVsRestClassifier(DecisionTreeClassifier()) ovr.fit(iris.data, iris.target) assert_array_equal(ovr.predict(iris.data), ovr_pipe.predict(iris.data))
def test_gridsearch_pipeline(): # Test if we can do a grid-search to find parameters to separate # circles with a perceptron model. X, y = make_circles(n_samples=400, factor=.3, noise=.05, random_state=0) kpca = KernelPCA(kernel="rbf", n_components=2) pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))]) param_grid = dict(kernel_pca__gamma=2.**np.arange(-2, 2)) grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid) grid_search.fit(X, y) assert grid_search.best_score_ == 1
def test_calibration_nan_imputer(): """Test that calibration can accept nan""" X, y = make_classification(n_samples=10, n_features=2, n_informative=2, n_redundant=0, random_state=42) X[0, 0] = np.nan clf = Pipeline( [('imputer', SimpleImputer()), ('rf', RandomForestClassifier(n_estimators=1))]) clf_c = CalibratedClassifierCV(clf, cv=2, method='isotonic') clf_c.fit(X, y) clf_c.predict(X)
def test_pipeline_wrong_memory(): # Test that an error is raised when memory is not a string or a Memory # instance iris = load_iris() X = iris.data y = iris.target # Define memory as an integer memory = 1 cached_pipe = Pipeline([('transf', DummyTransf()), ('svc', SVC())], memory=memory) assert_raises_regex(ValueError, "'memory' should be None, a string or" " have the same interface as joblib.Memory." " Got memory='1' instead.", cached_pipe.fit, X, y)
def test_predict_with_predict_params(): # tests that Pipeline passes predict_params to the final estimator # when predict is invoked pipe = Pipeline([('transf', Transf()), ('clf', DummyEstimatorParams())]) pipe.fit(None, None) pipe.predict(X=None, got_attribute=True) assert pipe.named_steps['clf'].got_attribute
def test_imputation_pipeline_grid_search(): # Test imputation within a pipeline + gridsearch. X = sparse_random_matrix(100, 100, density=0.10) missing_values = X.data[0] pipeline = Pipeline([('imputer', SimpleImputer(missing_values=missing_values)), ('tree', tree.DecisionTreeRegressor(random_state=0))]) parameters = {'imputer__strategy': ["mean", "median", "most_frequent"]} Y = sparse_random_matrix(100, 1, density=0.10).toarray() gs = GridSearchCV(pipeline, parameters) gs.fit(X, Y)
def test_fit_predict_on_pipeline(): # test that the fit_predict method is implemented on a pipeline # test that the fit_predict on pipeline yields same results as applying # transform and clustering steps separately iris = load_iris() scaler = StandardScaler() km = KMeans(random_state=0) # As pipeline doesn't clone estimators on construction, # it must have its own estimators scaler_for_pipeline = StandardScaler() km_for_pipeline = KMeans(random_state=0) # first compute the transform and clustering step separately scaled = scaler.fit_transform(iris.data) separate_pred = km.fit_predict(scaled) # use a pipeline to do the transform and clustering in one step pipe = Pipeline([ ('scaler', scaler_for_pipeline), ('Kmeans', km_for_pipeline) ]) pipeline_pred = pipe.fit_predict(iris.data) assert_array_almost_equal(pipeline_pred, separate_pred)
def test_pipeline_with_cache_attribute(): X = np.array([[1, 2]]) pipe = Pipeline([('transf', Transf()), ('clf', Mult())], memory=DummyMemory()) pipe.fit(X, y=None) dummy = WrongDummyMemory() pipe = Pipeline([('transf', Transf()), ('clf', Mult())], memory=dummy) assert_raises_regex(ValueError, "'memory' should be None, a string or" " have the same interface as joblib.Memory." " Got memory='{}' instead.".format(dummy), pipe.fit, X)
def test_set_params_passes_all_parameters(): # Make sure all parameters are passed together to set_params # of nested estimator. Regression test for #9944 class TestDecisionTree(DecisionTreeClassifier): def set_params(self, **kwargs): super().set_params(**kwargs) # expected_kwargs is in test scope assert kwargs == expected_kwargs return self expected_kwargs = {'max_depth': 5, 'min_samples_leaf': 2} for est in [ Pipeline([('estimator', TestDecisionTree())]), GridSearchCV(TestDecisionTree(), {}) ]: est.set_params(estimator__max_depth=5, estimator__min_samples_leaf=2)
def test_pipeline_sample_weight_unsupported(): # When sample_weight is None it shouldn't be passed X = np.array([[1, 2]]) pipe = Pipeline([('transf', Transf()), ('clf', Mult())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, sample_weight=None) == 3 assert_raise_message( TypeError, "score() got an unexpected keyword argument 'sample_weight'", pipe.score, X, sample_weight=np.array([2, 3]) )
def test_set_pipeline_steps(): transf1 = Transf() transf2 = Transf() pipeline = Pipeline([('mock', transf1)]) assert pipeline.named_steps['mock'] is transf1 # Directly setting attr pipeline.steps = [('mock2', transf2)] assert 'mock' not in pipeline.named_steps assert pipeline.named_steps['mock2'] is transf2 assert [('mock2', transf2)] == pipeline.steps # Using set_params pipeline.set_params(steps=[('mock', transf1)]) assert [('mock', transf1)] == pipeline.steps # Using set_params to replace single step pipeline.set_params(mock=transf2) assert [('mock', transf2)] == pipeline.steps # With invalid data pipeline.set_params(steps=[('junk', ())]) assert_raises(TypeError, pipeline.fit, [[1]], [1]) assert_raises(TypeError, pipeline.fit_transform, [[1]], [1])
def test_pipeline_raise_set_params_error(): # Test pipeline raises set params error message for nested models. pipe = Pipeline([('cls', LinearRegression())]) # expected error message error_msg = ('Invalid parameter %s for estimator %s. ' 'Check the list of available parameters ' 'with `estimator.get_params().keys()`.') assert_raise_message(ValueError, error_msg % ('fake', pipe), pipe.set_params, fake='nope') # nested model check assert_raise_message(ValueError, error_msg % ("fake", pipe), pipe.set_params, fake__estimator='nope')
def test_set_random_states(): # Linear Discriminant Analysis doesn't have random state: smoke test _set_random_states(LinearDiscriminantAnalysis(), random_state=17) clf1 = Perceptron(random_state=None) assert clf1.random_state is None # check random_state is None still sets _set_random_states(clf1, None) assert isinstance(clf1.random_state, int) # check random_state fixes results in consistent initialisation _set_random_states(clf1, 3) assert isinstance(clf1.random_state, int) clf2 = Perceptron(random_state=None) _set_random_states(clf2, 3) assert clf1.random_state == clf2.random_state # nested random_state def make_steps(): return [('sel', SelectFromModel(Perceptron(random_state=None))), ('clf', Perceptron(random_state=None))] est1 = Pipeline(make_steps()) _set_random_states(est1, 3) assert isinstance(est1.steps[0][1].estimator.random_state, int) assert isinstance(est1.steps[1][1].random_state, int) assert (est1.get_params()['sel__estimator__random_state'] != est1.get_params()['clf__random_state']) # ensure multiple random_state parameters are invariant to get_params() # iteration order class AlphaParamPipeline(Pipeline): def get_params(self, *args, **kwargs): params = Pipeline.get_params(self, *args, **kwargs).items() return OrderedDict(sorted(params)) class RevParamPipeline(Pipeline): def get_params(self, *args, **kwargs): params = Pipeline.get_params(self, *args, **kwargs).items() return OrderedDict(sorted(params, reverse=True)) for cls in [AlphaParamPipeline, RevParamPipeline]: est2 = cls(make_steps()) _set_random_states(est2, 3) assert (est1.get_params()['sel__estimator__random_state'] == est2.get_params()['sel__estimator__random_state']) assert (est1.get_params()['clf__random_state'] == est2.get_params() ['clf__random_state'])
def test_pipeline_score_samples_pca_lof(): iris = load_iris() X = iris.data # Test that the score_samples method is implemented on a pipeline. # Test that the score_samples method on pipeline yields same results as # applying transform and score_samples steps separately. pca = PCA(svd_solver='full', n_components='mle', whiten=True) lof = LocalOutlierFactor(novelty=True) pipe = Pipeline([('pca', pca), ('lof', lof)]) pipe.fit(X) # Check the shapes assert pipe.score_samples(X).shape == (X.shape[0],) # Check the values lof.fit(pca.fit_transform(X)) assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X)))
def make(): return Pipeline([('m2', mult2), ('m3', mult3), ('last', mult5)])
def test_set_pipeline_step_passthrough(passthrough): X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) def make(): return Pipeline([('m2', mult2), ('m3', mult3), ('last', mult5)]) pipeline = make() exp = 2 * 3 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline.set_params(m3=passthrough) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) assert (pipeline.get_params(deep=True) == {'steps': pipeline.steps, 'm2': mult2, 'm3': passthrough, 'last': mult5, 'memory': None, 'm2__mult': 2, 'last__mult': 5, 'verbose': False }) pipeline.set_params(m2=passthrough) exp = 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) # for other methods, ensure no AttributeErrors on None: other_methods = ['predict_proba', 'predict_log_proba', 'decision_function', 'transform', 'score'] for method in other_methods: getattr(pipeline, method)(X) pipeline.set_params(m2=mult2) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline = make() pipeline.set_params(last=passthrough) # mult2 and mult3 are active exp = 6 assert_array_equal([[exp]], pipeline.fit(X, y).transform(X)) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) assert_raise_message(AttributeError, "'str' object has no attribute 'predict'", getattr, pipeline, 'predict') # Check 'passthrough' step at construction time exp = 2 * 5 pipeline = Pipeline( [('m2', mult2), ('m3', passthrough), ('last', mult5)]) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]]))