def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2, svd_solver='randomized', whiten=True) clf = SVC(probability=True, random_state=0, decision_function_shape='ovr') for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert predict.shape == (n_samples,) proba = pipe.predict_proba(X) assert proba.shape == (n_samples, n_classes) log_proba = pipe.predict_log_proba(X) assert log_proba.shape == (n_samples, n_classes) decision_function = pipe.decision_function(X) assert decision_function.shape == (n_samples, n_classes) pipe.score(X, y)
def test_predict_with_predict_params(): # tests that Pipeline passes predict_params to the final estimator # when predict is invoked pipe = Pipeline([('transf', Transf()), ('clf', DummyEstimatorParams())]) pipe.fit(None, None) pipe.predict(X=None, got_attribute=True) assert pipe.named_steps['clf'].got_attribute
def test_pipeline_sample_weight_supported(): # Pipeline should pass sample_weight X = np.array([[1, 2]]) pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, y=None) == 3 assert pipe.score(X, y=None, sample_weight=None) == 3 assert pipe.score(X, sample_weight=np.array([2, 3])) == 8
def test_pipeline_init_tuple(): # Pipeline accepts steps as tuple X = np.array([[1, 2]]) pipe = Pipeline((('transf', Transf()), ('clf', FitParamT()))) pipe.fit(X, y=None) pipe.score(X) pipe.set_params(transf='passthrough') pipe.fit(X, y=None) pipe.score(X)
def test_pipeline_with_cache_attribute(): X = np.array([[1, 2]]) pipe = Pipeline([('transf', Transf()), ('clf', Mult())], memory=DummyMemory()) pipe.fit(X, y=None) dummy = WrongDummyMemory() pipe = Pipeline([('transf', Transf()), ('clf', Mult())], memory=dummy) assert_raises_regex(ValueError, "'memory' should be None, a string or" " have the same interface as joblib.Memory." " Got memory='{}' instead.".format(dummy), pipe.fit, X)
def test_pipeline_sample_weight_unsupported(): # When sample_weight is None it shouldn't be passed X = np.array([[1, 2]]) pipe = Pipeline([('transf', Transf()), ('clf', Mult())]) pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, sample_weight=None) == 3 assert_raise_message( TypeError, "score() got an unexpected keyword argument 'sample_weight'", pipe.score, X, sample_weight=np.array([2, 3]) )
def test_pipeline_methods_anova(): # Test the various methods of the pipeline (anova). iris = load_iris() X = iris.data y = iris.target # Test with Anova + LogisticRegression clf = LogisticRegression() filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('anova', filter1), ('logistic', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_pca_svm(): # Test the various methods of the pipeline (pca + svm). iris = load_iris() X = iris.data y = iris.target # Test with PCA + SVC clf = SVC(probability=True, random_state=0) pca = PCA(svd_solver='full', n_components='mle', whiten=True) pipe = Pipeline([('pca', pca), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_score_samples_pca_lof(): iris = load_iris() X = iris.data # Test that the score_samples method is implemented on a pipeline. # Test that the score_samples method on pipeline yields same results as # applying transform and score_samples steps separately. pca = PCA(svd_solver='full', n_components='mle', whiten=True) lof = LocalOutlierFactor(novelty=True) pipe = Pipeline([('pca', pca), ('lof', lof)]) pipe.fit(X) # Check the shapes assert pipe.score_samples(X).shape == (X.shape[0],) # Check the values lof.fit(pca.fit_transform(X)) assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X)))
def test_pipeline_fit_params(): # Test that the pipeline can take fit parameters pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())]) pipe.fit(X=None, y=None, clf__should_succeed=True) # classifier should return True assert pipe.predict(None) # and transformer params should not be changed assert pipe.named_steps['transf'].a is None assert pipe.named_steps['transf'].b is None # invalid parameters should raise an error message assert_raise_message( TypeError, "fit() got an unexpected keyword argument 'bad'", pipe.fit, None, None, clf__bad=True )
def test_pipeline_correctly_adjusts_steps(passthrough): X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) pipeline = Pipeline([ ('m2', mult2), ('bad', passthrough), ('m3', mult3), ('m5', mult5) ]) pipeline.fit(X, y) expected_names = ['m2', 'bad', 'm3', 'm5'] actual_names = [name for name, _ in pipeline.steps] assert expected_names == actual_names
def test_pipeline_transform(): # Test whether pipeline works with a transformer at the end. # Also test pipeline.transform and pipeline.inverse_transform iris = load_iris() X = iris.data pca = PCA(n_components=2, svd_solver='full') pipeline = Pipeline([('pca', pca)]) # test transform and fit_transform: X_trans = pipeline.fit(X).transform(X) X_trans2 = pipeline.fit_transform(X) X_trans3 = pca.fit_transform(X) assert_array_almost_equal(X_trans, X_trans2) assert_array_almost_equal(X_trans, X_trans3) X_back = pipeline.inverse_transform(X_trans) X_back2 = pca.inverse_transform(X_trans) assert_array_almost_equal(X_back, X_back2)
# TASK: Build a vectorizer that splits strings into sequence of 1 to 3 # characters instead of word tokens vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char', use_idf=False) # TASK: Build a vectorizer / classifier pipeline using the previous analyzer # the pipeline instance should stored in a variable named clf clf = Pipeline([ ('vec', vectorizer), ('clf', Perceptron()), ]) # TASK: Fit the pipeline on the training set clf.fit(docs_train, y_train) # TASK: Predict the outcome on the testing set in a variable named y_predicted y_predicted = clf.predict(docs_test) # Print the classification report print( metrics.classification_report(y_test, y_predicted, target_names=dataset.target_names)) # Plot the confusion matrix cm = metrics.confusion_matrix(y_test, y_predicted) print(cm) #import matlotlib.pyplot as plt
degrees = [1, 4, 15] X = np.sort(np.random.rand(n_samples)) y = true_fun(X) + np.random.randn(n_samples) * 0.1 plt.figure(figsize=(14, 5)) for i in range(len(degrees)): ax = plt.subplot(1, len(degrees), i + 1) plt.setp(ax, xticks=(), yticks=()) polynomial_features = PolynomialFeatures(degree=degrees[i], include_bias=False) linear_regression = LinearRegression() pipeline = Pipeline([("polynomial_features", polynomial_features), ("linear_regression", linear_regression)]) pipeline.fit(X[:, np.newaxis], y) # Evaluate the models using crossvalidation scores = cross_val_score(pipeline, X[:, np.newaxis], y, scoring="neg_mean_squared_error", cv=10) X_test = np.linspace(0, 1, 100) plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model") plt.plot(X_test, true_fun(X_test), label="True function") plt.scatter(X, y, edgecolor='b', s=20, label="Samples") plt.xlabel("x") plt.ylabel("y") plt.xlim((0, 1)) plt.ylim((-2, 2)) plt.legend(loc="best") plt.title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(
# ############################################################################# # Compute the coefs of a Bayesian Ridge with GridSearch cv = KFold(2) # cross-validation generator for model selection ridge = BayesianRidge() cachedir = tempfile.mkdtemp() mem = Memory(location=cachedir, verbose=1) # Ward agglomeration followed by BayesianRidge connectivity = grid_to_graph(n_x=size, n_y=size) ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, memory=mem) clf = Pipeline([('ward', ward), ('ridge', ridge)]) # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[-1][1].coef_ coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size) # Anova univariate feature selection followed by BayesianRidge f_regression = mem.cache(feature_selection.f_regression) # caching function anova = feature_selection.SelectPercentile(f_regression) clf = Pipeline([('anova', anova), ('ridge', ridge)]) # Select the optimal percentage of features with grid search clf = GridSearchCV(clf, {'anova__percentile': [5, 10, 20]}, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[-1][1].coef_ coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1)) coef_selection_ = coef_.reshape(size, size)
stratify=y, random_state=42) categorical_pipe = Pipeline([ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) numerical_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean'))]) preprocessing = ColumnTransformer([('cat', categorical_pipe, categorical_columns), ('num', numerical_pipe, numerical_columns)]) rf = Pipeline([('preprocess', preprocessing), ('classifier', RandomForestClassifier(random_state=42))]) rf.fit(X_train, y_train) ############################################################################## # Accuracy of the Model # --------------------- # Prior to inspecting the feature importances, it is important to check that # the model predictive performance is high enough. Indeed there would be little # interest of inspecting the important features of a non-predictive model. # # Here one can observe that the train accuracy is very high (the forest model # has enough capacity to completely memorize the training set) but it can still # generalize well enough to the test set thanks to the built-in bagging of # random forests. # # It might be possible to trade some accuracy on the training set for a # slightly better accuracy on the test set by limiting the capacity of the
]), 1), ], # weight components in ColumnTransformer transformer_weights={ 'subject': 0.8, 'body_bow': 0.5, 'body_stats': 1.0, })), # Use a SVC classifier on the combined features ('svc', LinearSVC(dual=False)), ], verbose=True) # limit the list of categories to make running this example faster. categories = ['alt.atheism', 'talk.religion.misc'] X_train, y_train = fetch_20newsgroups(random_state=1, subset='train', categories=categories, return_X_y=True) X_test, y_test = fetch_20newsgroups(random_state=1, subset='test', categories=categories, return_X_y=True) pipeline.fit(X_train, y_train) y_pred = pipeline.predict(X_test) print(classification_report(y_pred, y_test))
def test_pipeline_memory(): iris = load_iris() X = iris.data y = iris.target cachedir = mkdtemp() try: if LooseVersion(joblib.__version__) < LooseVersion('0.12'): # Deal with change of API in joblib memory = joblib.Memory(cachedir=cachedir, verbose=10) else: memory = joblib.Memory(location=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the transformer in the cached pipeline ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert not hasattr(transf, 'means_') # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert ts == cached_pipe.named_steps['transf'].timestamp_ # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert ts == cached_pipe_2.named_steps['transf_2'].timestamp_ finally: shutil.rmtree(cachedir)
categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer(transformers=[( 'num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) # Append classifier to preprocessing pipeline. # Now we have a full prediction pipeline. clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression())]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf.fit(X_train, y_train) print("model score: %.3f" % clf.score(X_test, y_test)) ############################################################################### # Using the prediction pipeline in a grid search ############################################################################### # Grid search can also be performed on the different preprocessing steps # defined in the ``ColumnTransformer`` object, together with the classifier's # hyperparameters as part of the ``Pipeline``. # We will search for both the imputer strategy of the numeric preprocessing # and the regularization parameter of the logistic regression using # :class:`mrex.model_selection.GridSearchCV`. param_grid = { 'preprocessor__num__imputer__strategy': ['mean', 'median'], 'classifier__C': [0.1, 1.0, 10, 100],
def test_set_pipeline_step_passthrough(passthrough): X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) def make(): return Pipeline([('m2', mult2), ('m3', mult3), ('last', mult5)]) pipeline = make() exp = 2 * 3 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline.set_params(m3=passthrough) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) assert (pipeline.get_params(deep=True) == {'steps': pipeline.steps, 'm2': mult2, 'm3': passthrough, 'last': mult5, 'memory': None, 'm2__mult': 2, 'last__mult': 5, 'verbose': False }) pipeline.set_params(m2=passthrough) exp = 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) # for other methods, ensure no AttributeErrors on None: other_methods = ['predict_proba', 'predict_log_proba', 'decision_function', 'transform', 'score'] for method in other_methods: getattr(pipeline, method)(X) pipeline.set_params(m2=mult2) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline = make() pipeline.set_params(last=passthrough) # mult2 and mult3 are active exp = 6 assert_array_equal([[exp]], pipeline.fit(X, y).transform(X)) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) assert_raise_message(AttributeError, "'str' object has no attribute 'predict'", getattr, pipeline, 'predict') # Check 'passthrough' step at construction time exp = 2 * 5 pipeline = Pipeline( [('m2', mult2), ('m3', passthrough), ('last', mult5)]) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]]))
# ############################################################################# # Training # Hyper-parameters. These were set by cross-validation, # using a GridSearchCV. Here we are not performing cross-validation to # save time. rbm.learning_rate = 0.06 rbm.n_iter = 10 # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = 100 logistic.C = 6000 # Training RBM-Logistic Pipeline rbm_features_classifier.fit(X_train, Y_train) # Training the Logistic regression classifier directly on the pixel raw_pixel_classifier = clone(logistic) raw_pixel_classifier.C = 100. raw_pixel_classifier.fit(X_train, Y_train) # ############################################################################# # Evaluation Y_pred = rbm_features_classifier.predict(X_test) print("Logistic regression using RBM features:\n%s\n" % (metrics.classification_report(Y_test, Y_pred))) Y_pred = raw_pixel_classifier.predict(X_test) print("Logistic regression using raw pixel features:\n%s\n" %