def test_stacking_regressor_diabetes(cv, final_estimator, predict_params, passthrough): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes), y_diabetes, random_state=42) estimators = [('lr', LinearRegression()), ('svr', LinearSVR())] reg = StackingRegressor(estimators=estimators, final_estimator=final_estimator, cv=cv, passthrough=passthrough) reg.fit(X_train, y_train) result = reg.predict(X_test, **predict_params) expected_result_length = 2 if predict_params else 1 if predict_params: assert len(result) == expected_result_length X_trans = reg.transform(X_test) expected_column_count = 12 if passthrough else 2 assert X_trans.shape[1] == expected_column_count if passthrough: assert_allclose(X_test, X_trans[:, -10:]) reg.set_params(lr='drop') reg.fit(X_train, y_train) reg.predict(X_test) X_trans = reg.transform(X_test) expected_column_count_drop = 11 if passthrough else 1 assert X_trans.shape[1] == expected_column_count_drop if passthrough: assert_allclose(X_test, X_trans[:, -10:])
def test_permutation_importance_linear_regresssion(): X, y = make_regression(n_samples=500, n_features=10, random_state=0) X = scale(X) y = scale(y) lr = LinearRegression().fit(X, y) # this relationship can be computed in closed form expected_importances = 2 * lr.coef_**2 results = permutation_importance(lr, X, y, n_repeats=50, scoring='neg_mean_squared_error') assert_allclose(expected_importances, results.importances_mean, rtol=1e-1, atol=1e-6)
def test_stacking_classifier_drop_binary_prob(): # check that classifier will drop one of the probability column for # binary classification problem # Select only the 2 first classes X_, y_ = scale(X_iris[:100]), y_iris[:100] estimators = [('lr', LogisticRegression()), ('rf', RandomForestClassifier())] clf = StackingClassifier(estimators=estimators) clf.fit(X_, y_) X_meta = clf.transform(X_) assert X_meta.shape[1] == 2
def test_stacking_classifier_sparse_passthrough(fmt): # Check passthrough behavior on a sparse X matrix X_train, X_test, y_train, _ = train_test_split(sparse.coo_matrix( scale(X_iris)).asformat(fmt), y_iris, random_state=42) estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())] rf = RandomForestClassifier(n_estimators=10, random_state=42) clf = StackingClassifier(estimators=estimators, final_estimator=rf, cv=5, passthrough=True) clf.fit(X_train, y_train) X_trans = clf.transform(X_test) assert_allclose_dense_sparse(X_test, X_trans[:, -4:]) assert sparse.issparse(X_trans) assert X_test.format == X_trans.format
def test_stacking_regressor_drop_estimator(): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes), y_diabetes, random_state=42) estimators = [('lr', 'drop'), ('svr', LinearSVR(random_state=0))] rf = RandomForestRegressor(n_estimators=10, random_state=42) reg = StackingRegressor(estimators=[('svr', LinearSVR(random_state=0))], final_estimator=rf, cv=5) reg_drop = StackingRegressor(estimators=estimators, final_estimator=rf, cv=5) reg.fit(X_train, y_train) reg_drop.fit(X_train, y_train) assert_allclose(reg.predict(X_test), reg_drop.predict(X_test)) assert_allclose(reg.transform(X_test), reg_drop.transform(X_test))
def test_stacking_classifier_drop_estimator(): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, _ = train_test_split(scale(X_iris), y_iris, stratify=y_iris, random_state=42) estimators = [('lr', 'drop'), ('svc', LinearSVC(random_state=0))] rf = RandomForestClassifier(n_estimators=10, random_state=42) clf = StackingClassifier(estimators=[('svc', LinearSVC(random_state=0))], final_estimator=rf, cv=5) clf_drop = StackingClassifier(estimators=estimators, final_estimator=rf, cv=5) clf.fit(X_train, y_train) clf_drop.fit(X_train, y_train) assert_allclose(clf.predict(X_test), clf_drop.predict(X_test)) assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test)) assert_allclose(clf.transform(X_test), clf_drop.transform(X_test))
def test_stacking_classifier_drop_column_binary_classification(): # check that a column is dropped in binary classification X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, _ = train_test_split(scale(X), y, stratify=y, random_state=42) # both classifiers implement 'predict_proba' and will both drop one column estimators = [('lr', LogisticRegression()), ('rf', RandomForestClassifier(random_state=42))] clf = StackingClassifier(estimators=estimators, cv=3) clf.fit(X_train, y_train) X_trans = clf.transform(X_test) assert X_trans.shape[1] == 2 # LinearSVC does not implement 'predict_proba' and will not drop one column estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())] clf.set_params(estimators=estimators) clf.fit(X_train, y_train) X_trans = clf.transform(X_test) assert X_trans.shape[1] == 2
def test_stacking_classifier_iris(cv, final_estimator, passthrough): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, y_test = train_test_split(scale(X_iris), y_iris, stratify=y_iris, random_state=42) estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())] clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=cv, passthrough=passthrough) clf.fit(X_train, y_train) clf.predict(X_test) clf.predict_proba(X_test) assert clf.score(X_test, y_test) > 0.8 X_trans = clf.transform(X_test) expected_column_count = 10 if passthrough else 6 assert X_trans.shape[1] == expected_column_count if passthrough: assert_allclose(X_test, X_trans[:, -4:]) clf.set_params(lr='drop') clf.fit(X_train, y_train) clf.predict(X_test) clf.predict_proba(X_test) if final_estimator is None: # LogisticRegression has decision_function method clf.decision_function(X_test) X_trans = clf.transform(X_test) expected_column_count_drop = 7 if passthrough else 3 assert X_trans.shape[1] == expected_column_count_drop if passthrough: assert_allclose(X_test, X_trans[:, -4:])
def test_stacking_regressor_error(y, params, type_err, msg_err): with pytest.raises(type_err, match=msg_err): reg = StackingRegressor(**params, cv=3) reg.fit(scale(X_diabetes), y, sample_weight=np.ones(X_diabetes.shape[0]))
def test_stacking_classifier_error(y, params, type_err, msg_err): with pytest.raises(type_err, match=msg_err): clf = StackingClassifier(**params, cv=3) clf.fit(scale(X_iris), y, sample_weight=np.ones(X_iris.shape[0]))