def test_partial_dependence_unknown_feature_indices(estimator, features): X, y = make_classification(random_state=0) estimator.fit(X, y) err_msg = 'all features must be in' with pytest.raises(ValueError, match=err_msg): partial_dependence(estimator, X, [features])
def test_partial_dependence_unfitted(estimator): X = iris.data preprocessor = make_column_transformer((StandardScaler(), [0, 2]), (RobustScaler(), [1, 3])) pipe = make_pipeline(preprocessor, estimator) with pytest.raises(NotFittedError, match="is not fitted yet"): partial_dependence(pipe, X, features=[0, 2], grid_resolution=10) with pytest.raises(NotFittedError, match="is not fitted yet"): partial_dependence(estimator, X, features=[0, 2], grid_resolution=10)
def test_partial_dependence_slice_error(with_dataframe, err_msg): X, y = make_classification(random_state=0) if with_dataframe: pd = pytest.importorskip('pandas') X = pd.DataFrame(X) estimator = LogisticRegression().fit(X, y) with pytest.raises(TypeError, match=err_msg): partial_dependence(estimator, X, features=slice(0, 2, 1))
def test_partial_dependence_unknown_feature_string(estimator): pd = pytest.importorskip("pandas") X, y = make_classification(random_state=0) df = pd.DataFrame(X) estimator.fit(df, y) features = ['random'] err_msg = 'A given column is not a column of the dataframe' with pytest.raises(ValueError, match=err_msg): partial_dependence(estimator, df, features)
def test_output_shape(Estimator, method, data, grid_resolution, features): # Check that partial_dependence has consistent output shape for different # kinds of estimators: # - classifiers with binary and multiclass settings # - regressors # - multi-task regressors est = Estimator() # n_target corresponds to the number of classes (1 for binary classif) or # the number of tasks / outputs in multi task settings. It's equal to 1 for # classical regression_data. (X, y), n_targets = data est.fit(X, y) pdp, axes = partial_dependence(est, X=X, features=features, method=method, grid_resolution=grid_resolution) expected_pdp_shape = (n_targets, *[grid_resolution for _ in range(len(features))]) expected_axes_shape = (len(features), grid_resolution) assert pdp.shape == expected_pdp_shape assert axes is not None assert np.asarray(axes).shape == expected_axes_shape
def test_warning_recursion_non_constant_init(): # make sure that passing a non-constant init parameter to a GBDT and using # recursion method yields a warning. gbc = GradientBoostingClassifier(init=DummyClassifier(), random_state=0) gbc.fit(X, y) with pytest.warns( UserWarning, match='Using recursion method with a non-constant init predictor'): partial_dependence(gbc, X, [0], method='recursion') with pytest.warns( UserWarning, match='Using recursion method with a non-constant init predictor'): partial_dependence(gbc, X, [0], method='recursion')
def test_multiclass_multioutput(Estimator): # Make sure error is raised for multiclass-multioutput classifiers # make multiclass-multioutput dataset X, y = make_classification(n_classes=3, n_clusters_per_class=1, random_state=0) y = np.array([y, y]).T est = Estimator() est.fit(X, y) with pytest.raises( ValueError, match="Multiclass-multioutput estimators are not supported"): partial_dependence(est, X, [0])
def test_partial_dependence_easy_target(est, power): # If the target y only depends on one feature in an obvious way (linear or # quadratic) then the partial dependence for that feature should reflect # it. # We here fit a linear regression_data model (with polynomial features if # needed) and compute r_squared to check that the partial dependence # correctly reflects the target. rng = np.random.RandomState(0) n_samples = 200 target_variable = 2 X = rng.normal(size=(n_samples, 5)) y = X[:, target_variable]**power est.fit(X, y) averaged_predictions, values = partial_dependence( est, features=[target_variable], X=X, grid_resolution=1000) new_X = values[0].reshape(-1, 1) new_y = averaged_predictions[0] # add polynomial features if needed new_X = PolynomialFeatures(degree=power).fit_transform(new_X) lr = LinearRegression().fit(new_X, new_y) r2 = r2_score(new_y, lr.predict(new_X)) assert r2 > .99
def test_recursion_decision_function(est, target_feature): # Make sure the recursion method (implicitly uses decision_function) has # the same result as using brute method with # response_method=decision_function X, y = make_classification(n_classes=2, n_clusters_per_class=1, random_state=1) assert np.mean(y) == .5 # make sure the init estimator predicts 0 anyway est.fit(X, y) preds_1, _ = partial_dependence(est, X, [target_feature], response_method='decision_function', method='recursion') preds_2, _ = partial_dependence(est, X, [target_feature], response_method='decision_function', method='brute') assert_allclose(preds_1, preds_2, atol=1e-7)
def test_partial_dependence_dataframe(estimator, preprocessor, features): # check that the partial dependence support dataframe and pipeline # including a column transformer pd = pytest.importorskip("pandas") df = pd.DataFrame(iris.data, columns=iris.feature_names) pipe = make_pipeline(preprocessor, estimator) pipe.fit(df, iris.target) pdp_pipe, values_pipe = partial_dependence(pipe, df, features=features, grid_resolution=10) # the column transformer will reorder the column when transforming # we mixed the index to be sure that we are computing the partial # dependence of the right columns if preprocessor is not None: X_proc = clone(preprocessor).fit_transform(df) features_clf = [0, 1] else: X_proc = df features_clf = [0, 2] clf = clone(estimator).fit(X_proc, iris.target) pdp_clf, values_clf = partial_dependence(clf, X_proc, features=features_clf, method='brute', grid_resolution=10) assert_allclose(pdp_pipe, pdp_clf) if preprocessor is not None: scaler = preprocessor.named_transformers_['standardscaler'] assert_allclose(values_pipe[1], values_clf[1] * scaler.scale_[1] + scaler.mean_[1]) else: assert_allclose(values_pipe[1], values_clf[1])
def test_partial_dependence_pipeline(): # check that the partial dependence support pipeline iris = load_iris() scaler = StandardScaler() clf = DummyClassifier(random_state=42) pipe = make_pipeline(scaler, clf) clf.fit(scaler.fit_transform(iris.data), iris.target) pipe.fit(iris.data, iris.target) features = 0 pdp_pipe, values_pipe = partial_dependence(pipe, iris.data, features=[features], grid_resolution=10) pdp_clf, values_clf = partial_dependence(clf, scaler.transform(iris.data), features=[features], grid_resolution=10) assert_allclose(pdp_pipe, pdp_clf) assert_allclose( values_pipe[0], values_clf[0] * scaler.scale_[features] + scaler.mean_[features])
def test_partial_dependence_feature_type(features, expected_pd_shape): # check all possible features type supported in PDP pd = pytest.importorskip("pandas") df = pd.DataFrame(iris.data, columns=iris.feature_names) preprocessor = make_column_transformer( (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]), (RobustScaler(), [iris.feature_names[i] for i in (1, 3)])) pipe = make_pipeline(preprocessor, LogisticRegression(max_iter=1000, random_state=0)) pipe.fit(df, iris.target) pdp_pipe, values_pipe = partial_dependence(pipe, df, features=features, grid_resolution=10) assert pdp_pipe.shape == expected_pd_shape assert len(values_pipe) == len(pdp_pipe.shape) - 1
def test_partial_dependence_sample_weight(): # Test near perfect correlation between partial dependence and diagonal # when sample weights emphasize y = x predictions # non-regression test for #13193 # TODO: extend to HistGradientBoosting once sample_weight is supported N = 1000 rng = np.random.RandomState(123456) mask = rng.randint(2, size=N, dtype=bool) x = rng.rand(N) # set y = x on mask and y = -x outside y = x.copy() y[~mask] = -y[~mask] X = np.c_[mask, x] # sample weights to emphasize data points where y = x sample_weight = np.ones(N) sample_weight[mask] = 1000. clf = GradientBoostingRegressor(n_estimators=10, random_state=1) clf.fit(X, y, sample_weight=sample_weight) pdp, values = partial_dependence(clf, X, features=[1]) assert np.corrcoef(pdp, values)[0, 1] > 0.99
def test_partial_dependence_X_list(estimator): # check that array-like objects are accepted X, y = make_classification(random_state=0) estimator.fit(X, y) partial_dependence(estimator, list(X), [0])
def test_partial_dependence_error(estimator, params, err_msg): X, y = make_classification(random_state=0) estimator.fit(X, y) with pytest.raises(ValueError, match=err_msg): partial_dependence(estimator, X, **params)