def test_partial_dependence_helpers(est, method, target_feature): # Check that what is returned by _partial_dependence_brute or # _partial_dependence_recursion is equivalent to manually setting a target # feature to a given value, and computing the average prediction over all # samples. # This also checks that the brute and recursion methods give the same # output. # Note that even on the trainset, the brute and the recursion methods # aren't always strictly equivalent, in particular when the slow method # generates unrealistic samples that have low mass in the joint # distribution of the input features, and when some of the features are # dependent. Hence the high tolerance on the checks. X, y = make_regression(random_state=0, n_features=5, n_informative=5) # The 'init' estimator for GBDT (here the average prediction) isn't taken # into account with the recursion method, for technical reasons. We set # the mean to 0 to that this 'bug' doesn't have any effect. y = y - y.mean() est.fit(X, y) # target feature will be set to .5 and then to 123 features = np.array([target_feature], dtype=np.int32) grid = np.array([[.5], [123]]) if method == 'brute': pdp, predictions = _partial_dependence_brute(est, grid, features, X, response_method='auto') else: pdp = _partial_dependence_recursion(est, grid, features) mean_predictions = [] for val in (.5, 123): X_ = X.copy() X_[:, target_feature] = val mean_predictions.append(est.predict(X_).mean()) pdp = pdp[0] # (shape is (1, 2) so make it (2,)) # allow for greater margin for error with recursion method rtol = 1e-1 if method == 'recursion' else 1e-3 assert np.allclose(pdp, mean_predictions, rtol=rtol)
def test_recursion_decision_tree_vs_forest_and_gbdt(seed): # Make sure that the recursion method gives the same results on a # DecisionTreeRegressor and a GradientBoostingRegressor or a # RandomForestRegressor with 1 tree and equivalent parameters. rng = np.random.RandomState(seed) # Purely random dataset to avoid correlated features n_samples = 1000 n_features = 5 X = rng.randn(n_samples, n_features) y = rng.randn(n_samples) * 10 # The 'init' estimator for GBDT (here the average prediction) isn't taken # into account with the recursion method, for technical reasons. We set # the mean to 0 to that this 'bug' doesn't have any effect. y = y - y.mean() # set max_depth not too high to avoid splits with same gain but different # features max_depth = 5 tree_seed = 0 forest = RandomForestRegressor(n_estimators=1, max_features=None, bootstrap=False, max_depth=max_depth, random_state=tree_seed) # The forest will use ensemble.base._set_random_states to set the # random_state of the tree sub-estimator. We simulate this here to have # equivalent estimators. equiv_random_state = check_random_state(tree_seed).randint( np.iinfo(np.int32).max) gbdt = GradientBoostingRegressor(n_estimators=1, learning_rate=1, criterion='mse', max_depth=max_depth, random_state=equiv_random_state) tree = DecisionTreeRegressor(max_depth=max_depth, random_state=equiv_random_state) forest.fit(X, y) gbdt.fit(X, y) tree.fit(X, y) # sanity check: if the trees aren't the same, the PD values won't be equal try: assert_is_subtree(tree.tree_, gbdt[0, 0].tree_) assert_is_subtree(tree.tree_, forest[0].tree_) except AssertionError: # For some reason the trees aren't exactly equal on 32bits, so the PDs # cannot be equal either. See # https://github.com/scikit-learn/scikit-learn/issues/8853 assert _IS_32BIT, "this should only fail on 32 bit platforms" return grid = rng.randn(50).reshape(-1, 1) for f in range(n_features): features = np.array([f], dtype=np.int32) pdp_forest = _partial_dependence_recursion(forest, grid, features) pdp_gbdt = _partial_dependence_recursion(gbdt, grid, features) pdp_tree = _partial_dependence_recursion(tree, grid, features) np.testing.assert_allclose(pdp_gbdt, pdp_tree) np.testing.assert_allclose(pdp_forest, pdp_tree)