def test_oob_improvement_raise(): # Test if oob improvement has correct shape. clf = GradientBoostingClassifier(n_estimators=100, random_state=1, subsample=1.0) clf.fit(X, y) assert_raises(AttributeError, lambda: clf.oob_improvement_)
def test_mean_variance_axis1(): X, _ = make_classification(5, 4, random_state=0) # Sparsify the array a little bit X[0, 0] = 0 X[2, 1] = 0 X[4, 3] = 0 X_lil = sp.lil_matrix(X) X_lil[1, 0] = 0 X[1, 0] = 0 assert_raises(TypeError, mean_variance_axis, X_lil, axis=1) X_csr = sp.csr_matrix(X_lil) X_csc = sp.csc_matrix(X_lil) expected_dtypes = [(np.float32, np.float32), (np.float64, np.float64), (np.int32, np.float64), (np.int64, np.float64)] for input_dtype, output_dtype in expected_dtypes: X_test = X.astype(input_dtype) for X_sparse in (X_csr, X_csc): X_sparse = X_sparse.astype(input_dtype) X_means, X_vars = mean_variance_axis(X_sparse, axis=0) assert_equal(X_means.dtype, output_dtype) assert_equal(X_vars.dtype, output_dtype) assert_array_almost_equal(X_means, np.mean(X_test, axis=0)) assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
def test_zero_estimator_reg(): # Test if ZeroEstimator works for regression. est = GradientBoostingRegressor(n_estimators=20, max_depth=1, random_state=1, init=ZeroEstimator()) est.fit(boston.data, boston.target) y_pred = est.predict(boston.data) mse = mean_squared_error(boston.target, y_pred) assert_almost_equal(mse, 33.0, decimal=0) est = GradientBoostingRegressor(n_estimators=20, max_depth=1, random_state=1, init='zero') est.fit(boston.data, boston.target) y_pred = est.predict(boston.data) mse = mean_squared_error(boston.target, y_pred) assert_almost_equal(mse, 33.0, decimal=0) est = GradientBoostingRegressor(n_estimators=20, max_depth=1, random_state=1, init='foobar') assert_raises(ValueError, est.fit, boston.data, boston.target)
def check_boston(presort, loss, subsample): # Check consistency on dataset boston house prices with least squares # and least absolute deviation. ones = np.ones(len(boston.target)) last_y_pred = None for sample_weight in None, ones, 2 * ones: clf = GradientBoostingRegressor(n_estimators=100, loss=loss, max_depth=4, subsample=subsample, min_samples_split=2, random_state=1, presort=presort) assert_raises(ValueError, clf.predict, boston.data) clf.fit(boston.data, boston.target, sample_weight=sample_weight) leaves = clf.apply(boston.data) assert_equal(leaves.shape, (506, 100)) y_pred = clf.predict(boston.data) mse = mean_squared_error(boston.target, y_pred) assert_less(mse, 6.0) if last_y_pred is not None: assert_array_almost_equal(last_y_pred, y_pred) last_y_pred = y_pred
def check_min_samples_leaf(name): X, y = hastie_X, hastie_y # Test if leaves contain more than leaf_count training examples ForestEstimator = FOREST_ESTIMATORS[name] # test boundary value assert_raises(ValueError, ForestEstimator(min_samples_leaf=-1).fit, X, y) assert_raises(ValueError, ForestEstimator(min_samples_leaf=0).fit, X, y) est = ForestEstimator(min_samples_leaf=5, n_estimators=1, random_state=0) est.fit(X, y) out = est.estimators_[0].tree_.apply(X) node_counts = bincount(out) # drop inner nodes leaf_count = node_counts[node_counts != 0] assert_greater(np.min(leaf_count), 4, "Failed with {0}".format(name)) est = ForestEstimator(min_samples_leaf=0.25, n_estimators=1, random_state=0) est.fit(X, y) out = est.estimators_[0].tree_.apply(X) node_counts = np.bincount(out) # drop inner nodes leaf_count = node_counts[node_counts != 0] assert_greater(np.min(leaf_count), len(X) * 0.25 - 1, "Failed with {0}".format(name))
def test_csc_row_median(): # Test csc_row_median actually calculates the median. # Test that it gives the same output when X is dense. rng = np.random.RandomState(0) X = rng.rand(100, 50) dense_median = np.median(X, axis=0) csc = sp.csc_matrix(X) sparse_median = csc_median_axis_0(csc) assert_array_equal(sparse_median, dense_median) # Test that it gives the same output when X is sparse X = rng.rand(51, 100) X[X < 0.7] = 0.0 ind = rng.randint(0, 50, 10) X[ind] = -X[ind] csc = sp.csc_matrix(X) dense_median = np.median(X, axis=0) sparse_median = csc_median_axis_0(csc) assert_array_equal(sparse_median, dense_median) # Test for toy data. X = [[0, -2], [-1, -1], [1, 0], [2, 1]] csc = sp.csc_matrix(X) assert_array_equal(csc_median_axis_0(csc), np.array([0.5, -0.5])) X = [[0, -2], [-1, -5], [1, -3]] csc = sp.csc_matrix(X) assert_array_equal(csc_median_axis_0(csc), np.array([0., -3])) # Test that it raises an Error for non-csc matrices. assert_raises(TypeError, csc_median_axis_0, sp.csr_matrix(X))
def test_staged_predict_proba(): # Test whether staged predict proba eventually gives # the same prediction. X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingClassifier(n_estimators=20) # test raise NotFittedError if not fitted assert_raises( NotFittedError, lambda X: np.fromiter(clf.staged_predict_proba(X), dtype=np.float64), X_test) clf.fit(X_train, y_train) # test if prediction for last stage equals ``predict`` for y_pred in clf.staged_predict(X_test): assert_equal(y_test.shape, y_pred.shape) assert_array_equal(clf.predict(X_test), y_pred) # test if prediction for last stage equals ``predict_proba`` for staged_proba in clf.staged_predict_proba(X_test): assert_equal(y_test.shape[0], staged_proba.shape[0]) assert_equal(2, staged_proba.shape[1]) assert_array_equal(clf.predict_proba(X_test), staged_proba)
def test_inplace_row_scale(): rng = np.random.RandomState(0) X = sp.rand(100, 200, 0.05) Xr = X.tocsr() Xc = X.tocsc() XA = X.toarray() scale = rng.rand(100) XA *= scale.reshape(-1, 1) inplace_row_scale(Xc, scale) inplace_row_scale(Xr, scale) assert_array_almost_equal(Xr.toarray(), Xc.toarray()) assert_array_almost_equal(XA, Xc.toarray()) assert_array_almost_equal(XA, Xr.toarray()) assert_raises(TypeError, inplace_column_scale, X.tolil(), scale) X = X.astype(np.float32) scale = scale.astype(np.float32) Xr = X.tocsr() Xc = X.tocsc() XA = X.toarray() XA *= scale.reshape(-1, 1) inplace_row_scale(Xc, scale) inplace_row_scale(Xr, scale) assert_array_almost_equal(Xr.toarray(), Xc.toarray()) assert_array_almost_equal(XA, Xc.toarray()) assert_array_almost_equal(XA, Xr.toarray()) assert_raises(TypeError, inplace_column_scale, X.tolil(), scale)
def test_warm_start_smaller_n_estimators(): # Test if warm start with smaller n_estimators raises error X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: est = Cls(n_estimators=100, max_depth=1, warm_start=True) est.fit(X, y) est.set_params(n_estimators=99) assert_raises(ValueError, est.fit, X, y)
def check_warm_start_smaller_n_estimators(name): # Test if warm start second fit with smaller n_estimators raises error. X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] clf = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) assert_raises(ValueError, clf.fit, X, y)
def test_get_params(): test = T(K(), K()) assert_true('a__d' in test.get_params(deep=True)) assert_true('a__d' not in test.get_params(deep=False)) test.set_params(a__d=2) assert_true(test.a.d == 2) assert_raises(ValueError, test.set_params, a__a=2)
def check_1d_input(name, X, X_2d, y): ForestEstimator = FOREST_ESTIMATORS[name] assert_raises(ValueError, ForestEstimator(n_estimators=1, random_state=0).fit, X, y) est = ForestEstimator(random_state=0) est.fit(X_2d, y) if name in FOREST_CLASSIFIERS or name in FOREST_REGRESSORS: assert_raises(ValueError, est.predict, X)
def test_compute_class_weight_not_present(): # Raise error when y does not contain all class labels classes = np.arange(4) y = np.asarray([0, 0, 0, 1, 1, 2]) assert_raises(ValueError, compute_class_weight, "auto", classes, y) assert_raises(ValueError, compute_class_weight, "balanced", classes, y) # Raise error when y has items not in classes classes = np.arange(2) assert_raises(ValueError, compute_class_weight, "auto", classes, y) assert_raises(ValueError, compute_class_weight, "balanced", classes, y) assert_raises(ValueError, compute_class_weight, {0: 1., 1: 2.}, classes, y)
def test_check_inputs(): # Test input checks (shape and type of X and y). clf = GradientBoostingClassifier(n_estimators=100, random_state=1) assert_raises(ValueError, clf.fit, X, y + [0, 1]) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) assert_raises(ValueError, clf.fit, X, y, sample_weight=([1] * len(y)) + [0, 1])
def test_degenerate_targets(): # Check if we can fit even though all targets are equal. clf = GradientBoostingClassifier(n_estimators=100, random_state=1) # classifier should raise exception assert_raises(ValueError, clf.fit, X, np.ones(len(X))) clf = GradientBoostingRegressor(n_estimators=100, random_state=1) clf.fit(X, np.ones(len(X))) clf.predict([rng.rand(2)]) assert_array_equal(np.ones((1, ), dtype=np.float64), clf.predict([rng.rand(2)]))
def test_make_rng(): # Check the check_random_state utility function behavior assert_true(check_random_state(None) is np.random.mtrand._rand) assert_true(check_random_state(np.random) is np.random.mtrand._rand) rng_42 = np.random.RandomState(42) assert_true(check_random_state(42).randint(100) == rng_42.randint(100)) rng_42 = np.random.RandomState(42) assert_true(check_random_state(rng_42) is rng_42) rng_42 = np.random.RandomState(42) assert_true(check_random_state(43).randint(100) != rng_42.randint(100)) assert_raises(ValueError, check_random_state, "some invalid seed")
def test_probability_log(): # Predict probabilities. clf = GradientBoostingClassifier(n_estimators=100, random_state=1) assert_raises(ValueError, clf.predict_proba, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # check if probabilities are in [0, 1]. y_proba = clf.predict_proba(T) assert_true(np.all(y_proba >= 0.0)) assert_true(np.all(y_proba <= 1.0)) # derive predictions from probabilities y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0) assert_array_equal(y_pred, true_result)
def test_count_nonzero(): X = np.array([[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64) X_csr = sp.csr_matrix(X) X_csc = sp.csc_matrix(X) X_nonzero = X != 0 sample_weight = [.5, .2, .3, .1, .1] X_nonzero_weighted = X_nonzero * np.array(sample_weight)[:, None] for axis in [0, 1, -1, -2, None]: assert_array_almost_equal(count_nonzero(X_csr, axis=axis), X_nonzero.sum(axis=axis)) assert_array_almost_equal( count_nonzero(X_csr, axis=axis, sample_weight=sample_weight), X_nonzero_weighted.sum(axis=axis)) assert_raises(TypeError, count_nonzero, X_csc) assert_raises(ValueError, count_nonzero, X_csr, axis=2)
def check_classification_toy(presort, loss): # Check classification on a toy dataset. clf = GradientBoostingClassifier(loss=loss, n_estimators=10, random_state=1, presort=presort) assert_raises(ValueError, clf.predict, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(10, len(clf.estimators_)) deviance_decrease = (clf.train_score_[:-1] - clf.train_score_[1:]) assert_true(np.any(deviance_decrease >= 0.0)) leaves = clf.apply(X) assert_equal(leaves.shape, (6, 10, 1))
def test_inplace_swap_column(): X = np.array([[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64) X_csr = sp.csr_matrix(X) X_csc = sp.csc_matrix(X) swap = linalg.get_blas_funcs(('swap', ), (X, )) swap = swap[0] X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1]) inplace_swap_column(X_csr, 0, -1) inplace_swap_column(X_csc, 0, -1) assert_array_equal(X_csr.toarray(), X_csc.toarray()) assert_array_equal(X, X_csc.toarray()) assert_array_equal(X, X_csr.toarray()) X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1]) inplace_swap_column(X_csr, 0, 1) inplace_swap_column(X_csc, 0, 1) assert_array_equal(X_csr.toarray(), X_csc.toarray()) assert_array_equal(X, X_csc.toarray()) assert_array_equal(X, X_csr.toarray()) assert_raises(TypeError, inplace_swap_column, X_csr.tolil()) X = np.array([[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float32) X_csr = sp.csr_matrix(X) X_csc = sp.csc_matrix(X) swap = linalg.get_blas_funcs(('swap', ), (X, )) swap = swap[0] X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1]) inplace_swap_column(X_csr, 0, -1) inplace_swap_column(X_csc, 0, -1) assert_array_equal(X_csr.toarray(), X_csc.toarray()) assert_array_equal(X, X_csc.toarray()) assert_array_equal(X, X_csr.toarray()) X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1]) inplace_swap_column(X_csr, 0, 1) inplace_swap_column(X_csc, 0, 1) assert_array_equal(X_csr.toarray(), X_csc.toarray()) assert_array_equal(X, X_csc.toarray()) assert_array_equal(X, X_csr.toarray()) assert_raises(TypeError, inplace_swap_column, X_csr.tolil())
def test_staged_predict(): # Test whether staged decision function eventually gives # the same prediction. X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0) X_train, y_train = X[:200], y[:200] X_test = X[200:] clf = GradientBoostingRegressor() # test raise ValueError if not fitted assert_raises( ValueError, lambda X: np.fromiter(clf.staged_predict(X), dtype=np.float64), X_test) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # test if prediction for last stage equals ``predict`` for y in clf.staged_predict(X_test): assert_equal(y.shape, y_pred.shape) assert_array_equal(y_pred, y)
def test_column_or_1d(): EXAMPLES = [ ("binary", ["spam", "egg", "spam"]), ("binary", [0, 1, 0, 1]), ("continuous", np.arange(10) / 20.), ("multiclass", [1, 2, 3]), ("multiclass", [0, 1, 2, 2, 0]), ("multiclass", [[1], [2], [3]]), ("multilabel-indicator", [[0, 1, 0], [0, 0, 1]]), ("multiclass-multioutput", [[1, 2, 3]]), ("multiclass-multioutput", [[1, 1], [2, 2], [3, 1]]), ("multiclass-multioutput", [[5, 1], [4, 2], [3, 1]]), ("multiclass-multioutput", [[1, 2, 3]]), ("continuous-multioutput", np.arange(30).reshape((-1, 3))), ] for y_type, y in EXAMPLES: if y_type in ["binary", 'multiclass', "continuous"]: assert_array_equal(column_or_1d(y), np.ravel(y)) else: assert_raises(ValueError, column_or_1d, y)
def test_probability_exponential(): # Predict probabilities. clf = GradientBoostingClassifier(loss='exponential', n_estimators=100, random_state=1) assert_raises(ValueError, clf.predict_proba, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # check if probabilities are in [0, 1]. y_proba = clf.predict_proba(T) assert_true(np.all(y_proba >= 0.0)) assert_true(np.all(y_proba <= 1.0)) score = clf.decision_function(T).ravel() assert_array_almost_equal(y_proba[:, 1], 1.0 / (1.0 + np.exp(-2 * score))) # derive predictions from probabilities y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0) assert_array_equal(y_pred, true_result)
def check_min_samples_split(name): X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] # test boundary value assert_raises(ValueError, ForestEstimator(min_samples_split=-1).fit, X, y) assert_raises(ValueError, ForestEstimator(min_samples_split=0).fit, X, y) assert_raises(ValueError, ForestEstimator(min_samples_split=1.1).fit, X, y) est = ForestEstimator(min_samples_split=10, n_estimators=1, random_state=0) est.fit(X, y) node_idx = est.estimators_[0].tree_.children_left != -1 node_samples = est.estimators_[0].tree_.n_node_samples[node_idx] assert_greater(np.min(node_samples), len(X) * 0.5 - 1, "Failed with {0}".format(name)) est = ForestEstimator(min_samples_split=0.5, n_estimators=1, random_state=0) est.fit(X, y) node_idx = est.estimators_[0].tree_.children_left != -1 node_samples = est.estimators_[0].tree_.n_node_samples[node_idx] assert_greater(np.min(node_samples), len(X) * 0.5 - 1, "Failed with {0}".format(name))
def test_plot_partial_dependence_multiclass(): # Test partial dependence plot function on multi-class input. clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(iris.data, iris.target) grid_resolution = 25 fig, axs = plot_partial_dependence(clf, iris.data, [0, 1], label=0, grid_resolution=grid_resolution) assert len(axs) == 2 assert all(ax.has_data for ax in axs) # now with symbol labels target = iris.target_names[iris.target] clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(iris.data, target) grid_resolution = 25 fig, axs = plot_partial_dependence(clf, iris.data, [0, 1], label='setosa', grid_resolution=grid_resolution) assert len(axs) == 2 assert all(ax.has_data for ax in axs) # label not in gbrt.classes_ assert_raises(ValueError, plot_partial_dependence, clf, iris.data, [0, 1], label='foobar', grid_resolution=grid_resolution) # label not provided assert_raises(ValueError, plot_partial_dependence, clf, iris.data, [0, 1], grid_resolution=grid_resolution)
def test_min_max_axis_errors(): X = np.array([[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64) X_csr = sp.csr_matrix(X) X_csc = sp.csc_matrix(X) assert_raises(TypeError, min_max_axis, X_csr.tolil(), axis=0) assert_raises(ValueError, min_max_axis, X_csr, axis=2) assert_raises(ValueError, min_max_axis, X_csc, axis=-3)
def test_zero_estimator_clf(): # Test if ZeroEstimator works for classification. X = iris.data y = np.array(iris.target) est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init=ZeroEstimator()) est.fit(X, y) assert_greater(est.score(X, y), 0.96) est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init='zero') est.fit(X, y) assert_greater(est.score(X, y), 0.96) # binary clf mask = y != 0 y[mask] = 1 y[~mask] = 0 est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init='zero') est.fit(X, y) assert_greater(est.score(X, y), 0.96) est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init='foobar') assert_raises(ValueError, est.fit, X, y)
def check_edge_case_of_sample_int(sample_without_replacement): # n_population < n_sample assert_raises(ValueError, sample_without_replacement, 0, 1) assert_raises(ValueError, sample_without_replacement, 1, 2) # n_population == n_samples assert_equal(sample_without_replacement(0, 0).shape, (0, )) assert_equal(sample_without_replacement(1, 1).shape, (1, )) # n_population >= n_samples assert_equal(sample_without_replacement(5, 0).shape, (0, )) assert_equal(sample_without_replacement(5, 1).shape, (1, )) # n_population < 0 or n_samples < 0 assert_raises(ValueError, sample_without_replacement, -1, 5) assert_raises(ValueError, sample_without_replacement, 5, -1)
def test_clone_buggy(): # Check that clone raises an error on buggy estimators. buggy = Buggy() buggy.a = 2 assert_raises(RuntimeError, clone, buggy) no_estimator = NoEstimator() assert_raises(TypeError, clone, no_estimator) varg_est = VargEstimator() assert_raises(RuntimeError, clone, varg_est)
def test_resample(): # Border case not worth mentioning in doctests assert_true(resample() is None) # Check that invalid arguments yield ValueError assert_raises(ValueError, resample, [0], [0, 1]) assert_raises(ValueError, resample, [0, 1], [0, 1], replace=False, n_samples=3) assert_raises(ValueError, resample, [0, 1], [0, 1], meaning_of_life=42) # Issue:6581, n_samples can be more when replace is True (default). assert_equal(len(resample([1, 2], n_samples=5)), 5)