def test_oob_improvement_raise(): # Test if oob improvement has correct shape. clf = GradientBoostingClassifier(n_estimators=100, random_state=1, subsample=1.0) clf.fit(X, y) assert_raises(AttributeError, lambda: clf.oob_improvement_)
def test_check_inputs_predict(): # X has wrong shape clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X, y) x = np.array([1.0, 2.0])[:, np.newaxis] assert_raises(ValueError, clf.predict, x) x = np.array([[]]) assert_raises(ValueError, clf.predict, x) x = np.array([1.0, 2.0, 3.0])[:, np.newaxis] assert_raises(ValueError, clf.predict, x) clf = GradientBoostingRegressor(n_estimators=100, random_state=1) clf.fit(X, rng.rand(len(X))) x = np.array([1.0, 2.0])[:, np.newaxis] assert_raises(ValueError, clf.predict, x) x = np.array([[]]) assert_raises(ValueError, clf.predict, x) x = np.array([1.0, 2.0, 3.0])[:, np.newaxis] assert_raises(ValueError, clf.predict, x)
def test_verbose_output(): # Check verbose=1 does not cause error. from sklearn.externals.six.moves import cStringIO as StringIO import sys old_stdout = sys.stdout sys.stdout = StringIO() clf = GradientBoostingClassifier(n_estimators=100, random_state=1, verbose=1, subsample=0.8) clf.fit(X, y) verbose_output = sys.stdout sys.stdout = old_stdout # check output verbose_output.seek(0) header = verbose_output.readline().rstrip() # with OOB true_header = ' '.join(['%10s'] + ['%16s'] * 3) % ( 'Iter', 'Train Loss', 'OOB Improve', 'Remaining Time') assert_equal(true_header, header) n_lines = sum(1 for l in verbose_output.readlines()) # one for 1-10 and then 9 for 20-100 assert_equal(10 + 9, n_lines)
def check_classification_synthetic(presort, loss): # Test GradientBoostingClassifier on synthetic dataset used by # Hastie et al. in ESLII Example 12.7. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=2, max_depth=1, loss=loss, learning_rate=1.0, random_state=0) gbrt.fit(X_train, y_train) error_rate = (1.0 - gbrt.score(X_test, y_test)) assert_less(error_rate, 0.09) gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=2, max_depth=1, loss=loss, learning_rate=1.0, subsample=0.5, random_state=0, presort=presort) gbrt.fit(X_train, y_train) error_rate = (1.0 - gbrt.score(X_test, y_test)) assert_less(error_rate, 0.08)
def test_plot_partial_dependence_input(): # Test partial dependence plot function input checks. clf = GradientBoostingClassifier(n_estimators=10, random_state=1) # not fitted yet assert_raises(ValueError, plot_partial_dependence, clf, X, [0]) clf.fit(X, y) assert_raises(ValueError, plot_partial_dependence, clf, np.array(X)[:, :0], [0]) # first argument must be an instance of BaseGradientBoosting assert_raises(ValueError, plot_partial_dependence, {}, X, [0]) # must be larger than -1 assert_raises(ValueError, plot_partial_dependence, clf, X, [-1]) # too large feature value assert_raises(ValueError, plot_partial_dependence, clf, X, [100]) # str feature but no feature_names assert_raises(ValueError, plot_partial_dependence, clf, X, ['foobar']) # not valid features value assert_raises(ValueError, plot_partial_dependence, clf, X, [{ 'foo': 'bar' }])
def test_staged_predict_proba(): # Test whether staged predict proba eventually gives # the same prediction. X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingClassifier(n_estimators=20) # test raise NotFittedError if not fitted assert_raises( NotFittedError, lambda X: np.fromiter(clf.staged_predict_proba(X), dtype=np.float64), X_test) clf.fit(X_train, y_train) # test if prediction for last stage equals ``predict`` for y_pred in clf.staged_predict(X_test): assert_equal(y_test.shape, y_pred.shape) assert_array_equal(clf.predict(X_test), y_pred) # test if prediction for last stage equals ``predict_proba`` for staged_proba in clf.staged_predict_proba(X_test): assert_equal(y_test.shape[0], staged_proba.shape[0]) assert_equal(2, staged_proba.shape[1]) assert_array_equal(clf.predict_proba(X_test), staged_proba)
def test_partial_dependecy_input(): # Test input validation of partial dependence. clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(X, y) assert_raises(ValueError, partial_dependence, clf, [0], grid=None, X=None) assert_raises(ValueError, partial_dependence, clf, [0], grid=[0, 1], X=X) # first argument must be an instance of BaseGradientBoosting assert_raises(ValueError, partial_dependence, {}, [0], X=X) # Gradient boosting estimator must be fit assert_raises(ValueError, partial_dependence, GradientBoostingClassifier(), [0], X=X) assert_raises(ValueError, partial_dependence, clf, [-1], X=X) assert_raises(ValueError, partial_dependence, clf, [100], X=X) # wrong ndim for grid grid = np.random.rand(10, 2, 1) assert_raises(ValueError, partial_dependence, clf, [0], grid=grid)
def test_warm_start_wo_nestimators_change(): # Test if warm_start does nothing if n_estimators is not changed. # Regression test for #3513. clf = GradientBoostingClassifier(n_estimators=10, warm_start=True) clf.fit([[0, 1], [2, 3]], [0, 1]) assert_equal(clf.estimators_.shape[0], 10) clf.fit([[0, 1], [2, 3]], [0, 1]) assert_equal(clf.estimators_.shape[0], 10)
def test_symbol_labels(): # Test with non-integer class labels. clf = GradientBoostingClassifier(n_estimators=100, random_state=1) symbol_y = tosequence(map(str, y)) clf.fit(X, symbol_y) assert_array_equal(clf.predict(T), tosequence(map(str, true_result))) assert_equal(100, len(clf.estimators_))
def test_non_uniform_weights_toy_edge_case_clf(): X = [[1, 0], [1, 0], [1, 0], [0, 1]] y = [0, 0, 1, 0] # ignore the first 2 training samples by setting their weight to 0 sample_weight = [0, 0, 1, 1] for loss in ('deviance', 'exponential'): gb = GradientBoostingClassifier(n_estimators=5) gb.fit(X, y, sample_weight=sample_weight) assert_array_equal(gb.predict([[1, 0]]), [1])
def test_oob_multilcass_iris(): # Check OOB improvement on multi-class dataset. clf = GradientBoostingClassifier(n_estimators=100, loss='deviance', random_state=1, subsample=0.5) clf.fit(iris.data, iris.target) score = clf.score(iris.data, iris.target) assert_greater(score, 0.9) assert_equal(clf.oob_improvement_.shape[0], clf.n_estimators)
def test_float_class_labels(): # Test with float class labels. clf = GradientBoostingClassifier(n_estimators=100, random_state=1) float_y = np.asarray(y, dtype=np.float32) clf.fit(X, float_y) assert_array_equal(clf.predict(T), np.asarray(true_result, dtype=np.float32)) assert_equal(100, len(clf.estimators_))
def test_oob_improvement(): # Test if oob improvement has correct shape and regression test. clf = GradientBoostingClassifier(n_estimators=100, random_state=1, subsample=0.5) clf.fit(X, y) assert_equal(clf.oob_improvement_.shape[0], 100) # hard-coded regression test - change if modification in OOB computation assert_array_almost_equal(clf.oob_improvement_[:5], np.array([0.19, 0.15, 0.12, -0.12, -0.11]), decimal=2)
def test_degenerate_targets(): # Check if we can fit even though all targets are equal. clf = GradientBoostingClassifier(n_estimators=100, random_state=1) # classifier should raise exception assert_raises(ValueError, clf.fit, X, np.ones(len(X))) clf = GradientBoostingRegressor(n_estimators=100, random_state=1) clf.fit(X, np.ones(len(X))) clf.predict([rng.rand(2)]) assert_array_equal(np.ones((1, ), dtype=np.float64), clf.predict([rng.rand(2)]))
def check_iris(presort, subsample, sample_weight): # Check consistency on dataset iris. clf = GradientBoostingClassifier(n_estimators=100, loss='deviance', random_state=1, subsample=subsample, presort=presort) clf.fit(iris.data, iris.target, sample_weight=sample_weight) score = clf.score(iris.data, iris.target) assert_greater(score, 0.9) leaves = clf.apply(iris.data) assert_equal(leaves.shape, (150, 100, 3))
def test_partial_dependence_multiclass(): # Test partial dependence for multi-class classifier clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(iris.data, iris.target) grid_resolution = 25 n_classes = clf.n_classes_ pdp, axes = partial_dependence(clf, [0], X=iris.data, grid_resolution=grid_resolution) assert pdp.shape == (n_classes, grid_resolution) assert len(axes) == 1 assert axes[0].shape[0] == grid_resolution
def test_mem_layout(): # Test with different memory layouts of X and y X_ = np.asfortranarray(X) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X_, y) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) X_ = np.ascontiguousarray(X) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X_, y) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) y_ = np.asarray(y, dtype=np.int32) y_ = np.ascontiguousarray(y_) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X, y_) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) y_ = np.asarray(y, dtype=np.int32) y_ = np.asfortranarray(y_) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X, y_) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_))
def test_complete_classification(): # Test greedy trees with max_depth + 1 leafs. from sklearn.tree._tree import TREE_LEAF X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) k = 4 est = GradientBoostingClassifier(n_estimators=20, max_depth=None, random_state=1, max_leaf_nodes=k + 1) est.fit(X, y) tree = est.estimators_[0, 0].tree_ assert_equal(tree.max_depth, k) assert_equal(tree.children_left[tree.children_left == TREE_LEAF].shape[0], k + 1)
def test_max_feature_regression(): # Test to make sure random state is set properly. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=5, max_depth=2, learning_rate=.1, max_features=2, random_state=1) gbrt.fit(X_train, y_train) deviance = gbrt.loss_(y_test, gbrt.decision_function(X_test)) assert_true(deviance < 0.5, "GB failed with deviance %.4f" % deviance)
def test_probability_log(): # Predict probabilities. clf = GradientBoostingClassifier(n_estimators=100, random_state=1) assert_raises(ValueError, clf.predict_proba, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # check if probabilities are in [0, 1]. y_proba = clf.predict_proba(T) assert_true(np.all(y_proba >= 0.0)) assert_true(np.all(y_proba <= 1.0)) # derive predictions from probabilities y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0) assert_array_equal(y_pred, true_result)
def test_partial_dependence_classifier(): # Test partial dependence for classifier clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(X, y) pdp, axes = partial_dependence(clf, [0], X=X, grid_resolution=5) # only 4 grid points instead of 5 because only 4 unique X[:,0] vals assert pdp.shape == (1, 4) assert axes[0].shape[0] == 4 # now with our own grid X_ = np.asarray(X) grid = np.unique(X_[:, 0]) pdp_2, axes = partial_dependence(clf, [0], grid=grid) assert axes is None assert_array_equal(pdp, pdp_2)
def test_serialization(): # Check model serialization. clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) try: import cPickle as pickle except ImportError: import pickle serialized_clf = pickle.dumps(clf, protocol=pickle.HIGHEST_PROTOCOL) clf = None clf = pickle.loads(serialized_clf) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_))
def check_classification_toy(presort, loss): # Check classification on a toy dataset. clf = GradientBoostingClassifier(loss=loss, n_estimators=10, random_state=1, presort=presort) assert_raises(ValueError, clf.predict, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(10, len(clf.estimators_)) deviance_decrease = (clf.train_score_[:-1] - clf.train_score_[1:]) assert_true(np.any(deviance_decrease >= 0.0)) leaves = clf.apply(X) assert_equal(leaves.shape, (6, 10, 1))
def test_probability_exponential(): # Predict probabilities. clf = GradientBoostingClassifier(loss='exponential', n_estimators=100, random_state=1) assert_raises(ValueError, clf.predict_proba, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # check if probabilities are in [0, 1]. y_proba = clf.predict_proba(T) assert_true(np.all(y_proba >= 0.0)) assert_true(np.all(y_proba <= 1.0)) score = clf.decision_function(T).ravel() assert_array_almost_equal(y_proba[:, 1], 1.0 / (1.0 + np.exp(-2 * score))) # derive predictions from probabilities y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0) assert_array_equal(y_pred, true_result)
def test_plot_partial_dependence_multiclass(): # Test partial dependence plot function on multi-class input. clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(iris.data, iris.target) grid_resolution = 25 fig, axs = plot_partial_dependence(clf, iris.data, [0, 1], label=0, grid_resolution=grid_resolution) assert len(axs) == 2 assert all(ax.has_data for ax in axs) # now with symbol labels target = iris.target_names[iris.target] clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(iris.data, target) grid_resolution = 25 fig, axs = plot_partial_dependence(clf, iris.data, [0, 1], label='setosa', grid_resolution=grid_resolution) assert len(axs) == 2 assert all(ax.has_data for ax in axs) # label not in gbrt.classes_ assert_raises(ValueError, plot_partial_dependence, clf, iris.data, [0, 1], label='foobar', grid_resolution=grid_resolution) # label not provided assert_raises(ValueError, plot_partial_dependence, clf, iris.data, [0, 1], grid_resolution=grid_resolution)
def test_more_verbose_output(): # Check verbose=2 does not cause error. from sklearn.externals.six.moves import cStringIO as StringIO import sys old_stdout = sys.stdout sys.stdout = StringIO() clf = GradientBoostingClassifier(n_estimators=100, random_state=1, verbose=2) clf.fit(X, y) verbose_output = sys.stdout sys.stdout = old_stdout # check output verbose_output.seek(0) header = verbose_output.readline().rstrip() # no OOB true_header = ' '.join(['%10s'] + ['%16s'] * 2) % ('Iter', 'Train Loss', 'Remaining Time') assert_equal(true_header, header) n_lines = sum(1 for l in verbose_output.readlines()) # 100 lines for n_estimators==100 assert_equal(100, n_lines)
def test_zero_estimator_clf(): # Test if ZeroEstimator works for classification. X = iris.data y = np.array(iris.target) est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init=ZeroEstimator()) est.fit(X, y) assert_greater(est.score(X, y), 0.96) est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init='zero') est.fit(X, y) assert_greater(est.score(X, y), 0.96) # binary clf mask = y != 0 y[mask] = 1 y[~mask] = 0 est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init='zero') est.fit(X, y) assert_greater(est.score(X, y), 0.96) est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init='foobar') assert_raises(ValueError, est.fit, X, y)
def test_max_feature_auto(): # Test if max features is set properly for floats and str. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) _, n_features = X.shape X_train = X[:2000] y_train = y[:2000] gbrt = GradientBoostingClassifier(n_estimators=1, max_features='auto') gbrt.fit(X_train, y_train) assert_equal(gbrt.max_features_, int(np.sqrt(n_features))) gbrt = GradientBoostingRegressor(n_estimators=1, max_features='auto') gbrt.fit(X_train, y_train) assert_equal(gbrt.max_features_, n_features) gbrt = GradientBoostingRegressor(n_estimators=1, max_features=0.3) gbrt.fit(X_train, y_train) assert_equal(gbrt.max_features_, int(n_features * 0.3)) gbrt = GradientBoostingRegressor(n_estimators=1, max_features='sqrt') gbrt.fit(X_train, y_train) assert_equal(gbrt.max_features_, int(np.sqrt(n_features))) gbrt = GradientBoostingRegressor(n_estimators=1, max_features='log2') gbrt.fit(X_train, y_train) assert_equal(gbrt.max_features_, int(np.log2(n_features))) gbrt = GradientBoostingRegressor(n_estimators=1, max_features=0.01 / X.shape[1]) gbrt.fit(X_train, y_train) assert_equal(gbrt.max_features_, 1)