def test_plot_grower(tmpdir, classification_data): pytest.importorskip('graphviz') from pygbm.plotting import plot_tree dataset = Dataset(classification_data[0], classification_data[1]) n_trees_per_iteration = 1 loss = BinaryCrossEntropy() clf = GradientBoostingClassifier() gradients, hessians = loss.init_gradients_and_hessians( n_samples=dataset.shape[0], prediction_dim=n_trees_per_iteration) y = clf._encode_y(dataset.y) baseline_prediction_ = loss.get_baseline_prediction(y, 1) raw_predictions = np.zeros(shape=(dataset.shape[0], n_trees_per_iteration), dtype=baseline_prediction_.dtype) raw_predictions += baseline_prediction_ loss.update_gradients_and_hessians(gradients, hessians, y, raw_predictions) options = OptionSet(clf.parameter_dict) options['max_leaf_nodes'] = 5 grower = TreeGrower(dataset, gradients, hessians, options) grower.grow() filename = tmpdir.join('plot_grower.pdf') plot_tree(grower, view=False, filename=filename) assert filename.exists()
def test_early_stopping_classification(data, scoring, validation_split, n_iter_no_change, tol): max_iter = 500 X, y = data if validation_split is not None: X, X_test, y, y_test = train_test_split(X, y, test_size=validation_split, random_state=42) eval_set = (X_test, y_test) else: eval_set = None gb = GradientBoostingClassifier( verbose=True, # just for coverage scoring=scoring, tol=tol, max_iter=max_iter, n_iter_no_change=n_iter_no_change, random_state=0) gb.fit(X, y, eval_set=eval_set) if n_iter_no_change != -1: assert n_iter_no_change <= gb.n_iter_ < max_iter else: assert gb.n_iter_ == max_iter
def test_plot_estimator_and_lightgbm(tmpdir): pytest.importorskip('graphviz') lightgbm = pytest.importorskip('lightgbm') from pygbm.plotting import plot_tree n_classes = 3 X, y = make_classification(n_samples=150, n_classes=n_classes, n_features=5, n_informative=3, n_redundant=0, random_state=0) n_trees = 3 est_pygbm = GradientBoostingClassifier(max_iter=n_trees, n_iter_no_change=None) est_pygbm.fit(X, y) est_lightgbm = lightgbm.LGBMClassifier(n_estimators=n_trees) est_lightgbm.fit(X, y) n_total_trees = n_trees * n_classes for i in range(n_total_trees): filename = tmpdir.join('plot_mixed_predictors.pdf') plot_tree(est_pygbm, est_lightgbm=est_lightgbm, tree_index=i, view=False, filename=filename) assert filename.exists()
def test_early_stopping_loss(n_samples, max_iter, n_iter_no_change, tree_type): # Make sure that when scoring is None, the early stopping is done w.r.t to # the loss. Using scoring='neg_log_loss' and scoring=None should be # equivalent since the loss is precisely the negative log likelihood X, y = make_classification(n_samples, random_state=0) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1) clf_scoring = GradientBoostingClassifier(max_iter=max_iter, scoring='neg_log_loss', n_iter_no_change=n_iter_no_change, tol=1e-4, verbose=True, random_state=0, tree_type=tree_type) clf_scoring.fit(X, y, eval_set=(X_val, y_val)) clf_loss = GradientBoostingClassifier(max_iter=max_iter, scoring=None, n_iter_no_change=n_iter_no_change, tol=1e-4, verbose=True, random_state=0, tree_type=tree_type) clf_loss.fit(X, y, eval_set=(X_val, y_val)) assert n_iter_no_change < clf_loss.n_iter_ < max_iter assert clf_loss.n_iter_ == clf_scoring.n_iter_
def test_early_stopping_loss(): # Make sure that when scoring is None, the early stopping is done w.r.t to # the loss. Using scoring='neg_log_loss' and scoring=None should be # equivalent since the loss is precisely the negative log likelihood n_samples = int(1e3) max_iter = 100 n_iter_no_change = 5 X, y = make_classification(n_samples, random_state=0) clf_scoring = GradientBoostingClassifier(max_iter=max_iter, scoring='neg_log_loss', validation_split=.1, n_iter_no_change=n_iter_no_change, tol=1e-4, verbose=1, random_state=0) clf_scoring.fit(X, y) clf_loss = GradientBoostingClassifier(max_iter=max_iter, scoring=None, validation_split=.1, n_iter_no_change=n_iter_no_change, tol=1e-4, verbose=1, random_state=0) clf_loss.fit(X, y) assert n_iter_no_change < clf_loss.n_iter_ < max_iter assert clf_loss.n_iter_ == clf_scoring.n_iter_
def test_one_sample_one_feature(): # Until numba issue #3569 is fixed, we raise an informative error message # when X is only one sample or one feature in fit (it's OK in predict). # The array is both F and C contiguous, and numba can't compile. gb = GradientBoostingClassifier() for X, y in (([[1, 2]], [0]), ([[1], [2]], [0, 1])): assert_raises_regex( ValueError, 'Passing only one sample or one feature is not supported yet.', gb.fit, X, y)
def test_early_stopping_classification(data, scoring, validation_split, tol): max_iter = 500 n_iter_no_change = 5 X, y = data gb = GradientBoostingClassifier( verbose=1, # just for coverage scoring=scoring, tol=tol, validation_split=validation_split, max_iter=max_iter, n_iter_no_change=n_iter_no_change, random_state=0) gb.fit(X, y) if scoring is not None: assert n_iter_no_change <= gb.n_iter_ < max_iter else: assert gb.n_iter_ == max_iter
def test_same_predictions_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_pygbm = GradientBoostingClassifier(loss='binary_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_lightgbm_estimator(est_pygbm) est_lightgbm.fit(X_train, y_train) est_pygbm.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_pygbm = est_pygbm.predict(X_train) assert np.mean(pred_pygbm == pred_lightgbm) > .89 acc_lgbm = accuracy_score(y_train, pred_lightgbm) acc_pygbm = accuracy_score(y_train, pred_pygbm) np.testing.assert_almost_equal(acc_lgbm, acc_pygbm) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_pygbm = est_pygbm.predict(X_test) assert np.mean(pred_pygbm == pred_lightgbm) > .89 acc_lgbm = accuracy_score(y_test, pred_lightgbm) acc_pygbm = accuracy_score(y_test, pred_pygbm) np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)
def test_same_predictions_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=0) if n_samples > 255: X = BinMapper(max_bins=max_bins).fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_pygbm = GradientBoostingClassifier(loss='binary_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=1, validation_split=None, scoring=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_lightgbm_estimator(est_pygbm) est_lightgbm.fit(X_train, y_train) est_pygbm.fit(X_train, y_train) pred_lightgbm = est_lightgbm.predict(X_train) pred_pygbm = est_pygbm.predict(X_train) assert np.mean(pred_pygbm == pred_lightgbm) > .89 acc_lgbm = accuracy_score(y_train, pred_lightgbm) acc_pygbm = accuracy_score(y_train, pred_pygbm) np.testing.assert_almost_equal(acc_lgbm, acc_pygbm) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_pygbm = est_pygbm.predict(X_test) assert np.mean(pred_pygbm == pred_lightgbm) > .89 acc_lgbm = accuracy_score(y_test, pred_lightgbm) acc_pygbm = accuracy_score(y_test, pred_pygbm) np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)
def test_same_predictions_multiclass_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 lr = 1 X, y = make_classification(n_samples=n_samples, n_classes=3, n_features=5, n_informative=5, n_redundant=0, n_clusters_per_class=1, random_state=0) if n_samples > 255: X = BinMapper(max_bins=max_bins).fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_pygbm = GradientBoostingClassifier(loss='categorical_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=lr, validation_split=None, scoring=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_lightgbm_estimator(est_pygbm) est_lightgbm.fit(X_train, y_train) est_pygbm.fit(X_train, y_train) pred_lightgbm = est_lightgbm.predict(X_train) pred_pygbm = est_pygbm.predict(X_train) assert np.mean(pred_pygbm == pred_lightgbm) > .89 proba_lightgbm = est_lightgbm.predict_proba(X_train) proba_pygbm = est_pygbm.predict_proba(X_train) # assert more than 75% of the predicted probabilities are the same up to # the second decimal assert np.mean(np.abs(proba_lightgbm - proba_pygbm) < 1e-2) > .75 acc_lgbm = accuracy_score(y_train, pred_lightgbm) acc_pygbm = accuracy_score(y_train, pred_pygbm) np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_pygbm = est_pygbm.predict(X_test) assert np.mean(pred_pygbm == pred_lightgbm) > .89 proba_lightgbm = est_lightgbm.predict_proba(X_train) proba_pygbm = est_pygbm.predict_proba(X_train) # assert more than 75% of the predicted probabilities are the same up # to the second decimal assert np.mean(np.abs(proba_lightgbm - proba_pygbm) < 1e-2) > .75 acc_lgbm = accuracy_score(y_test, pred_lightgbm) acc_pygbm = accuracy_score(y_test, pred_pygbm) np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)
def should_stop(scores, n_iter_no_change, tol, tree_type): gbdt = GradientBoostingClassifier(n_iter_no_change=n_iter_no_change, tol=tol, tree_type=tree_type) gbdt._validate_parameters() return gbdt._should_stop(scores)
if (hasattr(check, 'func') and check.func is estimator_checks.check_classifiers_train): continue # same, wrapped in a functools.partial object. try: check(name, estimator) except SkipTest as exception: # the only SkipTest thrown currently results from not # being able to import pandas. warnings.warn(str(exception), SkipTestWarning) @pytest.mark.skipif(int(os.environ.get("NUMBA_DISABLE_JIT", 0)) == 1, reason="Potentially long") @pytest.mark.parametrize('Estimator', ( GradientBoostingRegressor(), GradientBoostingClassifier(n_iter_no_change=None, min_samples_leaf=5), )) def test_estimator_checks(Estimator): # Run the check_estimator() test suite on GBRegressor and GBClassifier. # Notes: # - Can't do early stopping with classifier because often # validation_split=.1 leads to test_size=2 < n_classes and # train_test_split raises an error. # - Also, need to set a low min_samples_leaf for # check_classifiers_classes() to pass: with only 30 samples on the # dataset, the root is never split with min_samples_leaf=20 and only the # majority class is predicted. custom_check_estimator(Estimator)
def should_stop(scores, n_iter_no_change, tol): gbdt = GradientBoostingClassifier(n_iter_no_change=n_iter_no_change, tol=tol) return gbdt._should_stop(scores)
if (hasattr(check, 'func') and check.func is estimator_checks.check_classifiers_train): continue # same, wrapped in a functools.partial object. try: check(name, estimator) except SkipTest as exception: # the only SkipTest thrown currently results from not # being able to import pandas. warnings.warn(str(exception), SkipTestWarning) @pytest.mark.skipif(int(os.environ.get("NUMBA_DISABLE_JIT", 0)) == 1, reason="Potentially long") @pytest.mark.parametrize('Estimator', ( GradientBoostingRegressor(), GradientBoostingClassifier(scoring=None, min_samples_leaf=5), )) def test_estimator_checks(Estimator): # Run the check_estimator() test suite on GBRegressor and GBClassifier. # Notes: # - Can't do early stopping with classifier because often # validation_split=.1 leads to test_size=2 < n_classes and # train_test_split raises an error. # - Also, need to set a low min_samples_leaf for # check_classifiers_classes() to pass: with only 30 samples on the # dataset, the root is never split with min_samples_leaf=20 and only the # majority class is predicted. custom_check_estimator(Estimator)
data = np.ascontiguousarray(df.values[:, 1:]) data_train, data_test, target_train, target_test = train_test_split( data, target, test_size=50000, random_state=0) if subsample is not None: data_train, target_train = data_train[:subsample], target_train[:subsample] n_samples, n_features = data_train.shape print(f"Training set with {n_samples} records with {n_features} features.") print("JIT compiling code for the pygbm model...") tic = time() pygbm_model = GradientBoostingClassifier(learning_rate=lr, max_iter=1, max_bins=max_bins, max_leaf_nodes=n_leaf_nodes, n_iter_no_change=None, random_state=0, verbose=False, tree_type=tree_type) pygbm_model.fit(data_train[:100], target_train[:100]) pygbm_model.predict(data_train[:100]) # prediction code is also jitted toc = time() print(f"done in {toc - tic:.3f}s") print("Fitting a pygbm model...") tic = time() pygbm_model = GradientBoostingClassifier(loss='binary_crossentropy', learning_rate=lr, max_iter=n_trees, max_bins=max_bins, max_leaf_nodes=n_leaf_nodes, n_iter_no_change=None,
data = np.ascontiguousarray(df.values[:, 1:]) data_train, data_test, target_train, target_test = train_test_split( data, target, test_size=50000, random_state=0) if subsample is not None: data_train, target_train = data_train[:subsample], target_train[:subsample] n_samples, n_features = data_train.shape print(f"Training set with {n_samples} records with {n_features} features.") print("JIT compiling code for the pygbm model...") tic = time() pygbm_model = GradientBoostingClassifier(learning_rate=lr, max_iter=1, max_bins=max_bins, max_leaf_nodes=n_leaf_nodes, random_state=0, scoring=None, verbose=0, validation_split=None) pygbm_model.fit(data_train[:100], target_train[:100]) pygbm_model.predict(data_train[:100]) # prediction code is also jitted toc = time() print(f"done in {toc - tic:.3f}s") print("Fitting a pygbm model...") tic = time() pygbm_model = GradientBoostingClassifier(loss='binary_crossentropy', learning_rate=lr, max_iter=n_trees, max_bins=max_bins, max_leaf_nodes=n_leaf_nodes,
import numpy as np from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from pygbm import GradientBoostingClassifier rng = np.random.RandomState(0) n_samples = int(1e6) X, y = make_classification(n_samples, random_state=rng) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) print('Early stopping on held-out validation data') clf = GradientBoostingClassifier(max_iter=100, scoring='neg_log_loss', validation_split=.1, n_iter_no_change=5, tol=1e-4, verbose=1, random_state=rng) clf.fit(X_train, y_train) print(f'Early stopped at iteration {clf.n_iter_}') print(f'Mean accuracy: {clf.score(X_test, y_test)}') print('Early stopping on training data') clf = GradientBoostingClassifier(max_iter=100, scoring='neg_log_loss', validation_split=None, n_iter_no_change=5, tol=1e-4, verbose=1, random_state=rng)