def test_bin_mapper_idempotence(n_bins_small, n_bins_large): assert n_bins_large >= n_bins_small data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1) mapper_small = BinMapper(max_bins=n_bins_small) mapper_large = BinMapper(max_bins=n_bins_large) binned_small = mapper_small.fit_transform(data) binned_large = mapper_large.fit_transform(binned_small) assert_array_equal(binned_small, binned_large)
def test_subsample(): # Make sure bin thresholds are different when applying subsampling mapper_no_subsample = BinMapper(subsample=None, random_state=0).fit(DATA) mapper_subsample = BinMapper(subsample=256, random_state=0).fit(DATA) for feature in range(DATA.shape[1]): with pytest.raises(AssertionError): np.testing.assert_array_almost_equal( mapper_no_subsample.numerical_thresholds_[feature], mapper_subsample.numerical_thresholds_[feature], decimal=3)
def test_min_samples_leaf_root(n_samples, min_samples_leaf): # Make sure root node isn't split if n_samples is not at least twice # min_samples_leaf rng = np.random.RandomState(seed=0) max_bins = 255 # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] mapper = BinMapper(max_bins=max_bins) X = mapper.fit_transform(X) all_gradients = y.astype(np.float32) all_hessians = np.ones(shape=1, dtype=np.float32) grower = TreeGrower(X, all_gradients, all_hessians, max_bins=max_bins, shrinkage=1., min_samples_leaf=min_samples_leaf, max_leaf_nodes=n_samples) grower.grow() if n_samples >= min_samples_leaf * 2: assert len(grower.finalized_leaves) >= 2 else: assert len(grower.finalized_leaves) == 1
def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, constant_hessian, noise): rng = np.random.RandomState(seed=0) # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] if noise: y_scale = y.std() y += rng.normal(scale=noise, size=n_samples) * y_scale mapper = BinMapper(max_bins=n_bins) X = mapper.fit_transform(X) all_gradients = y.astype(np.float32) if constant_hessian: all_hessians = np.ones(shape=1, dtype=np.float32) else: all_hessians = np.ones_like(all_gradients) grower = TreeGrower(X, all_gradients, all_hessians, max_bins=n_bins, shrinkage=1., min_samples_leaf=min_samples_leaf, max_leaf_nodes=n_samples) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) if n_samples >= min_samples_leaf: for node in predictor.nodes: if node['is_leaf']: assert node['count'] >= min_samples_leaf else: assert predictor.nodes.shape[0] == 1 assert predictor.nodes[0]['is_leaf'] assert predictor.nodes[0]['count'] == n_samples
def test_boston_dataset(max_bins): boston = load_boston() X_train, X_test, y_train, y_test = train_test_split( boston.data, boston.target, random_state=42) mapper = BinMapper(max_bins=max_bins, random_state=42) X_train_binned = mapper.fit_transform(X_train) X_test_binned = mapper.transform(X_test) # Init gradients and hessians to that of least squares loss gradients = -y_train.astype(np.float32) hessians = np.ones(1, dtype=np.float32) min_samples_leaf = 8 max_leaf_nodes = 31 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, max_bins=max_bins, n_bins_per_feature=mapper.n_bins_per_feature_) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.85 assert r2_score(y_test, predictor.predict_binned(X_test_binned)) > 0.70 assert_allclose(predictor.predict(X_train), predictor.predict_binned(X_train_binned)) assert_allclose(predictor.predict(X_test), predictor.predict_binned(X_test_binned)) assert r2_score(y_train, predictor.predict(X_train)) > 0.85 assert r2_score(y_test, predictor.predict(X_test)) > 0.70
def test_boston_dataset(): boston = load_boston() X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=42) mapper = BinMapper(random_state=42) X_train_binned = mapper.fit_transform(X_train) X_test_binned = mapper.transform(X_test) gradients = y_train.astype(np.float32) hessians = np.ones(1, dtype=np.float32) min_samples_leaf = 8 max_leaf_nodes = 31 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.75 assert r2_score(y_test, predictor.predict_binned(X_test_binned)) > 0.65 assert_allclose(predictor.predict(X_train), predictor.predict_binned(X_train_binned)) assert_allclose(predictor.predict(X_test), predictor.predict_binned(X_test_binned)) assert r2_score(y_train, predictor.predict(X_train)) > 0.75 assert r2_score(y_test, predictor.predict(X_test)) > 0.65
def test_pre_binned_data(): # Make sure ValueError is raised when predictor.predict() is called while # the predictor does not have any numerical thresholds. X, y = make_regression() # Init gradients and hessians to that of least squares loss gradients = -y.astype(np.float32) hessians = np.ones(1, dtype=np.float32) mapper = BinMapper(random_state=0) X_binned = mapper.fit_transform(X) grower = TreeGrower(X_binned, gradients, hessians, n_bins_per_feature=mapper.n_bins_per_feature_) grower.grow() predictor = grower.make_predictor(numerical_thresholds=None) assert_raises_regex(ValueError, 'This predictor does not have numerical thresholds', predictor.predict, X) assert_raises_regex(ValueError, 'binned_data dtype should be uint8', predictor.predict_binned, X) predictor.predict_binned(X_binned) # No error predictor = grower.make_predictor( numerical_thresholds=mapper.numerical_thresholds_) assert_raises_regex(ValueError, 'X has uint8 dtype', predictor.predict, X_binned)
def test_same_predictions_regression(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Make sure pygbm has the same predictions as LGBM for easy targets. # # In particular when the size of the trees are bound and the number of # samples is large enough, the structure of the prediction trees found by # LightGBM and PyGBM should be exactly identical. # # Notes: # - Several candidate splits may have equal gains when the number of # samples in a node is low (and because of float errors). Therefore the # predictions on the test set might differ if the structure of the tree # is not exactly the same. To avoid this issue we only compare the # predictions on the test set when the number of samples is large enough # and max_leaf_nodes is low enough. # - To ignore discrepancies caused by small differences the binning # strategy, data is pre-binned if n_samples > 255. rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 X, y = make_regression(n_samples=n_samples, n_features=5, n_informative=5, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_pygbm = GradientBoostingRegressor(max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_lightgbm_estimator(est_pygbm) est_lightgbm.fit(X_train, y_train) est_pygbm.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lgbm = est_lightgbm.predict(X_train) pred_pygbm = est_pygbm.predict(X_train) # less than 1% of the predictions are different up to the 3rd decimal assert np.mean(abs(pred_lgbm - pred_pygbm) > 1e-3) < .011 if max_leaf_nodes < 10 and n_samples >= 1000: pred_lgbm = est_lightgbm.predict(X_test) pred_pygbm = est_pygbm.predict(X_test) # less than 1% of the predictions are different up to the 4th decimal assert np.mean(abs(pred_lgbm - pred_pygbm) > 1e-4) < .01
def test_n_bins_per_feature(max_bins, diff): # Check that n_bins_per_feature is n_unique_values when # n_unique_values <= max_bins, else max_bins. n_unique_values = max_bins + diff X = list(range(n_unique_values)) * 2 X = np.array(X).reshape(-1, 1) mapper = BinMapper(max_bins=max_bins).fit(X) assert np.all(mapper.n_bins_per_feature_ == min(max_bins, n_unique_values))
def test_same_predictions_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_pygbm = GradientBoostingClassifier(loss='binary_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_lightgbm_estimator(est_pygbm) est_lightgbm.fit(X_train, y_train) est_pygbm.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_pygbm = est_pygbm.predict(X_train) assert np.mean(pred_pygbm == pred_lightgbm) > .89 acc_lgbm = accuracy_score(y_train, pred_lightgbm) acc_pygbm = accuracy_score(y_train, pred_pygbm) np.testing.assert_almost_equal(acc_lgbm, acc_pygbm) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_pygbm = est_pygbm.predict(X_test) assert np.mean(pred_pygbm == pred_lightgbm) > .89 acc_lgbm = accuracy_score(y_test, pred_lightgbm) acc_pygbm = accuracy_score(y_test, pred_pygbm) np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)
def test_same_predictions_easy_target(seed, n_samples, max_leaf_nodes): # Make sure pygbm has the same predictions as LGBM for very easy targets. # # In particular when the size of the trees are bound and the number of # samples is large enough, the structure of the prediction trees found by # LightGBM and PyGBM should be exactly identical. # # Notes: # - Several candidate splits may have equal gains when the number of # samples in a node is low (and because of float errors). Therefore the # predictions on the test set might differ if the structure of the tree # is not exactly the same. To avoid this issue we only compare the # predictions on the test set when the number of samples is large enough # and max_leaf_nodes is low enough. # - To ignore discrepancies caused by small differences the binning # strategy, data is pre-binned if n_samples > 255. lb = pytest.importorskip("lightgbm") rng = np.random.RandomState(seed=seed) n_samples = n_samples min_samples_leaf = 1 # XXX: changing this breaks the test max_iter = 1 # data = linear target, 5 features, 3 irrelevant. X = rng.normal(size=(n_samples, 5)) y = X[:, 0] - X[:, 1] if n_samples > 255: X = BinMapper().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_lightgbm = lb.LGBMRegressor(n_estimators=max_iter, min_data_in_bin=1, learning_rate=1, min_data_in_leaf=min_samples_leaf, num_leaves=max_leaf_nodes) est_pygbm = GradientBoostingMachine(max_iter=max_iter, learning_rate=1, validation_split=None, scoring=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm.fit(X_train, y_train) est_pygbm.fit(X_train, y_train) pred_lgbm = est_lightgbm.predict(X_train) pred_pygbm = est_pygbm.predict(X_train) np.testing.assert_array_almost_equal(pred_lgbm, pred_pygbm, decimal=3) if max_leaf_nodes < 10 and n_samples > 1000: pred_lgbm = est_lightgbm.predict(X_test) pred_pygbm = est_pygbm.predict(X_test) np.testing.assert_array_almost_equal(pred_lgbm, pred_pygbm, decimal=3)
def test_bin_mapper_small_random_data(n_samples, n_bins): data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1) assert len(np.unique(data)) == n_samples mapper = BinMapper(max_bins=n_bins, random_state=42) binned = mapper.fit_transform(data) assert binned.shape == data.shape assert binned.dtype == np.uint8 assert_array_equal(binned.ravel()[np.argsort(data.ravel())], np.arange(n_samples))
def test_plot_grower(tmpdir): pytest.importorskip('graphviz') from pygbm.plotting import plot_tree X_binned = BinMapper().fit_transform(X) gradients = np.asarray(y, dtype=np.float32).copy() hessians = np.ones(1, dtype=np.float32) grower = TreeGrower(X_binned, gradients, hessians, max_leaf_nodes=5) grower.grow() filename = tmpdir.join('plot_grower.pdf') plot_tree(grower, view=False, filename=filename) assert filename.exists()
def test_bin_mapper_repeated_values_invariance(n_distinct): rng = np.random.RandomState(42) distinct_values = rng.normal(size=n_distinct) assert len(np.unique(distinct_values)) == n_distinct repeated_indices = rng.randint(low=0, high=n_distinct, size=1000) data = distinct_values[repeated_indices] rng.shuffle(data) assert_array_equal(np.unique(data), np.sort(distinct_values)) data = data.reshape(-1, 1) mapper_1 = BinMapper(max_bins=n_distinct) binned_1 = mapper_1.fit_transform(data) assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct)) # Adding more bins to the mapper yields the same results (same thresholds) mapper_2 = BinMapper(max_bins=min(256, n_distinct * 3)) binned_2 = mapper_2.fit_transform(data) assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0]) assert_array_equal(binned_1, binned_2)
def test_same_predictions_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=0) if n_samples > 255: X = BinMapper(max_bins=max_bins).fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_pygbm = GradientBoostingClassifier(loss='binary_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=1, validation_split=None, scoring=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_lightgbm_estimator(est_pygbm) est_lightgbm.fit(X_train, y_train) est_pygbm.fit(X_train, y_train) pred_lightgbm = est_lightgbm.predict(X_train) pred_pygbm = est_pygbm.predict(X_train) assert np.mean(pred_pygbm == pred_lightgbm) > .89 acc_lgbm = accuracy_score(y_train, pred_lightgbm) acc_pygbm = accuracy_score(y_train, pred_pygbm) np.testing.assert_almost_equal(acc_lgbm, acc_pygbm) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_pygbm = est_pygbm.predict(X_test) assert np.mean(pred_pygbm == pred_lightgbm) > .89 acc_lgbm = accuracy_score(y_test, pred_lightgbm) acc_pygbm = accuracy_score(y_test, pred_pygbm) np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)
def test_pre_binned_data(): # Make sure that: # - training on numerical data and predicting on numerical data is the # same as training on binned data and predicting on binned data # - training on numerical data and predicting on numerical data is the # same as training on numerical data and predicting on binned data # - training on binned data and predicting on numerical data is not # possible. X, y = make_regression(random_state=0) gbdt = GradientBoostingRegressor(scoring=None, random_state=0) mapper = BinMapper(random_state=0) X_binned = mapper.fit_transform(X) fit_num_pred_num = gbdt.fit(X, y).predict(X) fit_binned_pred_binned = gbdt.fit(X_binned, y).predict(X_binned) fit_num_pred_binned = gbdt.fit(X, y).predict(X_binned) assert_allclose(fit_num_pred_num, fit_binned_pred_binned) assert_allclose(fit_num_pred_num, fit_num_pred_binned) assert_raises_regex(ValueError, 'This estimator was fitted with pre-binned data ', gbdt.fit(X_binned, y).predict, X)
def test_bin_mapper_random_data(n_bins): n_samples, n_features = DATA.shape expected_count_per_bin = n_samples // n_bins tol = int(0.05 * expected_count_per_bin) mapper = BinMapper(max_bins=n_bins, random_state=42).fit(DATA) binned = mapper.transform(DATA) assert binned.shape == (n_samples, n_features) assert binned.dtype == np.uint8 assert_array_equal(binned.min(axis=0), np.array([0, 0])) assert_array_equal(binned.max(axis=0), np.array([n_bins - 1, n_bins - 1])) assert len(mapper.numerical_thresholds_) == n_features for i in range(len(mapper.numerical_thresholds_)): assert mapper.numerical_thresholds_[i].shape == (n_bins - 1, ) assert mapper.numerical_thresholds_[i].dtype == DATA.dtype assert np.all(mapper.n_bins_per_feature_ == n_bins) # Check that the binned data is approximately balanced across bins. for feature_idx in range(n_features): for bin_idx in range(n_bins): count = (binned[:, feature_idx] == bin_idx).sum() assert abs(count - expected_count_per_bin) < tol
def fit(self, X, y): fit_start_time = time() acc_find_split_time = 0. # time spent finding the best splits acc_apply_split_time = 0. # time spent splitting nodes # time spent predicting X for gradient and hessians update acc_prediction_time = 0. # TODO: add support for mixed-typed (numerical + categorical) data # TODO: add support for missing data # TODO: add support for pre-binned data (pass-through)? X, y = check_X_y(X, y, dtype=[np.float32, np.float64]) y = y.astype(np.float32, copy=False) rng = check_random_state(self.random_state) if self.verbose: print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="", flush=True) tic = time() self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng) X_binned = self.bin_mapper_.fit_transform(X) toc = time() if self.verbose: duration = toc - tic troughput = X.nbytes / duration print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)") if self.validation_split is not None: X_binned_train, X_binned_val, y_train, y_val = train_test_split( X_binned, y, test_size=self.validation_split, stratify=y, random_state=rng) # Histogram computation is faster on feature-aligned data. X_binned_train = np.asfortranarray(X_binned_train) else: X_binned_train, y_train = X_binned, y X_binned_val, y_val = None, None # Subsample the training set for score-based monitoring. subsample_size = 10000 if X_binned_train.shape[0] < subsample_size: X_binned_small_train = np.ascontiguousarray(X_binned_train) y_small_train = y_train else: indices = rng.choice(np.arange(X_binned_train.shape[0]), subsample_size) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] if self.verbose: print("Fitting gradient boosted rounds:") # TODO: plug custom loss functions y_pred = np.zeros_like(y_train, dtype=np.float32) gradients = np.asarray(y_train, dtype=np.float32).copy() hessians = np.ones(1, dtype=np.float32) self.predictors_ = predictors = [] self.train_scores_ = [] if self.validation_split is not None: self.validation_scores_ = [] scorer = check_scoring(self, self.scoring) gb_start_time = time() # TODO: compute training loss and use it for early stopping if no # validation data is provided? self.n_iter_ = 0 while True: should_stop = self._stopping_criterion(gb_start_time, scorer, X_binned_small_train, y_small_train, X_binned_val, y_val) if should_stop or self.n_iter_ == self.max_iter: break shrinkage = 1. if self.n_iter_ == 0 else self.learning_rate grower = TreeGrower(X_binned_train, gradients, hessians, n_bins=self.max_bins, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, shrinkage=shrinkage) grower.grow() predictor = grower.make_predictor( bin_thresholds=self.bin_mapper_.bin_thresholds_) predictors.append(predictor) self.n_iter_ += 1 tic_pred = time() leaves_data = [(l.value, l.sample_indices) for l in grower.finalized_leaves] _update_y_pred(leaves_data, y_pred) gradients = y_train - y_pred toc_pred = time() acc_prediction_time += toc_pred - tic_pred acc_apply_split_time += grower.total_apply_split_time acc_find_split_time += grower.total_find_split_time if self.verbose: duration = time() - fit_start_time n_leaf_nodes = sum(p.get_n_leaf_nodes() for p in self.predictors_) print(f"Fit {len(self.predictors_)} trees in {duration:.3f} s, " f"({n_leaf_nodes} total leaf nodes)") print('{:<32} {:.3f}s'.format('Time spent finding best splits:', acc_find_split_time)) print('{:<32} {:.3f}s'.format('Time spent applying splits:', acc_apply_split_time)) print('{:<32} {:.3f}s'.format('Time spent predicting:', acc_prediction_time)) self.train_scores_ = np.asarray(self.train_scores_) if self.validation_split is not None: self.validation_scores_ = np.asarray(self.validation_scores_) return self
from pygbm.binning import BinMapper from pygbm.grower import TreeGrower from pygbm import plotting rng = np.random.RandomState(0) n_samples = int(1e7) n_leaf_nodes = 5 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, n_informative=3, n_redundant=0, random_state=rng) bin_mapper_ = BinMapper(random_state=rng) X_binned = bin_mapper_.fit_transform(X) gradients = np.asarray(y, dtype=np.float32).copy() hessians = np.ones(1, dtype=np.float32) # First run to trigger the compilation of numba jit methods to avoid recording # the compiler overhead in the profile report. TreeGrower(X_binned, gradients, hessians, max_leaf_nodes=n_leaf_nodes).grow() # New run with to collect timing statistics that will be included in the plot. grower = TreeGrower(X_binned, gradients, hessians, max_leaf_nodes=n_leaf_nodes) grower.grow() plotting.plot_tree(grower)
from numpy.testing import assert_allclose from sklearn.datasets import make_regression from pygbm.binning import BinMapper from pygbm import GradientBoostingRegressor n_samples = int(5e6) X, y = make_regression(n_samples=n_samples, n_features=5) est = GradientBoostingRegressor(max_iter=1, scoring=None, validation_split=None, random_state=0) est.fit(X, y) predictor = est.predictors_[0][0] bin_mapper = BinMapper(random_state=0) X_binned = bin_mapper.fit_transform(X) X_binned_c = np.ascontiguousarray(X_binned) print("Compiling predictor code...") tic = time() predictor.predict_binned(np.asfortranarray(X_binned[:100])) predictor.predict_binned(X_binned_c[:100]) predictor.predict(np.asfortranarray(X[:100])) predictor.predict(X[:100]) toc = time() print(f"done in {toc - tic:0.3f}s") data_size = X_binned.nbytes print("Computing predictions (F-contiguous binned data)...") tic = time()
def test_same_predictions_multiclass_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 lr = 1 X, y = make_classification(n_samples=n_samples, n_classes=3, n_features=5, n_informative=5, n_redundant=0, n_clusters_per_class=1, random_state=0) if n_samples > 255: X = BinMapper(max_bins=max_bins).fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_pygbm = GradientBoostingClassifier(loss='categorical_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=lr, validation_split=None, scoring=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_lightgbm_estimator(est_pygbm) est_lightgbm.fit(X_train, y_train) est_pygbm.fit(X_train, y_train) pred_lightgbm = est_lightgbm.predict(X_train) pred_pygbm = est_pygbm.predict(X_train) assert np.mean(pred_pygbm == pred_lightgbm) > .89 proba_lightgbm = est_lightgbm.predict_proba(X_train) proba_pygbm = est_pygbm.predict_proba(X_train) # assert more than 75% of the predicted probabilities are the same up to # the second decimal assert np.mean(np.abs(proba_lightgbm - proba_pygbm) < 1e-2) > .75 acc_lgbm = accuracy_score(y_train, pred_lightgbm) acc_pygbm = accuracy_score(y_train, pred_pygbm) np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_pygbm = est_pygbm.predict(X_test) assert np.mean(pred_pygbm == pred_lightgbm) > .89 proba_lightgbm = est_lightgbm.predict_proba(X_train) proba_pygbm = est_pygbm.predict_proba(X_train) # assert more than 75% of the predicted probabilities are the same up # to the second decimal assert np.mean(np.abs(proba_lightgbm - proba_pygbm) < 1e-2) > .75 acc_lgbm = accuracy_score(y_test, pred_lightgbm) acc_pygbm = accuracy_score(y_test, pred_pygbm) np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)
def fit(self, X, y): """Fit the gradient boosting model. Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. If ``X.dtype == np.uint8``, the data is assumed to be pre-binned and the prediction methods (``predict``, ``predict_proba``) will only accept pre-binned data as well. y : array-like, shape=(n_samples,) Target values. Returns ------- self : object """ fit_start_time = time() acc_find_split_time = 0. # time spent finding the best splits acc_apply_split_time = 0. # time spent splitting nodes # time spent predicting X for gradient and hessians update acc_prediction_time = 0. # TODO: add support for mixed-typed (numerical + categorical) data # TODO: add support for missing data self.multi_output = len(y.ravel()) != len(y) if self.multi_output: self.prediction_dim = y.shape[1] else: self.prediction_dim = 1 X, y = check_X_y(X, y, dtype=[np.float32, np.float64, np.uint8], multi_output=self.multi_output) y = self._encode_y(y) if X.shape[0] == 1 or X.shape[1] == 1: raise ValueError( 'Passing only one sample or one feature is not supported yet. ' 'See numba issue #3569.') rng = check_random_state(self.random_state) self._validate_parameters(X) self.n_features_ = X.shape[1] # used for validation in predict() if X.dtype == np.uint8: # data is pre-binned if self.verbose: print("X is pre-binned.") X_binned = X self.bin_mapper_ = None numerical_thresholds = None n_bins_per_feature = X.max(axis=0).astype(np.uint32) else: if self.verbose: print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="", flush=True) tic = time() self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng) X_binned = self.bin_mapper_.fit_transform(X) numerical_thresholds = self.bin_mapper_.numerical_thresholds_ n_bins_per_feature = self.bin_mapper_.n_bins_per_feature_ toc = time() if self.verbose: duration = toc - tic throughput = X.nbytes / duration print(f"{duration:.3f} s ({throughput / 1e6:.3f} MB/s)") self.loss_ = self._get_loss() do_early_stopping = (self.n_iter_no_change is not None and self.n_iter_no_change > 0) if do_early_stopping and self.validation_split is not None: # stratify for classification stratify = y if hasattr(self.loss_, 'predict_proba') else None X_binned_train, X_binned_val, y_train, y_val = train_test_split( X_binned, y, test_size=self.validation_split, stratify=stratify, random_state=rng) if X_binned_train.size == 0 or X_binned_val.size == 0: raise ValueError( f'Not enough data (n_samples={X_binned.shape[0]}) to ' f'perform early stopping with validation_split=' f'{self.validation_split}. Use more training data or ' f'adjust validation_split.') # Predicting is faster of C-contiguous arrays, training is faster # on Fortran arrays. X_binned_val = np.ascontiguousarray(X_binned_val) X_binned_train = np.asfortranarray(X_binned_train) else: X_binned_train, y_train = X_binned, y X_binned_val, y_val = None, None # Subsample the training set for score-based monitoring. if do_early_stopping: subsample_size = 10000 n_samples_train = X_binned_train.shape[0] if n_samples_train > subsample_size: indices = rng.choice(X_binned_train.shape[0], subsample_size) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] else: X_binned_small_train = X_binned_train y_small_train = y_train # Predicting is faster of C-contiguous arrays. X_binned_small_train = np.ascontiguousarray(X_binned_small_train) if self.verbose: print("Fitting gradient boosted rounds:") n_samples = X_binned_train.shape[0] self.baseline_prediction_ = self.loss_.get_baseline_prediction( y_train, self.prediction_dim) # raw_predictions are the accumulated values predicted by the trees # for the training data. raw_predictions = np.zeros(shape=(n_samples, self.prediction_dim), dtype=self.baseline_prediction_.dtype) if not self.multi_output: raw_predictions = raw_predictions.ravel() raw_predictions += self.baseline_prediction_ # gradients and hessians are 1D arrays of size # n_samples * n_trees_per_iteration gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, prediction_dim=self.prediction_dim) if not self.multi_output: gradients = gradients.ravel() # predictors_ is a matrix of TreePredictor objects with shape # (n_iter_, n_trees_per_iteration) self.predictors_ = predictors = [] # scorer_ is a callable with signature (est, X, y) and calls # est.predict() or est.predict_proba() depending on its nature. self.scorer_ = check_scoring(self, self.scoring) self.train_scores_ = [] self.validation_scores_ = [] if do_early_stopping: # Add predictions of the initial model (before the first tree) self.train_scores_.append(self._get_scores(X_binned_train, y_train)) if self.validation_split is not None: self.validation_scores_.append( self._get_scores(X_binned_val, y_val)) for iteration in range(self.max_iter): if self.verbose: iteration_start_time = time() print(f"[{iteration + 1}/{self.max_iter}] ", end='', flush=True) # Update gradients and hessians, inplace self.loss_.update_gradients_and_hessians(gradients, hessians, y_train, raw_predictions) predictors.append([]) if self.multi_output: proj_gradients, proj_hessians = self.randomly_project_gradients_and_hessians( gradients, hessians) else: proj_gradients, proj_hessians = gradients.ravel( ), hessians.ravel() # Build `n_trees_per_iteration` trees. for k, (gradients_at_k, hessians_at_k) in enumerate( zip( np.array_split(proj_gradients, self.n_trees_per_iteration_), np.array_split(proj_hessians, self.n_trees_per_iteration_))): # the xxxx_at_k arrays are **views** on the original arrays. # Note that for binary classif and regressions, # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the # whole array. grower = TreeGrower(X_binned_train, gradients_at_k, hessians_at_k, max_bins=self.max_bins, n_bins_per_feature=n_bins_per_feature, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, l2_regularization=self.l2_regularization, shrinkage=self.learning_rate) grower.grow() if self.multi_output: for l in grower.finalized_leaves: l.residual = ( -self.learning_rate * np.sum(a=gradients[l.sample_indices, :], axis=0) / (l.sum_hessians + self.l2_regularization + np.finfo(np.float64).eps)) leaves_data = [(l.residual, l.sample_indices) for l in grower.finalized_leaves] else: leaves_data = [(l.value, l.sample_indices) for l in grower.finalized_leaves] acc_apply_split_time += grower.total_apply_split_time acc_find_split_time += grower.total_find_split_time predictor = grower.make_predictor(numerical_thresholds) predictors[-1].append(predictor) tic_pred = time() # prepare leaves_data so that _update_raw_predictions can be # @njitted _update_raw_predictions(leaves_data, raw_predictions) toc_pred = time() acc_prediction_time += toc_pred - tic_pred should_early_stop = False if do_early_stopping: should_early_stop = self._check_early_stopping( X_binned_small_train, y_small_train, X_binned_val, y_val) if self.verbose: self._print_iteration_stats(iteration_start_time, do_early_stopping) if should_early_stop: break if self.verbose: duration = time() - fit_start_time n_total_leaves = sum( predictor.get_n_leaf_nodes() for predictors_at_ith_iteration in self.predictors_ for predictor in predictors_at_ith_iteration) n_predictors = sum( len(predictors_at_ith_iteration) for predictors_at_ith_iteration in self.predictors_) print(f"Fit {n_predictors} trees in {duration:.3f} s, " f"({n_total_leaves} total leaves)") print(f"{'Time spent finding best splits:':<32} " f"{acc_find_split_time:.3f}s") print(f"{'Time spent applying splits:':<32} " f"{acc_apply_split_time:.3f}s") print(f"{'Time spent predicting:':<32} " f"{acc_prediction_time:.3f}s") self.train_scores_ = np.asarray(self.train_scores_) self.validation_scores_ = np.asarray(self.validation_scores_) return self
def fit(self, X, y): """Fit the gradient boosting model. Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. y : array-like, shape=(n_samples,) Target values. Returns ------- self : object """ fit_start_time = time() acc_find_split_time = 0. # time spent finding the best splits acc_apply_split_time = 0. # time spent splitting nodes # time spent predicting X for gradient and hessians update acc_prediction_time = 0. # TODO: add support for mixed-typed (numerical + categorical) data # TODO: add support for missing data # TODO: add support for pre-binned data (pass-through)? # TODO: test input checking X, y = check_X_y(X, y, dtype=[np.float32, np.float64]) y = self._encode_y(y) if X.shape[0] == 1 or X.shape[1] == 1: raise ValueError( 'Passing only one sample or one feature is not supported yet. ' 'See numba issue #3569.' ) rng = check_random_state(self.random_state) self._validate_parameters() self.n_features_ = X.shape[1] # used for validation in predict() if self.verbose: print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="", flush=True) tic = time() self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng) X_binned = self.bin_mapper_.fit_transform(X) toc = time() if self.verbose: duration = toc - tic troughput = X.nbytes / duration print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)") self.loss_ = self._get_loss() if self.scoring is not None and self.validation_split is not None: # stratify for classification stratify = y if hasattr(self.loss_, 'predict_proba') else None X_binned_train, X_binned_val, y_train, y_val = train_test_split( X_binned, y, test_size=self.validation_split, stratify=stratify, random_state=rng) if X_binned_train.size == 0 or X_binned_val.size == 0: raise ValueError( f'Not enough data (n_samples={X_binned.shape[0]}) to ' f'perform early stopping with validation_split=' f'{self.validation_split}. Use more training data or ' f'adjust validation_split.' ) # Histogram computation is faster on feature-aligned data. X_binned_train = np.asfortranarray(X_binned_train) else: X_binned_train, y_train = X_binned, y X_binned_val, y_val = None, None # Subsample the training set for score-based monitoring. subsample_size = 10000 if X_binned_train.shape[0] < subsample_size: X_binned_small_train = np.ascontiguousarray(X_binned_train) y_small_train = y_train else: indices = rng.choice( np.arange(X_binned_train.shape[0]), subsample_size) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] if self.verbose: print("Fitting gradient boosted rounds:") n_samples = X_binned_train.shape[0] # values predicted by the trees. Used as-is in regression, and # transformed into probas and / or classes for classification raw_predictions = np.zeros( shape=(n_samples, self.n_trees_per_iteration_), dtype=y_train.dtype ) # gradients and hessians are 1D arrays of size # n_samples * n_trees_per_iteration gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, n_trees_per_iteration=self.n_trees_per_iteration_ ) # predictors_ is a matrix of TreePredictor objects with shape # (n_iter_, n_trees_per_iteration) self.predictors_ = predictors = [] scorer = check_scoring(self, self.scoring) self.train_scores_ = [] if self.scoring is not None: # Add predictions of the initial model (before the first tree) predicted_train = self._predict_binned(X_binned_train) score_train = scorer._sign * scorer._score_func(y_train, predicted_train) self.train_scores_.append(score_train) if self.validation_split is not None: self.validation_scores_ = [] predicted_val = self._predict_binned(X_binned_val) score_val = scorer._sign * scorer._score_func(y_val, predicted_val) self.validation_scores_.append(score_val) for iteration in range(self.max_iter): if self.verbose: iteration_start_time = time() print(f"[{iteration + 1}/{self.max_iter}] ", end='', flush=True) # Update gradients and hessians, inplace self.loss_.update_gradients_and_hessians(gradients, hessians, y_train, raw_predictions) predictors.append([]) # Build `n_trees_per_iteration` trees. for k, (gradients_at_k, hessians_at_k) in enumerate(zip( np.array_split(gradients, self.n_trees_per_iteration_), np.array_split(hessians, self.n_trees_per_iteration_))): # the xxxx_at_k arrays are **views** on the original arrays. # Note that for binary classif and regressions, # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the # whole array. grower = TreeGrower( X_binned_train, gradients_at_k, hessians_at_k, max_bins=self.max_bins, n_bins_per_feature=self.bin_mapper_.n_bins_per_feature_, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, l2_regularization=self.l2_regularization, shrinkage=self.learning_rate) grower.grow() acc_apply_split_time += grower.total_apply_split_time acc_find_split_time += grower.total_find_split_time predictor = grower.make_predictor( bin_thresholds=self.bin_mapper_.bin_thresholds_) predictors[-1].append(predictor) tic_pred = time() # prepare leaves_data so that _update_raw_predictions can be # @njitted leaves_data = [(l.value, l.sample_indices) for l in grower.finalized_leaves] _update_raw_predictions(leaves_data, raw_predictions[:, k]) toc_pred = time() acc_prediction_time += toc_pred - tic_pred should_stop = self._check_early_stopping( scorer, X_binned_small_train, y_small_train, X_binned_val, y_val) if self.verbose: self._print_iteration_stats(iteration_start_time) if should_stop: break if self.verbose: duration = time() - fit_start_time n_total_leaves = sum( predictor.get_n_leaf_nodes() for predictors_at_ith_iteration in self.predictors_ for predictor in predictors_at_ith_iteration) n_predictors = sum( len(predictors_at_ith_iteration) for predictors_at_ith_iteration in self.predictors_) print(f"Fit {n_predictors} trees in {duration:.3f} s, " f"({n_total_leaves} total leaves)") print(f"{'Time spent finding best splits:':<32} " f"{acc_find_split_time:.3f}s") print(f"{'Time spent applying splits:':<32} " f"{acc_apply_split_time:.3f}s") print(f"{'Time spent predicting:':<32} " f"{acc_prediction_time:.3f}s") self.train_scores_ = np.asarray(self.train_scores_) if self.scoring is not None and self.validation_split is not None: self.validation_scores_ = np.asarray(self.validation_scores_) return self
def test_bin_mapper_identity_small(n_bins, scale, offset): data = np.arange(n_bins).reshape(-1, 1) * scale + offset binned = BinMapper(max_bins=n_bins).fit_transform(data) assert_array_equal(binned, np.arange(n_bins).reshape(-1, 1))
def test_bin_mapper_identity_repeated_values(n_bins, n_distinct, multiplier): data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1) binned = BinMapper(max_bins=n_bins).fit_transform(data) assert_array_equal(data, binned)