def test_boston_dataset(max_bins): boston = load_boston() X_train, X_test, y_train, y_test = train_test_split( boston.data, boston.target, random_state=42) mapper = BinMapper(max_bins=max_bins, random_state=42) X_train_binned = mapper.fit_transform(X_train) X_test_binned = mapper.transform(X_test) # Init gradients and hessians to that of least squares loss gradients = -y_train.astype(np.float32) hessians = np.ones(1, dtype=np.float32) min_samples_leaf = 8 max_leaf_nodes = 31 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, max_bins=max_bins, n_bins_per_feature=mapper.n_bins_per_feature_) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.85 assert r2_score(y_test, predictor.predict_binned(X_test_binned)) > 0.70 assert_allclose(predictor.predict(X_train), predictor.predict_binned(X_train_binned)) assert_allclose(predictor.predict(X_test), predictor.predict_binned(X_test_binned)) assert r2_score(y_train, predictor.predict(X_train)) > 0.85 assert r2_score(y_test, predictor.predict(X_test)) > 0.70
def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, constant_hessian, noise): rng = np.random.RandomState(seed=0) # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] if noise: y_scale = y.std() y += rng.normal(scale=noise, size=n_samples) * y_scale mapper = BinMapper(max_bins=n_bins) X = mapper.fit_transform(X) all_gradients = y.astype(np.float32) if constant_hessian: all_hessians = np.ones(shape=1, dtype=np.float32) else: all_hessians = np.ones_like(all_gradients) grower = TreeGrower(X, all_gradients, all_hessians, max_bins=n_bins, shrinkage=1., min_samples_leaf=min_samples_leaf, max_leaf_nodes=n_samples) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) if n_samples >= min_samples_leaf: for node in predictor.nodes: if node['is_leaf']: assert node['count'] >= min_samples_leaf else: assert predictor.nodes.shape[0] == 1 assert predictor.nodes[0]['is_leaf'] assert predictor.nodes[0]['count'] == n_samples
def test_boston_dataset(): boston = load_boston() X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=42) mapper = BinMapper(random_state=42) X_train_binned = mapper.fit_transform(X_train) X_test_binned = mapper.transform(X_test) gradients = y_train.astype(np.float32) hessians = np.ones(1, dtype=np.float32) min_samples_leaf = 8 max_leaf_nodes = 31 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.75 assert r2_score(y_test, predictor.predict_binned(X_test_binned)) > 0.65 assert_allclose(predictor.predict(X_train), predictor.predict_binned(X_train_binned)) assert_allclose(predictor.predict(X_test), predictor.predict_binned(X_test_binned)) assert r2_score(y_train, predictor.predict(X_train)) > 0.75 assert r2_score(y_test, predictor.predict(X_test)) > 0.65
def test_min_samples_leaf_root(n_samples, min_samples_leaf): # Make sure root node isn't split if n_samples is not at least twice # min_samples_leaf rng = np.random.RandomState(seed=0) max_bins = 255 # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] mapper = BinMapper(max_bins=max_bins) X = mapper.fit_transform(X) all_gradients = y.astype(np.float32) all_hessians = np.ones(shape=1, dtype=np.float32) grower = TreeGrower(X, all_gradients, all_hessians, max_bins=max_bins, shrinkage=1., min_samples_leaf=min_samples_leaf, max_leaf_nodes=n_samples) grower.grow() if n_samples >= min_samples_leaf * 2: assert len(grower.finalized_leaves) >= 2 else: assert len(grower.finalized_leaves) == 1
def test_pre_binned_data(): # Make sure ValueError is raised when predictor.predict() is called while # the predictor does not have any numerical thresholds. X, y = make_regression() # Init gradients and hessians to that of least squares loss gradients = -y.astype(np.float32) hessians = np.ones(1, dtype=np.float32) mapper = BinMapper(random_state=0) X_binned = mapper.fit_transform(X) grower = TreeGrower(X_binned, gradients, hessians, n_bins_per_feature=mapper.n_bins_per_feature_) grower.grow() predictor = grower.make_predictor(numerical_thresholds=None) assert_raises_regex(ValueError, 'This predictor does not have numerical thresholds', predictor.predict, X) assert_raises_regex(ValueError, 'binned_data dtype should be uint8', predictor.predict_binned, X) predictor.predict_binned(X_binned) # No error predictor = grower.make_predictor( numerical_thresholds=mapper.numerical_thresholds_) assert_raises_regex(ValueError, 'X has uint8 dtype', predictor.predict, X_binned)
def test_bin_mapper_small_random_data(n_samples, n_bins): data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1) assert len(np.unique(data)) == n_samples mapper = BinMapper(max_bins=n_bins, random_state=42) binned = mapper.fit_transform(data) assert binned.shape == data.shape assert binned.dtype == np.uint8 assert_array_equal(binned.ravel()[np.argsort(data.ravel())], np.arange(n_samples))
def test_subsample(): # Make sure bin thresholds are different when applying subsampling mapper_no_subsample = BinMapper(subsample=None, random_state=0).fit(DATA) mapper_subsample = BinMapper(subsample=256, random_state=0).fit(DATA) for feature in range(DATA.shape[1]): with pytest.raises(AssertionError): np.testing.assert_array_almost_equal( mapper_no_subsample.numerical_thresholds_[feature], mapper_subsample.numerical_thresholds_[feature], decimal=3)
def test_same_predictions_regression(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Make sure pygbm has the same predictions as LGBM for easy targets. # # In particular when the size of the trees are bound and the number of # samples is large enough, the structure of the prediction trees found by # LightGBM and PyGBM should be exactly identical. # # Notes: # - Several candidate splits may have equal gains when the number of # samples in a node is low (and because of float errors). Therefore the # predictions on the test set might differ if the structure of the tree # is not exactly the same. To avoid this issue we only compare the # predictions on the test set when the number of samples is large enough # and max_leaf_nodes is low enough. # - To ignore discrepancies caused by small differences the binning # strategy, data is pre-binned if n_samples > 255. rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 X, y = make_regression(n_samples=n_samples, n_features=5, n_informative=5, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_pygbm = GradientBoostingRegressor(max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_lightgbm_estimator(est_pygbm) est_lightgbm.fit(X_train, y_train) est_pygbm.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lgbm = est_lightgbm.predict(X_train) pred_pygbm = est_pygbm.predict(X_train) # less than 1% of the predictions are different up to the 3rd decimal assert np.mean(abs(pred_lgbm - pred_pygbm) > 1e-3) < .011 if max_leaf_nodes < 10 and n_samples >= 1000: pred_lgbm = est_lightgbm.predict(X_test) pred_pygbm = est_pygbm.predict(X_test) # less than 1% of the predictions are different up to the 4th decimal assert np.mean(abs(pred_lgbm - pred_pygbm) > 1e-4) < .01
def test_n_bins_per_feature(max_bins, diff): # Check that n_bins_per_feature is n_unique_values when # n_unique_values <= max_bins, else max_bins. n_unique_values = max_bins + diff X = list(range(n_unique_values)) * 2 X = np.array(X).reshape(-1, 1) mapper = BinMapper(max_bins=max_bins).fit(X) assert np.all(mapper.n_bins_per_feature_ == min(max_bins, n_unique_values))
def test_same_predictions_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_pygbm = GradientBoostingClassifier(loss='binary_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_lightgbm_estimator(est_pygbm) est_lightgbm.fit(X_train, y_train) est_pygbm.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_pygbm = est_pygbm.predict(X_train) assert np.mean(pred_pygbm == pred_lightgbm) > .89 acc_lgbm = accuracy_score(y_train, pred_lightgbm) acc_pygbm = accuracy_score(y_train, pred_pygbm) np.testing.assert_almost_equal(acc_lgbm, acc_pygbm) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_pygbm = est_pygbm.predict(X_test) assert np.mean(pred_pygbm == pred_lightgbm) > .89 acc_lgbm = accuracy_score(y_test, pred_lightgbm) acc_pygbm = accuracy_score(y_test, pred_pygbm) np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)
def test_same_predictions_easy_target(seed, n_samples, max_leaf_nodes): # Make sure pygbm has the same predictions as LGBM for very easy targets. # # In particular when the size of the trees are bound and the number of # samples is large enough, the structure of the prediction trees found by # LightGBM and PyGBM should be exactly identical. # # Notes: # - Several candidate splits may have equal gains when the number of # samples in a node is low (and because of float errors). Therefore the # predictions on the test set might differ if the structure of the tree # is not exactly the same. To avoid this issue we only compare the # predictions on the test set when the number of samples is large enough # and max_leaf_nodes is low enough. # - To ignore discrepancies caused by small differences the binning # strategy, data is pre-binned if n_samples > 255. lb = pytest.importorskip("lightgbm") rng = np.random.RandomState(seed=seed) n_samples = n_samples min_samples_leaf = 1 # XXX: changing this breaks the test max_iter = 1 # data = linear target, 5 features, 3 irrelevant. X = rng.normal(size=(n_samples, 5)) y = X[:, 0] - X[:, 1] if n_samples > 255: X = BinMapper().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_lightgbm = lb.LGBMRegressor(n_estimators=max_iter, min_data_in_bin=1, learning_rate=1, min_data_in_leaf=min_samples_leaf, num_leaves=max_leaf_nodes) est_pygbm = GradientBoostingMachine(max_iter=max_iter, learning_rate=1, validation_split=None, scoring=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm.fit(X_train, y_train) est_pygbm.fit(X_train, y_train) pred_lgbm = est_lightgbm.predict(X_train) pred_pygbm = est_pygbm.predict(X_train) np.testing.assert_array_almost_equal(pred_lgbm, pred_pygbm, decimal=3) if max_leaf_nodes < 10 and n_samples > 1000: pred_lgbm = est_lightgbm.predict(X_test) pred_pygbm = est_pygbm.predict(X_test) np.testing.assert_array_almost_equal(pred_lgbm, pred_pygbm, decimal=3)
def test_plot_grower(tmpdir): pytest.importorskip('graphviz') from pygbm.plotting import plot_tree X_binned = BinMapper().fit_transform(X) gradients = np.asarray(y, dtype=np.float32).copy() hessians = np.ones(1, dtype=np.float32) grower = TreeGrower(X_binned, gradients, hessians, max_leaf_nodes=5) grower.grow() filename = tmpdir.join('plot_grower.pdf') plot_tree(grower, view=False, filename=filename) assert filename.exists()
def test_bin_mapper_idempotence(n_bins_small, n_bins_large): assert n_bins_large >= n_bins_small data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1) mapper_small = BinMapper(max_bins=n_bins_small) mapper_large = BinMapper(max_bins=n_bins_large) binned_small = mapper_small.fit_transform(data) binned_large = mapper_large.fit_transform(binned_small) assert_array_equal(binned_small, binned_large)
def test_same_predictions_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=0) if n_samples > 255: X = BinMapper(max_bins=max_bins).fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_pygbm = GradientBoostingClassifier(loss='binary_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=1, validation_split=None, scoring=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_lightgbm_estimator(est_pygbm) est_lightgbm.fit(X_train, y_train) est_pygbm.fit(X_train, y_train) pred_lightgbm = est_lightgbm.predict(X_train) pred_pygbm = est_pygbm.predict(X_train) assert np.mean(pred_pygbm == pred_lightgbm) > .89 acc_lgbm = accuracy_score(y_train, pred_lightgbm) acc_pygbm = accuracy_score(y_train, pred_pygbm) np.testing.assert_almost_equal(acc_lgbm, acc_pygbm) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_pygbm = est_pygbm.predict(X_test) assert np.mean(pred_pygbm == pred_lightgbm) > .89 acc_lgbm = accuracy_score(y_test, pred_lightgbm) acc_pygbm = accuracy_score(y_test, pred_pygbm) np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)
def test_pre_binned_data(): # Make sure that: # - training on numerical data and predicting on numerical data is the # same as training on binned data and predicting on binned data # - training on numerical data and predicting on numerical data is the # same as training on numerical data and predicting on binned data # - training on binned data and predicting on numerical data is not # possible. X, y = make_regression(random_state=0) gbdt = GradientBoostingRegressor(scoring=None, random_state=0) mapper = BinMapper(random_state=0) X_binned = mapper.fit_transform(X) fit_num_pred_num = gbdt.fit(X, y).predict(X) fit_binned_pred_binned = gbdt.fit(X_binned, y).predict(X_binned) fit_num_pred_binned = gbdt.fit(X, y).predict(X_binned) assert_allclose(fit_num_pred_num, fit_binned_pred_binned) assert_allclose(fit_num_pred_num, fit_num_pred_binned) assert_raises_regex(ValueError, 'This estimator was fitted with pre-binned data ', gbdt.fit(X_binned, y).predict, X)
def test_bin_mapper_random_data(n_bins): n_samples, n_features = DATA.shape expected_count_per_bin = n_samples // n_bins tol = int(0.05 * expected_count_per_bin) mapper = BinMapper(max_bins=n_bins, random_state=42).fit(DATA) binned = mapper.transform(DATA) assert binned.shape == (n_samples, n_features) assert binned.dtype == np.uint8 assert_array_equal(binned.min(axis=0), np.array([0, 0])) assert_array_equal(binned.max(axis=0), np.array([n_bins - 1, n_bins - 1])) assert len(mapper.numerical_thresholds_) == n_features for i in range(len(mapper.numerical_thresholds_)): assert mapper.numerical_thresholds_[i].shape == (n_bins - 1, ) assert mapper.numerical_thresholds_[i].dtype == DATA.dtype assert np.all(mapper.n_bins_per_feature_ == n_bins) # Check that the binned data is approximately balanced across bins. for feature_idx in range(n_features): for bin_idx in range(n_bins): count = (binned[:, feature_idx] == bin_idx).sum() assert abs(count - expected_count_per_bin) < tol
def test_bin_mapper_repeated_values_invariance(n_distinct): rng = np.random.RandomState(42) distinct_values = rng.normal(size=n_distinct) assert len(np.unique(distinct_values)) == n_distinct repeated_indices = rng.randint(low=0, high=n_distinct, size=1000) data = distinct_values[repeated_indices] rng.shuffle(data) assert_array_equal(np.unique(data), np.sort(distinct_values)) data = data.reshape(-1, 1) mapper_1 = BinMapper(max_bins=n_distinct) binned_1 = mapper_1.fit_transform(data) assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct)) # Adding more bins to the mapper yields the same results (same thresholds) mapper_2 = BinMapper(max_bins=min(256, n_distinct * 3)) binned_2 = mapper_2.fit_transform(data) assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0]) assert_array_equal(binned_1, binned_2)
class GradientBoostingMachine(BaseEstimator, RegressorMixin): def __init__(self, learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0., max_bins=255, max_no_improvement=5, validation_split=0.1, scoring='neg_mean_squared_error', tol=1e-7, verbose=0, random_state=None): self.learning_rate = learning_rate self.max_iter = max_iter self.max_leaf_nodes = max_leaf_nodes self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf self.l2_regularization = l2_regularization self.max_bins = max_bins self.max_no_improvement = max_no_improvement self.validation_split = validation_split self.scoring = scoring self.tol = tol self.verbose = verbose self.random_state = random_state def fit(self, X, y): fit_start_time = time() acc_find_split_time = 0. # time spent finding the best splits acc_apply_split_time = 0. # time spent splitting nodes # time spent predicting X for gradient and hessians update acc_prediction_time = 0. # TODO: add support for mixed-typed (numerical + categorical) data # TODO: add support for missing data # TODO: add support for pre-binned data (pass-through)? X, y = check_X_y(X, y, dtype=[np.float32, np.float64]) y = y.astype(np.float32, copy=False) rng = check_random_state(self.random_state) if self.verbose: print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="", flush=True) tic = time() self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng) X_binned = self.bin_mapper_.fit_transform(X) toc = time() if self.verbose: duration = toc - tic troughput = X.nbytes / duration print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)") if self.validation_split is not None: X_binned_train, X_binned_val, y_train, y_val = train_test_split( X_binned, y, test_size=self.validation_split, stratify=y, random_state=rng) # Histogram computation is faster on feature-aligned data. X_binned_train = np.asfortranarray(X_binned_train) else: X_binned_train, y_train = X_binned, y X_binned_val, y_val = None, None # Subsample the training set for score-based monitoring. subsample_size = 10000 if X_binned_train.shape[0] < subsample_size: X_binned_small_train = np.ascontiguousarray(X_binned_train) y_small_train = y_train else: indices = rng.choice(np.arange(X_binned_train.shape[0]), subsample_size) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] if self.verbose: print("Fitting gradient boosted rounds:") # TODO: plug custom loss functions y_pred = np.zeros_like(y_train, dtype=np.float32) gradients = np.asarray(y_train, dtype=np.float32).copy() hessians = np.ones(1, dtype=np.float32) self.predictors_ = predictors = [] self.train_scores_ = [] if self.validation_split is not None: self.validation_scores_ = [] scorer = check_scoring(self, self.scoring) gb_start_time = time() # TODO: compute training loss and use it for early stopping if no # validation data is provided? self.n_iter_ = 0 while True: should_stop = self._stopping_criterion(gb_start_time, scorer, X_binned_small_train, y_small_train, X_binned_val, y_val) if should_stop or self.n_iter_ == self.max_iter: break shrinkage = 1. if self.n_iter_ == 0 else self.learning_rate grower = TreeGrower(X_binned_train, gradients, hessians, n_bins=self.max_bins, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, shrinkage=shrinkage) grower.grow() predictor = grower.make_predictor( bin_thresholds=self.bin_mapper_.bin_thresholds_) predictors.append(predictor) self.n_iter_ += 1 tic_pred = time() leaves_data = [(l.value, l.sample_indices) for l in grower.finalized_leaves] _update_y_pred(leaves_data, y_pred) gradients = y_train - y_pred toc_pred = time() acc_prediction_time += toc_pred - tic_pred acc_apply_split_time += grower.total_apply_split_time acc_find_split_time += grower.total_find_split_time if self.verbose: duration = time() - fit_start_time n_leaf_nodes = sum(p.get_n_leaf_nodes() for p in self.predictors_) print(f"Fit {len(self.predictors_)} trees in {duration:.3f} s, " f"({n_leaf_nodes} total leaf nodes)") print('{:<32} {:.3f}s'.format('Time spent finding best splits:', acc_find_split_time)) print('{:<32} {:.3f}s'.format('Time spent applying splits:', acc_apply_split_time)) print('{:<32} {:.3f}s'.format('Time spent predicting:', acc_prediction_time)) self.train_scores_ = np.asarray(self.train_scores_) if self.validation_split is not None: self.validation_scores_ = np.asarray(self.validation_scores_) return self def predict(self, X): # TODO: check input / check_fitted # TODO: make predictor behave correctly on pre-binned data # TODO: handle classification and output class labels in this case predicted = np.zeros(X.shape[0], dtype=np.float32) for predictor in self.predictors_: predicted += predictor.predict(X) return predicted def _predict_binned(self, X_binned): predicted = np.zeros(X_binned.shape[0], dtype=np.float32) for predictor in self.predictors_: predicted += predictor.predict_binned(X_binned) return predicted def _stopping_criterion(self, start_time, scorer, X_binned_train, y_train, X_binned_val, y_val): log_msg = f"[{self.n_iter_}/{self.max_iter}]" if self.scoring is not None: # TODO: make sure that self.predict can work on binned data and # then only use the public scorer.__call__. predicted_train = self._predict_binned(X_binned_train) score_train = scorer._score_func(y_train, predicted_train) self.train_scores_.append(score_train) log_msg += f" {self.scoring} train: {score_train:.5f}," if self.validation_split is not None: predicted_val = self._predict_binned(X_binned_val) score_val = scorer._score_func(y_val, predicted_val) self.validation_scores_.append(score_val) log_msg += f", {self.scoring} val: {score_val:.5f}," if self.n_iter_ > 0: iteration_time = (time() - start_time) / self.n_iter_ predictor_nodes = self.predictors_[-1].nodes max_depth = predictor_nodes['depth'].max() n_leaf_nodes = predictor_nodes['is_leaf'].sum() log_msg += (f" {n_leaf_nodes} leaf nodes, max depth {max_depth}" f" in {iteration_time:0.3f}s") if self.verbose: print(log_msg) if self.validation_split is not None: return self._should_stop(self.validation_scores_) else: return self._should_stop(self.train_scores_) def _should_stop(self, scores): if (len(scores) == 0 or (self.max_no_improvement and len(scores) < self.max_no_improvement)): return False context_scores = scores[-self.max_no_improvement:] candidate = scores[-self.max_no_improvement] tol = 0. if self.tol is None else self.tol # sklearn scores: higher is always better. best_with_tol = max(context_scores) * (1 - tol) return candidate >= best_with_tol
from pygbm.binning import BinMapper from pygbm.grower import TreeGrower from pygbm import plotting rng = np.random.RandomState(0) n_samples = int(1e7) n_leaf_nodes = 5 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, n_informative=3, n_redundant=0, random_state=rng) bin_mapper_ = BinMapper(random_state=rng) X_binned = bin_mapper_.fit_transform(X) gradients = np.asarray(y, dtype=np.float32).copy() hessians = np.ones(1, dtype=np.float32) # First run to trigger the compilation of numba jit methods to avoid recording # the compiler overhead in the profile report. TreeGrower(X_binned, gradients, hessians, max_leaf_nodes=n_leaf_nodes).grow() # New run with to collect timing statistics that will be included in the plot. grower = TreeGrower(X_binned, gradients, hessians, max_leaf_nodes=n_leaf_nodes) grower.grow() plotting.plot_tree(grower)
from numpy.testing import assert_allclose from sklearn.datasets import make_regression from pygbm.binning import BinMapper from pygbm import GradientBoostingRegressor n_samples = int(5e6) X, y = make_regression(n_samples=n_samples, n_features=5) est = GradientBoostingRegressor(max_iter=1, scoring=None, validation_split=None, random_state=0) est.fit(X, y) predictor = est.predictors_[0][0] bin_mapper = BinMapper(random_state=0) X_binned = bin_mapper.fit_transform(X) X_binned_c = np.ascontiguousarray(X_binned) print("Compiling predictor code...") tic = time() predictor.predict_binned(np.asfortranarray(X_binned[:100])) predictor.predict_binned(X_binned_c[:100]) predictor.predict(np.asfortranarray(X[:100])) predictor.predict(X[:100]) toc = time() print(f"done in {toc - tic:0.3f}s") data_size = X_binned.nbytes print("Computing predictions (F-contiguous binned data)...") tic = time()
def test_same_predictions_multiclass_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 lr = 1 X, y = make_classification(n_samples=n_samples, n_classes=3, n_features=5, n_informative=5, n_redundant=0, n_clusters_per_class=1, random_state=0) if n_samples > 255: X = BinMapper(max_bins=max_bins).fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_pygbm = GradientBoostingClassifier(loss='categorical_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=lr, validation_split=None, scoring=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_lightgbm_estimator(est_pygbm) est_lightgbm.fit(X_train, y_train) est_pygbm.fit(X_train, y_train) pred_lightgbm = est_lightgbm.predict(X_train) pred_pygbm = est_pygbm.predict(X_train) assert np.mean(pred_pygbm == pred_lightgbm) > .89 proba_lightgbm = est_lightgbm.predict_proba(X_train) proba_pygbm = est_pygbm.predict_proba(X_train) # assert more than 75% of the predicted probabilities are the same up to # the second decimal assert np.mean(np.abs(proba_lightgbm - proba_pygbm) < 1e-2) > .75 acc_lgbm = accuracy_score(y_train, pred_lightgbm) acc_pygbm = accuracy_score(y_train, pred_pygbm) np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_pygbm = est_pygbm.predict(X_test) assert np.mean(pred_pygbm == pred_lightgbm) > .89 proba_lightgbm = est_lightgbm.predict_proba(X_train) proba_pygbm = est_pygbm.predict_proba(X_train) # assert more than 75% of the predicted probabilities are the same up # to the second decimal assert np.mean(np.abs(proba_lightgbm - proba_pygbm) < 1e-2) > .75 acc_lgbm = accuracy_score(y_test, pred_lightgbm) acc_pygbm = accuracy_score(y_test, pred_pygbm) np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2)
class BaseGradientBoostingMachine(BaseEstimator, ABC): """Base class for gradient boosting estimators.""" multi_output = False prediction_dim = 1 @abstractmethod def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes, max_depth, min_samples_leaf, l2_regularization, max_bins, scoring, validation_split, n_iter_no_change, tol, verbose, random_state): self.loss = loss self.learning_rate = learning_rate self.max_iter = max_iter self.max_leaf_nodes = max_leaf_nodes self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf self.l2_regularization = l2_regularization self.max_bins = max_bins self.n_iter_no_change = n_iter_no_change self.validation_split = validation_split self.scoring = scoring self.tol = tol self.verbose = verbose self.random_state = random_state def _validate_parameters(self, X): """Validate parameters passed to __init__. The parameters that are directly passed to the grower are checked in TreeGrower.""" if self.loss not in self._VALID_LOSSES: raise ValueError("Loss {} is not supported for {}. Accepted losses" "are {}.".format(self.loss, self.__class__.__name__, ', '.join(self._VALID_LOSSES))) if self.learning_rate <= 0: raise ValueError(f'learning_rate={self.learning_rate} must ' f'be strictly positive') if self.max_iter < 1: raise ValueError(f'max_iter={self.max_iter} must ' f'not be smaller than 1.') if self.n_iter_no_change is not None and self.n_iter_no_change < 0: raise ValueError(f'n_iter_no_change={self.n_iter_no_change} ' f'must be positive.') if self.validation_split is not None and self.validation_split <= 0: raise ValueError(f'validation_split={self.validation_split} ' f'must be strictly positive, or None.') if self.tol is not None and self.tol < 0: raise ValueError(f'tol={self.tol} ' f'must not be smaller than 0.') if X.dtype == np.uint8: # pre-binned data max_bin_index = X.max() if self.max_bins < max_bin_index + 1: raise ValueError( f'max_bins is set to {self.max_bins} but the data is ' f'pre-binned with {max_bin_index + 1} bins.') def fit(self, X, y): """Fit the gradient boosting model. Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. If ``X.dtype == np.uint8``, the data is assumed to be pre-binned and the prediction methods (``predict``, ``predict_proba``) will only accept pre-binned data as well. y : array-like, shape=(n_samples,) Target values. Returns ------- self : object """ fit_start_time = time() acc_find_split_time = 0. # time spent finding the best splits acc_apply_split_time = 0. # time spent splitting nodes # time spent predicting X for gradient and hessians update acc_prediction_time = 0. # TODO: add support for mixed-typed (numerical + categorical) data # TODO: add support for missing data self.multi_output = len(y.ravel()) != len(y) if self.multi_output: self.prediction_dim = y.shape[1] else: self.prediction_dim = 1 X, y = check_X_y(X, y, dtype=[np.float32, np.float64, np.uint8], multi_output=self.multi_output) y = self._encode_y(y) if X.shape[0] == 1 or X.shape[1] == 1: raise ValueError( 'Passing only one sample or one feature is not supported yet. ' 'See numba issue #3569.') rng = check_random_state(self.random_state) self._validate_parameters(X) self.n_features_ = X.shape[1] # used for validation in predict() if X.dtype == np.uint8: # data is pre-binned if self.verbose: print("X is pre-binned.") X_binned = X self.bin_mapper_ = None numerical_thresholds = None n_bins_per_feature = X.max(axis=0).astype(np.uint32) else: if self.verbose: print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="", flush=True) tic = time() self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng) X_binned = self.bin_mapper_.fit_transform(X) numerical_thresholds = self.bin_mapper_.numerical_thresholds_ n_bins_per_feature = self.bin_mapper_.n_bins_per_feature_ toc = time() if self.verbose: duration = toc - tic throughput = X.nbytes / duration print(f"{duration:.3f} s ({throughput / 1e6:.3f} MB/s)") self.loss_ = self._get_loss() do_early_stopping = (self.n_iter_no_change is not None and self.n_iter_no_change > 0) if do_early_stopping and self.validation_split is not None: # stratify for classification stratify = y if hasattr(self.loss_, 'predict_proba') else None X_binned_train, X_binned_val, y_train, y_val = train_test_split( X_binned, y, test_size=self.validation_split, stratify=stratify, random_state=rng) if X_binned_train.size == 0 or X_binned_val.size == 0: raise ValueError( f'Not enough data (n_samples={X_binned.shape[0]}) to ' f'perform early stopping with validation_split=' f'{self.validation_split}. Use more training data or ' f'adjust validation_split.') # Predicting is faster of C-contiguous arrays, training is faster # on Fortran arrays. X_binned_val = np.ascontiguousarray(X_binned_val) X_binned_train = np.asfortranarray(X_binned_train) else: X_binned_train, y_train = X_binned, y X_binned_val, y_val = None, None # Subsample the training set for score-based monitoring. if do_early_stopping: subsample_size = 10000 n_samples_train = X_binned_train.shape[0] if n_samples_train > subsample_size: indices = rng.choice(X_binned_train.shape[0], subsample_size) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] else: X_binned_small_train = X_binned_train y_small_train = y_train # Predicting is faster of C-contiguous arrays. X_binned_small_train = np.ascontiguousarray(X_binned_small_train) if self.verbose: print("Fitting gradient boosted rounds:") n_samples = X_binned_train.shape[0] self.baseline_prediction_ = self.loss_.get_baseline_prediction( y_train, self.prediction_dim) # raw_predictions are the accumulated values predicted by the trees # for the training data. raw_predictions = np.zeros(shape=(n_samples, self.prediction_dim), dtype=self.baseline_prediction_.dtype) if not self.multi_output: raw_predictions = raw_predictions.ravel() raw_predictions += self.baseline_prediction_ # gradients and hessians are 1D arrays of size # n_samples * n_trees_per_iteration gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, prediction_dim=self.prediction_dim) if not self.multi_output: gradients = gradients.ravel() # predictors_ is a matrix of TreePredictor objects with shape # (n_iter_, n_trees_per_iteration) self.predictors_ = predictors = [] # scorer_ is a callable with signature (est, X, y) and calls # est.predict() or est.predict_proba() depending on its nature. self.scorer_ = check_scoring(self, self.scoring) self.train_scores_ = [] self.validation_scores_ = [] if do_early_stopping: # Add predictions of the initial model (before the first tree) self.train_scores_.append(self._get_scores(X_binned_train, y_train)) if self.validation_split is not None: self.validation_scores_.append( self._get_scores(X_binned_val, y_val)) for iteration in range(self.max_iter): if self.verbose: iteration_start_time = time() print(f"[{iteration + 1}/{self.max_iter}] ", end='', flush=True) # Update gradients and hessians, inplace self.loss_.update_gradients_and_hessians(gradients, hessians, y_train, raw_predictions) predictors.append([]) if self.multi_output: proj_gradients, proj_hessians = self.randomly_project_gradients_and_hessians( gradients, hessians) else: proj_gradients, proj_hessians = gradients.ravel( ), hessians.ravel() # Build `n_trees_per_iteration` trees. for k, (gradients_at_k, hessians_at_k) in enumerate( zip( np.array_split(proj_gradients, self.n_trees_per_iteration_), np.array_split(proj_hessians, self.n_trees_per_iteration_))): # the xxxx_at_k arrays are **views** on the original arrays. # Note that for binary classif and regressions, # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the # whole array. grower = TreeGrower(X_binned_train, gradients_at_k, hessians_at_k, max_bins=self.max_bins, n_bins_per_feature=n_bins_per_feature, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, l2_regularization=self.l2_regularization, shrinkage=self.learning_rate) grower.grow() if self.multi_output: for l in grower.finalized_leaves: l.residual = ( -self.learning_rate * np.sum(a=gradients[l.sample_indices, :], axis=0) / (l.sum_hessians + self.l2_regularization + np.finfo(np.float64).eps)) leaves_data = [(l.residual, l.sample_indices) for l in grower.finalized_leaves] else: leaves_data = [(l.value, l.sample_indices) for l in grower.finalized_leaves] acc_apply_split_time += grower.total_apply_split_time acc_find_split_time += grower.total_find_split_time predictor = grower.make_predictor(numerical_thresholds) predictors[-1].append(predictor) tic_pred = time() # prepare leaves_data so that _update_raw_predictions can be # @njitted _update_raw_predictions(leaves_data, raw_predictions) toc_pred = time() acc_prediction_time += toc_pred - tic_pred should_early_stop = False if do_early_stopping: should_early_stop = self._check_early_stopping( X_binned_small_train, y_small_train, X_binned_val, y_val) if self.verbose: self._print_iteration_stats(iteration_start_time, do_early_stopping) if should_early_stop: break if self.verbose: duration = time() - fit_start_time n_total_leaves = sum( predictor.get_n_leaf_nodes() for predictors_at_ith_iteration in self.predictors_ for predictor in predictors_at_ith_iteration) n_predictors = sum( len(predictors_at_ith_iteration) for predictors_at_ith_iteration in self.predictors_) print(f"Fit {n_predictors} trees in {duration:.3f} s, " f"({n_total_leaves} total leaves)") print(f"{'Time spent finding best splits:':<32} " f"{acc_find_split_time:.3f}s") print(f"{'Time spent applying splits:':<32} " f"{acc_apply_split_time:.3f}s") print(f"{'Time spent predicting:':<32} " f"{acc_prediction_time:.3f}s") self.train_scores_ = np.asarray(self.train_scores_) self.validation_scores_ = np.asarray(self.validation_scores_) return self def _check_early_stopping(self, X_binned_train, y_train, X_binned_val, y_val): """Check if fitting should be early-stopped. Scores are computed on validation data or on training data. """ self.train_scores_.append(self._get_scores(X_binned_train, y_train)) if self.validation_split is not None: self.validation_scores_.append( self._get_scores(X_binned_val, y_val)) return self._should_stop(self.validation_scores_) return self._should_stop(self.train_scores_) def _should_stop(self, scores): """ Return True (do early stopping) if the last n scores aren't better than the (n-1)th-to-last score, up to some tolerance. """ reference_position = self.n_iter_no_change + 1 if len(scores) < reference_position: return False # A higher score is always better. Higher tol means that it will be # harder for subsequent iteration to be considered an improvement upon # the reference score, and therefore it is more likely to early stop # because of the lack of significant improvement. tol = 0 if self.tol is None else self.tol reference_score = scores[-reference_position] + tol recent_scores = scores[-reference_position + 1:] recent_improvements = [ score > reference_score for score in recent_scores ] return not any(recent_improvements) def _get_scores(self, X, y): """Compute scores on data X with target y. Scores are either computed with a scorer if scoring parameter is not None, else with the loss. As higher is always better, we return -loss_value. """ if self.scoring is not None: return self.scorer_(self, X, y) # Else, use the negative loss as score. if self.multi_output: raw_predictions = self._raw_predict_multi(X) else: raw_predictions = self._raw_predict(X) return -self.loss_(y, raw_predictions) def _print_iteration_stats(self, iteration_start_time, do_early_stopping): """Print info about the current fitting iteration.""" log_msg = '' predictors_of_ith_iteration = [ predictors_list for predictors_list in self.predictors_[-1] if predictors_list ] n_trees = len(predictors_of_ith_iteration) max_depth = max(predictor.get_max_depth() for predictor in predictors_of_ith_iteration) n_leaves = sum(predictor.get_n_leaf_nodes() for predictor in predictors_of_ith_iteration) if n_trees == 1: log_msg += (f"{n_trees} tree, {n_leaves} leaves, ") else: log_msg += (f"{n_trees} trees, {n_leaves} leaves ") log_msg += (f"({int(n_leaves / n_trees)} on avg), ") log_msg += f"max depth = {max_depth}, " if do_early_stopping: log_msg += f"{self.scoring} train: {self.train_scores_[-1]:.5f}, " if self.validation_split is not None: log_msg += (f"{self.scoring} val: " f"{self.validation_scores_[-1]:.5f}, ") iteration_time = time() - iteration_start_time log_msg += f"in {iteration_time:0.3f}s" print(log_msg) def _raw_predict(self, X): """Return the sum of the leaves values over all predictors. Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. If ``X.dtype == np.uint8``, the data is assumed to be pre-binned and the estimator must have been fitted with pre-binned data. Returns ------- raw_predictions : array, shape (n_samples * n_trees_per_iteration,) The raw predicted values. """ X = check_array(X) check_is_fitted(self, 'predictors_') if X.shape[1] != self.n_features_: raise ValueError( f'X has {X.shape[1]} features but this estimator was ' f'trained with {self.n_features_} features.') is_binned = X.dtype == np.uint8 if not is_binned and self.bin_mapper_ is None: raise ValueError( 'This estimator was fitted with pre-binned data and ' 'can only predict pre-binned data as well. If your data *is* ' 'already pre-binnned, convert it to uint8 using e.g. ' 'X.astype(np.uint8). If the data passed to fit() was *not* ' 'pre-binned, convert it to float32 and call fit() again.') n_samples = X.shape[0] raw_predictions = np.zeros(shape=(n_samples, self.n_trees_per_iteration_), dtype=self.baseline_prediction_.dtype) raw_predictions += self.baseline_prediction_ # Should we parallelize this? for predictors_of_ith_iteration in self.predictors_: for k, predictor in enumerate(predictors_of_ith_iteration): predict = (predictor.predict_binned if is_binned else predictor.predict) raw_predictions[:, k] += predict(X) return raw_predictions def _raw_predict_multi(self, X): """Return the sum of the leaves values over all predictors. Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. If ``X.dtype == np.uint8``, the data is assumed to be pre-binned and the estimator must have been fitted with pre-binned data. Returns ------- raw_predictions : array, shape (n_samples * n_trees_per_iteration,) The raw predicted values. """ X = check_array(X) check_is_fitted(self, 'predictors_') if X.shape[1] != self.n_features_: raise ValueError( f'X has {X.shape[1]} features but this estimator was ' f'trained with {self.n_features_} features.') is_binned = X.dtype == np.uint8 if not is_binned and self.bin_mapper_ is None: raise ValueError( 'This estimator was fitted with pre-binned data and ' 'can only predict pre-binned data as well. If your data *is* ' 'already pre-binnned, convert it to uint8 using e.g. ' 'X.astype(np.uint8). If the data passed to fit() was *not* ' 'pre-binned, convert it to float32 and call fit() again.') n_samples = X.shape[0] raw_predictions = np.zeros(shape=(n_samples, self.prediction_dim), dtype=self.baseline_prediction_.dtype) raw_predictions += self.baseline_prediction_ # Should we parallelize this? for predictors_of_ith_iteration in self.predictors_: for k, predictor in enumerate(predictors_of_ith_iteration): predict = (predictor.predict_binned_multi if is_binned else predictor.predict_multi) tmp = predict(X, self.prediction_dim) if tmp.dtype != 'float32': print(tmp) raw_predictions = np.add(raw_predictions, predict(X, self.prediction_dim)) return raw_predictions def randomly_project_gradients_and_hessians(self, gradients, hessians): proj_g = SparseRandomProjection( n_components=1, random_state=self.random_state).fit_transform(X=gradients) proj_h = hessians #SparseRandomProjection(n_components=1, random_state=self.random_state).fit_transform(X=hessians) return proj_g.ravel().astype(np.float32), proj_h.astype(np.float32) @abstractmethod def _get_loss(self): pass @abstractmethod def _encode_y(self, y=None): pass @property def n_iter_(self): check_is_fitted(self, 'predictors_') return len(self.predictors_)
def test_bin_mapper_identity_small(n_bins, scale, offset): data = np.arange(n_bins).reshape(-1, 1) * scale + offset binned = BinMapper(max_bins=n_bins).fit_transform(data) assert_array_equal(binned, np.arange(n_bins).reshape(-1, 1))
def test_bin_mapper_identity_repeated_values(n_bins, n_distinct, multiplier): data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1) binned = BinMapper(max_bins=n_bins).fit_transform(data) assert_array_equal(data, binned)
def fit(self, X, y): """Fit the gradient boosting model. Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. If ``X.dtype == np.uint8``, the data is assumed to be pre-binned and the prediction methods (``predict``, ``predict_proba``) will only accept pre-binned data as well. y : array-like, shape=(n_samples,) Target values. Returns ------- self : object """ fit_start_time = time() acc_find_split_time = 0. # time spent finding the best splits acc_apply_split_time = 0. # time spent splitting nodes # time spent predicting X for gradient and hessians update acc_prediction_time = 0. # TODO: add support for mixed-typed (numerical + categorical) data # TODO: add support for missing data self.multi_output = len(y.ravel()) != len(y) if self.multi_output: self.prediction_dim = y.shape[1] else: self.prediction_dim = 1 X, y = check_X_y(X, y, dtype=[np.float32, np.float64, np.uint8], multi_output=self.multi_output) y = self._encode_y(y) if X.shape[0] == 1 or X.shape[1] == 1: raise ValueError( 'Passing only one sample or one feature is not supported yet. ' 'See numba issue #3569.') rng = check_random_state(self.random_state) self._validate_parameters(X) self.n_features_ = X.shape[1] # used for validation in predict() if X.dtype == np.uint8: # data is pre-binned if self.verbose: print("X is pre-binned.") X_binned = X self.bin_mapper_ = None numerical_thresholds = None n_bins_per_feature = X.max(axis=0).astype(np.uint32) else: if self.verbose: print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="", flush=True) tic = time() self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng) X_binned = self.bin_mapper_.fit_transform(X) numerical_thresholds = self.bin_mapper_.numerical_thresholds_ n_bins_per_feature = self.bin_mapper_.n_bins_per_feature_ toc = time() if self.verbose: duration = toc - tic throughput = X.nbytes / duration print(f"{duration:.3f} s ({throughput / 1e6:.3f} MB/s)") self.loss_ = self._get_loss() do_early_stopping = (self.n_iter_no_change is not None and self.n_iter_no_change > 0) if do_early_stopping and self.validation_split is not None: # stratify for classification stratify = y if hasattr(self.loss_, 'predict_proba') else None X_binned_train, X_binned_val, y_train, y_val = train_test_split( X_binned, y, test_size=self.validation_split, stratify=stratify, random_state=rng) if X_binned_train.size == 0 or X_binned_val.size == 0: raise ValueError( f'Not enough data (n_samples={X_binned.shape[0]}) to ' f'perform early stopping with validation_split=' f'{self.validation_split}. Use more training data or ' f'adjust validation_split.') # Predicting is faster of C-contiguous arrays, training is faster # on Fortran arrays. X_binned_val = np.ascontiguousarray(X_binned_val) X_binned_train = np.asfortranarray(X_binned_train) else: X_binned_train, y_train = X_binned, y X_binned_val, y_val = None, None # Subsample the training set for score-based monitoring. if do_early_stopping: subsample_size = 10000 n_samples_train = X_binned_train.shape[0] if n_samples_train > subsample_size: indices = rng.choice(X_binned_train.shape[0], subsample_size) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] else: X_binned_small_train = X_binned_train y_small_train = y_train # Predicting is faster of C-contiguous arrays. X_binned_small_train = np.ascontiguousarray(X_binned_small_train) if self.verbose: print("Fitting gradient boosted rounds:") n_samples = X_binned_train.shape[0] self.baseline_prediction_ = self.loss_.get_baseline_prediction( y_train, self.prediction_dim) # raw_predictions are the accumulated values predicted by the trees # for the training data. raw_predictions = np.zeros(shape=(n_samples, self.prediction_dim), dtype=self.baseline_prediction_.dtype) if not self.multi_output: raw_predictions = raw_predictions.ravel() raw_predictions += self.baseline_prediction_ # gradients and hessians are 1D arrays of size # n_samples * n_trees_per_iteration gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, prediction_dim=self.prediction_dim) if not self.multi_output: gradients = gradients.ravel() # predictors_ is a matrix of TreePredictor objects with shape # (n_iter_, n_trees_per_iteration) self.predictors_ = predictors = [] # scorer_ is a callable with signature (est, X, y) and calls # est.predict() or est.predict_proba() depending on its nature. self.scorer_ = check_scoring(self, self.scoring) self.train_scores_ = [] self.validation_scores_ = [] if do_early_stopping: # Add predictions of the initial model (before the first tree) self.train_scores_.append(self._get_scores(X_binned_train, y_train)) if self.validation_split is not None: self.validation_scores_.append( self._get_scores(X_binned_val, y_val)) for iteration in range(self.max_iter): if self.verbose: iteration_start_time = time() print(f"[{iteration + 1}/{self.max_iter}] ", end='', flush=True) # Update gradients and hessians, inplace self.loss_.update_gradients_and_hessians(gradients, hessians, y_train, raw_predictions) predictors.append([]) if self.multi_output: proj_gradients, proj_hessians = self.randomly_project_gradients_and_hessians( gradients, hessians) else: proj_gradients, proj_hessians = gradients.ravel( ), hessians.ravel() # Build `n_trees_per_iteration` trees. for k, (gradients_at_k, hessians_at_k) in enumerate( zip( np.array_split(proj_gradients, self.n_trees_per_iteration_), np.array_split(proj_hessians, self.n_trees_per_iteration_))): # the xxxx_at_k arrays are **views** on the original arrays. # Note that for binary classif and regressions, # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the # whole array. grower = TreeGrower(X_binned_train, gradients_at_k, hessians_at_k, max_bins=self.max_bins, n_bins_per_feature=n_bins_per_feature, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, l2_regularization=self.l2_regularization, shrinkage=self.learning_rate) grower.grow() if self.multi_output: for l in grower.finalized_leaves: l.residual = ( -self.learning_rate * np.sum(a=gradients[l.sample_indices, :], axis=0) / (l.sum_hessians + self.l2_regularization + np.finfo(np.float64).eps)) leaves_data = [(l.residual, l.sample_indices) for l in grower.finalized_leaves] else: leaves_data = [(l.value, l.sample_indices) for l in grower.finalized_leaves] acc_apply_split_time += grower.total_apply_split_time acc_find_split_time += grower.total_find_split_time predictor = grower.make_predictor(numerical_thresholds) predictors[-1].append(predictor) tic_pred = time() # prepare leaves_data so that _update_raw_predictions can be # @njitted _update_raw_predictions(leaves_data, raw_predictions) toc_pred = time() acc_prediction_time += toc_pred - tic_pred should_early_stop = False if do_early_stopping: should_early_stop = self._check_early_stopping( X_binned_small_train, y_small_train, X_binned_val, y_val) if self.verbose: self._print_iteration_stats(iteration_start_time, do_early_stopping) if should_early_stop: break if self.verbose: duration = time() - fit_start_time n_total_leaves = sum( predictor.get_n_leaf_nodes() for predictors_at_ith_iteration in self.predictors_ for predictor in predictors_at_ith_iteration) n_predictors = sum( len(predictors_at_ith_iteration) for predictors_at_ith_iteration in self.predictors_) print(f"Fit {n_predictors} trees in {duration:.3f} s, " f"({n_total_leaves} total leaves)") print(f"{'Time spent finding best splits:':<32} " f"{acc_find_split_time:.3f}s") print(f"{'Time spent applying splits:':<32} " f"{acc_apply_split_time:.3f}s") print(f"{'Time spent predicting:':<32} " f"{acc_prediction_time:.3f}s") self.train_scores_ = np.asarray(self.train_scores_) self.validation_scores_ = np.asarray(self.validation_scores_) return self
def fit(self, X, y): fit_start_time = time() acc_find_split_time = 0. # time spent finding the best splits acc_apply_split_time = 0. # time spent splitting nodes # time spent predicting X for gradient and hessians update acc_prediction_time = 0. # TODO: add support for mixed-typed (numerical + categorical) data # TODO: add support for missing data # TODO: add support for pre-binned data (pass-through)? X, y = check_X_y(X, y, dtype=[np.float32, np.float64]) y = y.astype(np.float32, copy=False) rng = check_random_state(self.random_state) if self.verbose: print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="", flush=True) tic = time() self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng) X_binned = self.bin_mapper_.fit_transform(X) toc = time() if self.verbose: duration = toc - tic troughput = X.nbytes / duration print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)") if self.validation_split is not None: X_binned_train, X_binned_val, y_train, y_val = train_test_split( X_binned, y, test_size=self.validation_split, stratify=y, random_state=rng) # Histogram computation is faster on feature-aligned data. X_binned_train = np.asfortranarray(X_binned_train) else: X_binned_train, y_train = X_binned, y X_binned_val, y_val = None, None # Subsample the training set for score-based monitoring. subsample_size = 10000 if X_binned_train.shape[0] < subsample_size: X_binned_small_train = np.ascontiguousarray(X_binned_train) y_small_train = y_train else: indices = rng.choice(np.arange(X_binned_train.shape[0]), subsample_size) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] if self.verbose: print("Fitting gradient boosted rounds:") # TODO: plug custom loss functions y_pred = np.zeros_like(y_train, dtype=np.float32) gradients = np.asarray(y_train, dtype=np.float32).copy() hessians = np.ones(1, dtype=np.float32) self.predictors_ = predictors = [] self.train_scores_ = [] if self.validation_split is not None: self.validation_scores_ = [] scorer = check_scoring(self, self.scoring) gb_start_time = time() # TODO: compute training loss and use it for early stopping if no # validation data is provided? self.n_iter_ = 0 while True: should_stop = self._stopping_criterion(gb_start_time, scorer, X_binned_small_train, y_small_train, X_binned_val, y_val) if should_stop or self.n_iter_ == self.max_iter: break shrinkage = 1. if self.n_iter_ == 0 else self.learning_rate grower = TreeGrower(X_binned_train, gradients, hessians, n_bins=self.max_bins, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, shrinkage=shrinkage) grower.grow() predictor = grower.make_predictor( bin_thresholds=self.bin_mapper_.bin_thresholds_) predictors.append(predictor) self.n_iter_ += 1 tic_pred = time() leaves_data = [(l.value, l.sample_indices) for l in grower.finalized_leaves] _update_y_pred(leaves_data, y_pred) gradients = y_train - y_pred toc_pred = time() acc_prediction_time += toc_pred - tic_pred acc_apply_split_time += grower.total_apply_split_time acc_find_split_time += grower.total_find_split_time if self.verbose: duration = time() - fit_start_time n_leaf_nodes = sum(p.get_n_leaf_nodes() for p in self.predictors_) print(f"Fit {len(self.predictors_)} trees in {duration:.3f} s, " f"({n_leaf_nodes} total leaf nodes)") print('{:<32} {:.3f}s'.format('Time spent finding best splits:', acc_find_split_time)) print('{:<32} {:.3f}s'.format('Time spent applying splits:', acc_apply_split_time)) print('{:<32} {:.3f}s'.format('Time spent predicting:', acc_prediction_time)) self.train_scores_ = np.asarray(self.train_scores_) if self.validation_split is not None: self.validation_scores_ = np.asarray(self.validation_scores_) return self
def fit(self, X, y): """Fit the gradient boosting model. Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. y : array-like, shape=(n_samples,) Target values. Returns ------- self : object """ fit_start_time = time() acc_find_split_time = 0. # time spent finding the best splits acc_apply_split_time = 0. # time spent splitting nodes # time spent predicting X for gradient and hessians update acc_prediction_time = 0. # TODO: add support for mixed-typed (numerical + categorical) data # TODO: add support for missing data # TODO: add support for pre-binned data (pass-through)? # TODO: test input checking X, y = check_X_y(X, y, dtype=[np.float32, np.float64]) y = self._encode_y(y) if X.shape[0] == 1 or X.shape[1] == 1: raise ValueError( 'Passing only one sample or one feature is not supported yet. ' 'See numba issue #3569.' ) rng = check_random_state(self.random_state) self._validate_parameters() self.n_features_ = X.shape[1] # used for validation in predict() if self.verbose: print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="", flush=True) tic = time() self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng) X_binned = self.bin_mapper_.fit_transform(X) toc = time() if self.verbose: duration = toc - tic troughput = X.nbytes / duration print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)") self.loss_ = self._get_loss() if self.scoring is not None and self.validation_split is not None: # stratify for classification stratify = y if hasattr(self.loss_, 'predict_proba') else None X_binned_train, X_binned_val, y_train, y_val = train_test_split( X_binned, y, test_size=self.validation_split, stratify=stratify, random_state=rng) if X_binned_train.size == 0 or X_binned_val.size == 0: raise ValueError( f'Not enough data (n_samples={X_binned.shape[0]}) to ' f'perform early stopping with validation_split=' f'{self.validation_split}. Use more training data or ' f'adjust validation_split.' ) # Histogram computation is faster on feature-aligned data. X_binned_train = np.asfortranarray(X_binned_train) else: X_binned_train, y_train = X_binned, y X_binned_val, y_val = None, None # Subsample the training set for score-based monitoring. subsample_size = 10000 if X_binned_train.shape[0] < subsample_size: X_binned_small_train = np.ascontiguousarray(X_binned_train) y_small_train = y_train else: indices = rng.choice( np.arange(X_binned_train.shape[0]), subsample_size) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] if self.verbose: print("Fitting gradient boosted rounds:") n_samples = X_binned_train.shape[0] # values predicted by the trees. Used as-is in regression, and # transformed into probas and / or classes for classification raw_predictions = np.zeros( shape=(n_samples, self.n_trees_per_iteration_), dtype=y_train.dtype ) # gradients and hessians are 1D arrays of size # n_samples * n_trees_per_iteration gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, n_trees_per_iteration=self.n_trees_per_iteration_ ) # predictors_ is a matrix of TreePredictor objects with shape # (n_iter_, n_trees_per_iteration) self.predictors_ = predictors = [] scorer = check_scoring(self, self.scoring) self.train_scores_ = [] if self.scoring is not None: # Add predictions of the initial model (before the first tree) predicted_train = self._predict_binned(X_binned_train) score_train = scorer._sign * scorer._score_func(y_train, predicted_train) self.train_scores_.append(score_train) if self.validation_split is not None: self.validation_scores_ = [] predicted_val = self._predict_binned(X_binned_val) score_val = scorer._sign * scorer._score_func(y_val, predicted_val) self.validation_scores_.append(score_val) for iteration in range(self.max_iter): if self.verbose: iteration_start_time = time() print(f"[{iteration + 1}/{self.max_iter}] ", end='', flush=True) # Update gradients and hessians, inplace self.loss_.update_gradients_and_hessians(gradients, hessians, y_train, raw_predictions) predictors.append([]) # Build `n_trees_per_iteration` trees. for k, (gradients_at_k, hessians_at_k) in enumerate(zip( np.array_split(gradients, self.n_trees_per_iteration_), np.array_split(hessians, self.n_trees_per_iteration_))): # the xxxx_at_k arrays are **views** on the original arrays. # Note that for binary classif and regressions, # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the # whole array. grower = TreeGrower( X_binned_train, gradients_at_k, hessians_at_k, max_bins=self.max_bins, n_bins_per_feature=self.bin_mapper_.n_bins_per_feature_, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, l2_regularization=self.l2_regularization, shrinkage=self.learning_rate) grower.grow() acc_apply_split_time += grower.total_apply_split_time acc_find_split_time += grower.total_find_split_time predictor = grower.make_predictor( bin_thresholds=self.bin_mapper_.bin_thresholds_) predictors[-1].append(predictor) tic_pred = time() # prepare leaves_data so that _update_raw_predictions can be # @njitted leaves_data = [(l.value, l.sample_indices) for l in grower.finalized_leaves] _update_raw_predictions(leaves_data, raw_predictions[:, k]) toc_pred = time() acc_prediction_time += toc_pred - tic_pred should_stop = self._check_early_stopping( scorer, X_binned_small_train, y_small_train, X_binned_val, y_val) if self.verbose: self._print_iteration_stats(iteration_start_time) if should_stop: break if self.verbose: duration = time() - fit_start_time n_total_leaves = sum( predictor.get_n_leaf_nodes() for predictors_at_ith_iteration in self.predictors_ for predictor in predictors_at_ith_iteration) n_predictors = sum( len(predictors_at_ith_iteration) for predictors_at_ith_iteration in self.predictors_) print(f"Fit {n_predictors} trees in {duration:.3f} s, " f"({n_total_leaves} total leaves)") print(f"{'Time spent finding best splits:':<32} " f"{acc_find_split_time:.3f}s") print(f"{'Time spent applying splits:':<32} " f"{acc_apply_split_time:.3f}s") print(f"{'Time spent predicting:':<32} " f"{acc_prediction_time:.3f}s") self.train_scores_ = np.asarray(self.train_scores_) if self.scoring is not None and self.validation_split is not None: self.validation_scores_ = np.asarray(self.validation_scores_) return self
class BaseGradientBoostingMachine(BaseEstimator, ABC): """Base class for gradient boosting estimators.""" @abstractmethod def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes, max_depth, min_samples_leaf, l2_regularization, max_bins, scoring, validation_split, n_iter_no_change, tol, verbose, random_state): self.loss = loss self.learning_rate = learning_rate self.max_iter = max_iter self.max_leaf_nodes = max_leaf_nodes self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf self.l2_regularization = l2_regularization self.max_bins = max_bins self.n_iter_no_change = n_iter_no_change self.validation_split = validation_split self.scoring = scoring self.tol = tol self.verbose = verbose self.random_state = random_state def _validate_parameters(self): """Validate parameters passed to __init__. The parameters that are directly passed to the grower are checked in TreeGrower.""" if self.loss not in self._VALID_LOSSES: raise ValueError( "Loss {} is not supported for {}. Accepted losses" "are {}.".format(self.loss, self.__class__.__name__, ', '.join(self._VALID_LOSSES))) if self.learning_rate <= 0: raise ValueError(f'learning_rate={self.learning_rate} must ' f'be strictly positive') if self.max_iter < 1: raise ValueError(f'max_iter={self.max_iter} must ' f'not be smaller than 1.') if self.n_iter_no_change < 2: raise ValueError(f'n_iter_no_change={self.n_iter_no_change} ' f'must not be smaller than 2.') if self.validation_split is not None and self.validation_split <= 0: raise ValueError(f'validation_split={self.validation_split} ' f'must be strictly positive, or None.') if self.tol is not None and self.tol < 0: raise ValueError(f'tol={self.tol} ' f'must not be smaller than 0.') def fit(self, X, y): """Fit the gradient boosting model. Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. y : array-like, shape=(n_samples,) Target values. Returns ------- self : object """ fit_start_time = time() acc_find_split_time = 0. # time spent finding the best splits acc_apply_split_time = 0. # time spent splitting nodes # time spent predicting X for gradient and hessians update acc_prediction_time = 0. # TODO: add support for mixed-typed (numerical + categorical) data # TODO: add support for missing data # TODO: add support for pre-binned data (pass-through)? # TODO: test input checking X, y = check_X_y(X, y, dtype=[np.float32, np.float64]) y = self._encode_y(y) if X.shape[0] == 1 or X.shape[1] == 1: raise ValueError( 'Passing only one sample or one feature is not supported yet. ' 'See numba issue #3569.' ) rng = check_random_state(self.random_state) self._validate_parameters() self.n_features_ = X.shape[1] # used for validation in predict() if self.verbose: print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="", flush=True) tic = time() self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng) X_binned = self.bin_mapper_.fit_transform(X) toc = time() if self.verbose: duration = toc - tic troughput = X.nbytes / duration print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)") self.loss_ = self._get_loss() if self.scoring is not None and self.validation_split is not None: # stratify for classification stratify = y if hasattr(self.loss_, 'predict_proba') else None X_binned_train, X_binned_val, y_train, y_val = train_test_split( X_binned, y, test_size=self.validation_split, stratify=stratify, random_state=rng) if X_binned_train.size == 0 or X_binned_val.size == 0: raise ValueError( f'Not enough data (n_samples={X_binned.shape[0]}) to ' f'perform early stopping with validation_split=' f'{self.validation_split}. Use more training data or ' f'adjust validation_split.' ) # Histogram computation is faster on feature-aligned data. X_binned_train = np.asfortranarray(X_binned_train) else: X_binned_train, y_train = X_binned, y X_binned_val, y_val = None, None # Subsample the training set for score-based monitoring. subsample_size = 10000 if X_binned_train.shape[0] < subsample_size: X_binned_small_train = np.ascontiguousarray(X_binned_train) y_small_train = y_train else: indices = rng.choice( np.arange(X_binned_train.shape[0]), subsample_size) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] if self.verbose: print("Fitting gradient boosted rounds:") n_samples = X_binned_train.shape[0] # values predicted by the trees. Used as-is in regression, and # transformed into probas and / or classes for classification raw_predictions = np.zeros( shape=(n_samples, self.n_trees_per_iteration_), dtype=y_train.dtype ) # gradients and hessians are 1D arrays of size # n_samples * n_trees_per_iteration gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, n_trees_per_iteration=self.n_trees_per_iteration_ ) # predictors_ is a matrix of TreePredictor objects with shape # (n_iter_, n_trees_per_iteration) self.predictors_ = predictors = [] scorer = check_scoring(self, self.scoring) self.train_scores_ = [] if self.scoring is not None: # Add predictions of the initial model (before the first tree) predicted_train = self._predict_binned(X_binned_train) score_train = scorer._sign * scorer._score_func(y_train, predicted_train) self.train_scores_.append(score_train) if self.validation_split is not None: self.validation_scores_ = [] predicted_val = self._predict_binned(X_binned_val) score_val = scorer._sign * scorer._score_func(y_val, predicted_val) self.validation_scores_.append(score_val) for iteration in range(self.max_iter): if self.verbose: iteration_start_time = time() print(f"[{iteration + 1}/{self.max_iter}] ", end='', flush=True) # Update gradients and hessians, inplace self.loss_.update_gradients_and_hessians(gradients, hessians, y_train, raw_predictions) predictors.append([]) # Build `n_trees_per_iteration` trees. for k, (gradients_at_k, hessians_at_k) in enumerate(zip( np.array_split(gradients, self.n_trees_per_iteration_), np.array_split(hessians, self.n_trees_per_iteration_))): # the xxxx_at_k arrays are **views** on the original arrays. # Note that for binary classif and regressions, # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the # whole array. grower = TreeGrower( X_binned_train, gradients_at_k, hessians_at_k, max_bins=self.max_bins, n_bins_per_feature=self.bin_mapper_.n_bins_per_feature_, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, l2_regularization=self.l2_regularization, shrinkage=self.learning_rate) grower.grow() acc_apply_split_time += grower.total_apply_split_time acc_find_split_time += grower.total_find_split_time predictor = grower.make_predictor( bin_thresholds=self.bin_mapper_.bin_thresholds_) predictors[-1].append(predictor) tic_pred = time() # prepare leaves_data so that _update_raw_predictions can be # @njitted leaves_data = [(l.value, l.sample_indices) for l in grower.finalized_leaves] _update_raw_predictions(leaves_data, raw_predictions[:, k]) toc_pred = time() acc_prediction_time += toc_pred - tic_pred should_stop = self._check_early_stopping( scorer, X_binned_small_train, y_small_train, X_binned_val, y_val) if self.verbose: self._print_iteration_stats(iteration_start_time) if should_stop: break if self.verbose: duration = time() - fit_start_time n_total_leaves = sum( predictor.get_n_leaf_nodes() for predictors_at_ith_iteration in self.predictors_ for predictor in predictors_at_ith_iteration) n_predictors = sum( len(predictors_at_ith_iteration) for predictors_at_ith_iteration in self.predictors_) print(f"Fit {n_predictors} trees in {duration:.3f} s, " f"({n_total_leaves} total leaves)") print(f"{'Time spent finding best splits:':<32} " f"{acc_find_split_time:.3f}s") print(f"{'Time spent applying splits:':<32} " f"{acc_apply_split_time:.3f}s") print(f"{'Time spent predicting:':<32} " f"{acc_prediction_time:.3f}s") self.train_scores_ = np.asarray(self.train_scores_) if self.scoring is not None and self.validation_split is not None: self.validation_scores_ = np.asarray(self.validation_scores_) return self def _check_early_stopping(self, scorer, X_binned_train, y_train, X_binned_val, y_val): """Check if fitting should be early-stopped. Return True (do early stopping) if the score at iteration i hasn't improved any of the last n_iter_no_change scores by at least tol percent. Scores are computed on validation data or on training data. """ if self.scoring is None: # no early stopping. # In sklearn early stopping is not done if n_iter_no_change is None return False def _should_stop(scores): if len(scores) - 1 < self.n_iter_no_change: # - 1 because scores[0] is for the init model before the first # tree. return False current_score = scores[-1] # score at current iteration previous_scores = scores[-self.n_iter_no_change:-1] return all( current_score < prev_score * (1 + self.tol * scorer._sign) for prev_score in previous_scores ) # TODO: make sure that self.predict can work on binned data and # then only use the public scorer.__call__. predicted_train = self._predict_binned(X_binned_train) score_train = scorer._sign * scorer._score_func(y_train, predicted_train) self.train_scores_.append(score_train) if self.validation_split is not None: predicted_val = self._predict_binned(X_binned_val) score_val = scorer._sign * scorer._score_func(y_val, predicted_val) self.validation_scores_.append(score_val) return _should_stop(self.validation_scores_) return _should_stop(self.train_scores_) def _print_iteration_stats(self, iteration_start_time): """Print info about the current fitting iteration.""" log_msg = '' predictors_of_ith_iteration = [ predictors_list for predictors_list in self.predictors_[-1] if predictors_list ] n_trees = len(predictors_of_ith_iteration) max_depth = max(predictor.get_max_depth() for predictor in predictors_of_ith_iteration) n_leaves = sum(predictor.get_n_leaf_nodes() for predictor in predictors_of_ith_iteration) if n_trees == 1: log_msg += (f"{n_trees} tree, {n_leaves} leaves, ") else: log_msg += (f"{n_trees} trees, {n_leaves} leaves ") log_msg += (f"({int(n_leaves / n_trees)} on avg), ") log_msg += f"max depth = {max_depth}, " if self.scoring is not None: log_msg += f"{self.scoring} train: {self.train_scores_[-1]:.5f}, " if self.validation_split is not None: log_msg += (f"{self.scoring} val: " f"{self.validation_scores_[-1]:.5f}, ") iteration_time = time() - iteration_start_time log_msg += f"in {iteration_time:0.3f}s" print(log_msg) def _raw_predict(self, X, binned=False): """Return the sum of the leaves values over all predictors. Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. binned : bool, optional (default=False) If True, X is considered to be already binned. Returns ------- raw_predictions : array, shape (n_samples * n_trees_per_iteration,) The raw predicted values. """ X = check_array(X) check_is_fitted(self, 'predictors_') if X.shape[1] != self.n_features_: raise ValueError( f'X has {X.shape[1]} features but this estimator was ' f'trained with {self.n_features_} features.' ) n_samples = X.shape[0] raw_predictions = np.zeros( shape=(n_samples, self.n_trees_per_iteration_), dtype=np.float32 ) # Should we parallelize this? for predictors_of_ith_iteration in self.predictors_: for k, predictor in enumerate(predictors_of_ith_iteration): predict = (predictor.predict_binned if binned else predictor.predict) raw_predictions[:, k] += predict(X) return raw_predictions @abstractmethod def _get_loss(self): pass @abstractmethod def _encode_y(self, y=None): pass @property def n_iter_(self): check_is_fitted(self, 'predictors_') return len(self.predictors_) @abstractmethod def _predict_binned(self, X_binned): pass