def test_min_samples_leaf_root(n_samples, min_samples_leaf): # Make sure root node isn't split if n_samples is not at least twice # min_samples_leaf rng = np.random.RandomState(seed=0) max_bins = 255 # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] mapper = BinMapper(max_bins=max_bins) X = mapper.fit_transform(X) all_gradients = y.astype(np.float32) all_hessians = np.ones(shape=1, dtype=np.float32) grower = TreeGrower(X, all_gradients, all_hessians, max_bins=max_bins, shrinkage=1., min_samples_leaf=min_samples_leaf, max_leaf_nodes=n_samples) grower.grow() if n_samples >= min_samples_leaf * 2: assert len(grower.finalized_leaves) >= 2 else: assert len(grower.finalized_leaves) == 1
def test_pre_binned_data(): # Make sure ValueError is raised when predictor.predict() is called while # the predictor does not have any numerical thresholds. X, y = make_regression() # Init gradients and hessians to that of least squares loss gradients = -y.astype(np.float32) hessians = np.ones(1, dtype=np.float32) mapper = BinMapper(random_state=0) X_binned = mapper.fit_transform(X) grower = TreeGrower(X_binned, gradients, hessians, n_bins_per_feature=mapper.n_bins_per_feature_) grower.grow() predictor = grower.make_predictor(numerical_thresholds=None) assert_raises_regex(ValueError, 'This predictor does not have numerical thresholds', predictor.predict, X) assert_raises_regex(ValueError, 'binned_data dtype should be uint8', predictor.predict_binned, X) predictor.predict_binned(X_binned) # No error predictor = grower.make_predictor( numerical_thresholds=mapper.numerical_thresholds_) assert_raises_regex(ValueError, 'X has uint8 dtype', predictor.predict, X_binned)
def test_boston_dataset(): boston = load_boston() X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=42) mapper = BinMapper(random_state=42) X_train_binned = mapper.fit_transform(X_train) X_test_binned = mapper.transform(X_test) gradients = y_train.astype(np.float32) hessians = np.ones(1, dtype=np.float32) min_samples_leaf = 8 max_leaf_nodes = 31 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.75 assert r2_score(y_test, predictor.predict_binned(X_test_binned)) > 0.65 assert_allclose(predictor.predict(X_train), predictor.predict_binned(X_train_binned)) assert_allclose(predictor.predict(X_test), predictor.predict_binned(X_test_binned)) assert r2_score(y_train, predictor.predict(X_train)) > 0.75 assert r2_score(y_test, predictor.predict(X_test)) > 0.65
def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, constant_hessian, noise): rng = np.random.RandomState(seed=0) # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] if noise: y_scale = y.std() y += rng.normal(scale=noise, size=n_samples) * y_scale mapper = BinMapper(max_bins=n_bins) X = mapper.fit_transform(X) all_gradients = y.astype(np.float32) if constant_hessian: all_hessians = np.ones(shape=1, dtype=np.float32) else: all_hessians = np.ones_like(all_gradients) grower = TreeGrower(X, all_gradients, all_hessians, max_bins=n_bins, shrinkage=1., min_samples_leaf=min_samples_leaf, max_leaf_nodes=n_samples) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) if n_samples >= min_samples_leaf: for node in predictor.nodes: if node['is_leaf']: assert node['count'] >= min_samples_leaf else: assert predictor.nodes.shape[0] == 1 assert predictor.nodes[0]['is_leaf'] assert predictor.nodes[0]['count'] == n_samples
def test_boston_dataset(max_bins): boston = load_boston() X_train, X_test, y_train, y_test = train_test_split( boston.data, boston.target, random_state=42) mapper = BinMapper(max_bins=max_bins, random_state=42) X_train_binned = mapper.fit_transform(X_train) X_test_binned = mapper.transform(X_test) # Init gradients and hessians to that of least squares loss gradients = -y_train.astype(np.float32) hessians = np.ones(1, dtype=np.float32) min_samples_leaf = 8 max_leaf_nodes = 31 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, max_bins=max_bins, n_bins_per_feature=mapper.n_bins_per_feature_) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.85 assert r2_score(y_test, predictor.predict_binned(X_test_binned)) > 0.70 assert_allclose(predictor.predict(X_train), predictor.predict_binned(X_train_binned)) assert_allclose(predictor.predict(X_test), predictor.predict_binned(X_test_binned)) assert r2_score(y_train, predictor.predict(X_train)) > 0.85 assert r2_score(y_test, predictor.predict(X_test)) > 0.70
def test_plot_grower(tmpdir): pytest.importorskip('graphviz') from pygbm.plotting import plot_tree X_binned = BinMapper().fit_transform(X) gradients = np.asarray(y, dtype=np.float32).copy() hessians = np.ones(1, dtype=np.float32) grower = TreeGrower(X_binned, gradients, hessians, max_leaf_nodes=5) grower.grow() filename = tmpdir.join('plot_grower.pdf') plot_tree(grower, view=False, filename=filename) assert filename.exists()
def test_predictor_from_grower(): # Build a tree on the toy 3-leaf dataset to extract the predictor. n_bins = 256 features_data, all_gradients, all_hessians = _make_training_data( n_bins=n_bins) grower = TreeGrower(features_data, all_gradients, all_hessians, n_bins=n_bins, shrinkage=1., max_leaf_nodes=3, min_samples_leaf=5) grower.grow() assert grower.n_nodes == 5 # (2 decision nodes + 3 leaves) # Check that the node structure can be converted into a predictor # object to perform predictions at scale predictor = grower.make_predictor() assert predictor.nodes.shape[0] == 5 assert predictor.nodes['is_leaf'].sum() == 3 def predict(features): return predictor.predict_one_binned(np.array(features, dtype=np.uint8)) # Probe some predictions for each leaf of the tree input_data = np.array([ [0, 0], [42, 99], [128, 255], [129, 0], [129, 85], [255, 85], [129, 86], [129, 255], [242, 100], ], dtype=np.uint8) predictions = predictor.predict_binned(input_data) expected_targets = [-1, -1, -1, -1, -1, -1, 1, 1, 1] assert_array_almost_equal(predictions, expected_targets, decimal=5) # Check that training set can be recovered exactly: predictions = predictor.predict_binned(features_data) assert_array_almost_equal(predictions, all_gradients, decimal=5)
from pygbm.binning import BinMapper from pygbm.grower import TreeGrower from pygbm import plotting rng = np.random.RandomState(0) n_samples = int(1e7) n_leaf_nodes = 5 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, n_informative=3, n_redundant=0, random_state=rng) bin_mapper_ = BinMapper(random_state=rng) X_binned = bin_mapper_.fit_transform(X) gradients = np.asarray(y, dtype=np.float32).copy() hessians = np.ones(1, dtype=np.float32) # First run to trigger the compilation of numba jit methods to avoid recording # the compiler overhead in the profile report. TreeGrower(X_binned, gradients, hessians, max_leaf_nodes=n_leaf_nodes).grow() # New run with to collect timing statistics that will be included in the plot. grower = TreeGrower(X_binned, gradients, hessians, max_leaf_nodes=n_leaf_nodes) grower.grow() plotting.plot_tree(grower)
def fit(self, X, y): """Fit the gradient boosting model. Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. y : array-like, shape=(n_samples,) Target values. Returns ------- self : object """ fit_start_time = time() acc_find_split_time = 0. # time spent finding the best splits acc_apply_split_time = 0. # time spent splitting nodes # time spent predicting X for gradient and hessians update acc_prediction_time = 0. # TODO: add support for mixed-typed (numerical + categorical) data # TODO: add support for missing data # TODO: add support for pre-binned data (pass-through)? # TODO: test input checking X, y = check_X_y(X, y, dtype=[np.float32, np.float64]) y = self._encode_y(y) if X.shape[0] == 1 or X.shape[1] == 1: raise ValueError( 'Passing only one sample or one feature is not supported yet. ' 'See numba issue #3569.' ) rng = check_random_state(self.random_state) self._validate_parameters() self.n_features_ = X.shape[1] # used for validation in predict() if self.verbose: print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="", flush=True) tic = time() self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng) X_binned = self.bin_mapper_.fit_transform(X) toc = time() if self.verbose: duration = toc - tic troughput = X.nbytes / duration print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)") self.loss_ = self._get_loss() if self.scoring is not None and self.validation_split is not None: # stratify for classification stratify = y if hasattr(self.loss_, 'predict_proba') else None X_binned_train, X_binned_val, y_train, y_val = train_test_split( X_binned, y, test_size=self.validation_split, stratify=stratify, random_state=rng) if X_binned_train.size == 0 or X_binned_val.size == 0: raise ValueError( f'Not enough data (n_samples={X_binned.shape[0]}) to ' f'perform early stopping with validation_split=' f'{self.validation_split}. Use more training data or ' f'adjust validation_split.' ) # Histogram computation is faster on feature-aligned data. X_binned_train = np.asfortranarray(X_binned_train) else: X_binned_train, y_train = X_binned, y X_binned_val, y_val = None, None # Subsample the training set for score-based monitoring. subsample_size = 10000 if X_binned_train.shape[0] < subsample_size: X_binned_small_train = np.ascontiguousarray(X_binned_train) y_small_train = y_train else: indices = rng.choice( np.arange(X_binned_train.shape[0]), subsample_size) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] if self.verbose: print("Fitting gradient boosted rounds:") n_samples = X_binned_train.shape[0] # values predicted by the trees. Used as-is in regression, and # transformed into probas and / or classes for classification raw_predictions = np.zeros( shape=(n_samples, self.n_trees_per_iteration_), dtype=y_train.dtype ) # gradients and hessians are 1D arrays of size # n_samples * n_trees_per_iteration gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, n_trees_per_iteration=self.n_trees_per_iteration_ ) # predictors_ is a matrix of TreePredictor objects with shape # (n_iter_, n_trees_per_iteration) self.predictors_ = predictors = [] scorer = check_scoring(self, self.scoring) self.train_scores_ = [] if self.scoring is not None: # Add predictions of the initial model (before the first tree) predicted_train = self._predict_binned(X_binned_train) score_train = scorer._sign * scorer._score_func(y_train, predicted_train) self.train_scores_.append(score_train) if self.validation_split is not None: self.validation_scores_ = [] predicted_val = self._predict_binned(X_binned_val) score_val = scorer._sign * scorer._score_func(y_val, predicted_val) self.validation_scores_.append(score_val) for iteration in range(self.max_iter): if self.verbose: iteration_start_time = time() print(f"[{iteration + 1}/{self.max_iter}] ", end='', flush=True) # Update gradients and hessians, inplace self.loss_.update_gradients_and_hessians(gradients, hessians, y_train, raw_predictions) predictors.append([]) # Build `n_trees_per_iteration` trees. for k, (gradients_at_k, hessians_at_k) in enumerate(zip( np.array_split(gradients, self.n_trees_per_iteration_), np.array_split(hessians, self.n_trees_per_iteration_))): # the xxxx_at_k arrays are **views** on the original arrays. # Note that for binary classif and regressions, # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the # whole array. grower = TreeGrower( X_binned_train, gradients_at_k, hessians_at_k, max_bins=self.max_bins, n_bins_per_feature=self.bin_mapper_.n_bins_per_feature_, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, l2_regularization=self.l2_regularization, shrinkage=self.learning_rate) grower.grow() acc_apply_split_time += grower.total_apply_split_time acc_find_split_time += grower.total_find_split_time predictor = grower.make_predictor( bin_thresholds=self.bin_mapper_.bin_thresholds_) predictors[-1].append(predictor) tic_pred = time() # prepare leaves_data so that _update_raw_predictions can be # @njitted leaves_data = [(l.value, l.sample_indices) for l in grower.finalized_leaves] _update_raw_predictions(leaves_data, raw_predictions[:, k]) toc_pred = time() acc_prediction_time += toc_pred - tic_pred should_stop = self._check_early_stopping( scorer, X_binned_small_train, y_small_train, X_binned_val, y_val) if self.verbose: self._print_iteration_stats(iteration_start_time) if should_stop: break if self.verbose: duration = time() - fit_start_time n_total_leaves = sum( predictor.get_n_leaf_nodes() for predictors_at_ith_iteration in self.predictors_ for predictor in predictors_at_ith_iteration) n_predictors = sum( len(predictors_at_ith_iteration) for predictors_at_ith_iteration in self.predictors_) print(f"Fit {n_predictors} trees in {duration:.3f} s, " f"({n_total_leaves} total leaves)") print(f"{'Time spent finding best splits:':<32} " f"{acc_find_split_time:.3f}s") print(f"{'Time spent applying splits:':<32} " f"{acc_apply_split_time:.3f}s") print(f"{'Time spent predicting:':<32} " f"{acc_prediction_time:.3f}s") self.train_scores_ = np.asarray(self.train_scores_) if self.scoring is not None and self.validation_split is not None: self.validation_scores_ = np.asarray(self.validation_scores_) return self
def fit(self, X, y): """Fit the gradient boosting model. Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. If ``X.dtype == np.uint8``, the data is assumed to be pre-binned and the prediction methods (``predict``, ``predict_proba``) will only accept pre-binned data as well. y : array-like, shape=(n_samples,) Target values. Returns ------- self : object """ fit_start_time = time() acc_find_split_time = 0. # time spent finding the best splits acc_apply_split_time = 0. # time spent splitting nodes # time spent predicting X for gradient and hessians update acc_prediction_time = 0. # TODO: add support for mixed-typed (numerical + categorical) data # TODO: add support for missing data self.multi_output = len(y.ravel()) != len(y) if self.multi_output: self.prediction_dim = y.shape[1] else: self.prediction_dim = 1 X, y = check_X_y(X, y, dtype=[np.float32, np.float64, np.uint8], multi_output=self.multi_output) y = self._encode_y(y) if X.shape[0] == 1 or X.shape[1] == 1: raise ValueError( 'Passing only one sample or one feature is not supported yet. ' 'See numba issue #3569.') rng = check_random_state(self.random_state) self._validate_parameters(X) self.n_features_ = X.shape[1] # used for validation in predict() if X.dtype == np.uint8: # data is pre-binned if self.verbose: print("X is pre-binned.") X_binned = X self.bin_mapper_ = None numerical_thresholds = None n_bins_per_feature = X.max(axis=0).astype(np.uint32) else: if self.verbose: print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="", flush=True) tic = time() self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng) X_binned = self.bin_mapper_.fit_transform(X) numerical_thresholds = self.bin_mapper_.numerical_thresholds_ n_bins_per_feature = self.bin_mapper_.n_bins_per_feature_ toc = time() if self.verbose: duration = toc - tic throughput = X.nbytes / duration print(f"{duration:.3f} s ({throughput / 1e6:.3f} MB/s)") self.loss_ = self._get_loss() do_early_stopping = (self.n_iter_no_change is not None and self.n_iter_no_change > 0) if do_early_stopping and self.validation_split is not None: # stratify for classification stratify = y if hasattr(self.loss_, 'predict_proba') else None X_binned_train, X_binned_val, y_train, y_val = train_test_split( X_binned, y, test_size=self.validation_split, stratify=stratify, random_state=rng) if X_binned_train.size == 0 or X_binned_val.size == 0: raise ValueError( f'Not enough data (n_samples={X_binned.shape[0]}) to ' f'perform early stopping with validation_split=' f'{self.validation_split}. Use more training data or ' f'adjust validation_split.') # Predicting is faster of C-contiguous arrays, training is faster # on Fortran arrays. X_binned_val = np.ascontiguousarray(X_binned_val) X_binned_train = np.asfortranarray(X_binned_train) else: X_binned_train, y_train = X_binned, y X_binned_val, y_val = None, None # Subsample the training set for score-based monitoring. if do_early_stopping: subsample_size = 10000 n_samples_train = X_binned_train.shape[0] if n_samples_train > subsample_size: indices = rng.choice(X_binned_train.shape[0], subsample_size) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] else: X_binned_small_train = X_binned_train y_small_train = y_train # Predicting is faster of C-contiguous arrays. X_binned_small_train = np.ascontiguousarray(X_binned_small_train) if self.verbose: print("Fitting gradient boosted rounds:") n_samples = X_binned_train.shape[0] self.baseline_prediction_ = self.loss_.get_baseline_prediction( y_train, self.prediction_dim) # raw_predictions are the accumulated values predicted by the trees # for the training data. raw_predictions = np.zeros(shape=(n_samples, self.prediction_dim), dtype=self.baseline_prediction_.dtype) if not self.multi_output: raw_predictions = raw_predictions.ravel() raw_predictions += self.baseline_prediction_ # gradients and hessians are 1D arrays of size # n_samples * n_trees_per_iteration gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, prediction_dim=self.prediction_dim) if not self.multi_output: gradients = gradients.ravel() # predictors_ is a matrix of TreePredictor objects with shape # (n_iter_, n_trees_per_iteration) self.predictors_ = predictors = [] # scorer_ is a callable with signature (est, X, y) and calls # est.predict() or est.predict_proba() depending on its nature. self.scorer_ = check_scoring(self, self.scoring) self.train_scores_ = [] self.validation_scores_ = [] if do_early_stopping: # Add predictions of the initial model (before the first tree) self.train_scores_.append(self._get_scores(X_binned_train, y_train)) if self.validation_split is not None: self.validation_scores_.append( self._get_scores(X_binned_val, y_val)) for iteration in range(self.max_iter): if self.verbose: iteration_start_time = time() print(f"[{iteration + 1}/{self.max_iter}] ", end='', flush=True) # Update gradients and hessians, inplace self.loss_.update_gradients_and_hessians(gradients, hessians, y_train, raw_predictions) predictors.append([]) if self.multi_output: proj_gradients, proj_hessians = self.randomly_project_gradients_and_hessians( gradients, hessians) else: proj_gradients, proj_hessians = gradients.ravel( ), hessians.ravel() # Build `n_trees_per_iteration` trees. for k, (gradients_at_k, hessians_at_k) in enumerate( zip( np.array_split(proj_gradients, self.n_trees_per_iteration_), np.array_split(proj_hessians, self.n_trees_per_iteration_))): # the xxxx_at_k arrays are **views** on the original arrays. # Note that for binary classif and regressions, # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the # whole array. grower = TreeGrower(X_binned_train, gradients_at_k, hessians_at_k, max_bins=self.max_bins, n_bins_per_feature=n_bins_per_feature, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, l2_regularization=self.l2_regularization, shrinkage=self.learning_rate) grower.grow() if self.multi_output: for l in grower.finalized_leaves: l.residual = ( -self.learning_rate * np.sum(a=gradients[l.sample_indices, :], axis=0) / (l.sum_hessians + self.l2_regularization + np.finfo(np.float64).eps)) leaves_data = [(l.residual, l.sample_indices) for l in grower.finalized_leaves] else: leaves_data = [(l.value, l.sample_indices) for l in grower.finalized_leaves] acc_apply_split_time += grower.total_apply_split_time acc_find_split_time += grower.total_find_split_time predictor = grower.make_predictor(numerical_thresholds) predictors[-1].append(predictor) tic_pred = time() # prepare leaves_data so that _update_raw_predictions can be # @njitted _update_raw_predictions(leaves_data, raw_predictions) toc_pred = time() acc_prediction_time += toc_pred - tic_pred should_early_stop = False if do_early_stopping: should_early_stop = self._check_early_stopping( X_binned_small_train, y_small_train, X_binned_val, y_val) if self.verbose: self._print_iteration_stats(iteration_start_time, do_early_stopping) if should_early_stop: break if self.verbose: duration = time() - fit_start_time n_total_leaves = sum( predictor.get_n_leaf_nodes() for predictors_at_ith_iteration in self.predictors_ for predictor in predictors_at_ith_iteration) n_predictors = sum( len(predictors_at_ith_iteration) for predictors_at_ith_iteration in self.predictors_) print(f"Fit {n_predictors} trees in {duration:.3f} s, " f"({n_total_leaves} total leaves)") print(f"{'Time spent finding best splits:':<32} " f"{acc_find_split_time:.3f}s") print(f"{'Time spent applying splits:':<32} " f"{acc_apply_split_time:.3f}s") print(f"{'Time spent predicting:':<32} " f"{acc_prediction_time:.3f}s") self.train_scores_ = np.asarray(self.train_scores_) self.validation_scores_ = np.asarray(self.validation_scores_) return self
def fit(self, X, y): fit_start_time = time() acc_find_split_time = 0. # time spent finding the best splits acc_apply_split_time = 0. # time spent splitting nodes # time spent predicting X for gradient and hessians update acc_prediction_time = 0. # TODO: add support for mixed-typed (numerical + categorical) data # TODO: add support for missing data # TODO: add support for pre-binned data (pass-through)? X, y = check_X_y(X, y, dtype=[np.float32, np.float64]) y = y.astype(np.float32, copy=False) rng = check_random_state(self.random_state) if self.verbose: print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="", flush=True) tic = time() self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng) X_binned = self.bin_mapper_.fit_transform(X) toc = time() if self.verbose: duration = toc - tic troughput = X.nbytes / duration print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)") if self.validation_split is not None: X_binned_train, X_binned_val, y_train, y_val = train_test_split( X_binned, y, test_size=self.validation_split, stratify=y, random_state=rng) # Histogram computation is faster on feature-aligned data. X_binned_train = np.asfortranarray(X_binned_train) else: X_binned_train, y_train = X_binned, y X_binned_val, y_val = None, None # Subsample the training set for score-based monitoring. subsample_size = 10000 if X_binned_train.shape[0] < subsample_size: X_binned_small_train = np.ascontiguousarray(X_binned_train) y_small_train = y_train else: indices = rng.choice(np.arange(X_binned_train.shape[0]), subsample_size) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] if self.verbose: print("Fitting gradient boosted rounds:") # TODO: plug custom loss functions y_pred = np.zeros_like(y_train, dtype=np.float32) gradients = np.asarray(y_train, dtype=np.float32).copy() hessians = np.ones(1, dtype=np.float32) self.predictors_ = predictors = [] self.train_scores_ = [] if self.validation_split is not None: self.validation_scores_ = [] scorer = check_scoring(self, self.scoring) gb_start_time = time() # TODO: compute training loss and use it for early stopping if no # validation data is provided? self.n_iter_ = 0 while True: should_stop = self._stopping_criterion(gb_start_time, scorer, X_binned_small_train, y_small_train, X_binned_val, y_val) if should_stop or self.n_iter_ == self.max_iter: break shrinkage = 1. if self.n_iter_ == 0 else self.learning_rate grower = TreeGrower(X_binned_train, gradients, hessians, n_bins=self.max_bins, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, shrinkage=shrinkage) grower.grow() predictor = grower.make_predictor( bin_thresholds=self.bin_mapper_.bin_thresholds_) predictors.append(predictor) self.n_iter_ += 1 tic_pred = time() leaves_data = [(l.value, l.sample_indices) for l in grower.finalized_leaves] _update_y_pred(leaves_data, y_pred) gradients = y_train - y_pred toc_pred = time() acc_prediction_time += toc_pred - tic_pred acc_apply_split_time += grower.total_apply_split_time acc_find_split_time += grower.total_find_split_time if self.verbose: duration = time() - fit_start_time n_leaf_nodes = sum(p.get_n_leaf_nodes() for p in self.predictors_) print(f"Fit {len(self.predictors_)} trees in {duration:.3f} s, " f"({n_leaf_nodes} total leaf nodes)") print('{:<32} {:.3f}s'.format('Time spent finding best splits:', acc_find_split_time)) print('{:<32} {:.3f}s'.format('Time spent applying splits:', acc_apply_split_time)) print('{:<32} {:.3f}s'.format('Time spent predicting:', acc_prediction_time)) self.train_scores_ = np.asarray(self.train_scores_) if self.validation_split is not None: self.validation_scores_ = np.asarray(self.validation_scores_) return self