示例#1
0
def test_pre_binned_data():
    # Make sure ValueError is raised when predictor.predict() is called while
    # the predictor does not have any numerical thresholds.

    X, y = make_regression()

    # Init gradients and hessians to that of least squares loss
    gradients = -y.astype(np.float32)
    hessians = np.ones(1, dtype=np.float32)

    mapper = BinMapper(random_state=0)
    X_binned = mapper.fit_transform(X)
    grower = TreeGrower(X_binned,
                        gradients,
                        hessians,
                        n_bins_per_feature=mapper.n_bins_per_feature_)
    grower.grow()
    predictor = grower.make_predictor(numerical_thresholds=None)

    assert_raises_regex(ValueError,
                        'This predictor does not have numerical thresholds',
                        predictor.predict, X)

    assert_raises_regex(ValueError, 'binned_data dtype should be uint8',
                        predictor.predict_binned, X)

    predictor.predict_binned(X_binned)  # No error

    predictor = grower.make_predictor(
        numerical_thresholds=mapper.numerical_thresholds_)
    assert_raises_regex(ValueError, 'X has uint8 dtype', predictor.predict,
                        X_binned)
示例#2
0
def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins,
                          constant_hessian, noise):
    rng = np.random.RandomState(seed=0)
    # data = linear target, 3 features, 1 irrelevant.
    X = rng.normal(size=(n_samples, 3))
    y = X[:, 0] - X[:, 1]
    if noise:
        y_scale = y.std()
        y += rng.normal(scale=noise, size=n_samples) * y_scale
    mapper = BinMapper(max_bins=n_bins)
    X = mapper.fit_transform(X)

    all_gradients = y.astype(np.float32)
    if constant_hessian:
        all_hessians = np.ones(shape=1, dtype=np.float32)
    else:
        all_hessians = np.ones_like(all_gradients)
    grower = TreeGrower(X,
                        all_gradients,
                        all_hessians,
                        max_bins=n_bins,
                        shrinkage=1.,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=n_samples)
    grower.grow()
    predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)

    if n_samples >= min_samples_leaf:
        for node in predictor.nodes:
            if node['is_leaf']:
                assert node['count'] >= min_samples_leaf
    else:
        assert predictor.nodes.shape[0] == 1
        assert predictor.nodes[0]['is_leaf']
        assert predictor.nodes[0]['count'] == n_samples
示例#3
0
def test_boston_dataset():
    boston = load_boston()
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=42)

    mapper = BinMapper(random_state=42)
    X_train_binned = mapper.fit_transform(X_train)
    X_test_binned = mapper.transform(X_test)

    gradients = y_train.astype(np.float32)
    hessians = np.ones(1, dtype=np.float32)

    min_samples_leaf = 8
    max_leaf_nodes = 31
    grower = TreeGrower(X_train_binned,
                        gradients,
                        hessians,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=max_leaf_nodes)
    grower.grow()

    predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)

    assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.75
    assert r2_score(y_test, predictor.predict_binned(X_test_binned)) > 0.65

    assert_allclose(predictor.predict(X_train),
                    predictor.predict_binned(X_train_binned))

    assert_allclose(predictor.predict(X_test),
                    predictor.predict_binned(X_test_binned))

    assert r2_score(y_train, predictor.predict(X_train)) > 0.75
    assert r2_score(y_test, predictor.predict(X_test)) > 0.65
示例#4
0
def test_boston_dataset(max_bins):
    boston = load_boston()
    X_train, X_test, y_train, y_test = train_test_split(
        boston.data, boston.target, random_state=42)

    mapper = BinMapper(max_bins=max_bins, random_state=42)
    X_train_binned = mapper.fit_transform(X_train)
    X_test_binned = mapper.transform(X_test)

    # Init gradients and hessians to that of least squares loss
    gradients = -y_train.astype(np.float32)
    hessians = np.ones(1, dtype=np.float32)

    min_samples_leaf = 8
    max_leaf_nodes = 31
    grower = TreeGrower(X_train_binned, gradients, hessians,
                        min_samples_leaf=min_samples_leaf,
                        max_leaf_nodes=max_leaf_nodes, max_bins=max_bins,
                        n_bins_per_feature=mapper.n_bins_per_feature_)
    grower.grow()

    predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)

    assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.85
    assert r2_score(y_test, predictor.predict_binned(X_test_binned)) > 0.70

    assert_allclose(predictor.predict(X_train),
                    predictor.predict_binned(X_train_binned))

    assert_allclose(predictor.predict(X_test),
                    predictor.predict_binned(X_test_binned))

    assert r2_score(y_train, predictor.predict(X_train)) > 0.85
    assert r2_score(y_test, predictor.predict(X_test)) > 0.70
示例#5
0
def test_predictor_from_grower():
    # Build a tree on the toy 3-leaf dataset to extract the predictor.
    n_bins = 256
    features_data, all_gradients, all_hessians = _make_training_data(
        n_bins=n_bins)
    grower = TreeGrower(features_data,
                        all_gradients,
                        all_hessians,
                        n_bins=n_bins,
                        shrinkage=1.,
                        max_leaf_nodes=3,
                        min_samples_leaf=5)
    grower.grow()
    assert grower.n_nodes == 5  # (2 decision nodes + 3 leaves)

    # Check that the node structure can be converted into a predictor
    # object to perform predictions at scale
    predictor = grower.make_predictor()
    assert predictor.nodes.shape[0] == 5
    assert predictor.nodes['is_leaf'].sum() == 3

    def predict(features):
        return predictor.predict_one_binned(np.array(features, dtype=np.uint8))

    # Probe some predictions for each leaf of the tree
    input_data = np.array([
        [0, 0],
        [42, 99],
        [128, 255],
        [129, 0],
        [129, 85],
        [255, 85],
        [129, 86],
        [129, 255],
        [242, 100],
    ],
                          dtype=np.uint8)
    predictions = predictor.predict_binned(input_data)
    expected_targets = [-1, -1, -1, -1, -1, -1, 1, 1, 1]
    assert_array_almost_equal(predictions, expected_targets, decimal=5)

    # Check that training set can be recovered exactly:
    predictions = predictor.predict_binned(features_data)
    assert_array_almost_equal(predictions, all_gradients, decimal=5)
示例#6
0
    def fit(self, X, y):
        """Fit the gradient boosting model.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)
            The input samples.

        y : array-like, shape=(n_samples,)
            Target values.

        Returns
        -------
        self : object
        """

        fit_start_time = time()
        acc_find_split_time = 0.  # time spent finding the best splits
        acc_apply_split_time = 0.  # time spent splitting nodes
        # time spent predicting X for gradient and hessians update
        acc_prediction_time = 0.
        # TODO: add support for mixed-typed (numerical + categorical) data
        # TODO: add support for missing data
        # TODO: add support for pre-binned data (pass-through)?
        # TODO: test input checking
        X, y = check_X_y(X, y, dtype=[np.float32, np.float64])
        y = self._encode_y(y)
        if X.shape[0] == 1 or X.shape[1] == 1:
            raise ValueError(
                'Passing only one sample or one feature is not supported yet. '
                'See numba issue #3569.'
            )
        rng = check_random_state(self.random_state)

        self._validate_parameters()
        self.n_features_ = X.shape[1]  # used for validation in predict()

        if self.verbose:
            print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="",
                  flush=True)
        tic = time()
        self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng)
        X_binned = self.bin_mapper_.fit_transform(X)
        toc = time()
        if self.verbose:
            duration = toc - tic
            troughput = X.nbytes / duration
            print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)")

        self.loss_ = self._get_loss()

        if self.scoring is not None and self.validation_split is not None:
            # stratify for classification
            stratify = y if hasattr(self.loss_, 'predict_proba') else None

            X_binned_train, X_binned_val, y_train, y_val = train_test_split(
                X_binned, y, test_size=self.validation_split,
                stratify=stratify, random_state=rng)
            if X_binned_train.size == 0 or X_binned_val.size == 0:
                raise ValueError(
                    f'Not enough data (n_samples={X_binned.shape[0]}) to '
                    f'perform early stopping with validation_split='
                    f'{self.validation_split}. Use more training data or '
                    f'adjust validation_split.'
                )
            # Histogram computation is faster on feature-aligned data.
            X_binned_train = np.asfortranarray(X_binned_train)
        else:
            X_binned_train, y_train = X_binned, y
            X_binned_val, y_val = None, None

        # Subsample the training set for score-based monitoring.
        subsample_size = 10000
        if X_binned_train.shape[0] < subsample_size:
            X_binned_small_train = np.ascontiguousarray(X_binned_train)
            y_small_train = y_train
        else:
            indices = rng.choice(
                np.arange(X_binned_train.shape[0]), subsample_size)
            X_binned_small_train = X_binned_train[indices]
            y_small_train = y_train[indices]

        if self.verbose:
            print("Fitting gradient boosted rounds:")

        n_samples = X_binned_train.shape[0]
        # values predicted by the trees. Used as-is in regression, and
        # transformed into probas and / or classes for classification
        raw_predictions = np.zeros(
            shape=(n_samples, self.n_trees_per_iteration_),
            dtype=y_train.dtype
        )
        # gradients and hessians are 1D arrays of size
        # n_samples * n_trees_per_iteration
        gradients, hessians = self.loss_.init_gradients_and_hessians(
            n_samples=n_samples,
            n_trees_per_iteration=self.n_trees_per_iteration_
        )
        # predictors_ is a matrix of TreePredictor objects with shape
        # (n_iter_, n_trees_per_iteration)
        self.predictors_ = predictors = []

        scorer = check_scoring(self, self.scoring)
        self.train_scores_ = []
        if self.scoring is not None:
            # Add predictions of the initial model (before the first tree)
            predicted_train = self._predict_binned(X_binned_train)
            score_train = scorer._sign * scorer._score_func(y_train,
                                                            predicted_train)
            self.train_scores_.append(score_train)

            if self.validation_split is not None:
                self.validation_scores_ = []
                predicted_val = self._predict_binned(X_binned_val)
                score_val = scorer._sign * scorer._score_func(y_val,
                                                              predicted_val)
                self.validation_scores_.append(score_val)

        for iteration in range(self.max_iter):

            if self.verbose:
                iteration_start_time = time()
                print(f"[{iteration + 1}/{self.max_iter}] ", end='',
                      flush=True)

            # Update gradients and hessians, inplace
            self.loss_.update_gradients_and_hessians(gradients, hessians,
                                                     y_train, raw_predictions)

            predictors.append([])

            # Build `n_trees_per_iteration` trees.
            for k, (gradients_at_k, hessians_at_k) in enumerate(zip(
                    np.array_split(gradients, self.n_trees_per_iteration_),
                    np.array_split(hessians, self.n_trees_per_iteration_))):
                # the xxxx_at_k arrays are **views** on the original arrays.
                # Note that for binary classif and regressions,
                # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the
                # whole array.

                grower = TreeGrower(
                    X_binned_train, gradients_at_k, hessians_at_k,
                    max_bins=self.max_bins,
                    n_bins_per_feature=self.bin_mapper_.n_bins_per_feature_,
                    max_leaf_nodes=self.max_leaf_nodes,
                    max_depth=self.max_depth,
                    min_samples_leaf=self.min_samples_leaf,
                    l2_regularization=self.l2_regularization,
                    shrinkage=self.learning_rate)
                grower.grow()

                acc_apply_split_time += grower.total_apply_split_time
                acc_find_split_time += grower.total_find_split_time

                predictor = grower.make_predictor(
                    bin_thresholds=self.bin_mapper_.bin_thresholds_)
                predictors[-1].append(predictor)

                tic_pred = time()

                # prepare leaves_data so that _update_raw_predictions can be
                # @njitted
                leaves_data = [(l.value, l.sample_indices)
                               for l in grower.finalized_leaves]
                _update_raw_predictions(leaves_data, raw_predictions[:, k])
                toc_pred = time()
                acc_prediction_time += toc_pred - tic_pred

            should_stop = self._check_early_stopping(
                scorer, X_binned_small_train, y_small_train,
                X_binned_val, y_val)

            if self.verbose:
                self._print_iteration_stats(iteration_start_time)

            if should_stop:
                break

        if self.verbose:
            duration = time() - fit_start_time
            n_total_leaves = sum(
                predictor.get_n_leaf_nodes()
                for predictors_at_ith_iteration in self.predictors_
                for predictor in predictors_at_ith_iteration)
            n_predictors = sum(
                len(predictors_at_ith_iteration)
                for predictors_at_ith_iteration in self.predictors_)
            print(f"Fit {n_predictors} trees in {duration:.3f} s, "
                  f"({n_total_leaves} total leaves)")
            print(f"{'Time spent finding best splits:':<32} "
                  f"{acc_find_split_time:.3f}s")
            print(f"{'Time spent applying splits:':<32} "
                  f"{acc_apply_split_time:.3f}s")
            print(f"{'Time spent predicting:':<32} "
                  f"{acc_prediction_time:.3f}s")

        self.train_scores_ = np.asarray(self.train_scores_)
        if self.scoring is not None and self.validation_split is not None:
            self.validation_scores_ = np.asarray(self.validation_scores_)
        return self
    def fit(self, X, y):
        """Fit the gradient boosting model.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)
            The input samples. If ``X.dtype == np.uint8``, the data is
            assumed to be pre-binned and the prediction methods
            (``predict``, ``predict_proba``) will only accept pre-binned
            data as well.

        y : array-like, shape=(n_samples,)
            Target values.

        Returns
        -------
        self : object
        """

        fit_start_time = time()
        acc_find_split_time = 0.  # time spent finding the best splits
        acc_apply_split_time = 0.  # time spent splitting nodes
        # time spent predicting X for gradient and hessians update
        acc_prediction_time = 0.
        # TODO: add support for mixed-typed (numerical + categorical) data
        # TODO: add support for missing data
        self.multi_output = len(y.ravel()) != len(y)
        if self.multi_output:
            self.prediction_dim = y.shape[1]
        else:
            self.prediction_dim = 1
        X, y = check_X_y(X,
                         y,
                         dtype=[np.float32, np.float64, np.uint8],
                         multi_output=self.multi_output)
        y = self._encode_y(y)
        if X.shape[0] == 1 or X.shape[1] == 1:
            raise ValueError(
                'Passing only one sample or one feature is not supported yet. '
                'See numba issue #3569.')
        rng = check_random_state(self.random_state)

        self._validate_parameters(X)
        self.n_features_ = X.shape[1]  # used for validation in predict()

        if X.dtype == np.uint8:  # data is pre-binned
            if self.verbose:
                print("X is pre-binned.")
            X_binned = X
            self.bin_mapper_ = None
            numerical_thresholds = None
            n_bins_per_feature = X.max(axis=0).astype(np.uint32)
        else:
            if self.verbose:
                print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ",
                      end="",
                      flush=True)
            tic = time()
            self.bin_mapper_ = BinMapper(max_bins=self.max_bins,
                                         random_state=rng)
            X_binned = self.bin_mapper_.fit_transform(X)
            numerical_thresholds = self.bin_mapper_.numerical_thresholds_
            n_bins_per_feature = self.bin_mapper_.n_bins_per_feature_
            toc = time()

            if self.verbose:
                duration = toc - tic
                throughput = X.nbytes / duration
                print(f"{duration:.3f} s ({throughput / 1e6:.3f} MB/s)")

        self.loss_ = self._get_loss()

        do_early_stopping = (self.n_iter_no_change is not None
                             and self.n_iter_no_change > 0)

        if do_early_stopping and self.validation_split is not None:
            # stratify for classification
            stratify = y if hasattr(self.loss_, 'predict_proba') else None

            X_binned_train, X_binned_val, y_train, y_val = train_test_split(
                X_binned,
                y,
                test_size=self.validation_split,
                stratify=stratify,
                random_state=rng)
            if X_binned_train.size == 0 or X_binned_val.size == 0:
                raise ValueError(
                    f'Not enough data (n_samples={X_binned.shape[0]}) to '
                    f'perform early stopping with validation_split='
                    f'{self.validation_split}. Use more training data or '
                    f'adjust validation_split.')
            # Predicting is faster of C-contiguous arrays, training is faster
            # on Fortran arrays.
            X_binned_val = np.ascontiguousarray(X_binned_val)
            X_binned_train = np.asfortranarray(X_binned_train)
        else:
            X_binned_train, y_train = X_binned, y
            X_binned_val, y_val = None, None

        # Subsample the training set for score-based monitoring.
        if do_early_stopping:
            subsample_size = 10000
            n_samples_train = X_binned_train.shape[0]
            if n_samples_train > subsample_size:
                indices = rng.choice(X_binned_train.shape[0], subsample_size)
                X_binned_small_train = X_binned_train[indices]
                y_small_train = y_train[indices]
            else:
                X_binned_small_train = X_binned_train
                y_small_train = y_train
            # Predicting is faster of C-contiguous arrays.
            X_binned_small_train = np.ascontiguousarray(X_binned_small_train)

        if self.verbose:
            print("Fitting gradient boosted rounds:")

        n_samples = X_binned_train.shape[0]
        self.baseline_prediction_ = self.loss_.get_baseline_prediction(
            y_train, self.prediction_dim)
        # raw_predictions are the accumulated values predicted by the trees
        # for the training data.
        raw_predictions = np.zeros(shape=(n_samples, self.prediction_dim),
                                   dtype=self.baseline_prediction_.dtype)
        if not self.multi_output:
            raw_predictions = raw_predictions.ravel()
        raw_predictions += self.baseline_prediction_

        # gradients and hessians are 1D arrays of size
        # n_samples * n_trees_per_iteration
        gradients, hessians = self.loss_.init_gradients_and_hessians(
            n_samples=n_samples, prediction_dim=self.prediction_dim)
        if not self.multi_output:
            gradients = gradients.ravel()
        # predictors_ is a matrix of TreePredictor objects with shape
        # (n_iter_, n_trees_per_iteration)
        self.predictors_ = predictors = []

        # scorer_ is a callable with signature (est, X, y) and calls
        # est.predict() or est.predict_proba() depending on its nature.
        self.scorer_ = check_scoring(self, self.scoring)
        self.train_scores_ = []
        self.validation_scores_ = []
        if do_early_stopping:
            # Add predictions of the initial model (before the first tree)
            self.train_scores_.append(self._get_scores(X_binned_train,
                                                       y_train))

            if self.validation_split is not None:
                self.validation_scores_.append(
                    self._get_scores(X_binned_val, y_val))

        for iteration in range(self.max_iter):

            if self.verbose:
                iteration_start_time = time()
                print(f"[{iteration + 1}/{self.max_iter}] ",
                      end='',
                      flush=True)

            # Update gradients and hessians, inplace
            self.loss_.update_gradients_and_hessians(gradients, hessians,
                                                     y_train, raw_predictions)

            predictors.append([])
            if self.multi_output:
                proj_gradients, proj_hessians = self.randomly_project_gradients_and_hessians(
                    gradients, hessians)
            else:
                proj_gradients, proj_hessians = gradients.ravel(
                ), hessians.ravel()

            # Build `n_trees_per_iteration` trees.
            for k, (gradients_at_k, hessians_at_k) in enumerate(
                    zip(
                        np.array_split(proj_gradients,
                                       self.n_trees_per_iteration_),
                        np.array_split(proj_hessians,
                                       self.n_trees_per_iteration_))):
                # the xxxx_at_k arrays are **views** on the original arrays.
                # Note that for binary classif and regressions,
                # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the
                # whole array.

                grower = TreeGrower(X_binned_train,
                                    gradients_at_k,
                                    hessians_at_k,
                                    max_bins=self.max_bins,
                                    n_bins_per_feature=n_bins_per_feature,
                                    max_leaf_nodes=self.max_leaf_nodes,
                                    max_depth=self.max_depth,
                                    min_samples_leaf=self.min_samples_leaf,
                                    l2_regularization=self.l2_regularization,
                                    shrinkage=self.learning_rate)
                grower.grow()

                if self.multi_output:
                    for l in grower.finalized_leaves:
                        l.residual = (
                            -self.learning_rate *
                            np.sum(a=gradients[l.sample_indices, :], axis=0) /
                            (l.sum_hessians + self.l2_regularization +
                             np.finfo(np.float64).eps))
                    leaves_data = [(l.residual, l.sample_indices)
                                   for l in grower.finalized_leaves]
                else:
                    leaves_data = [(l.value, l.sample_indices)
                                   for l in grower.finalized_leaves]

                acc_apply_split_time += grower.total_apply_split_time
                acc_find_split_time += grower.total_find_split_time

                predictor = grower.make_predictor(numerical_thresholds)
                predictors[-1].append(predictor)

                tic_pred = time()

                # prepare leaves_data so that _update_raw_predictions can be
                # @njitted

                _update_raw_predictions(leaves_data, raw_predictions)
                toc_pred = time()
                acc_prediction_time += toc_pred - tic_pred

            should_early_stop = False
            if do_early_stopping:
                should_early_stop = self._check_early_stopping(
                    X_binned_small_train, y_small_train, X_binned_val, y_val)

            if self.verbose:
                self._print_iteration_stats(iteration_start_time,
                                            do_early_stopping)

            if should_early_stop:
                break

        if self.verbose:
            duration = time() - fit_start_time
            n_total_leaves = sum(
                predictor.get_n_leaf_nodes()
                for predictors_at_ith_iteration in self.predictors_
                for predictor in predictors_at_ith_iteration)
            n_predictors = sum(
                len(predictors_at_ith_iteration)
                for predictors_at_ith_iteration in self.predictors_)
            print(f"Fit {n_predictors} trees in {duration:.3f} s, "
                  f"({n_total_leaves} total leaves)")
            print(f"{'Time spent finding best splits:':<32} "
                  f"{acc_find_split_time:.3f}s")
            print(f"{'Time spent applying splits:':<32} "
                  f"{acc_apply_split_time:.3f}s")
            print(f"{'Time spent predicting:':<32} "
                  f"{acc_prediction_time:.3f}s")

        self.train_scores_ = np.asarray(self.train_scores_)
        self.validation_scores_ = np.asarray(self.validation_scores_)
        return self
示例#8
0
    def fit(self, X, y):
        fit_start_time = time()
        acc_find_split_time = 0.  # time spent finding the best splits
        acc_apply_split_time = 0.  # time spent splitting nodes
        # time spent predicting X for gradient and hessians update
        acc_prediction_time = 0.
        # TODO: add support for mixed-typed (numerical + categorical) data
        # TODO: add support for missing data
        # TODO: add support for pre-binned data (pass-through)?
        X, y = check_X_y(X, y, dtype=[np.float32, np.float64])
        y = y.astype(np.float32, copy=False)
        rng = check_random_state(self.random_state)
        if self.verbose:
            print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ",
                  end="",
                  flush=True)
        tic = time()
        self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng)
        X_binned = self.bin_mapper_.fit_transform(X)
        toc = time()
        if self.verbose:
            duration = toc - tic
            troughput = X.nbytes / duration
            print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)")
        if self.validation_split is not None:
            X_binned_train, X_binned_val, y_train, y_val = train_test_split(
                X_binned,
                y,
                test_size=self.validation_split,
                stratify=y,
                random_state=rng)
            # Histogram computation is faster on feature-aligned data.
            X_binned_train = np.asfortranarray(X_binned_train)
        else:
            X_binned_train, y_train = X_binned, y
            X_binned_val, y_val = None, None

        # Subsample the training set for score-based monitoring.
        subsample_size = 10000
        if X_binned_train.shape[0] < subsample_size:
            X_binned_small_train = np.ascontiguousarray(X_binned_train)
            y_small_train = y_train
        else:
            indices = rng.choice(np.arange(X_binned_train.shape[0]),
                                 subsample_size)
            X_binned_small_train = X_binned_train[indices]
            y_small_train = y_train[indices]

        if self.verbose:
            print("Fitting gradient boosted rounds:")
        # TODO: plug custom loss functions
        y_pred = np.zeros_like(y_train, dtype=np.float32)
        gradients = np.asarray(y_train, dtype=np.float32).copy()
        hessians = np.ones(1, dtype=np.float32)
        self.predictors_ = predictors = []
        self.train_scores_ = []
        if self.validation_split is not None:
            self.validation_scores_ = []
        scorer = check_scoring(self, self.scoring)
        gb_start_time = time()
        # TODO: compute training loss and use it for early stopping if no
        # validation data is provided?
        self.n_iter_ = 0
        while True:
            should_stop = self._stopping_criterion(gb_start_time, scorer,
                                                   X_binned_small_train,
                                                   y_small_train, X_binned_val,
                                                   y_val)
            if should_stop or self.n_iter_ == self.max_iter:
                break
            shrinkage = 1. if self.n_iter_ == 0 else self.learning_rate
            grower = TreeGrower(X_binned_train,
                                gradients,
                                hessians,
                                n_bins=self.max_bins,
                                max_leaf_nodes=self.max_leaf_nodes,
                                max_depth=self.max_depth,
                                min_samples_leaf=self.min_samples_leaf,
                                shrinkage=shrinkage)
            grower.grow()
            predictor = grower.make_predictor(
                bin_thresholds=self.bin_mapper_.bin_thresholds_)
            predictors.append(predictor)
            self.n_iter_ += 1
            tic_pred = time()
            leaves_data = [(l.value, l.sample_indices)
                           for l in grower.finalized_leaves]
            _update_y_pred(leaves_data, y_pred)
            gradients = y_train - y_pred
            toc_pred = time()
            acc_prediction_time += toc_pred - tic_pred

            acc_apply_split_time += grower.total_apply_split_time
            acc_find_split_time += grower.total_find_split_time
        if self.verbose:
            duration = time() - fit_start_time
            n_leaf_nodes = sum(p.get_n_leaf_nodes() for p in self.predictors_)
            print(f"Fit {len(self.predictors_)} trees in {duration:.3f} s, "
                  f"({n_leaf_nodes} total leaf nodes)")
            print('{:<32} {:.3f}s'.format('Time spent finding best splits:',
                                          acc_find_split_time))
            print('{:<32} {:.3f}s'.format('Time spent applying splits:',
                                          acc_apply_split_time))
            print('{:<32} {:.3f}s'.format('Time spent predicting:',
                                          acc_prediction_time))
        self.train_scores_ = np.asarray(self.train_scores_)
        if self.validation_split is not None:
            self.validation_scores_ = np.asarray(self.validation_scores_)
        return self