Exemplo n.º 1
0
 def _preprocess(self, X, **kwargs):
     X = super()._preprocess(X, **kwargs)
     if self._feature_generator is None:
         self._feature_generator = LabelEncoderFeatureGenerator(verbosity=0)
         self._feature_generator.fit(X=X)
     if self._feature_generator.features_in:
         X = X.copy()
         X[self._feature_generator.features_in] = self._feature_generator.transform(X=X)
     X = X.fillna(0).to_numpy(dtype=np.float32)
     return X
Exemplo n.º 2
0
def test_label_encoder_feature_generator(generator_helper, data_helper):
    # Given
    input_data = data_helper.generate_multi_feature_standard()

    generator = LabelEncoderFeatureGenerator()

    expected_feature_metadata_in_full = {
        ('category', ()): ['cat'],
    }
    expected_feature_metadata_full = {
        ('int', ()): ['cat']
    }

    expected_output_data_cat_val = [0, 1, 0, 3, 3, 3, 2, -1, -1]

    # When
    output_data = generator_helper.fit_transform_assert(
        input_data=input_data,
        generator=generator,
        expected_feature_metadata_in_full=expected_feature_metadata_in_full,
        expected_feature_metadata_full=expected_feature_metadata_full,
    )

    # Therefore
    assert output_data['cat'].dtype == np.int8
    assert list(output_data['cat'].values) == expected_output_data_cat_val
Exemplo n.º 3
0
class RFModel(AbstractModel):
    """
    Random Forest model (scikit-learn): https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    """
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._feature_generator = None

    def _get_model_type(self):
        if self.problem_type in [REGRESSION, SOFTCLASS]:
            return RandomForestRegressor
        elif self.problem_type == QUANTILE:
            return RandomForestQuantileRegressor
        else:
            return RandomForestClassifier

    # TODO: X.fillna -inf? Add extra is_missing column?
    def _preprocess(self, X, **kwargs):
        X = super()._preprocess(X, **kwargs)
        if self._feature_generator is None:
            self._feature_generator = LabelEncoderFeatureGenerator(verbosity=0)
            self._feature_generator.fit(X=X)
        if self._feature_generator.features_in:
            X = X.copy()
            X[self._feature_generator.
              features_in] = self._feature_generator.transform(X=X)
        X = X.fillna(0).to_numpy(dtype=np.float32)
        return X

    def _set_default_params(self):
        default_params = {
            'n_estimators': 300,
            'n_jobs': -1,
            'random_state': 0,
        }
        for param, val in default_params.items():
            self._set_default_param_value(param, val)

    # TODO: Add in documentation that Categorical default is the first index
    # TODO: enable HPO for RF models
    def _get_default_searchspace(self):
        spaces = {
            # 'n_estimators': Int(lower=10, upper=1000, default=300),
            # 'max_features': Categorical(['auto', 0.5, 0.25]),
            # 'criterion': Categorical(['gini', 'entropy']),
        }
        return spaces

    def _fit(self, X, y, time_limit=None, sample_weight=None, **kwargs):
        time_start = time.time()
        max_memory_usage_ratio = self.params_aux['max_memory_usage_ratio']
        hyperparams = self.params.copy()
        n_estimators_final = hyperparams['n_estimators']

        n_estimators_minimum = min(40, n_estimators_final)
        n_estimators_test = min(4, max(1,
                                       math.floor(n_estimators_minimum / 5)))

        X = self.preprocess(X)
        n_estimator_increments = [n_estimators_final]

        # Very rough guess to size of a single tree before training
        if self.problem_type in [MULTICLASS, SOFTCLASS]:
            if self.num_classes is None:
                num_trees_per_estimator = 10  # Guess since it wasn't passed in, could also check y for a better value
            else:
                num_trees_per_estimator = self.num_classes
        else:
            num_trees_per_estimator = 1
        bytes_per_estimator = num_trees_per_estimator * len(
            X) / 60000 * 1e6  # Underestimates by 3x on ExtraTrees
        available_mem = psutil.virtual_memory().available
        expected_memory_usage = bytes_per_estimator * n_estimators_final / available_mem
        expected_min_memory_usage = bytes_per_estimator * n_estimators_minimum / available_mem
        if expected_min_memory_usage > (
                0.5 * max_memory_usage_ratio
        ):  # if minimum estimated size is greater than 50% memory
            logger.warning(
                f'\tWarning: Model is expected to require {round(expected_min_memory_usage * 100, 2)}% of available memory (Estimated before training)...'
            )
            raise NotEnoughMemoryError

        if n_estimators_final > n_estimators_test * 2:
            if self.problem_type == MULTICLASS:
                n_estimator_increments = [
                    n_estimators_test, n_estimators_final
                ]
                hyperparams['warm_start'] = True
            else:
                if expected_memory_usage > (
                        0.05 * max_memory_usage_ratio
                ):  # Somewhat arbitrary, consider finding a better value, should it scale by cores?
                    # Causes ~10% training slowdown, so try to avoid if memory is not an issue
                    n_estimator_increments = [
                        n_estimators_test, n_estimators_final
                    ]
                    hyperparams['warm_start'] = True

        hyperparams['n_estimators'] = n_estimator_increments[0]
        self.model = self._get_model_type()(**hyperparams)

        time_train_start = time.time()
        for i, n_estimators in enumerate(n_estimator_increments):
            if i != 0:
                self.model.n_estimators = n_estimators
            self.model = self.model.fit(X, y, sample_weight=sample_weight)
            if (i == 0) and (len(n_estimator_increments) > 1):
                time_elapsed = time.time() - time_train_start
                model_size_bytes = 0
                for estimator in self.model.estimators_:  # Uses far less memory than pickling the entire forest at once
                    model_size_bytes += sys.getsizeof(pickle.dumps(estimator))
                expected_final_model_size_bytes = model_size_bytes * (
                    n_estimators_final / self.model.n_estimators)
                available_mem = psutil.virtual_memory().available
                model_memory_ratio = expected_final_model_size_bytes / available_mem

                ideal_memory_ratio = 0.15 * max_memory_usage_ratio
                n_estimators_ideal = min(
                    n_estimators_final,
                    math.floor(ideal_memory_ratio / model_memory_ratio *
                               n_estimators_final))

                if n_estimators_final > n_estimators_ideal:
                    if n_estimators_ideal < n_estimators_minimum:
                        logger.warning(
                            f'\tWarning: Model is expected to require {round(model_memory_ratio*100, 2)}% of available memory...'
                        )
                        raise NotEnoughMemoryError  # don't train full model to avoid OOM error
                    logger.warning(
                        f'\tWarning: Reducing model \'n_estimators\' from {n_estimators_final} -> {n_estimators_ideal} due to low memory. Expected memory usage reduced from {round(model_memory_ratio*100, 2)}% -> {round(ideal_memory_ratio*100, 2)}% of available memory...'
                    )

                if time_limit is not None:
                    time_expected = time_train_start - time_start + (
                        time_elapsed * n_estimators_ideal / n_estimators)
                    n_estimators_time = math.floor(
                        (time_limit - time_train_start + time_start) *
                        n_estimators / time_elapsed)
                    if n_estimators_time < n_estimators_ideal:
                        if n_estimators_time < n_estimators_minimum:
                            logger.warning(
                                f'\tWarning: Model is expected to require {round(time_expected, 1)}s to train, which exceeds the maximum time limit of {round(time_limit, 1)}s, skipping model...'
                            )
                            raise TimeLimitExceeded
                        logger.warning(
                            f'\tWarning: Reducing model \'n_estimators\' from {n_estimators_ideal} -> {n_estimators_time} due to low time. Expected time usage reduced from {round(time_expected, 1)}s -> {round(time_limit, 1)}s...'
                        )
                        n_estimators_ideal = n_estimators_time

                for j in range(len(n_estimator_increments)):
                    if n_estimator_increments[j] > n_estimators_ideal:
                        n_estimator_increments[j] = n_estimators_ideal

        self.params_trained['n_estimators'] = self.model.n_estimators

    # TODO: Remove this after simplifying _predict_proba to reduce code duplication. This is only present for SOFTCLASS support.
    def _predict_proba(self, X, **kwargs):
        X = self.preprocess(X, **kwargs)

        if self.problem_type == REGRESSION:
            return self.model.predict(X)
        elif self.problem_type == SOFTCLASS:
            return self.model.predict(X)
        elif self.problem_type == QUANTILE:
            return self.model.predict(X, quantile_levels=self.quantile_levels)

        y_pred_proba = self.model.predict_proba(X)
        return self._convert_proba_to_unified_form(y_pred_proba)

    # TODO: Add HPO
    def _hyperparameter_tune(self, **kwargs):
        return skip_hpo(self, **kwargs)

    def get_model_feature_importance(self):
        if self.features is None:
            # TODO: Consider making this raise an exception
            logger.warning(
                'Warning: get_model_feature_importance called when self.features is None!'
            )
            return dict()
        return dict(zip(self.features, self.model.feature_importances_))

    def _get_default_auxiliary_params(self) -> dict:
        default_auxiliary_params = super()._get_default_auxiliary_params()
        extra_auxiliary_params = dict(ignored_type_group_raw=[R_OBJECT], )
        default_auxiliary_params.update(extra_auxiliary_params)
        return default_auxiliary_params
Exemplo n.º 4
0
class RFModel(AbstractModel):
    """
    Random Forest model (scikit-learn): https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    """
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._feature_generator = None
        self._daal = False  # Whether daal4py backend is being used

    def _get_model_type(self):
        if self.problem_type == QUANTILE:
            from .rf_quantile import RandomForestQuantileRegressor
            return RandomForestQuantileRegressor
        if self.params_aux.get('use_daal', False):
            # Disabled by default because it appears to degrade performance
            try:
                # TODO: Use sklearnex instead once a suitable toggle option is provided that won't impact future models
                # FIXME: DAAL OOB score is broken, returns biased predictions. Without this optimization, can't compute Efficient OOF.
                from daal4py.sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
                logger.log(15, '\tUsing daal4py RF backend...')
                self._daal = True
            except:
                from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
                self._daal = False
        else:
            from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
            self._daal = False
        if self.problem_type in [REGRESSION, SOFTCLASS]:
            return RandomForestRegressor
        else:
            return RandomForestClassifier

    # TODO: X.fillna -inf? Add extra is_missing column?
    def _preprocess(self, X, **kwargs):
        X = super()._preprocess(X, **kwargs)
        if self._feature_generator is None:
            self._feature_generator = LabelEncoderFeatureGenerator(verbosity=0)
            self._feature_generator.fit(X=X)
        if self._feature_generator.features_in:
            X = X.copy()
            X[self._feature_generator.
              features_in] = self._feature_generator.transform(X=X)
        X = X.fillna(0).to_numpy(dtype=np.float32)
        return X

    def _set_default_params(self):
        default_params = {
            'n_estimators': 300,
            'n_jobs': -1,
            'random_state': 0,
            'bootstrap':
            True,  # Required for OOB estimates, setting to False will raise exception if bagging.
            # 'oob_score': True,  # Disabled by default as it is better to do it post-fit via custom logic.
        }
        for param, val in default_params.items():
            self._set_default_param_value(param, val)

    # TODO: Add in documentation that Categorical default is the first index
    # TODO: enable HPO for RF models
    def _get_default_searchspace(self):
        spaces = {
            # 'n_estimators': Int(lower=10, upper=1000, default=300),
            # 'max_features': Categorical(['auto', 0.5, 0.25]),
            # 'criterion': Categorical(['gini', 'entropy']),
        }
        return spaces

    def _fit(self, X, y, time_limit=None, sample_weight=None, **kwargs):
        time_start = time.time()

        model_cls = self._get_model_type()

        max_memory_usage_ratio = self.params_aux['max_memory_usage_ratio']
        params = self._get_model_params()
        n_estimators_final = params['n_estimators']

        n_estimators_minimum = min(40, n_estimators_final)
        n_estimators_test = min(4, max(1,
                                       math.floor(n_estimators_minimum / 5)))

        X = self.preprocess(X)
        n_estimator_increments = [n_estimators_final]

        # Very rough guess to size of a single tree before training
        if self.problem_type in [MULTICLASS, SOFTCLASS]:
            if self.num_classes is None:
                num_trees_per_estimator = 10  # Guess since it wasn't passed in, could also check y for a better value
            else:
                num_trees_per_estimator = self.num_classes
        else:
            num_trees_per_estimator = 1
        bytes_per_estimator = num_trees_per_estimator * len(
            X) / 60000 * 1e6  # Underestimates by 3x on ExtraTrees
        available_mem = psutil.virtual_memory().available
        expected_memory_usage = bytes_per_estimator * n_estimators_final / available_mem
        expected_min_memory_usage = bytes_per_estimator * n_estimators_minimum / available_mem
        if expected_min_memory_usage > (
                0.5 * max_memory_usage_ratio
        ):  # if minimum estimated size is greater than 50% memory
            logger.warning(
                f'\tWarning: Model is expected to require {round(expected_min_memory_usage * 100, 2)}% of available memory (Estimated before training)...'
            )
            raise NotEnoughMemoryError

        if n_estimators_final > n_estimators_test * 2:
            if self.problem_type == MULTICLASS:
                n_estimator_increments = [
                    n_estimators_test, n_estimators_final
                ]
                params['warm_start'] = True
            else:
                if expected_memory_usage > (
                        0.05 * max_memory_usage_ratio
                ):  # Somewhat arbitrary, consider finding a better value, should it scale by cores?
                    # Causes ~10% training slowdown, so try to avoid if memory is not an issue
                    n_estimator_increments = [
                        n_estimators_test, n_estimators_final
                    ]
                    params['warm_start'] = True

        params['n_estimators'] = n_estimator_increments[0]
        if self._daal:
            if params.get('warm_start', False):
                params['warm_start'] = False

        model = model_cls(**params)

        time_train_start = time.time()
        for i, n_estimators in enumerate(n_estimator_increments):
            if i != 0:
                if params.get('warm_start', False):
                    model.n_estimators = n_estimators
                else:
                    params['n_estimators'] = n_estimators
                    model = model_cls(**params)
            model = model.fit(X, y, sample_weight=sample_weight)
            if (i == 0) and (len(n_estimator_increments) > 1):
                time_elapsed = time.time() - time_train_start
                model_size_bytes = 0
                for estimator in model.estimators_:  # Uses far less memory than pickling the entire forest at once
                    model_size_bytes += sys.getsizeof(pickle.dumps(estimator))
                expected_final_model_size_bytes = model_size_bytes * (
                    n_estimators_final / model.n_estimators)
                available_mem = psutil.virtual_memory().available
                model_memory_ratio = expected_final_model_size_bytes / available_mem

                ideal_memory_ratio = 0.15 * max_memory_usage_ratio
                n_estimators_ideal = min(
                    n_estimators_final,
                    math.floor(ideal_memory_ratio / model_memory_ratio *
                               n_estimators_final))

                if n_estimators_final > n_estimators_ideal:
                    if n_estimators_ideal < n_estimators_minimum:
                        logger.warning(
                            f'\tWarning: Model is expected to require {round(model_memory_ratio*100, 2)}% of available memory...'
                        )
                        raise NotEnoughMemoryError  # don't train full model to avoid OOM error
                    logger.warning(
                        f'\tWarning: Reducing model \'n_estimators\' from {n_estimators_final} -> {n_estimators_ideal} due to low memory. Expected memory usage reduced from {round(model_memory_ratio*100, 2)}% -> {round(ideal_memory_ratio*100, 2)}% of available memory...'
                    )

                if time_limit is not None:
                    time_expected = time_train_start - time_start + (
                        time_elapsed * n_estimators_ideal / n_estimators)
                    n_estimators_time = math.floor(
                        (time_limit - time_train_start + time_start) *
                        n_estimators / time_elapsed)
                    if n_estimators_time < n_estimators_ideal:
                        if n_estimators_time < n_estimators_minimum:
                            logger.warning(
                                f'\tWarning: Model is expected to require {round(time_expected, 1)}s to train, which exceeds the maximum time limit of {round(time_limit, 1)}s, skipping model...'
                            )
                            raise TimeLimitExceeded
                        logger.warning(
                            f'\tWarning: Reducing model \'n_estimators\' from {n_estimators_ideal} -> {n_estimators_time} due to low time. Expected time usage reduced from {round(time_expected, 1)}s -> {round(time_limit, 1)}s...'
                        )
                        n_estimators_ideal = n_estimators_time

                for j in range(len(n_estimator_increments)):
                    if n_estimator_increments[j] > n_estimators_ideal:
                        n_estimator_increments[j] = n_estimators_ideal

        self.model = model
        self.params_trained['n_estimators'] = self.model.n_estimators

    # TODO: Remove this after simplifying _predict_proba to reduce code duplication. This is only present for SOFTCLASS support.
    def _predict_proba(self, X, **kwargs):
        X = self.preprocess(X, **kwargs)

        if self.problem_type == REGRESSION:
            return self.model.predict(X)
        elif self.problem_type == SOFTCLASS:
            return self.model.predict(X)
        elif self.problem_type == QUANTILE:
            return self.model.predict(X, quantile_levels=self.quantile_levels)

        y_pred_proba = self.model.predict_proba(X)
        return self._convert_proba_to_unified_form(y_pred_proba)

    def get_oof_pred_proba(self, X, normalize=None, **kwargs):
        """X should be the same X passed to `.fit`"""
        y_oof_pred_proba = self._get_oof_pred_proba(X=X, **kwargs)
        if normalize is None:
            normalize = self.normalize_pred_probas
        if normalize:
            y_oof_pred_proba = normalize_pred_probas(y_oof_pred_proba,
                                                     self.problem_type)
        y_oof_pred_proba = y_oof_pred_proba.astype(np.float32)
        return y_oof_pred_proba

    # FIXME: Unknown if this works with quantile regression
    def _get_oof_pred_proba(self, X, y, **kwargs):
        if self._daal:
            raise AssertionError(
                'DAAL forest backend does not support out-of-bag predictions.')
        if not self.model.bootstrap:
            raise ValueError(
                'Forest models must set `bootstrap=True` to compute out-of-fold predictions via out-of-bag predictions.'
            )

        # TODO: This can also be done via setting `oob_score=True` in model params,
        #  but getting the correct `pred_time_val` that way is not easy, since we can't time the internal call.
        if (getattr(self.model, "oob_decision_function_", None) is None and getattr(self.model, "oob_prediction_", None) is None) \
                and callable(getattr(self.model, "_set_oob_score", None)):
            X = self.preprocess(X)

            if getattr(self.model, "n_classes_", None) is not None:
                if self.model.n_outputs_ == 1:
                    self.model.n_classes_ = [self.model.n_classes_]
            from sklearn.tree._tree import DTYPE, DOUBLE
            X, y = self.model._validate_data(X,
                                             y,
                                             multi_output=True,
                                             accept_sparse="csc",
                                             dtype=DTYPE)
            if y.ndim == 1:
                # reshape is necessary to preserve the data contiguity against vs
                # [:, np.newaxis] that does not.
                y = np.reshape(y, (-1, 1))
            if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
                y = np.ascontiguousarray(y, dtype=DOUBLE)
            self.model._set_oob_score(X, y)
            if getattr(self.model, "n_classes_", None) is not None:
                if self.model.n_outputs_ == 1:
                    self.model.n_classes_ = self.model.n_classes_[0]

        if getattr(self.model, "oob_decision_function_", None) is not None:
            y_oof_pred_proba = self.model.oob_decision_function_
            self.model.oob_decision_function_ = None  # save memory
        elif getattr(self.model, "oob_prediction_", None) is not None:
            y_oof_pred_proba = self.model.oob_prediction_
            self.model.oob_prediction_ = None  # save memory
        else:
            raise AssertionError(
                f'Model class {type(self.model)} does not support out-of-fold prediction generation.'
            )

        # TODO: Regression does not return NaN for missing rows, instead it sets them to 0. This makes life hard.
        #  The below code corrects the missing rows to NaN instead of 0.
        # Don't bother if >60 trees, near impossible to have missing
        # If using 68% of data for training, chance of missing for each row is 1 in 11 billion.
        if self.problem_type == REGRESSION and self.model.n_estimators <= 60:
            from sklearn.ensemble._forest import _get_n_samples_bootstrap, _generate_unsampled_indices
            n_samples = len(y)

            n_predictions = np.zeros(n_samples)
            n_samples_bootstrap = _get_n_samples_bootstrap(
                n_samples, self.model.max_samples)
            for estimator in self.model.estimators_:
                unsampled_indices = _generate_unsampled_indices(
                    estimator.random_state, n_samples, n_samples_bootstrap)
                n_predictions[unsampled_indices] += 1
            missing_row_mask = n_predictions == 0
            y_oof_pred_proba[missing_row_mask] = np.nan

        # fill missing prediction rows with average of non-missing rows
        if np.isnan(np.sum(y_oof_pred_proba)):
            if len(y_oof_pred_proba.shape) == 1:
                col_mean = np.nanmean(y_oof_pred_proba)
                y_oof_pred_proba[np.isnan(y_oof_pred_proba)] = col_mean
            else:
                col_mean = np.nanmean(y_oof_pred_proba, axis=0)
                inds = np.where(np.isnan(y_oof_pred_proba))
                y_oof_pred_proba[inds] = np.take(col_mean, inds[1])

        return self._convert_proba_to_unified_form(y_oof_pred_proba)

    # TODO: Add HPO
    def _hyperparameter_tune(self, **kwargs):
        return skip_hpo(self, **kwargs)

    def get_model_feature_importance(self):
        if self.features is None:
            # TODO: Consider making this raise an exception
            logger.warning(
                'Warning: get_model_feature_importance called when self.features is None!'
            )
            return dict()
        return dict(zip(self.features, self.model.feature_importances_))

    def _get_default_auxiliary_params(self) -> dict:
        default_auxiliary_params = super()._get_default_auxiliary_params()
        extra_auxiliary_params = dict(ignored_type_group_raw=[R_OBJECT], )
        default_auxiliary_params.update(extra_auxiliary_params)
        return default_auxiliary_params

    @classmethod
    def _get_default_ag_args_ensemble(cls) -> dict:
        default_ag_args_ensemble = super()._get_default_ag_args_ensemble()
        extra_ag_args_ensemble = {'use_child_oof': False}
        default_ag_args_ensemble.update(extra_ag_args_ensemble)
        return default_ag_args_ensemble

    def _more_tags(self):
        return {'valid_oof': True}
Exemplo n.º 5
0
class RFModel(AbstractModel):
    """
    Random Forest model (scikit-learn): https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    """
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._feature_generator = None
        self._daal = False  # Whether daal4py backend is being used

    def _get_model_type(self):
        if self.problem_type == QUANTILE:
            from .rf_quantile import RandomForestQuantileRegressor
            return RandomForestQuantileRegressor
        if self.params_aux.get('use_daal', False):
            # Disabled by default because OOB score does not yet work properly
            try:
                # FIXME: sklearnex OOB score is broken, returns biased predictions. Without this optimization, can't compute Efficient OOF.
                #  Refer to https://github.com/intel/scikit-learn-intelex/issues/933
                #  Current workaround: Forcibly set oob_score=True during fit to compute OOB during train time.
                #  Downsides:
                #    1. Slows down training slightly by forcing computation of OOB even if OOB is not needed (such as in medium_quality)
                #    2. Makes computing the correct pred_time_val difficult, as the time is instead added to the fit_time,
                #       and we would need to waste extra time to compute the proper pred_time_val post-fit.
                #       Therefore with sklearnex enabled, pred_time_val is incorrect.
                from sklearnex.ensemble import RandomForestClassifier, RandomForestRegressor
                logger.log(15, '\tUsing sklearnex RF backend...')
                self._daal = True
            except:
                from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
                self._daal = False
        else:
            from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
            self._daal = False
        if self.problem_type in [REGRESSION, SOFTCLASS]:
            return RandomForestRegressor
        else:
            return RandomForestClassifier

    # TODO: X.fillna -inf? Add extra is_missing column?
    def _preprocess(self, X, **kwargs):
        X = super()._preprocess(X, **kwargs)
        if self._feature_generator is None:
            self._feature_generator = LabelEncoderFeatureGenerator(verbosity=0)
            self._feature_generator.fit(X=X)
        if self._feature_generator.features_in:
            X = X.copy()
            X[self._feature_generator.features_in] = self._feature_generator.transform(X=X)
        X = X.fillna(0).to_numpy(dtype=np.float32)
        return X

    def _set_default_params(self):
        default_params = {
            # TODO: 600 is much better, but increases info leakage in stacking -> therefore 300 is ~equal in stack ensemble final quality.
            #  Consider adding targeted noise to OOF to avoid info leakage, or increase `min_samples_leaf`.
            'n_estimators': 300,
            # Cap leaf nodes to 15000 to avoid large datasets using unreasonable amounts of memory/disk for RF/XT.
            #  Ensures that memory and disk usage of RF model with 300 n_estimators is at most ~500 MB for binary/regression, ~200 MB per class for multiclass.
            #  This has no effect on datasets with <=15000 rows, and minimal to no impact on datasets with <50000 rows.
            #  For large datasets, will often make the model worse, but will significantly speed up inference speed and massively reduce memory and disk usage.
            #  For example, when left uncapped, RF can use 5 GB of disk for a regression dataset with 2M rows.
            #  Multiply by the 8 RF/XT models in config for best quality / high quality and this is 40 GB of tree models, which is unreasonable.
            #  This size scales linearly with number of rows.
            'max_leaf_nodes': 15000,
            'n_jobs': -1,
            'random_state': 0,
            'bootstrap': True,  # Required for OOB estimates, setting to False will raise exception if bagging.
            # TODO: min_samples_leaf=5 is too large on most problems, however on some datasets it helps a lot (airlines likes >40 min_samples_leaf, adult likes 2 much better than 1)
            #  This value would need to be tuned per dataset, likely very worthwhile.
            #  Higher values = less OOF info leak, default = 1, which maximizes info leak.
            # 'min_samples_leaf': 5,  # Significantly reduces info leakage to stacker models. Never use the default/1 when using as base model.
            # 'oob_score': True,  # Disabled by default as it is better to do it post-fit via custom logic.
        }
        for param, val in default_params.items():
            self._set_default_param_value(param, val)

    # TODO: Add in documentation that Categorical default is the first index
    # TODO: enable HPO for RF models
    def _get_default_searchspace(self):
        spaces = {
            # 'n_estimators': Int(lower=10, upper=1000, default=300),
            # 'max_features': Categorical(['auto', 0.5, 0.25]),
            # 'criterion': Categorical(['gini', 'entropy']),
        }
        return spaces

    def _get_num_trees_per_estimator(self):
        # Very rough guess to size of a single tree before training
        if self.problem_type in [MULTICLASS, SOFTCLASS]:
            if self.num_classes is None:
                num_trees_per_estimator = 10  # Guess since it wasn't passed in, could also check y for a better value
            else:
                num_trees_per_estimator = self.num_classes
        else:
            num_trees_per_estimator = 1
        return num_trees_per_estimator

    def _estimate_memory_usage(self, X, **kwargs):
        params = self._get_model_params()
        n_estimators_final = params['n_estimators']
        n_estimators_minimum = min(40, n_estimators_final)
        num_trees_per_estimator = self._get_num_trees_per_estimator()
        bytes_per_estimator = num_trees_per_estimator * len(X) / 60000 * 1e6  # Underestimates by 3x on ExtraTrees
        expected_min_memory_usage = bytes_per_estimator * n_estimators_minimum
        return expected_min_memory_usage

    def _validate_fit_memory_usage(self, **kwargs):
        max_memory_usage_ratio = self.params_aux['max_memory_usage_ratio']
        available_mem = psutil.virtual_memory().available
        expected_min_memory_usage = self.estimate_memory_usage(**kwargs) / available_mem
        if expected_min_memory_usage > (0.5 * max_memory_usage_ratio):  # if minimum estimated size is greater than 50% memory
            logger.warning(f'\tWarning: Model is expected to require {round(expected_min_memory_usage * 100, 2)}% of available memory (Estimated before training)...')
            raise NotEnoughMemoryError

    def _fit(self,
             X,
             y,
             num_cpus=-1,
             time_limit=None,
             sample_weight=None,
             **kwargs):
        time_start = time.time()

        model_cls = self._get_model_type()

        max_memory_usage_ratio = self.params_aux['max_memory_usage_ratio']
        params = self._get_model_params()
        if 'n_jobs' not in params:
            params['n_jobs'] = num_cpus
        n_estimators_final = params['n_estimators']

        n_estimators_minimum = min(40, n_estimators_final)
        n_estimators_test = min(4, max(1, math.floor(n_estimators_minimum/5)))

        X = self.preprocess(X)
        n_estimator_increments = [n_estimators_final]

        num_trees_per_estimator = self._get_num_trees_per_estimator()
        bytes_per_estimator = num_trees_per_estimator * len(X) / 60000 * 1e6  # Underestimates by 3x on ExtraTrees
        available_mem = psutil.virtual_memory().available
        expected_memory_usage = n_estimators_final * bytes_per_estimator / available_mem
        
        if n_estimators_final > n_estimators_test * 2:
            if self.problem_type == MULTICLASS:
                n_estimator_increments = [n_estimators_test, n_estimators_final]
                params['warm_start'] = True
            else:
                if expected_memory_usage > (0.05 * max_memory_usage_ratio):  # Somewhat arbitrary, consider finding a better value, should it scale by cores?
                    # Causes ~10% training slowdown, so try to avoid if memory is not an issue
                    n_estimator_increments = [n_estimators_test, n_estimators_final]
                    params['warm_start'] = True

        params['n_estimators'] = n_estimator_increments[0]
        if self._daal:
            if params.get('warm_start', False):
                params['warm_start'] = False
            # FIXME: This is inefficent but sklearnex doesn't support computing oob_score after training
            params['oob_score'] = True

        model = model_cls(**params)

        time_train_start = time.time()
        for i, n_estimators in enumerate(n_estimator_increments):
            if i != 0:
                if params.get('warm_start', False):
                    model.n_estimators = n_estimators
                else:
                    params['n_estimators'] = n_estimators
                    model = model_cls(**params)
            model = model.fit(X, y, sample_weight=sample_weight)
            if (i == 0) and (len(n_estimator_increments) > 1):
                time_elapsed = time.time() - time_train_start
                model_size_bytes = 0
                for estimator in model.estimators_:  # Uses far less memory than pickling the entire forest at once
                    model_size_bytes += sys.getsizeof(pickle.dumps(estimator))
                expected_final_model_size_bytes = model_size_bytes * (n_estimators_final / model.n_estimators)
                available_mem = psutil.virtual_memory().available
                model_memory_ratio = expected_final_model_size_bytes / available_mem

                ideal_memory_ratio = 0.15 * max_memory_usage_ratio
                n_estimators_ideal = min(n_estimators_final, math.floor(ideal_memory_ratio / model_memory_ratio * n_estimators_final))

                if n_estimators_final > n_estimators_ideal:
                    if n_estimators_ideal < n_estimators_minimum:
                        logger.warning(f'\tWarning: Model is expected to require {round(model_memory_ratio*100, 2)}% of available memory...')
                        raise NotEnoughMemoryError  # don't train full model to avoid OOM error
                    logger.warning(f'\tWarning: Reducing model \'n_estimators\' from {n_estimators_final} -> {n_estimators_ideal} due to low memory. Expected memory usage reduced from {round(model_memory_ratio*100, 2)}% -> {round(ideal_memory_ratio*100, 2)}% of available memory...')

                if time_limit is not None:
                    time_expected = time_train_start - time_start + (time_elapsed * n_estimators_ideal / n_estimators)
                    n_estimators_time = math.floor((time_limit - time_train_start + time_start) * n_estimators / time_elapsed)
                    if n_estimators_time < n_estimators_ideal:
                        if n_estimators_time < n_estimators_minimum:
                            logger.warning(f'\tWarning: Model is expected to require {round(time_expected, 1)}s to train, which exceeds the maximum time limit of {round(time_limit, 1)}s, skipping model...')
                            raise TimeLimitExceeded
                        logger.warning(f'\tWarning: Reducing model \'n_estimators\' from {n_estimators_ideal} -> {n_estimators_time} due to low time. Expected time usage reduced from {round(time_expected, 1)}s -> {round(time_limit, 1)}s...')
                        n_estimators_ideal = n_estimators_time

                for j in range(len(n_estimator_increments)):
                    if n_estimator_increments[j] > n_estimators_ideal:
                        n_estimator_increments[j] = n_estimators_ideal
        if self._daal and model.criterion != 'entropy':
            # TODO: entropy is not accelerated by sklearnex, need to not set estimators_ to None to avoid crash
            # This reduces memory usage / disk usage.
            model.estimators_ = None
        self.model = model
        self.params_trained['n_estimators'] = self.model.n_estimators

    # TODO: Remove this after simplifying _predict_proba to reduce code duplication. This is only present for SOFTCLASS support.
    def _predict_proba(self, X, **kwargs):
        X = self.preprocess(X, **kwargs)

        if self.problem_type == REGRESSION:
            return self.model.predict(X)
        elif self.problem_type == SOFTCLASS:
            return self.model.predict(X)
        elif self.problem_type == QUANTILE:
            return self.model.predict(X, quantile_levels=self.quantile_levels)

        y_pred_proba = self.model.predict_proba(X)
        return self._convert_proba_to_unified_form(y_pred_proba)

    def get_oof_pred_proba(self, X, normalize=None, **kwargs):
        """X should be the same X passed to `.fit`"""
        y_oof_pred_proba = self._get_oof_pred_proba(X=X, **kwargs)
        if normalize is None:
            normalize = self.normalize_pred_probas
        if normalize:
            y_oof_pred_proba = normalize_pred_probas(y_oof_pred_proba, self.problem_type)
        y_oof_pred_proba = y_oof_pred_proba.astype(np.float32)
        return y_oof_pred_proba

    def _is_sklearn_1(self) -> bool:
        """Returns True if the trained model is from sklearn>=1.0"""
        return callable(getattr(self.model, "_set_oob_score_and_attributes", None))

    def _model_supports_oob_pred_proba(self) -> bool:
        """Returns True if model supports computing out-of-bag prediction probabilities"""
        # TODO: Remove `_set_oob_score` after sklearn version requirement is >=1.0
        return callable(getattr(self.model, "_set_oob_score", None)) or self._is_sklearn_1()

    # FIXME: Unknown if this works with quantile regression
    def _get_oof_pred_proba(self, X, y, **kwargs):
        if not self.model.bootstrap:
            raise ValueError('Forest models must set `bootstrap=True` to compute out-of-fold predictions via out-of-bag predictions.')

        oob_is_not_set = getattr(self.model, "oob_decision_function_", None) is None and getattr(self.model, "oob_prediction_", None) is None

        if oob_is_not_set and self._daal:
            raise AssertionError('DAAL forest backend does not support out-of-bag predictions.')

        # TODO: This can also be done via setting `oob_score=True` in model params,
        #  but getting the correct `pred_time_val` that way is not easy, since we can't time the internal call.
        if oob_is_not_set and self._model_supports_oob_pred_proba():
            X = self.preprocess(X)

            if getattr(self.model, "n_classes_", None) is not None:
                if self.model.n_outputs_ == 1:
                    self.model.n_classes_ = [self.model.n_classes_]
            from sklearn.tree._tree import DTYPE, DOUBLE
            X, y = self.model._validate_data(X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE)
            if y.ndim == 1:
                # reshape is necessary to preserve the data contiguity against vs
                # [:, np.newaxis] that does not.
                y = np.reshape(y, (-1, 1))
            if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
                y = np.ascontiguousarray(y, dtype=DOUBLE)
            if self._is_sklearn_1():
                # sklearn >= 1.0
                # TODO: Can instead do `_compute_oob_predictions` but requires post-processing. Skips scoring func.
                self.model._set_oob_score_and_attributes(X, y)
            else:
                # sklearn < 1.0
                # TODO: Remove once sklearn < 1.0 support is dropped
                self.model._set_oob_score(X, y)
            if getattr(self.model, "n_classes_", None) is not None:
                if self.model.n_outputs_ == 1:
                    self.model.n_classes_ = self.model.n_classes_[0]

        if getattr(self.model, "oob_decision_function_", None) is not None:
            y_oof_pred_proba = self.model.oob_decision_function_
            self.model.oob_decision_function_ = None  # save memory
        elif getattr(self.model, "oob_prediction_", None) is not None:
            y_oof_pred_proba = self.model.oob_prediction_
            self.model.oob_prediction_ = None  # save memory
        else:
            raise AssertionError(f'Model class {type(self.model)} does not support out-of-fold prediction generation.')

        # TODO: Regression does not return NaN for missing rows, instead it sets them to 0. This makes life hard.
        #  The below code corrects the missing rows to NaN instead of 0.
        # Don't bother if >60 trees, near impossible to have missing
        # If using 68% of data for training, chance of missing for each row is 1 in 11 billion.
        if self.problem_type == REGRESSION and self.model.n_estimators <= 60:
            from sklearn.ensemble._forest import _get_n_samples_bootstrap, _generate_unsampled_indices
            n_samples = len(y)

            n_predictions = np.zeros(n_samples)
            n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.model.max_samples)
            for estimator in self.model.estimators_:
                unsampled_indices = _generate_unsampled_indices(estimator.random_state, n_samples, n_samples_bootstrap)
                n_predictions[unsampled_indices] += 1
            missing_row_mask = n_predictions == 0
            y_oof_pred_proba[missing_row_mask] = np.nan

        # fill missing prediction rows with average of non-missing rows
        if np.isnan(np.sum(y_oof_pred_proba)):
            if len(y_oof_pred_proba.shape) == 1:
                col_mean = np.nanmean(y_oof_pred_proba)
                y_oof_pred_proba[np.isnan(y_oof_pred_proba)] = col_mean
            else:
                col_mean = np.nanmean(y_oof_pred_proba, axis=0)
                inds = np.where(np.isnan(y_oof_pred_proba))
                y_oof_pred_proba[inds] = np.take(col_mean, inds[1])

        return self._convert_proba_to_unified_form(y_oof_pred_proba)

    def _get_default_auxiliary_params(self) -> dict:
        default_auxiliary_params = super()._get_default_auxiliary_params()
        extra_auxiliary_params = dict(
            valid_raw_types=[R_BOOL, R_INT, R_FLOAT, R_CATEGORY],
        )
        default_auxiliary_params.update(extra_auxiliary_params)
        return default_auxiliary_params

    @classmethod
    def _get_default_ag_args_ensemble(cls, problem_type=None, **kwargs) -> dict:
        default_ag_args_ensemble = super()._get_default_ag_args_ensemble(problem_type=problem_type, **kwargs)
        if problem_type != QUANTILE:  # use_child_oof not supported in quantile regression
            extra_ag_args_ensemble = {'use_child_oof': True}
            default_ag_args_ensemble.update(extra_ag_args_ensemble)
        return default_ag_args_ensemble

    def _more_tags(self):
        # `can_refit_full=True` because final n_estimators is communicated at end of `_fit`:
        #  `self.params_trained['n_estimators'] = self.model.n_estimators`
        tags = {'can_refit_full': True}
        if self.problem_type == QUANTILE:
            tags['valid_oof'] = False  # not supported in quantile regression
        else:
            tags['valid_oof'] = True
        return tags