예제 #1
0
 def base_params(self, estimator_list):
     config = {"estimator_list": estimator_list}
     cv = check_cv(cv=len(estimator_list),
                   y=self.y,
                   classifier=estimator_list[0])
     config["X_id"] = self.X_id
     config["y_id"] = self.y_id
     config["early_stopping"] = False
     config["max_iters"] = 1
     config["groups"] = None
     config["cv"] = cv
     config["fit_params"] = None
     config["scoring"], _ = _check_multimetric_scoring(estimator_list[0],
                                                       scoring=None)
     config["return_train_score"] = False
     config["n_jobs"] = 1
     return config
예제 #2
0
    def _fit(self, X, y=None, groups=None, **fit_params):
        """Helper method to run fit procedure

        Args:
            X (:obj:`array-like` (shape = [n_samples, n_features])):
                Training vector, where n_samples is the number of samples and
                n_features is the number of features.
            y (:obj:`array-like`): Shape of array expected to be [n_samples]
                or [n_samples, n_output]). Target relative to X for
                classification or regression; None for unsupervised learning.
            groups (:obj:`array-like` (shape (n_samples,)), optional):
                Group labels for the samples used while splitting the dataset
                into train/test set. Only used in conjunction with a "Group"
                `cv` instance (e.g., `GroupKFold`).
            **fit_params (:obj:`dict` of str): Parameters passed to
                the ``fit`` method of the estimator.

        Returns:
            :obj:`TuneBaseSearchCV` child instance, after fitting.
        """

        self._check_params()
        classifier = is_classifier(self.estimator)
        cv = check_cv(cv=self.cv, y=y, classifier=classifier)
        self.n_splits = cv.get_n_splits(X, y, groups)
        if not hasattr(self, "is_multi"):
            self.scoring, self.is_multi = _check_multimetric_scoring(
                self.estimator, self.scoring)
        else:
            self.scoring, _ = _check_multimetric_scoring(
                self.estimator, self.scoring)

        if self.is_multi:
            if self.refit and (not isinstance(self.refit, str)
                               or self.refit not in self.scoring):
                raise ValueError("When using multimetric scoring, refit "
                                 "must be the name of the scorer used to "
                                 "pick the best parameters. If not needed, "
                                 "set refit to False")

        assert isinstance(
            self.n_jobs,
            int), ("Internal error: self.n_jobs must be an integer.")
        if self.n_jobs < 0:
            resources_per_trial = {"cpu": 1, "gpu": 1 if self.use_gpu else 0}
            if self.n_jobs < -1:
                warnings.warn(
                    "`self.n_jobs` is automatically set "
                    "-1 for any negative values.",
                    category=UserWarning)
        else:
            available_cpus = multiprocessing.cpu_count()
            if ray.is_initialized():
                available_cpus = ray.cluster_resources()["CPU"]
            cpu_fraction = available_cpus / self.n_jobs
            if cpu_fraction > 1:
                cpu_fraction = int(np.ceil(cpu_fraction))
            resources_per_trial = {
                "cpu": cpu_fraction,
                "gpu": 1 if self.use_gpu else 0
            }

        X_id = ray.put(X)
        y_id = ray.put(y)

        config = {}
        config["early_stopping"] = bool(self.early_stopping)
        config["early_stop_type"] = self.early_stop_type
        config["X_id"] = X_id
        config["y_id"] = y_id
        config["groups"] = groups
        config["cv"] = cv
        config["fit_params"] = fit_params
        config["scoring"] = self.scoring
        config["max_iters"] = self.max_iters
        config["return_train_score"] = self.return_train_score
        config["n_jobs"] = self.sk_n_jobs

        self._fill_config_hyperparam(config)
        analysis = self._tune_run(config, resources_per_trial)

        self.cv_results_ = self._format_results(self.n_splits, analysis)

        metric = self._metric_name
        base_metric = self._base_metric_name

        # For multi-metric evaluation, store the best_index, best_params and
        # best_score iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.is_multi:
            # If callable, refit is expected to return the index of the best
            # parameter set.
            if callable(self.refit):
                self.best_index = self.refit(self.cv_results_)
                if not isinstance(self.best_index, numbers.Integral):
                    raise TypeError("best_index returned is not an integer")
                if (self.best_index < 0
                        or self.best_index >= len(self.cv_results_["params"])):
                    raise IndexError("best_index index out of range")
            else:
                self.best_index = self.cv_results_["rank_test_%s" %
                                                   base_metric].argmin()
                self.best_score = self.cv_results_[
                    "mean_test_%s" % base_metric][self.best_index]
            best_config = analysis.get_best_config(metric=metric,
                                                   mode="max",
                                                   scope="last")
            self.best_params = self._clean_config_dict(best_config)

        if self.refit:
            base_estimator = clone(self.estimator)
            if self.early_stop_type == EarlyStopping.WARM_START_ENSEMBLE:
                logger.info("tune-sklearn uses `n_estimators` to warm "
                            "start, so this parameter can't be "
                            "set when warm start early stopping. "
                            "`n_estimators` defaults to `max_iters`.")
                if check_is_pipeline(base_estimator):
                    cloned_final_estimator = base_estimator.steps[-1][1]
                    cloned_final_estimator.set_params(
                        **{"n_estimators": self.max_iters})
                else:
                    self.best_params["n_estimators"] = self.max_iters
            # we clone again after setting params in case some
            # of the params are estimators as well.
            self.best_estimator = clone(
                base_estimator.set_params(**self.best_params))
            refit_start_time = time.time()
            if y is not None:
                self.best_estimator.fit(X, y, **fit_params)
            else:
                self.best_estimator.fit(X, **fit_params)
            refit_end_time = time.time()
            self.refit_time = refit_end_time - refit_start_time

        return self
예제 #3
0
    def __init__(self,
                 estimator,
                 early_stopping=None,
                 scoring=None,
                 n_jobs=None,
                 sk_n_jobs=-1,
                 cv=5,
                 refit=True,
                 verbose=0,
                 error_score="raise",
                 return_train_score=False,
                 local_dir="~/ray_results",
                 max_iters=1,
                 use_gpu=False,
                 loggers=None,
                 pipeline_auto_early_stop=True):
        if max_iters < 1:
            raise ValueError("max_iters must be greater than or equal to 1.")
        self.estimator = estimator
        self.base_estimator = estimator
        self.pipeline_auto_early_stop = pipeline_auto_early_stop

        if self.pipeline_auto_early_stop and check_is_pipeline(estimator):
            _, self.base_estimator = self.base_estimator.steps[-1]

        self.early_stop_type = get_early_stop_type(self.base_estimator,
                                                   bool(early_stopping))

        if not self._can_early_stop():
            if early_stopping:
                raise ValueError("Early stopping is not supported because "
                                 "the estimator does not have `partial_fit`, "
                                 "does not support warm_start, or is a "
                                 "tree classifier. Set "
                                 "`early_stopping=False`.")
        if not early_stopping and max_iters > 1:
            warnings.warn(
                "max_iters is set > 1 but incremental/partial training "
                "is not enabled. To enable partial training, "
                "ensure the estimator has `partial_fit` or "
                "`warm_start` and set `early_stopping=True`. "
                "Automatically setting max_iters=1.",
                category=UserWarning)
            max_iters = 1

        # Get metric scoring name
        self.scoring = scoring
        self.refit = refit
        if not hasattr(self, "is_multi"):
            self.scoring, self.is_multi = _check_multimetric_scoring(
                self.estimator, self.scoring)

        if self.is_multi:
            self._base_metric_name = self.refit
        else:
            self._base_metric_name = "score"

        self._metric_name = "average_test_%s" % self._base_metric_name

        if early_stopping:
            if not self._can_early_stop() and is_lightgbm_model(
                    self.base_estimator):
                warnings.warn("lightgbm>=3.0.0 required for early_stopping "
                              "functionality.")
            assert self._can_early_stop()
            if max_iters == 1:
                warnings.warn(
                    "early_stopping is enabled but max_iters = 1. "
                    "To enable partial training, set max_iters > 1.",
                    category=UserWarning)
            if self.early_stop_type == EarlyStopping.XGB:
                warnings.warn(
                    "tune-sklearn implements incremental learning "
                    "for xgboost models following this: "
                    "https://github.com/dmlc/xgboost/issues/1686. "
                    "This may negatively impact performance. To "
                    "disable, set `early_stopping=False`.",
                    category=UserWarning)
            elif self.early_stop_type == EarlyStopping.LGBM:
                warnings.warn(
                    "tune-sklearn implements incremental learning "
                    "for lightgbm models following this: "
                    "https://lightgbm.readthedocs.io/en/latest/pythonapi/"
                    "lightgbm.LGBMModel.html#lightgbm.LGBMModel.fit "
                    "This may negatively impact performance. To "
                    "disable, set `early_stopping=False`.",
                    category=UserWarning)
            elif self.early_stop_type == EarlyStopping.CATBOOST:
                warnings.warn(
                    "tune-sklearn implements incremental learning "
                    "for Catboost models following this: "
                    "https://catboost.ai/docs/concepts/python-usages-"
                    "examples.html#training-continuation "
                    "This may negatively impact performance. To "
                    "disable, set `early_stopping=False`.",
                    category=UserWarning)
            if early_stopping is True:
                # Override the early_stopping variable so
                # that it is resolved appropriately in
                # the next block
                early_stopping = "AsyncHyperBandScheduler"
            # Resolve the early stopping object
            early_stopping = resolve_early_stopping(early_stopping, max_iters,
                                                    self._metric_name)

        self.early_stopping = early_stopping
        self.max_iters = max_iters

        self.cv = cv
        self.n_jobs = int(n_jobs or -1)
        if os.environ.get("SKLEARN_N_JOBS") is not None:
            self.sk_n_jobs = int(os.environ.get("SKLEARN_N_JOBS"))
        else:
            self.sk_n_jobs = sk_n_jobs

        self.verbose = verbose
        self.error_score = error_score
        self.return_train_score = return_train_score
        self.local_dir = local_dir
        self.use_gpu = use_gpu
        self.loggers = resolve_loggers(loggers)
        assert isinstance(self.n_jobs, int)
예제 #4
0
    def _fit(self, X, y=None, groups=None, **fit_params):
        """Helper method to run fit procedure

        Args:
            X (:obj:`array-like` (shape = [n_samples, n_features])):
                Training vector, where n_samples is the number of samples and
                n_features is the number of features.
            y (:obj:`array-like`): Shape of array expected to be [n_samples]
                or [n_samples, n_output]). Target relative to X for
                classification or regression; None for unsupervised learning.
            groups (:obj:`array-like` (shape (n_samples,)), optional):
                Group labels for the samples used while splitting the dataset
                into train/test set. Only used in conjunction with a "Group"
                `cv` instance (e.g., `GroupKFold`).
            **fit_params (:obj:`dict` of str): Parameters passed to
                the ``fit`` method of the estimator.

        Returns:
            :obj:`TuneBaseSearchCV` child instance, after fitting.
        """

        self._check_params()
        classifier = is_classifier(self.estimator)
        cv = check_cv(cv=self.cv, y=y, classifier=classifier)
        self.n_splits = cv.get_n_splits(X, y, groups)
        if not hasattr(self, "is_multi"):
            self.scoring, self.is_multi = _check_multimetric_scoring(
                self.estimator, self.scoring)
        else:
            self.scoring, _ = _check_multimetric_scoring(
                self.estimator, self.scoring)

        if self.is_multi:
            if self.refit and (not isinstance(self.refit, str)
                               or self.refit not in self.scoring):
                raise ValueError("When using multimetric scoring, refit "
                                 "must be the name of the scorer used to "
                                 "pick the best parameters. If not needed, "
                                 "set refit to False")

        assert isinstance(
            self.n_jobs,
            int), ("Internal error: self.n_jobs must be an integer.")
        if self.n_jobs < 0:
            resources_per_trial = {"cpu": 1, "gpu": 1 if self.use_gpu else 0}
            if self.n_jobs < -1:
                warnings.warn(
                    "`self.n_jobs` is automatically set "
                    "-1 for any negative values.",
                    category=UserWarning)
        else:
            available_cpus = multiprocessing.cpu_count()
            if ray.is_initialized():
                available_cpus = ray.cluster_resources()["CPU"]
            cpu_fraction = available_cpus / self.n_jobs
            if cpu_fraction > 1:
                cpu_fraction = int(np.ceil(cpu_fraction))
            resources_per_trial = {
                "cpu": cpu_fraction,
                "gpu": 1 if self.use_gpu else 0
            }

        X_id = ray.put(X)
        y_id = ray.put(y)

        config = {}
        config["early_stopping"] = bool(self.early_stopping)
        config["X_id"] = X_id
        config["y_id"] = y_id
        config["groups"] = groups
        config["cv"] = cv
        config["fit_params"] = fit_params
        config["scoring"] = self.scoring
        config["max_iters"] = self.max_iters
        config["return_train_score"] = self.return_train_score
        config["n_jobs"] = self.sk_n_jobs

        self._fill_config_hyperparam(config)
        analysis = self._tune_run(config, resources_per_trial)

        self.cv_results_ = self._format_results(self.n_splits, analysis)

        if self.is_multi:
            scoring_name = self.refit
        else:
            scoring_name = "score"

        if self.refit:
            best_config = analysis.get_best_config(metric="average_test_%s" %
                                                   scoring_name,
                                                   mode="max",
                                                   scope="last")
            self.best_params = self._clean_config_dict(best_config)
            self.best_estimator_ = clone(self.estimator)
            self.best_estimator_.set_params(**self.best_params)
            self.best_estimator_.fit(X, y, **fit_params)

            best_finished_trial_id = analysis.get_best_trial(
                metric="average_test_%s" % scoring_name,
                mode="max",
                scope="last").trial_id
            df = analysis.dataframe()
            self.best_score = float(df.loc[
                df["trial_id"] == best_finished_trial_id]["average_test_%s" %
                                                          scoring_name])

            return self