Exemplo n.º 1
0
    def _tune_run(self, config, resources_per_trial):
        """Wrapper to call ``tune.run``. Multiple estimators are generated when
        early stopping is possible, whereas a single estimator is
        generated when  early stopping is not possible.

        Args:
            config (dict): Configurations such as hyperparameters to run
                ``tune.run`` on.
            resources_per_trial (dict): Resources to use per trial within Ray.
                Accepted keys are `cpu`, `gpu` and custom resources, and values
                are integers specifying the number of each resource to use.

        Returns:
            analysis (`ExperimentAnalysis`): Object returned by
                `tune.run`.

        """
        trainable = _Trainable
        if self.pipeline_auto_early_stop and check_is_pipeline(
                self.estimator) and self.early_stopping:
            trainable = _PipelineTrainable

        if self.early_stopping is not None:
            config["estimator_list"] = [
                clone(self.estimator) for _ in range(self.n_splits)
            ]
        else:
            config["estimator_list"] = [self.estimator]

        if isinstance(self.param_grid, list):
            analysis = tune.run(
                trainable,
                search_alg=ListSearcher(self.param_grid),
                num_samples=self._list_grid_num_samples(),
                scheduler=self.early_stopping,
                reuse_actors=True,
                verbose=self.verbose,
                stop={"training_iteration": self.max_iters},
                config=config,
                fail_fast=True,
                resources_per_trial=resources_per_trial,
                local_dir=os.path.expanduser(self.local_dir),
                loggers=self.loggers)
        else:
            analysis = tune.run(
                trainable,
                scheduler=self.early_stopping,
                reuse_actors=True,
                verbose=self.verbose,
                stop={"training_iteration": self.max_iters},
                config=config,
                fail_fast=True,
                resources_per_trial=resources_per_trial,
                local_dir=os.path.expanduser(self.local_dir),
                loggers=self.loggers)

        return analysis
Exemplo n.º 2
0
    def _tune_run(self, config, resources_per_trial):
        """Wrapper to call ``tune.run``. Multiple estimators are generated when
        early stopping is possible, whereas a single estimator is
        generated when  early stopping is not possible.

        Args:
            config (dict): Configurations such as hyperparameters to run
                ``tune.run`` on.
            resources_per_trial (dict): Resources to use per trial within Ray.
                Accepted keys are `cpu`, `gpu` and custom resources, and values
                are integers specifying the number of each resource to use.

        Returns:
            analysis (`ExperimentAnalysis`): Object returned by
                `tune.run`.

        """
        trainable = _Trainable
        if self.pipeline_auto_early_stop and check_is_pipeline(
                self.estimator) and self.early_stopping:
            trainable = _PipelineTrainable

        if self.early_stopping is not None:
            config["estimator_ids"] = [
                ray.put(self.estimator) for _ in range(self.n_splits)
            ]
        else:
            config["estimator_ids"] = [ray.put(self.estimator)]

        stopper = MaximumIterationStopper(max_iter=self.max_iters)
        if self.stopper:
            stopper = CombinedStopper(stopper, self.stopper)

        run_args = dict(scheduler=self.early_stopping,
                        reuse_actors=True,
                        verbose=self.verbose,
                        stop=stopper,
                        config=config,
                        fail_fast="raise",
                        resources_per_trial=resources_per_trial,
                        local_dir=os.path.expanduser(self.local_dir),
                        loggers=self.loggers,
                        time_budget_s=self.time_budget_s)

        if isinstance(self.param_grid, list):
            run_args.update(
                dict(search_alg=ListSearcher(self.param_grid),
                     num_samples=self._list_grid_num_samples()))

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore",
                                    message="fail_fast='raise' "
                                    "detected.")
            analysis = tune.run(trainable, **run_args)
        return analysis
Exemplo n.º 3
0
    def _fit(self, X, y=None, groups=None, **fit_params):
        """Helper method to run fit procedure

        Args:
            X (:obj:`array-like` (shape = [n_samples, n_features])):
                Training vector, where n_samples is the number of samples and
                n_features is the number of features.
            y (:obj:`array-like`): Shape of array expected to be [n_samples]
                or [n_samples, n_output]). Target relative to X for
                classification or regression; None for unsupervised learning.
            groups (:obj:`array-like` (shape (n_samples,)), optional):
                Group labels for the samples used while splitting the dataset
                into train/test set. Only used in conjunction with a "Group"
                `cv` instance (e.g., `GroupKFold`).
            **fit_params (:obj:`dict` of str): Parameters passed to
                the ``fit`` method of the estimator.

        Returns:
            :obj:`TuneBaseSearchCV` child instance, after fitting.
        """

        self._check_params()
        classifier = is_classifier(self.estimator)
        cv = check_cv(cv=self.cv, y=y, classifier=classifier)
        self.n_splits = cv.get_n_splits(X, y, groups)
        if not hasattr(self, "is_multi"):
            self.scoring, self.is_multi = _check_multimetric_scoring(
                self.estimator, self.scoring)
        else:
            self.scoring, _ = _check_multimetric_scoring(
                self.estimator, self.scoring)

        if self.is_multi:
            if self.refit and (not isinstance(self.refit, str)
                               or self.refit not in self.scoring):
                raise ValueError("When using multimetric scoring, refit "
                                 "must be the name of the scorer used to "
                                 "pick the best parameters. If not needed, "
                                 "set refit to False")

        assert isinstance(
            self.n_jobs,
            int), ("Internal error: self.n_jobs must be an integer.")
        if self.n_jobs < 0:
            resources_per_trial = {"cpu": 1, "gpu": 1 if self.use_gpu else 0}
            if self.n_jobs < -1:
                warnings.warn(
                    "`self.n_jobs` is automatically set "
                    "-1 for any negative values.",
                    category=UserWarning)
        else:
            available_cpus = multiprocessing.cpu_count()
            if ray.is_initialized():
                available_cpus = ray.cluster_resources()["CPU"]
            cpu_fraction = available_cpus / self.n_jobs
            if cpu_fraction > 1:
                cpu_fraction = int(np.ceil(cpu_fraction))
            resources_per_trial = {
                "cpu": cpu_fraction,
                "gpu": 1 if self.use_gpu else 0
            }

        X_id = ray.put(X)
        y_id = ray.put(y)

        config = {}
        config["early_stopping"] = bool(self.early_stopping)
        config["early_stop_type"] = self.early_stop_type
        config["X_id"] = X_id
        config["y_id"] = y_id
        config["groups"] = groups
        config["cv"] = cv
        config["fit_params"] = fit_params
        config["scoring"] = self.scoring
        config["max_iters"] = self.max_iters
        config["return_train_score"] = self.return_train_score
        config["n_jobs"] = self.sk_n_jobs

        self._fill_config_hyperparam(config)
        analysis = self._tune_run(config, resources_per_trial)

        self.cv_results_ = self._format_results(self.n_splits, analysis)

        metric = self._metric_name
        base_metric = self._base_metric_name

        # For multi-metric evaluation, store the best_index, best_params and
        # best_score iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.is_multi:
            # If callable, refit is expected to return the index of the best
            # parameter set.
            if callable(self.refit):
                self.best_index = self.refit(self.cv_results_)
                if not isinstance(self.best_index, numbers.Integral):
                    raise TypeError("best_index returned is not an integer")
                if (self.best_index < 0
                        or self.best_index >= len(self.cv_results_["params"])):
                    raise IndexError("best_index index out of range")
            else:
                self.best_index = self.cv_results_["rank_test_%s" %
                                                   base_metric].argmin()
                self.best_score = self.cv_results_[
                    "mean_test_%s" % base_metric][self.best_index]
            best_config = analysis.get_best_config(metric=metric,
                                                   mode="max",
                                                   scope="last")
            self.best_params = self._clean_config_dict(best_config)

        if self.refit:
            base_estimator = clone(self.estimator)
            if self.early_stop_type == EarlyStopping.WARM_START_ENSEMBLE:
                logger.info("tune-sklearn uses `n_estimators` to warm "
                            "start, so this parameter can't be "
                            "set when warm start early stopping. "
                            "`n_estimators` defaults to `max_iters`.")
                if check_is_pipeline(base_estimator):
                    cloned_final_estimator = base_estimator.steps[-1][1]
                    cloned_final_estimator.set_params(
                        **{"n_estimators": self.max_iters})
                else:
                    self.best_params["n_estimators"] = self.max_iters
            # we clone again after setting params in case some
            # of the params are estimators as well.
            self.best_estimator = clone(
                base_estimator.set_params(**self.best_params))
            refit_start_time = time.time()
            if y is not None:
                self.best_estimator.fit(X, y, **fit_params)
            else:
                self.best_estimator.fit(X, **fit_params)
            refit_end_time = time.time()
            self.refit_time = refit_end_time - refit_start_time

        return self
Exemplo n.º 4
0
    def __init__(self,
                 estimator,
                 early_stopping=None,
                 scoring=None,
                 n_jobs=None,
                 sk_n_jobs=-1,
                 cv=5,
                 refit=True,
                 verbose=0,
                 error_score="raise",
                 return_train_score=False,
                 local_dir="~/ray_results",
                 max_iters=1,
                 use_gpu=False,
                 loggers=None,
                 pipeline_auto_early_stop=True):
        if max_iters < 1:
            raise ValueError("max_iters must be greater than or equal to 1.")
        self.estimator = estimator
        self.base_estimator = estimator
        self.pipeline_auto_early_stop = pipeline_auto_early_stop

        if self.pipeline_auto_early_stop and check_is_pipeline(estimator):
            _, self.base_estimator = self.base_estimator.steps[-1]

        self.early_stop_type = get_early_stop_type(self.base_estimator,
                                                   bool(early_stopping))

        if not self._can_early_stop():
            if early_stopping:
                raise ValueError("Early stopping is not supported because "
                                 "the estimator does not have `partial_fit`, "
                                 "does not support warm_start, or is a "
                                 "tree classifier. Set "
                                 "`early_stopping=False`.")
        if not early_stopping and max_iters > 1:
            warnings.warn(
                "max_iters is set > 1 but incremental/partial training "
                "is not enabled. To enable partial training, "
                "ensure the estimator has `partial_fit` or "
                "`warm_start` and set `early_stopping=True`. "
                "Automatically setting max_iters=1.",
                category=UserWarning)
            max_iters = 1

        # Get metric scoring name
        self.scoring = scoring
        self.refit = refit
        if not hasattr(self, "is_multi"):
            self.scoring, self.is_multi = _check_multimetric_scoring(
                self.estimator, self.scoring)

        if self.is_multi:
            self._base_metric_name = self.refit
        else:
            self._base_metric_name = "score"

        self._metric_name = "average_test_%s" % self._base_metric_name

        if early_stopping:
            if not self._can_early_stop() and is_lightgbm_model(
                    self.base_estimator):
                warnings.warn("lightgbm>=3.0.0 required for early_stopping "
                              "functionality.")
            assert self._can_early_stop()
            if max_iters == 1:
                warnings.warn(
                    "early_stopping is enabled but max_iters = 1. "
                    "To enable partial training, set max_iters > 1.",
                    category=UserWarning)
            if self.early_stop_type == EarlyStopping.XGB:
                warnings.warn(
                    "tune-sklearn implements incremental learning "
                    "for xgboost models following this: "
                    "https://github.com/dmlc/xgboost/issues/1686. "
                    "This may negatively impact performance. To "
                    "disable, set `early_stopping=False`.",
                    category=UserWarning)
            elif self.early_stop_type == EarlyStopping.LGBM:
                warnings.warn(
                    "tune-sklearn implements incremental learning "
                    "for lightgbm models following this: "
                    "https://lightgbm.readthedocs.io/en/latest/pythonapi/"
                    "lightgbm.LGBMModel.html#lightgbm.LGBMModel.fit "
                    "This may negatively impact performance. To "
                    "disable, set `early_stopping=False`.",
                    category=UserWarning)
            elif self.early_stop_type == EarlyStopping.CATBOOST:
                warnings.warn(
                    "tune-sklearn implements incremental learning "
                    "for Catboost models following this: "
                    "https://catboost.ai/docs/concepts/python-usages-"
                    "examples.html#training-continuation "
                    "This may negatively impact performance. To "
                    "disable, set `early_stopping=False`.",
                    category=UserWarning)
            if early_stopping is True:
                # Override the early_stopping variable so
                # that it is resolved appropriately in
                # the next block
                early_stopping = "AsyncHyperBandScheduler"
            # Resolve the early stopping object
            early_stopping = resolve_early_stopping(early_stopping, max_iters,
                                                    self._metric_name)

        self.early_stopping = early_stopping
        self.max_iters = max_iters

        self.cv = cv
        self.n_jobs = int(n_jobs or -1)
        if os.environ.get("SKLEARN_N_JOBS") is not None:
            self.sk_n_jobs = int(os.environ.get("SKLEARN_N_JOBS"))
        else:
            self.sk_n_jobs = sk_n_jobs

        self.verbose = verbose
        self.error_score = error_score
        self.return_train_score = return_train_score
        self.local_dir = local_dir
        self.use_gpu = use_gpu
        self.loggers = resolve_loggers(loggers)
        assert isinstance(self.n_jobs, int)
Exemplo n.º 5
0
    def _tune_run(self, config, resources_per_trial):
        """Wrapper to call ``tune.run``. Multiple estimators are generated when
        early stopping is possible, whereas a single estimator is
        generated when early stopping is not possible.

        Args:
            config (dict): Configurations such as hyperparameters to run
            ``tune.run`` on.
            resources_per_trial (dict): Resources to use per trial within Ray.
                Accepted keys are `cpu`, `gpu` and custom resources, and values
                are integers specifying the number of each resource to use.

        Returns:
            analysis (`ExperimentAnalysis`): Object returned by
                `tune.run`.

        """
        if self.seed is not None:
            random.seed(self.seed)
            np.random.seed(self.seed)

        trainable = _Trainable
        if self.pipeline_auto_early_stop and check_is_pipeline(
                self.estimator) and self.early_stopping:
            trainable = _PipelineTrainable

        max_iter = self.max_iters
        if self.early_stopping is not None:
            config["estimator_list"] = [
                clone(self.estimator) for _ in range(self.n_splits)
            ]
            if hasattr(self.early_stopping, "_max_t_attr"):
                # we want to delegate stopping to schedulers which
                # support it, but we want it to stop eventually, just in case
                # the solution is to make the stop condition very big
                max_iter = self.max_iters * 10
        else:
            config["estimator_list"] = [self.estimator]

        stopper = MaximumIterationStopper(max_iter=max_iter)
        if self.stopper:
            stopper = CombinedStopper(stopper, self.stopper)

        run_args = dict(scheduler=self.early_stopping,
                        reuse_actors=True,
                        verbose=self.verbose,
                        stop=stopper,
                        num_samples=self.n_trials,
                        config=config,
                        fail_fast="raise",
                        resources_per_trial=resources_per_trial,
                        local_dir=os.path.expanduser(self.local_dir),
                        loggers=self.loggers,
                        time_budget_s=self.time_budget_s)

        if self.search_optimization == "random":
            if isinstance(self.param_distributions, list):
                search_algo = RandomListSearcher(self.param_distributions)
            else:
                search_algo = BasicVariantGenerator()
            run_args["search_alg"] = search_algo
        else:
            search_space = None
            override_search_space = True
            if self._is_param_distributions_all_tune_domains():
                run_args["config"].update(self.param_distributions)
                override_search_space = False

            search_kwargs = self.search_kwargs.copy()
            search_kwargs.update(metric=self._metric_name, mode="max")

            if self.search_optimization == "bayesian":
                from ray.tune.suggest.skopt import SkOptSearch
                if override_search_space:
                    search_space = self.param_distributions
                search_algo = SkOptSearch(space=search_space, **search_kwargs)
                run_args["search_alg"] = search_algo

            elif self.search_optimization == "bohb":
                from ray.tune.suggest.bohb import TuneBOHB
                if override_search_space:
                    search_space = self._get_bohb_config_space()
                if self.seed:
                    warnings.warn("'seed' is not implemented for BOHB.")
                search_algo = TuneBOHB(space=search_space, **search_kwargs)
                # search_algo = TuneBOHB(
                #     space=search_space, seed=self.seed, **search_kwargs)
                run_args["search_alg"] = search_algo

            elif self.search_optimization == "optuna":
                from ray.tune.suggest.optuna import OptunaSearch
                from optuna.samplers import TPESampler
                sampler = TPESampler(seed=self.seed)
                if override_search_space:
                    search_space = self._get_optuna_params()
                search_algo = OptunaSearch(space=search_space,
                                           sampler=sampler,
                                           **search_kwargs)
                run_args["search_alg"] = search_algo

            elif self.search_optimization == "hyperopt":
                from ray.tune.suggest.hyperopt import HyperOptSearch
                if override_search_space:
                    search_space = self._get_hyperopt_params()
                search_algo = HyperOptSearch(space=search_space,
                                             random_state_seed=self.seed,
                                             **search_kwargs)
                run_args["search_alg"] = search_algo

            else:
                # This should not happen as we validate the input before
                # this method. Still, just to be sure, raise an error here.
                raise ValueError(
                    f"Invalid search optimizer: {self.search_optimization}")

        if isinstance(self.n_jobs, int) and self.n_jobs > 0 \
           and not self.search_optimization == "random":
            search_algo = ConcurrencyLimiter(search_algo,
                                             max_concurrent=self.n_jobs)
            run_args["search_alg"] = search_algo

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore",
                                    message="fail_fast='raise' "
                                    "detected.")
            analysis = tune.run(trainable, **run_args)
        return analysis
Exemplo n.º 6
0
    def __init__(self,
                 estimator,
                 early_stopping=None,
                 scoring=None,
                 n_jobs=None,
                 sk_n_jobs=-1,
                 cv=5,
                 refit=True,
                 verbose=0,
                 error_score="raise",
                 return_train_score=False,
                 local_dir="~/ray_results",
                 max_iters=1,
                 use_gpu=False,
                 loggers=None,
                 pipeline_auto_early_stop=True):
        if max_iters < 1:
            raise ValueError("max_iters must be greater than or equal to 1.")
        self.estimator = estimator
        self.base_estimator = estimator
        self.pipeline_auto_early_stop = pipeline_auto_early_stop

        if self.pipeline_auto_early_stop and check_is_pipeline(estimator):
            _, self.base_estimator = self.base_estimator.steps[-1]

        self.early_stop_type = get_early_stop_type(self.base_estimator,
                                                   bool(early_stopping))

        if not self._can_early_stop():
            if early_stopping:
                raise ValueError("Early stopping is not supported because "
                                 "the estimator does not have `partial_fit`, "
                                 "does not support warm_start, or is a "
                                 "tree classifier. Set "
                                 "`early_stopping=False`.")
        if not early_stopping and max_iters > 1:
            warnings.warn(
                "max_iters is set > 1 but incremental/partial training "
                "is not enabled. To enable partial training, "
                "ensure the estimator has `partial_fit` or "
                "`warm_start` and set `early_stopping=True`. "
                "Automatically setting max_iters=1.",
                category=UserWarning)
            max_iters = 1

        if early_stopping:
            assert self._can_early_stop()
            if max_iters == 1:
                warnings.warn(
                    "early_stopping is enabled but max_iters = 1. "
                    "To enable partial training, set max_iters > 1.",
                    category=UserWarning)
            if self.early_stop_type == EarlyStopping.XGB:
                warnings.warn(
                    "tune-sklearn implements incremental learning "
                    "for xgboost models following this: "
                    "https://github.com/dmlc/xgboost/issues/1686. "
                    "This may negatively impact performance. To "
                    "disable, set `early_stopping=False`.",
                    category=UserWarning)
            if early_stopping is True:
                # Override the early_stopping variable so
                # that it is resolved appropriately in
                # the next block
                early_stopping = "AsyncHyperBandScheduler"
            # Resolve the early stopping object
            early_stopping = resolve_early_stopping(early_stopping, max_iters)

        self.early_stopping = early_stopping
        self.max_iters = max_iters

        self.cv = cv
        self.scoring = scoring
        self.n_jobs = int(n_jobs or -1)
        if os.environ.get("SKLEARN_N_JOBS") is not None:
            self.sk_n_jobs = int(os.environ.get("SKLEARN_N_JOBS"))
        else:
            self.sk_n_jobs = sk_n_jobs
        self.refit = refit
        self.verbose = verbose
        self.error_score = error_score
        self.return_train_score = return_train_score
        self.local_dir = local_dir
        self.use_gpu = use_gpu
        self.loggers = resolve_loggers(loggers)
        assert isinstance(self.n_jobs, int)
Exemplo n.º 7
0
    def _tune_run(self, config, resources_per_trial):
        """Wrapper to call ``tune.run``. Multiple estimators are generated when
        early stopping is possible, whereas a single estimator is
        generated when early stopping is not possible.

        Args:
            config (dict): Configurations such as hyperparameters to run
            ``tune.run`` on.
            resources_per_trial (dict): Resources to use per trial within Ray.
                Accepted keys are `cpu`, `gpu` and custom resources, and values
                are integers specifying the number of each resource to use.

        Returns:
            analysis (`ExperimentAnalysis`): Object returned by
                `tune.run`.

        """
        trainable = _Trainable
        if self.pipeline_auto_early_stop and check_is_pipeline(
                self.estimator) and self.early_stopping:
            trainable = _PipelineTrainable

        stop_condition = {"training_iteration": self.max_iters}
        if self.early_stopping is not None:
            config["estimator_list"] = [
                clone(self.estimator) for _ in range(self.n_splits)
            ]
            if hasattr(self.early_stopping, "_max_t_attr"):
                # we want to delegate stopping to schedulers which
                # support it, but we want it to stop eventually, just in case
                # the solution is to make the stop condition very big
                stop_condition = {"training_iteration": self.max_iters * 10}
        else:
            config["estimator_list"] = [self.estimator]

        if self.search_optimization == "random":
            run_args = dict(scheduler=self.early_stopping,
                            reuse_actors=True,
                            verbose=self.verbose,
                            stop=stop_condition,
                            num_samples=self.num_samples,
                            config=config,
                            fail_fast=True,
                            resources_per_trial=resources_per_trial,
                            local_dir=os.path.expanduser(self.local_dir),
                            loggers=self.loggers)

            if isinstance(self.param_distributions, list):
                run_args["search_alg"] = RandomListSearcher(
                    self.param_distributions)

            analysis = tune.run(trainable, **run_args)
            return analysis

        elif self.search_optimization == "bayesian":
            from skopt import Optimizer
            from ray.tune.suggest.skopt import SkOptSearch
            hyperparameter_names, spaces = self._get_skopt_params()
            search_algo = SkOptSearch(Optimizer(spaces),
                                      hyperparameter_names,
                                      metric=self._metric_name,
                                      mode="max",
                                      **self.search_kwargs)

        elif self.search_optimization == "bohb":
            from ray.tune.suggest.bohb import TuneBOHB
            config_space = self._get_bohb_config_space()
            search_algo = TuneBOHB(config_space,
                                   metric=self._metric_name,
                                   mode="max",
                                   **self.search_kwargs)

        elif self.search_optimization == "optuna":
            from ray.tune.suggest.optuna import OptunaSearch
            config_space = self._get_optuna_params()
            search_algo = OptunaSearch(config_space,
                                       metric=self._metric_name,
                                       mode="max",
                                       **self.search_kwargs)

        elif self.search_optimization == "hyperopt":
            from ray.tune.suggest.hyperopt import HyperOptSearch
            config_space = self._get_hyperopt_params()
            search_algo = HyperOptSearch(config_space,
                                         metric=self._metric_name,
                                         mode="max",
                                         **self.search_kwargs)

        if isinstance(self.n_jobs, int) and self.n_jobs > 0:
            search_algo = ConcurrencyLimiter(search_algo,
                                             max_concurrent=self.n_jobs)

        analysis = tune.run(trainable,
                            search_alg=search_algo,
                            scheduler=self.early_stopping,
                            reuse_actors=True,
                            verbose=self.verbose,
                            stop=stop_condition,
                            num_samples=self.num_samples,
                            config=config,
                            fail_fast=True,
                            resources_per_trial=resources_per_trial,
                            local_dir=os.path.expanduser(self.local_dir),
                            loggers=self.loggers)

        return analysis