def testCallbackReordering(self):
        """SyncerCallback should come after ExperimentLogger callbacks"""
        def get_positions(callbacks):
            first_logger_pos = None
            last_logger_pos = None
            syncer_pos = None
            for i, callback in enumerate(callbacks):
                if isinstance(callback, ExperimentLogger):
                    if first_logger_pos is None:
                        first_logger_pos = i
                    last_logger_pos = i
                elif isinstance(callback, SyncerCallback):
                    syncer_pos = i
            return first_logger_pos, last_logger_pos, syncer_pos

        # Auto creation of loggers, no callbacks, no syncer
        callbacks = create_default_callbacks(None, SyncConfig(), None)
        first_logger_pos, last_logger_pos, syncer_pos = get_positions(
            callbacks)
        self.assertLess(last_logger_pos, syncer_pos)

        # Auto creation of loggers with callbacks
        callbacks = create_default_callbacks([Callback()], SyncConfig(), None)
        first_logger_pos, last_logger_pos, syncer_pos = get_positions(
            callbacks)
        self.assertLess(last_logger_pos, syncer_pos)

        # Auto creation of loggers with existing logger (but no CSV/JSON)
        callbacks = create_default_callbacks([ExperimentLogger()],
                                             SyncConfig(), None)
        first_logger_pos, last_logger_pos, syncer_pos = get_positions(
            callbacks)
        self.assertLess(last_logger_pos, syncer_pos)

        # This should throw an error as the syncer comes before the logger
        with self.assertRaises(ValueError):
            callbacks = create_default_callbacks(
                [SyncerCallback(None),
                 ExperimentLogger()], SyncConfig(), None)

        # This should be reordered but preserve the regular callback order
        [mc1, mc2, mc3] = [Callback(), Callback(), Callback()]
        # Has to be legacy logger to avoid logger callback creation
        lc = LegacyExperimentLogger(logger_classes=DEFAULT_LOGGERS)
        callbacks = create_default_callbacks([mc1, mc2, lc, mc3], SyncConfig(),
                                             None)
        print(callbacks)
        first_logger_pos, last_logger_pos, syncer_pos = get_positions(
            callbacks)
        self.assertLess(last_logger_pos, syncer_pos)
        self.assertLess(callbacks.index(mc1), callbacks.index(mc2))
        self.assertLess(callbacks.index(mc2), callbacks.index(mc3))
        self.assertLess(callbacks.index(lc), callbacks.index(mc3))
        # Syncer callback is appended
        self.assertLess(callbacks.index(mc3), syncer_pos)
示例#2
0
    def from_json(cls, name, spec):
        """Generates an Experiment object from JSON.

        Args:
            name (str): Name of Experiment.
            spec (dict): JSON configuration of experiment.
        """
        if "run" not in spec:
            raise TuneError("No trainable specified!")

        # Special case the `env` param for RLlib by automatically
        # moving it into the `config` section.
        if "env" in spec:
            spec["config"] = spec.get("config", {})
            spec["config"]["env"] = spec["env"]
            del spec["env"]

        if "sync_config" in spec and isinstance(spec["sync_config"], dict):
            spec["sync_config"] = SyncConfig(**spec["sync_config"])

        spec = copy.deepcopy(spec)

        run_value = spec.pop("run")
        try:
            exp = cls(name, run_value, **spec)
        except TypeError:
            raise TuneError("Improper argument from JSON: {}.".format(spec))
        return exp
示例#3
0
def test_syncer_callback_noop_on_trial_cloud_checkpointing():
    """Check that trial using cloud checkpointing disables sync to driver"""
    callbacks = create_default_callbacks(callbacks=[],
                                         sync_config=SyncConfig())
    syncer_callback = None
    for cb in callbacks:
        if isinstance(cb, SyncerCallback):
            syncer_callback = cb

    trial1 = MockTrial(trial_id="a", logdir=None)
    trial1.uses_cloud_checkpointing = True

    assert syncer_callback
    assert syncer_callback._enabled
    # Cloud checkpointing set, so no-op
    assert not syncer_callback._sync_trial_dir(trial1)

    # This should not raise any error for not existing directory
    syncer_callback.on_checkpoint(
        iteration=1,
        trials=[],
        trial=trial1,
        checkpoint=_TrackedCheckpoint(
            dir_or_data="/does/not/exist",
            storage_mode=CheckpointStorage.PERSISTENT),
    )
def mock_storage_client(path):
    """Mocks storage client that treats a local dir as durable storage."""
    os.makedirs(path, exist_ok=True)
    if _ray_114:
        syncer = get_node_to_storage_syncer(SyncConfig(upload_dir=path))
    else:
        syncer = get_sync_client(LOCAL_SYNC_TEMPLATE, LOCAL_DELETE_TEMPLATE)
    return syncer
示例#5
0
文件: config.py 项目: parasj/ray
    def __post_init__(self):
        if not self.failure_config:
            self.failure_config = FailureConfig()

        if not self.sync_config:
            self.sync_config = SyncConfig()

        if not self.checkpoint_config:
            self.checkpoint_config = CheckpointConfig()
示例#6
0
def test_syncer_callback_op_on_no_cloud_checkpointing():
    """Check that without cloud checkpointing sync to driver is enabled"""
    callbacks = create_default_callbacks(callbacks=[],
                                         sync_config=SyncConfig())
    syncer_callback = None
    for cb in callbacks:
        if isinstance(cb, SyncerCallback):
            syncer_callback = cb

    trial1 = MockTrial(trial_id="a", logdir=None)
    trial1.uses_cloud_checkpointing = False

    assert syncer_callback
    assert syncer_callback._enabled
    assert syncer_callback._sync_trial_dir(trial1)
    def testCheckpointAutoPeriod(self):
        ray.init(num_cpus=3)

        # This makes checkpointing take 2 seconds.
        def sync_up(source, target):
            time.sleep(2)
            return True

        runner = TrialRunner(
            local_checkpoint_dir=self.tmpdir,
            checkpoint_period="auto",
            sync_config=SyncConfig(upload_dir="fake", syncer=sync_up),
            remote_checkpoint_dir="fake")
        runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 1}))

        runner.step()  # Run one step, this will trigger checkpointing

        self.assertGreaterEqual(runner._checkpoint_manager._checkpoint_period,
                                38.)
示例#8
0
    def testCheckpointAutoPeriod(self):
        ray.init(num_cpus=3)

        # This makes checkpointing take 2 seconds.

        class CustomSyncer(Syncer):
            def __init__(self, sync_period: float = 300.0):
                super(CustomSyncer, self).__init__(sync_period=sync_period)
                self._sync_status = {}

            def sync_up(self,
                        local_dir: str,
                        remote_dir: str,
                        exclude: list = None) -> bool:
                time.sleep(2)
                return True

            def sync_down(self,
                          remote_dir: str,
                          local_dir: str,
                          exclude: list = None) -> bool:
                time.sleep(2)
                return True

            def delete(self, remote_dir: str) -> bool:
                pass

        runner = TrialRunner(
            local_checkpoint_dir=self.tmpdir,
            checkpoint_period="auto",
            sync_config=SyncConfig(upload_dir="fake",
                                   syncer=CustomSyncer(),
                                   sync_period=0),
            remote_checkpoint_dir="fake",
        )
        runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 1}))

        runner.step()  # Run one step, this will trigger checkpointing

        self.assertGreaterEqual(runner._checkpoint_manager._checkpoint_period,
                                38.0)
示例#9
0
def run(
        run_or_experiment: Union[str, Callable, Type],
        name: Optional[str] = None,
        metric: Optional[str] = None,
        mode: Optional[str] = None,
        stop: Union[None, Mapping, Stopper, Callable[[str, Mapping],
                                                     bool]] = None,
        time_budget_s: Union[None, int, float, datetime.timedelta] = None,
        config: Optional[Dict[str, Any]] = None,
        resources_per_trial: Union[None, Mapping[str, Union[
            float, int, Mapping]], PlacementGroupFactory] = None,
        num_samples: int = 1,
        local_dir: Optional[str] = None,
        search_alg: Optional[Union[Searcher, SearchAlgorithm, str]] = None,
        scheduler: Optional[Union[TrialScheduler, str]] = None,
        keep_checkpoints_num: Optional[int] = None,
        checkpoint_score_attr: Optional[str] = None,
        checkpoint_freq: int = 0,
        checkpoint_at_end: bool = False,
        verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS,
        progress_reporter: Optional[ProgressReporter] = None,
        log_to_file: bool = False,
        trial_name_creator: Optional[Callable[[Trial], str]] = None,
        trial_dirname_creator: Optional[Callable[[Trial], str]] = None,
        sync_config: Optional[SyncConfig] = None,
        export_formats: Optional[Sequence] = None,
        max_failures: int = 0,
        fail_fast: bool = False,
        restore: Optional[str] = None,
        server_port: Optional[int] = None,
        resume: bool = False,
        reuse_actors: bool = False,
        trial_executor: Optional[RayTrialExecutor] = None,
        raise_on_failed_trial: bool = True,
        callbacks: Optional[Sequence[Callback]] = None,
        max_concurrent_trials: Optional[int] = None,
        # Deprecated args
        queue_trials: Optional[bool] = None,
        loggers: Optional[Sequence[Type[Logger]]] = None,
        _remote: Optional[bool] = None,
) -> ExperimentAnalysis:
    """Executes training.

    When a SIGINT signal is received (e.g. through Ctrl+C), the tuning run
    will gracefully shut down and checkpoint the latest experiment state.
    Sending SIGINT again (or SIGKILL/SIGTERM instead) will skip this step.

    Many aspects of Tune, such as the frequency of global checkpointing,
    maximum pending placement group trials and the path of the result
    directory be configured through environment variables. Refer to
    :ref:`tune-env-vars` for a list of environment variables available.

    Examples:

    .. code-block:: python

        # Run 10 trials (each trial is one instance of a Trainable). Tune runs
        # in parallel and automatically determines concurrency.
        tune.run(trainable, num_samples=10)

        # Run 1 trial, stop when trial has reached 10 iterations
        tune.run(my_trainable, stop={"training_iteration": 10})

        # automatically retry failed trials up to 3 times
        tune.run(my_trainable, stop={"training_iteration": 10}, max_failures=3)

        # Run 1 trial, search over hyperparameters, stop after 10 iterations.
        space = {"lr": tune.uniform(0, 1), "momentum": tune.uniform(0, 1)}
        tune.run(my_trainable, config=space, stop={"training_iteration": 10})

        # Resumes training if a previous machine crashed
        tune.run(my_trainable, config=space,
                 local_dir=<path/to/dir>, resume=True)

        # Rerun ONLY failed trials after an experiment is finished.
        tune.run(my_trainable, config=space,
                 local_dir=<path/to/dir>, resume="ERRORED_ONLY")

    Args:
        run_or_experiment (function | class | str | :class:`Experiment`): If
            function|class|str, this is the algorithm or model to train.
            This may refer to the name of a built-on algorithm
            (e.g. RLLib's DQN or PPO), a user-defined trainable
            function or class, or the string identifier of a
            trainable function or class registered in the tune registry.
            If Experiment, then Tune will execute training based on
            Experiment.spec. If you want to pass in a Python lambda, you
            will need to first register the function:
            ``tune.register_trainable("lambda_id", lambda x: ...)``. You can
            then use ``tune.run("lambda_id")``.
        metric (str): Metric to optimize. This metric should be reported
            with `tune.report()`. If set, will be passed to the search
            algorithm and scheduler.
        mode (str): Must be one of [min, max]. Determines whether objective is
            minimizing or maximizing the metric attribute. If set, will be
            passed to the search algorithm and scheduler.
        name (str): Name of experiment.
        stop (dict | callable | :class:`Stopper`): Stopping criteria. If dict,
            the keys may be any field in the return result of 'train()',
            whichever is reached first. If function, it must take (trial_id,
            result) as arguments and return a boolean (True if trial should be
            stopped, False otherwise). This can also be a subclass of
            ``ray.tune.Stopper``, which allows users to implement
            custom experiment-wide stopping (i.e., stopping an entire Tune
            run based on some time constraint).
        time_budget_s (int|float|datetime.timedelta): Global time budget in
            seconds after which all trials are stopped. Can also be a
            ``datetime.timedelta`` object.
        config (dict): Algorithm-specific configuration for Tune variant
            generation (e.g. env, hyperparams). Defaults to empty dict.
            Custom search algorithms may ignore this.
        resources_per_trial (dict|PlacementGroupFactory): Machine resources
            to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}``.
            Note that GPUs will not be assigned unless you specify them here.
            Defaults to 1 CPU and 0 GPUs in
            ``Trainable.default_resource_request()``. This can also
            be a PlacementGroupFactory object wrapping arguments to create a
            per-trial placement group.
        num_samples (int): Number of times to sample from the
            hyperparameter space. Defaults to 1. If `grid_search` is
            provided as an argument, the grid will be repeated
            `num_samples` of times. If this is -1, (virtually) infinite
            samples are generated until a stopping condition is met.
        local_dir (str): Local dir to save training results to.
            Defaults to ``~/ray_results``.
        search_alg (Searcher|SearchAlgorithm|str): Search algorithm for
            optimization. You can also use the name of the algorithm.
        scheduler (TrialScheduler|str): Scheduler for executing
            the experiment. Choose among FIFO (default), MedianStopping,
            AsyncHyperBand, HyperBand and PopulationBasedTraining. Refer to
            ray.tune.schedulers for more options. You can also use the
            name of the scheduler.
        keep_checkpoints_num (int): Number of checkpoints to keep. A value of
            `None` keeps all checkpoints. Defaults to `None`. If set, need
            to provide `checkpoint_score_attr`.
        checkpoint_score_attr (str): Specifies by which attribute to rank the
            best checkpoint. Default is increasing order. If attribute starts
            with `min-` it will rank attribute in decreasing order, i.e.
            `min-validation_loss`.
        checkpoint_freq (int): How many training iterations between
            checkpoints. A value of 0 (default) disables checkpointing.
            This has no effect when using the Functional Training API.
        checkpoint_at_end (bool): Whether to checkpoint at the end of the
            experiment regardless of the checkpoint_freq. Default is False.
            This has no effect when using the Functional Training API.
        verbose (Union[int, Verbosity]): 0, 1, 2, or 3. Verbosity mode.
            0 = silent, 1 = only status updates, 2 = status and brief trial
            results, 3 = status and detailed trial results. Defaults to 3.
        progress_reporter (ProgressReporter): Progress reporter for reporting
            intermediate experiment progress. Defaults to CLIReporter if
            running in command-line, or JupyterNotebookReporter if running in
            a Jupyter notebook.
        log_to_file (bool|str|Sequence): Log stdout and stderr to files in
            Tune's trial directories. If this is `False` (default), no files
            are written. If `true`, outputs are written to `trialdir/stdout`
            and `trialdir/stderr`, respectively. If this is a single string,
            this is interpreted as a file relative to the trialdir, to which
            both streams are written. If this is a Sequence (e.g. a Tuple),
            it has to have length 2 and the elements indicate the files to
            which stdout and stderr are written, respectively.
        trial_name_creator (Callable[[Trial], str]): Optional function
            for generating the trial string representation.
        trial_dirname_creator (Callable[[Trial], str]): Function
            for generating the trial dirname. This function should take
            in a Trial object and return a string representing the
            name of the directory. The return value cannot be a path.
        sync_config (SyncConfig): Configuration object for syncing. See
            tune.SyncConfig.
        export_formats (list): List of formats that exported at the end of
            the experiment. Default is None.
        max_failures (int): Try to recover a trial at least this many times.
            Ray will recover from the latest checkpoint if present.
            Setting to -1 will lead to infinite recovery retries.
            Setting to 0 will disable retries. Defaults to 0.
        fail_fast (bool | str): Whether to fail upon the first error.
            If fail_fast='raise' provided, Tune will automatically
            raise the exception received by the Trainable. fail_fast='raise'
            can easily leak resources and should be used with caution (it
            is best used with `ray.init(local_mode=True)`).
        restore (str): Path to checkpoint. Only makes sense to set if
            running 1 trial. Defaults to None.
        server_port (int): Port number for launching TuneServer.
        resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", "ERRORED_ONLY",
            or bool. LOCAL/True restores the checkpoint from the
            local experiment directory, determined
            by ``name`` and ``local_dir``. REMOTE restores the checkpoint
            from ``upload_dir`` (as passed to ``sync_config``).
            PROMPT provides CLI feedback.
            False forces a new experiment. ERRORED_ONLY resets and reruns
            ERRORED trials upon resume - previous trial artifacts will
            be left untouched.  If resume is set but checkpoint does not exist,
            ValueError will be thrown.
        reuse_actors (bool): Whether to reuse actors between different trials
            when possible. This can drastically speed up experiments that start
            and stop actors often (e.g., PBT in time-multiplexing mode). This
            requires trials to have the same resource requirements.
        trial_executor (TrialExecutor): Manage the execution of trials.
        raise_on_failed_trial (bool): Raise TuneError if there exists failed
            trial (of ERROR state) when the experiments complete.
        callbacks (list): List of callbacks that will be called at different
            times in the training loop. Must be instances of the
            ``ray.tune.callback.Callback`` class. If not passed,
            `LoggerCallback` and `SyncerCallback` callbacks are automatically
            added.
        max_concurrent_trials (int): Maximum number of trials to run
            concurrently. Must be non-negative. If None or 0, no limit will
            be applied. This is achieved by wrapping the ``search_alg`` in
            a :class:`ConcurrencyLimiter`, and thus setting this argument
            will raise an exception if the ``search_alg`` is already a
            :class:`ConcurrencyLimiter`. Defaults to None.
        _remote (bool): Whether to run the Tune driver in a remote function.
            This is disabled automatically if a custom trial executor is
            passed in. This is enabled by default in Ray client mode.

    Returns:
        ExperimentAnalysis: Object for experiment analysis.

    Raises:
        TuneError: Any trials failed and `raise_on_failed_trial` is True.
    """

    # To be removed in 1.9.
    if queue_trials is not None:
        raise DeprecationWarning(
            "`queue_trials` has been deprecated and is replaced by "
            "the `TUNE_MAX_PENDING_TRIALS_PG` environment variable. "
            "Per default at least one Trial is queued at all times, "
            "so you likely don't need to change anything other than "
            "removing this argument from your call to `tune.run()`")

    # NO CODE IS TO BE ADDED ABOVE THIS COMMENT
    # remote_run_kwargs must be defined before any other
    # code is ran to ensure that at this point,
    # `locals()` is equal to args and kwargs
    remote_run_kwargs = locals().copy()
    remote_run_kwargs.pop("_remote")

    if _remote is None:
        _remote = ray.util.client.ray.is_connected()

    if _remote is True and trial_executor:
        raise ValueError("cannot use custom trial executor")

    if not trial_executor or isinstance(trial_executor, RayTrialExecutor):
        _ray_auto_init()

    if _remote:
        remote_run = ray.remote(num_cpus=0)(run)

        # Make sure tune.run is called on the sever node.
        remote_run = force_on_current_node(remote_run)

        # JupyterNotebooks don't work with remote tune runs out of the box
        # (e.g. via Ray client) as they don't have access to the main
        # process stdout. So we introduce a queue here that accepts
        # callables, which will then be executed on the driver side.
        if isinstance(progress_reporter, JupyterNotebookReporter):
            execute_queue = Queue(actor_options={
                "num_cpus": 0,
                **force_on_current_node(None)
            })
            progress_reporter.set_output_queue(execute_queue)

            def get_next_queue_item():
                try:
                    return execute_queue.get(block=False)
                except Empty:
                    return None

        else:
            # If we don't need a queue, use this dummy get fn instead of
            # scheduling an unneeded actor
            def get_next_queue_item():
                return None

        def _handle_execute_queue():
            execute_item = get_next_queue_item()
            while execute_item:
                if isinstance(execute_item, Callable):
                    execute_item()

                execute_item = get_next_queue_item()

        remote_future = remote_run.remote(_remote=False, **remote_run_kwargs)

        # ray.wait(...)[1] returns futures that are not ready, yet
        while ray.wait([remote_future], timeout=0.2)[1]:
            # Check if we have items to execute
            _handle_execute_queue()

        # Handle queue one last time
        _handle_execute_queue()

        return ray.get(remote_future)

    del remote_run_kwargs

    all_start = time.time()

    if loggers:
        # Raise DeprecationWarning in 1.9, remove in 1.10/1.11
        warnings.warn(
            "The `loggers` argument is deprecated. Please pass the respective "
            "`LoggerCallback` classes to the `callbacks` argument instead. "
            "See https://docs.ray.io/en/latest/tune/api_docs/logging.html")

    if mode and mode not in ["min", "max"]:
        raise ValueError(
            "The `mode` parameter passed to `tune.run()` has to be one of "
            "['min', 'max']")

    set_verbosity(verbose)

    config = config or {}
    sync_config = sync_config or SyncConfig()
    set_sync_periods(sync_config)

    if num_samples == -1:
        num_samples = sys.maxsize

    result_buffer_length = None

    # Create scheduler here as we need access to some of its properties
    if isinstance(scheduler, str):
        # importing at top level causes a recursive dependency
        from ray.tune.schedulers import create_scheduler
        scheduler = create_scheduler(scheduler)
    scheduler = scheduler or FIFOScheduler()

    if not scheduler.supports_buffered_results:
        # Result buffering with e.g. a Hyperband scheduler is a bad idea, as
        # hyperband tries to stop trials when processing brackets. With result
        # buffering, we might trigger this multiple times when evaluating
        # a single trial, which leads to unexpected behavior.
        env_result_buffer_length = os.getenv("TUNE_RESULT_BUFFER_LENGTH", "")
        if env_result_buffer_length:
            warnings.warn(
                f"You are using a {type(scheduler)} scheduler, but "
                f"TUNE_RESULT_BUFFER_LENGTH is set "
                f"({env_result_buffer_length}). This can lead to undesired "
                f"and faulty behavior, so the buffer length was forcibly set "
                f"to 1 instead.")
        result_buffer_length = 1

    if isinstance(scheduler,
                  (PopulationBasedTraining,
                   PopulationBasedTrainingReplay)) and not reuse_actors:
        warnings.warn(
            "Consider boosting PBT performance by enabling `reuse_actors` as "
            "well as implementing `reset_config` for Trainable.")

    trial_executor = trial_executor or RayTrialExecutor(
        reuse_actors=reuse_actors, result_buffer_length=result_buffer_length)
    if isinstance(run_or_experiment, list):
        experiments = run_or_experiment
    else:
        experiments = [run_or_experiment]

    for i, exp in enumerate(experiments):
        if not isinstance(exp, Experiment):
            experiments[i] = Experiment(
                name=name,
                run=exp,
                stop=stop,
                time_budget_s=time_budget_s,
                config=config,
                resources_per_trial=resources_per_trial,
                num_samples=num_samples,
                local_dir=local_dir,
                sync_config=sync_config,
                trial_name_creator=trial_name_creator,
                trial_dirname_creator=trial_dirname_creator,
                log_to_file=log_to_file,
                checkpoint_freq=checkpoint_freq,
                checkpoint_at_end=checkpoint_at_end,
                keep_checkpoints_num=keep_checkpoints_num,
                checkpoint_score_attr=checkpoint_score_attr,
                export_formats=export_formats,
                max_failures=max_failures,
                restore=restore)
    else:
        logger.debug("Ignoring some parameters passed into tune.run.")

    if fail_fast and max_failures != 0:
        raise ValueError("max_failures must be 0 if fail_fast=True.")

    if isinstance(search_alg, str):
        # importing at top level causes a recursive dependency
        from ray.tune.suggest import create_searcher
        search_alg = create_searcher(search_alg)

    # if local_mode=True is set during ray.init().
    is_local_mode = ray.worker._mode() == ray.worker.LOCAL_MODE

    if is_local_mode:
        max_concurrent_trials = 1

    if not search_alg:
        search_alg = BasicVariantGenerator(
            max_concurrent=max_concurrent_trials or 0)
    elif max_concurrent_trials:
        if isinstance(search_alg, ConcurrencyLimiter):
            if search_alg.max_concurrent != max_concurrent_trials:
                raise ValueError(
                    "You have specified `max_concurrent_trials="
                    f"{max_concurrent_trials}`, but the `search_alg` is "
                    "already a `ConcurrencyLimiter` with `max_concurrent="
                    f"{search_alg.max_concurrent}. FIX THIS by setting "
                    "`max_concurrent_trials=None`.")
            else:
                logger.warning(
                    "You have specified `max_concurrent_trials="
                    f"{max_concurrent_trials}`, but the `search_alg` is "
                    "already a `ConcurrencyLimiter`. `max_concurrent_trials` "
                    "will be ignored.")
        else:
            if max_concurrent_trials < 1:
                raise ValueError(
                    "`max_concurrent_trials` must be greater or equal than 1, "
                    f"got {max_concurrent_trials}.")
            if isinstance(search_alg, Searcher):
                search_alg = ConcurrencyLimiter(
                    search_alg, max_concurrent=max_concurrent_trials)
            elif not is_local_mode:
                logger.warning(
                    "You have passed a `SearchGenerator` instance as the "
                    "`search_alg`, but `max_concurrent_trials` requires a "
                    "`Searcher` instance`. `max_concurrent_trials` "
                    "will be ignored.")

    if isinstance(search_alg, Searcher):
        search_alg = SearchGenerator(search_alg)

    if config and not set_search_properties_backwards_compatible(
            search_alg.set_search_properties, metric, mode, config, **
            experiments[0].public_spec):
        if has_unresolved_values(config):
            raise ValueError(
                "You passed a `config` parameter to `tune.run()` with "
                "unresolved parameters, but the search algorithm was already "
                "instantiated with a search space. Make sure that `config` "
                "does not contain any more parameter definitions - include "
                "them in the search algorithm's search space if necessary.")

    if not scheduler.set_search_properties(metric, mode):
        raise ValueError(
            "You passed a `metric` or `mode` argument to `tune.run()`, but "
            "the scheduler you are using was already instantiated with their "
            "own `metric` and `mode` parameters. Either remove the arguments "
            "from your scheduler or from your call to `tune.run()`")

    # Create syncer callbacks
    callbacks = create_default_callbacks(
        callbacks, sync_config, metric=metric, loggers=loggers)

    runner = TrialRunner(
        search_alg=search_alg,
        scheduler=scheduler,
        local_checkpoint_dir=experiments[0].checkpoint_dir,
        remote_checkpoint_dir=experiments[0].remote_checkpoint_dir,
        sync_config=sync_config,
        stopper=experiments[0].stopper,
        resume=resume,
        server_port=server_port,
        fail_fast=fail_fast,
        trial_executor=trial_executor,
        callbacks=callbacks,
        metric=metric,
        # Driver should only sync trial checkpoints if
        # checkpoints are not synced to cloud
        driver_sync_trial_checkpoints=not bool(sync_config.upload_dir))

    if not runner.resumed:
        for exp in experiments:
            search_alg.add_configurations([exp])
    else:
        logger.info("TrialRunner resumed, ignoring new add_experiment but "
                    "updating trial resources.")
        if resources_per_trial:
            runner.update_pending_trial_resources(resources_per_trial)

    progress_reporter = progress_reporter or detect_reporter()

    if not progress_reporter.set_search_properties(metric, mode):
        raise ValueError(
            "You passed a `metric` or `mode` argument to `tune.run()`, but "
            "the reporter you are using was already instantiated with their "
            "own `metric` and `mode` parameters. Either remove the arguments "
            "from your reporter or from your call to `tune.run()`")
    progress_reporter.set_total_samples(search_alg.total_samples)

    # Calls setup on callbacks
    runner.setup_experiments(
        experiments=experiments, total_num_samples=search_alg.total_samples)

    # User Warning for GPUs
    if trial_executor.has_gpus():
        if isinstance(resources_per_trial,
                      dict) and "gpu" in resources_per_trial:
            # "gpu" is manually set.
            pass
        elif _check_default_resources_override(experiments[0].run_identifier):
            # "default_resources" is manually overridden.
            pass
        else:
            logger.warning("Tune detects GPUs, but no trials are using GPUs. "
                           "To enable trials to use GPUs, set "
                           "tune.run(resources_per_trial={'gpu': 1}...) "
                           "which allows Tune to expose 1 GPU to each trial. "
                           "You can also override "
                           "`Trainable.default_resource_request` if using the "
                           "Trainable API.")

    original_handler = signal.getsignal(signal.SIGINT)
    state = {signal.SIGINT: False}

    def sigint_handler(sig, frame):
        logger.warning(
            "SIGINT received (e.g. via Ctrl+C), ending Ray Tune run. "
            "This will try to checkpoint the experiment state one last time. "
            "Press CTRL+C one more time (or send SIGINT/SIGKILL/SIGTERM) "
            "to skip. ")
        state[signal.SIGINT] = True
        # Restore original signal handler to react to future SIGINT signals
        signal.signal(signal.SIGINT, original_handler)

    if not int(os.getenv("TUNE_DISABLE_SIGINT_HANDLER", "0")):
        signal.signal(signal.SIGINT, sigint_handler)

    tune_start = time.time()
    progress_reporter.set_start_time(tune_start)
    while not runner.is_finished() and not state[signal.SIGINT]:
        runner.step()
        if has_verbosity(Verbosity.V1_EXPERIMENT):
            _report_progress(runner, progress_reporter)
    tune_taken = time.time() - tune_start

    try:
        runner.checkpoint(force=True)
    except Exception as e:
        logger.warning(f"Trial Runner checkpointing failed: {str(e)}")

    if has_verbosity(Verbosity.V1_EXPERIMENT):
        _report_progress(runner, progress_reporter, done=True)

    wait_for_sync()
    runner.cleanup()

    incomplete_trials = []
    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
            incomplete_trials += [trial]

    if incomplete_trials:
        if raise_on_failed_trial and not state[signal.SIGINT]:
            raise TuneError("Trials did not complete", incomplete_trials)
        else:
            logger.error("Trials did not complete: %s", incomplete_trials)

    all_taken = time.time() - all_start
    if has_verbosity(Verbosity.V1_EXPERIMENT):
        logger.info(f"Total run time: {all_taken:.2f} seconds "
                    f"({tune_taken:.2f} seconds for the tuning loop).")

    if state[signal.SIGINT]:
        logger.warning(
            "Experiment has been interrupted, but the most recent state was "
            "saved. You can continue running this experiment by passing "
            "`resume=True` to `tune.run()`")

    trials = runner.get_trials()
    return ExperimentAnalysis(
        runner.checkpoint_file,
        trials=trials,
        default_metric=metric,
        default_mode=mode,
        sync_config=sync_config)
示例#10
0
文件: tune.py 项目: ipark-CS/ray
def run(
    run_or_experiment,
    name=None,
    metric=None,
    mode=None,
    stop=None,
    time_budget_s=None,
    config=None,
    resources_per_trial=None,
    num_samples=1,
    local_dir=None,
    search_alg=None,
    scheduler=None,
    keep_checkpoints_num=None,
    checkpoint_score_attr=None,
    checkpoint_freq=0,
    checkpoint_at_end=False,
    verbose=2,
    progress_reporter=None,
    loggers=None,
    log_to_file=False,
    trial_name_creator=None,
    trial_dirname_creator=None,
    sync_config=None,
    export_formats=None,
    max_failures=0,
    fail_fast=False,
    restore=None,
    server_port=None,
    resume=False,
    reuse_actors=False,
    trial_executor=None,
    raise_on_failed_trial=True,
    # Deprecated args
    ray_auto_init=None,
    run_errored_only=None,
    queue_trials=None,
    global_checkpoint_period=None,
    with_server=None,
    upload_dir=None,
    sync_to_cloud=None,
    sync_to_driver=None,
    sync_on_checkpoint=None,
):
    """Executes training.

    Examples:

    .. code-block:: python

        # Run 10 trials (each trial is one instance of a Trainable). Tune runs
        # in parallel and automatically determines concurrency.
        tune.run(trainable, num_samples=10)

        # Run 1 trial, stop when trial has reached 10 iterations
        tune.run(my_trainable, stop={"training_iteration": 10})

        # automatically retry failed trials up to 3 times
        tune.run(my_trainable, stop={"training_iteration": 10}, max_failures=3)

        # Run 1 trial, search over hyperparameters, stop after 10 iterations.
        space = {"lr": tune.uniform(0, 1), "momentum": tune.uniform(0, 1)}
        tune.run(my_trainable, config=space, stop={"training_iteration": 10})

        # Resumes training if a previous machine crashed
        tune.run(my_trainable, config=space,
                 local_dir=<path/to/dir>, resume=True)

        # Rerun ONLY failed trials after an experiment is finished.
        tune.run(my_trainable, config=space,
                 local_dir=<path/to/dir>, resume="ERRORED_ONLY")

    Args:
        run_or_experiment (function | class | str | :class:`Experiment`): If
            function|class|str, this is the algorithm or model to train.
            This may refer to the name of a built-on algorithm
            (e.g. RLLib's DQN or PPO), a user-defined trainable
            function or class, or the string identifier of a
            trainable function or class registered in the tune registry.
            If Experiment, then Tune will execute training based on
            Experiment.spec. If you want to pass in a Python lambda, you
            will need to first register the function:
            ``tune.register_trainable("lambda_id", lambda x: ...)``. You can
            then use ``tune.run("lambda_id")``.
        metric (str): Metric to optimize. This metric should be reported
            with `tune.report()`. If set, will be passed to the search
            algorithm and scheduler.
        mode (str): Must be one of [min, max]. Determines whether objective is
            minimizing or maximizing the metric attribute. If set, will be
            passed to the search algorithm and scheduler.
        name (str): Name of experiment.
        stop (dict | callable | :class:`Stopper`): Stopping criteria. If dict,
            the keys may be any field in the return result of 'train()',
            whichever is reached first. If function, it must take (trial_id,
            result) as arguments and return a boolean (True if trial should be
            stopped, False otherwise). This can also be a subclass of
            ``ray.tune.Stopper``, which allows users to implement
            custom experiment-wide stopping (i.e., stopping an entire Tune
            run based on some time constraint).
        time_budget_s (int|float|datetime.timedelta): Global time budget in
            seconds after which all trials are stopped. Can also be a
            ``datetime.timedelta`` object.
        config (dict): Algorithm-specific configuration for Tune variant
            generation (e.g. env, hyperparams). Defaults to empty dict.
            Custom search algorithms may ignore this.
        resources_per_trial (dict): Machine resources to allocate per trial,
            e.g. ``{"cpu": 64, "gpu": 8}``. Note that GPUs will not be
            assigned unless you specify them here. Defaults to 1 CPU and 0
            GPUs in ``Trainable.default_resource_request()``.
        num_samples (int): Number of times to sample from the
            hyperparameter space. Defaults to 1. If `grid_search` is
            provided as an argument, the grid will be repeated
            `num_samples` of times.
        local_dir (str): Local dir to save training results to.
            Defaults to ``~/ray_results``.
        search_alg (Searcher): Search algorithm for optimization.
        scheduler (TrialScheduler): Scheduler for executing
            the experiment. Choose among FIFO (default), MedianStopping,
            AsyncHyperBand, HyperBand and PopulationBasedTraining. Refer to
            ray.tune.schedulers for more options.
        keep_checkpoints_num (int): Number of checkpoints to keep. A value of
            `None` keeps all checkpoints. Defaults to `None`. If set, need
            to provide `checkpoint_score_attr`.
        checkpoint_score_attr (str): Specifies by which attribute to rank the
            best checkpoint. Default is increasing order. If attribute starts
            with `min-` it will rank attribute in decreasing order, i.e.
            `min-validation_loss`.
        checkpoint_freq (int): How many training iterations between
            checkpoints. A value of 0 (default) disables checkpointing.
            This has no effect when using the Functional Training API.
        checkpoint_at_end (bool): Whether to checkpoint at the end of the
            experiment regardless of the checkpoint_freq. Default is False.
            This has no effect when using the Functional Training API.
        verbose (int): 0, 1, or 2. Verbosity mode. 0 = silent,
            1 = only status updates, 2 = status and trial results.
        progress_reporter (ProgressReporter): Progress reporter for reporting
            intermediate experiment progress. Defaults to CLIReporter if
            running in command-line, or JupyterNotebookReporter if running in
            a Jupyter notebook.
        loggers (list): List of logger creators to be used with
            each Trial. If None, defaults to ray.tune.logger.DEFAULT_LOGGERS.
            See `ray/tune/logger.py`.
        log_to_file (bool|str|Sequence): Log stdout and stderr to files in
            Tune's trial directories. If this is `False` (default), no files
            are written. If `true`, outputs are written to `trialdir/stdout`
            and `trialdir/stderr`, respectively. If this is a single string,
            this is interpreted as a file relative to the trialdir, to which
            both streams are written. If this is a Sequence (e.g. a Tuple),
            it has to have length 2 and the elements indicate the files to
            which stdout and stderr are written, respectively.
        trial_name_creator (Callable[[Trial], str]): Optional function
            for generating the trial string representation.
        trial_dirname_creator (Callable[[Trial], str]): Function
            for generating the trial dirname. This function should take
            in a Trial object and return a string representing the
            name of the directory. The return value cannot be a path.
        sync_config (SyncConfig): Configuration object for syncing. See
            tune.SyncConfig.
        export_formats (list): List of formats that exported at the end of
            the experiment. Default is None.
        max_failures (int): Try to recover a trial at least this many times.
            Ray will recover from the latest checkpoint if present.
            Setting to -1 will lead to infinite recovery retries.
            Setting to 0 will disable retries. Defaults to 3.
        fail_fast (bool | str): Whether to fail upon the first error.
            If fail_fast='raise' provided, Tune will automatically
            raise the exception received by the Trainable. fail_fast='raise'
            can easily leak resources and should be used with caution (it
            is best used with `ray.init(local_mode=True)`).
        restore (str): Path to checkpoint. Only makes sense to set if
            running 1 trial. Defaults to None.
        server_port (int): Port number for launching TuneServer.
        resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", "ERRORED_ONLY",
            or bool. LOCAL/True restores the checkpoint from the
            local_checkpoint_dir, determined
            by `name` and `local_dir`. REMOTE restores the checkpoint
            from remote_checkpoint_dir. PROMPT provides CLI feedback.
            False forces a new experiment. ERRORED_ONLY resets and reruns
            ERRORED trials upon resume - previous trial artifacts will
            be left untouched.  If resume is set but checkpoint does not exist,
            ValueError will be thrown.
        reuse_actors (bool): Whether to reuse actors between different trials
            when possible. This can drastically speed up experiments that start
            and stop actors often (e.g., PBT in time-multiplexing mode). This
            requires trials to have the same resource requirements.
        trial_executor (TrialExecutor): Manage the execution of trials.
        raise_on_failed_trial (bool): Raise TuneError if there exists failed
            trial (of ERROR state) when the experiments complete.


    Returns:
        ExperimentAnalysis: Object for experiment analysis.

    Raises:
        TuneError: Any trials failed and `raise_on_failed_trial` is True.
    """
    if global_checkpoint_period:
        raise ValueError("global_checkpoint_period is deprecated. Set env var "
                         "'TUNE_GLOBAL_CHECKPOINT_S' instead.")
    if queue_trials:
        raise ValueError(
            "queue_trials is deprecated. "
            "Set env var 'TUNE_DISABLE_QUEUE_TRIALS=1' instead to "
            "disable queuing behavior.")
    if ray_auto_init:
        raise ValueError("ray_auto_init is deprecated. "
                         "Set env var 'TUNE_DISABLE_AUTO_INIT=1' instead or "
                         "call 'ray.init' before calling 'tune.run'.")
    if with_server:
        raise ValueError(
            "with_server is deprecated. It is now enabled by default "
            "if 'server_port' is not None.")
    if sync_on_checkpoint or sync_to_cloud or sync_to_driver or upload_dir:
        raise ValueError(
            "sync_on_checkpoint / sync_to_cloud / sync_to_driver / "
            "upload_dir must now be set via `tune.run("
            "sync_config=SyncConfig(...)`. See `ray.tune.SyncConfig` for "
            "more details.")

    if mode and mode not in ["min", "max"]:
        raise ValueError(
            "The `mode` parameter passed to `tune.run()` has to be one of "
            "['min', 'max']")

    config = config or {}
    sync_config = sync_config or SyncConfig()
    set_sync_periods(sync_config)

    trial_executor = trial_executor or RayTrialExecutor(
        reuse_actors=reuse_actors)
    if isinstance(run_or_experiment, list):
        experiments = run_or_experiment
    else:
        experiments = [run_or_experiment]

    for i, exp in enumerate(experiments):
        if not isinstance(exp, Experiment):
            experiments[i] = Experiment(
                name=name,
                run=exp,
                stop=stop,
                time_budget_s=time_budget_s,
                config=config,
                resources_per_trial=resources_per_trial,
                num_samples=num_samples,
                local_dir=local_dir,
                upload_dir=sync_config.upload_dir,
                sync_to_driver=sync_config.sync_to_driver,
                trial_name_creator=trial_name_creator,
                trial_dirname_creator=trial_dirname_creator,
                loggers=loggers,
                log_to_file=log_to_file,
                checkpoint_freq=checkpoint_freq,
                checkpoint_at_end=checkpoint_at_end,
                sync_on_checkpoint=sync_config.sync_on_checkpoint,
                keep_checkpoints_num=keep_checkpoints_num,
                checkpoint_score_attr=checkpoint_score_attr,
                export_formats=export_formats,
                max_failures=max_failures,
                restore=restore)
    else:
        logger.debug("Ignoring some parameters passed into tune.run.")

    if sync_config.sync_to_cloud:
        for exp in experiments:
            assert exp.remote_checkpoint_dir, (
                "Need `upload_dir` if `sync_to_cloud` given.")

    if fail_fast and max_failures != 0:
        raise ValueError("max_failures must be 0 if fail_fast=True.")

    if issubclass(type(search_alg), Searcher):
        search_alg = SearchGenerator(search_alg)

    if not search_alg:
        search_alg = BasicVariantGenerator()

    if config and not search_alg.set_search_properties(metric, mode, config):
        if has_unresolved_values(config):
            raise ValueError(
                "You passed a `config` parameter to `tune.run()` with "
                "unresolved parameters, but the search algorithm was already "
                "instantiated with a search space. Make sure that `config` "
                "does not contain any more parameter definitions - include "
                "them in the search algorithm's search space if necessary.")

    scheduler = scheduler or FIFOScheduler()
    if not scheduler.set_search_properties(metric, mode):
        raise ValueError(
            "You passed a `metric` or `mode` argument to `tune.run()`, but "
            "the scheduler you are using was already instantiated with their "
            "own `metric` and `mode` parameters. Either remove the arguments "
            "from your scheduler or from your call to `tune.run()`")

    runner = TrialRunner(
        search_alg=search_alg,
        scheduler=scheduler,
        local_checkpoint_dir=experiments[0].checkpoint_dir,
        remote_checkpoint_dir=experiments[0].remote_checkpoint_dir,
        sync_to_cloud=sync_config.sync_to_cloud,
        stopper=experiments[0].stopper,
        resume=resume,
        server_port=server_port,
        verbose=bool(verbose > 1),
        fail_fast=fail_fast,
        trial_executor=trial_executor)

    if not runner.resumed:
        for exp in experiments:
            search_alg.add_configurations([exp])
    else:
        logger.info("TrialRunner resumed, ignoring new add_experiment.")

    if progress_reporter is None:
        if IS_NOTEBOOK:
            progress_reporter = JupyterNotebookReporter(overwrite=verbose < 2)
        else:
            progress_reporter = CLIReporter()

    # User Warning for GPUs
    if trial_executor.has_gpus():
        if isinstance(resources_per_trial,
                      dict) and "gpu" in resources_per_trial:
            # "gpu" is manually set.
            pass
        elif _check_default_resources_override(experiments[0].run_identifier):
            # "default_resources" is manually overriden.
            pass
        else:
            logger.warning("Tune detects GPUs, but no trials are using GPUs. "
                           "To enable trials to use GPUs, set "
                           "tune.run(resources_per_trial={'gpu': 1}...) "
                           "which allows Tune to expose 1 GPU to each trial. "
                           "You can also override "
                           "`Trainable.default_resource_request` if using the "
                           "Trainable API.")

    while not runner.is_finished():
        runner.step()
        if verbose:
            _report_progress(runner, progress_reporter)

    try:
        runner.checkpoint(force=True)
    except Exception as e:
        logger.warning(f"Trial Runner checkpointing failed: {str(e)}")

    if verbose:
        _report_progress(runner, progress_reporter, done=True)

    wait_for_sync()
    runner.cleanup_trials()

    incomplete_trials = []
    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
            incomplete_trials += [trial]

    if incomplete_trials:
        if raise_on_failed_trial:
            raise TuneError("Trials did not complete", incomplete_trials)
        else:
            logger.error("Trials did not complete: %s", incomplete_trials)

    trials = runner.get_trials()
    return ExperimentAnalysis(runner.checkpoint_file,
                              trials=trials,
                              default_metric=metric,
                              default_mode=mode)
示例#11
0
def run(
        run_or_experiment: Union[str, Callable, Type],
        name: Optional[str] = None,
        metric: Optional[str] = None,
        mode: Optional[str] = None,
        stop: Union[None, Mapping, Stopper, Callable[[str, Mapping],
                                                     bool]] = None,
        time_budget_s: Union[None, int, float, datetime.timedelta] = None,
        config: Optional[Dict[str, Any]] = None,
        resources_per_trial: Union[None, Mapping[str, Union[
            float, int, Mapping]], PlacementGroupFactory] = None,
        num_samples: int = 1,
        local_dir: Optional[str] = None,
        search_alg: Optional[Union[Searcher, SearchAlgorithm]] = None,
        scheduler: Optional[TrialScheduler] = None,
        keep_checkpoints_num: Optional[int] = None,
        checkpoint_score_attr: Optional[str] = None,
        checkpoint_freq: int = 0,
        checkpoint_at_end: bool = False,
        verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS,
        progress_reporter: Optional[ProgressReporter] = None,
        log_to_file: bool = False,
        trial_name_creator: Optional[Callable[[Trial], str]] = None,
        trial_dirname_creator: Optional[Callable[[Trial], str]] = None,
        sync_config: Optional[SyncConfig] = None,
        export_formats: Optional[Sequence] = None,
        max_failures: int = 0,
        fail_fast: bool = False,
        restore: Optional[str] = None,
        server_port: Optional[int] = None,
        resume: bool = False,
        queue_trials: bool = False,
        reuse_actors: bool = False,
        trial_executor: Optional[RayTrialExecutor] = None,
        raise_on_failed_trial: bool = True,
        callbacks: Optional[Sequence[Callback]] = None,
        # Deprecated args
        loggers: Optional[Sequence[Type[Logger]]] = None,
        ray_auto_init: Optional = None,
        run_errored_only: Optional = None,
        global_checkpoint_period: Optional = None,
        with_server: Optional = None,
        upload_dir: Optional = None,
        sync_to_cloud: Optional = None,
        sync_to_driver: Optional = None,
        sync_on_checkpoint: Optional = None,
        _remote: bool = None,
) -> ExperimentAnalysis:
    """Executes training.

    When a SIGINT signal is received (e.g. through Ctrl+C), the tuning run
    will gracefully shut down and checkpoint the latest experiment state.
    Sending SIGINT again (or SIGKILL/SIGTERM instead) will skip this step.

    Examples:

    .. code-block:: python

        # Run 10 trials (each trial is one instance of a Trainable). Tune runs
        # in parallel and automatically determines concurrency.
        tune.run(trainable, num_samples=10)

        # Run 1 trial, stop when trial has reached 10 iterations
        tune.run(my_trainable, stop={"training_iteration": 10})

        # automatically retry failed trials up to 3 times
        tune.run(my_trainable, stop={"training_iteration": 10}, max_failures=3)

        # Run 1 trial, search over hyperparameters, stop after 10 iterations.
        space = {"lr": tune.uniform(0, 1), "momentum": tune.uniform(0, 1)}
        tune.run(my_trainable, config=space, stop={"training_iteration": 10})

        # Resumes training if a previous machine crashed
        tune.run(my_trainable, config=space,
                 local_dir=<path/to/dir>, resume=True)

        # Rerun ONLY failed trials after an experiment is finished.
        tune.run(my_trainable, config=space,
                 local_dir=<path/to/dir>, resume="ERRORED_ONLY")

    Args:
        run_or_experiment (function | class | str | :class:`Experiment`): If
            function|class|str, this is the algorithm or model to train.
            This may refer to the name of a built-on algorithm
            (e.g. RLLib's DQN or PPO), a user-defined trainable
            function or class, or the string identifier of a
            trainable function or class registered in the tune registry.
            If Experiment, then Tune will execute training based on
            Experiment.spec. If you want to pass in a Python lambda, you
            will need to first register the function:
            ``tune.register_trainable("lambda_id", lambda x: ...)``. You can
            then use ``tune.run("lambda_id")``.
        metric (str): Metric to optimize. This metric should be reported
            with `tune.report()`. If set, will be passed to the search
            algorithm and scheduler.
        mode (str): Must be one of [min, max]. Determines whether objective is
            minimizing or maximizing the metric attribute. If set, will be
            passed to the search algorithm and scheduler.
        name (str): Name of experiment.
        stop (dict | callable | :class:`Stopper`): Stopping criteria. If dict,
            the keys may be any field in the return result of 'train()',
            whichever is reached first. If function, it must take (trial_id,
            result) as arguments and return a boolean (True if trial should be
            stopped, False otherwise). This can also be a subclass of
            ``ray.tune.Stopper``, which allows users to implement
            custom experiment-wide stopping (i.e., stopping an entire Tune
            run based on some time constraint).
        time_budget_s (int|float|datetime.timedelta): Global time budget in
            seconds after which all trials are stopped. Can also be a
            ``datetime.timedelta`` object.
        config (dict): Algorithm-specific configuration for Tune variant
            generation (e.g. env, hyperparams). Defaults to empty dict.
            Custom search algorithms may ignore this.
        resources_per_trial (dict|PlacementGroupFactory): Machine resources
            to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}``.
            Note that GPUs will not be assigned unless you specify them here.
            Defaults to 1 CPU and 0 GPUs in
            ``Trainable.default_resource_request()``. This can also
            be a PlacementGroupFactory object wrapping arguments to create a
            per-trial placement group.
        num_samples (int): Number of times to sample from the
            hyperparameter space. Defaults to 1. If `grid_search` is
            provided as an argument, the grid will be repeated
            `num_samples` of times. If this is -1, (virtually) infinite
            samples are generated until a stopping condition is met.
        local_dir (str): Local dir to save training results to.
            Defaults to ``~/ray_results``.
        search_alg (Searcher|SearchAlgorithm): Search algorithm for
            optimization.
        scheduler (TrialScheduler): Scheduler for executing
            the experiment. Choose among FIFO (default), MedianStopping,
            AsyncHyperBand, HyperBand and PopulationBasedTraining. Refer to
            ray.tune.schedulers for more options.
        keep_checkpoints_num (int): Number of checkpoints to keep. A value of
            `None` keeps all checkpoints. Defaults to `None`. If set, need
            to provide `checkpoint_score_attr`.
        checkpoint_score_attr (str): Specifies by which attribute to rank the
            best checkpoint. Default is increasing order. If attribute starts
            with `min-` it will rank attribute in decreasing order, i.e.
            `min-validation_loss`.
        checkpoint_freq (int): How many training iterations between
            checkpoints. A value of 0 (default) disables checkpointing.
            This has no effect when using the Functional Training API.
        checkpoint_at_end (bool): Whether to checkpoint at the end of the
            experiment regardless of the checkpoint_freq. Default is False.
            This has no effect when using the Functional Training API.
        verbose (Union[int, Verbosity]): 0, 1, 2, or 3. Verbosity mode.
            0 = silent, 1 = only status updates, 2 = status and brief trial
            results, 3 = status and detailed trial results. Defaults to 3.
        progress_reporter (ProgressReporter): Progress reporter for reporting
            intermediate experiment progress. Defaults to CLIReporter if
            running in command-line, or JupyterNotebookReporter if running in
            a Jupyter notebook.
        log_to_file (bool|str|Sequence): Log stdout and stderr to files in
            Tune's trial directories. If this is `False` (default), no files
            are written. If `true`, outputs are written to `trialdir/stdout`
            and `trialdir/stderr`, respectively. If this is a single string,
            this is interpreted as a file relative to the trialdir, to which
            both streams are written. If this is a Sequence (e.g. a Tuple),
            it has to have length 2 and the elements indicate the files to
            which stdout and stderr are written, respectively.
        trial_name_creator (Callable[[Trial], str]): Optional function
            for generating the trial string representation.
        trial_dirname_creator (Callable[[Trial], str]): Function
            for generating the trial dirname. This function should take
            in a Trial object and return a string representing the
            name of the directory. The return value cannot be a path.
        sync_config (SyncConfig): Configuration object for syncing. See
            tune.SyncConfig.
        export_formats (list): List of formats that exported at the end of
            the experiment. Default is None.
        max_failures (int): Try to recover a trial at least this many times.
            Ray will recover from the latest checkpoint if present.
            Setting to -1 will lead to infinite recovery retries.
            Setting to 0 will disable retries. Defaults to 0.
        fail_fast (bool | str): Whether to fail upon the first error.
            If fail_fast='raise' provided, Tune will automatically
            raise the exception received by the Trainable. fail_fast='raise'
            can easily leak resources and should be used with caution (it
            is best used with `ray.init(local_mode=True)`).
        restore (str): Path to checkpoint. Only makes sense to set if
            running 1 trial. Defaults to None.
        server_port (int): Port number for launching TuneServer.
        resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", "ERRORED_ONLY",
            or bool. LOCAL/True restores the checkpoint from the
            local_checkpoint_dir, determined
            by `name` and `local_dir`. REMOTE restores the checkpoint
            from remote_checkpoint_dir. PROMPT provides CLI feedback.
            False forces a new experiment. ERRORED_ONLY resets and reruns
            ERRORED trials upon resume - previous trial artifacts will
            be left untouched.  If resume is set but checkpoint does not exist,
            ValueError will be thrown.
        queue_trials (bool): Whether to queue trials when the cluster does
            not currently have enough resources to launch one. This should
            be set to True when running on an autoscaling cluster to enable
            automatic scale-up.
        reuse_actors (bool): Whether to reuse actors between different trials
            when possible. This can drastically speed up experiments that start
            and stop actors often (e.g., PBT in time-multiplexing mode). This
            requires trials to have the same resource requirements.
        trial_executor (TrialExecutor): Manage the execution of trials.
        raise_on_failed_trial (bool): Raise TuneError if there exists failed
            trial (of ERROR state) when the experiments complete.
        callbacks (list): List of callbacks that will be called at different
            times in the training loop. Must be instances of the
            ``ray.tune.callback.Callback`` class. If not passed,
            `LoggerCallback` and `SyncerCallback` callbacks are automatically
            added.
        _remote (bool): Whether to run the Tune driver in a remote function.
            This is disabled automatically if a custom trial executor is
            passed in. This is enabled by default in Ray client mode.

    Returns:
        ExperimentAnalysis: Object for experiment analysis.

    Raises:
        TuneError: Any trials failed and `raise_on_failed_trial` is True.
    """

    if _remote is None:
        _remote = ray.util.client.ray.is_connected()

    if _remote is True and trial_executor:
        raise ValueError("cannot use custom trial executor")

    if not trial_executor or isinstance(trial_executor, RayTrialExecutor):
        _ray_auto_init()

    if _remote:
        return ray.get(
            ray.remote(num_cpus=0)(run).remote(
                run_or_experiment,
                name,
                metric,
                mode,
                stop,
                time_budget_s,
                config,
                resources_per_trial,
                num_samples,
                local_dir,
                search_alg,
                scheduler,
                keep_checkpoints_num,
                checkpoint_score_attr,
                checkpoint_freq,
                checkpoint_at_end,
                verbose,
                progress_reporter,
                log_to_file,
                trial_name_creator,
                trial_dirname_creator,
                sync_config,
                export_formats,
                max_failures,
                fail_fast,
                restore,
                server_port,
                resume,
                queue_trials,
                reuse_actors,
                trial_executor,
                raise_on_failed_trial,
                callbacks,
                # Deprecated args
                loggers,
                ray_auto_init,
                run_errored_only,
                global_checkpoint_period,
                with_server,
                upload_dir,
                sync_to_cloud,
                sync_to_driver,
                sync_on_checkpoint,
                _remote=False))

    all_start = time.time()
    if global_checkpoint_period:
        raise ValueError("global_checkpoint_period is deprecated. Set env var "
                         "'TUNE_GLOBAL_CHECKPOINT_S' instead.")
    if ray_auto_init:
        raise ValueError("ray_auto_init is deprecated. "
                         "Set env var 'TUNE_DISABLE_AUTO_INIT=1' instead or "
                         "call 'ray.init' before calling 'tune.run'.")
    if with_server:
        raise ValueError(
            "with_server is deprecated. It is now enabled by default "
            "if 'server_port' is not None.")
    if sync_on_checkpoint or sync_to_cloud or sync_to_driver or upload_dir:
        raise ValueError(
            "sync_on_checkpoint / sync_to_cloud / sync_to_driver / "
            "upload_dir must now be set via `tune.run("
            "sync_config=SyncConfig(...)`. See `ray.tune.SyncConfig` for "
            "more details.")

    if mode and mode not in ["min", "max"]:
        raise ValueError(
            "The `mode` parameter passed to `tune.run()` has to be one of "
            "['min', 'max']")

    set_verbosity(verbose)

    config = config or {}
    sync_config = sync_config or SyncConfig()
    set_sync_periods(sync_config)

    if num_samples == -1:
        num_samples = sys.maxsize

    trial_executor = trial_executor or RayTrialExecutor(
        reuse_actors=reuse_actors, queue_trials=queue_trials)
    if isinstance(run_or_experiment, list):
        experiments = run_or_experiment
    else:
        experiments = [run_or_experiment]

    for i, exp in enumerate(experiments):
        if not isinstance(exp, Experiment):
            experiments[i] = Experiment(
                name=name,
                run=exp,
                stop=stop,
                time_budget_s=time_budget_s,
                config=config,
                resources_per_trial=resources_per_trial,
                num_samples=num_samples,
                local_dir=local_dir,
                upload_dir=sync_config.upload_dir,
                sync_to_driver=sync_config.sync_to_driver,
                trial_name_creator=trial_name_creator,
                trial_dirname_creator=trial_dirname_creator,
                log_to_file=log_to_file,
                checkpoint_freq=checkpoint_freq,
                checkpoint_at_end=checkpoint_at_end,
                sync_on_checkpoint=sync_config.sync_on_checkpoint,
                keep_checkpoints_num=keep_checkpoints_num,
                checkpoint_score_attr=checkpoint_score_attr,
                export_formats=export_formats,
                max_failures=max_failures,
                restore=restore)
    else:
        logger.debug("Ignoring some parameters passed into tune.run.")

    if sync_config.sync_to_cloud:
        for exp in experiments:
            assert exp.remote_checkpoint_dir, (
                "Need `upload_dir` if `sync_to_cloud` given.")

    if fail_fast and max_failures != 0:
        raise ValueError("max_failures must be 0 if fail_fast=True.")

    if issubclass(type(search_alg), Searcher):
        search_alg = SearchGenerator(search_alg)

    if not search_alg:
        search_alg = BasicVariantGenerator()

    if config and not search_alg.set_search_properties(metric, mode, config):
        if has_unresolved_values(config):
            raise ValueError(
                "You passed a `config` parameter to `tune.run()` with "
                "unresolved parameters, but the search algorithm was already "
                "instantiated with a search space. Make sure that `config` "
                "does not contain any more parameter definitions - include "
                "them in the search algorithm's search space if necessary.")

    scheduler = scheduler or FIFOScheduler()
    if not scheduler.set_search_properties(metric, mode):
        raise ValueError(
            "You passed a `metric` or `mode` argument to `tune.run()`, but "
            "the scheduler you are using was already instantiated with their "
            "own `metric` and `mode` parameters. Either remove the arguments "
            "from your scheduler or from your call to `tune.run()`")

    # Create syncer callbacks
    callbacks = create_default_callbacks(
        callbacks, sync_config, metric=metric, loggers=loggers)

    runner = TrialRunner(
        search_alg=search_alg,
        scheduler=scheduler,
        local_checkpoint_dir=experiments[0].checkpoint_dir,
        remote_checkpoint_dir=experiments[0].remote_checkpoint_dir,
        sync_to_cloud=sync_config.sync_to_cloud,
        stopper=experiments[0].stopper,
        resume=resume,
        server_port=server_port,
        fail_fast=fail_fast,
        trial_executor=trial_executor,
        callbacks=callbacks,
        metric=metric)

    if not runner.resumed:
        for exp in experiments:
            search_alg.add_configurations([exp])
    else:
        logger.info("TrialRunner resumed, ignoring new add_experiment.")

    if progress_reporter is None:
        if IS_NOTEBOOK:
            progress_reporter = JupyterNotebookReporter(
                overwrite=not has_verbosity(Verbosity.V2_TRIAL_NORM))
        else:
            progress_reporter = CLIReporter()

    if not progress_reporter.set_search_properties(metric, mode):
        raise ValueError(
            "You passed a `metric` or `mode` argument to `tune.run()`, but "
            "the reporter you are using was already instantiated with their "
            "own `metric` and `mode` parameters. Either remove the arguments "
            "from your reporter or from your call to `tune.run()`")
    progress_reporter.set_total_samples(search_alg.total_samples)

    # User Warning for GPUs
    if trial_executor.has_gpus():
        if isinstance(resources_per_trial,
                      dict) and "gpu" in resources_per_trial:
            # "gpu" is manually set.
            pass
        elif _check_default_resources_override(experiments[0].run_identifier):
            # "default_resources" is manually overridden.
            pass
        else:
            logger.warning("Tune detects GPUs, but no trials are using GPUs. "
                           "To enable trials to use GPUs, set "
                           "tune.run(resources_per_trial={'gpu': 1}...) "
                           "which allows Tune to expose 1 GPU to each trial. "
                           "You can also override "
                           "`Trainable.default_resource_request` if using the "
                           "Trainable API.")

    original_handler = signal.getsignal(signal.SIGINT)
    state = {signal.SIGINT: False}

    def sigint_handler(sig, frame):
        logger.warning(
            "SIGINT received (e.g. via Ctrl+C), ending Ray Tune run. "
            "This will try to checkpoint the experiment state one last time. "
            "Press CTRL+C one more time (or send SIGINT/SIGKILL/SIGTERM) "
            "to skip. ")
        state[signal.SIGINT] = True
        # Restore original signal handler to react to future SIGINT signals
        signal.signal(signal.SIGINT, original_handler)

    if not int(os.getenv("TUNE_DISABLE_SIGINT_HANDLER", "0")):
        signal.signal(signal.SIGINT, sigint_handler)

    tune_start = time.time()
    while not runner.is_finished() and not state[signal.SIGINT]:
        runner.step()
        if has_verbosity(Verbosity.V1_EXPERIMENT):
            _report_progress(runner, progress_reporter)
    tune_taken = time.time() - tune_start

    try:
        runner.checkpoint(force=True)
    except Exception as e:
        logger.warning(f"Trial Runner checkpointing failed: {str(e)}")

    if has_verbosity(Verbosity.V1_EXPERIMENT):
        _report_progress(runner, progress_reporter, done=True)

    wait_for_sync()
    runner.cleanup_trials()

    incomplete_trials = []
    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
            incomplete_trials += [trial]

    if incomplete_trials:
        if raise_on_failed_trial and not state[signal.SIGINT]:
            raise TuneError("Trials did not complete", incomplete_trials)
        else:
            logger.error("Trials did not complete: %s", incomplete_trials)

    all_taken = time.time() - all_start
    if has_verbosity(Verbosity.V1_EXPERIMENT):
        logger.info(f"Total run time: {all_taken:.2f} seconds "
                    f"({tune_taken:.2f} seconds for the tuning loop).")

    if state[signal.SIGINT]:
        logger.warning(
            "Experiment has been interrupted, but the most recent state was "
            "saved. You can continue running this experiment by passing "
            "`resume=True` to `tune.run()`")

    trials = runner.get_trials()
    return ExperimentAnalysis(
        runner.checkpoint_file,
        trials=trials,
        default_metric=metric,
        default_mode=mode)
示例#12
0
    def __init__(
        self,
        name,
        run,
        stop=None,
        time_budget_s=None,
        config=None,
        resources_per_trial=None,
        num_samples=1,
        local_dir=None,
        _experiment_checkpoint_dir: Optional[str] = None,
        sync_config=None,
        trial_name_creator=None,
        trial_dirname_creator=None,
        log_to_file=False,
        checkpoint_freq=0,
        checkpoint_at_end=False,
        keep_checkpoints_num=None,
        checkpoint_score_attr=None,
        export_formats=None,
        max_failures=0,
        restore=None,
    ):

        local_dir = _get_local_dir_with_expand_user(local_dir)
        # `_experiment_checkpoint_dir` is for internal use only for better
        # support of Tuner API.
        # If set, it should be a subpath under `local_dir`. Also deduce `dir_name`.
        self._experiment_checkpoint_dir = _experiment_checkpoint_dir
        if _experiment_checkpoint_dir:
            experiment_checkpoint_dir_path = Path(_experiment_checkpoint_dir)
            local_dir_path = Path(local_dir)
            assert local_dir_path in experiment_checkpoint_dir_path.parents
            # `dir_name` is set by `_experiment_checkpoint_dir` indirectly.
            self.dir_name = os.path.relpath(_experiment_checkpoint_dir, local_dir)

        config = config or {}
        sync_config = sync_config or SyncConfig()
        if (
            callable(run)
            and not inspect.isclass(run)
            and detect_checkpoint_function(run)
        ):
            if checkpoint_at_end:
                raise ValueError(
                    "'checkpoint_at_end' cannot be used with a "
                    "checkpointable function. You can specify "
                    "and register checkpoints within "
                    "your trainable function."
                )
            if checkpoint_freq:
                raise ValueError(
                    "'checkpoint_freq' cannot be used with a "
                    "checkpointable function. You can specify checkpoints "
                    "within your trainable function."
                )
        try:
            self._run_identifier = Experiment.register_if_needed(run)
        except grpc.RpcError as e:
            if e.code() == grpc.StatusCode.RESOURCE_EXHAUSTED:
                raise TuneError(
                    f"The Trainable/training function is too large for grpc resource "
                    f"limit. Check that its definition is not implicitly capturing a "
                    f"large array or other object in scope. "
                    f"Tip: use tune.with_parameters() to put large objects "
                    f"in the Ray object store. \n"
                    f"Original exception: {traceback.format_exc()}"
                )
            else:
                raise e

        self.name = name or self._run_identifier

        if not _experiment_checkpoint_dir:
            self.dir_name = _get_dir_name(run, name, self.name)

        assert self.dir_name

        if sync_config.upload_dir:
            self.remote_checkpoint_dir = os.path.join(
                sync_config.upload_dir, self.dir_name
            )
        else:
            self.remote_checkpoint_dir = None

        self._stopper = None
        stopping_criteria = {}
        if not stop:
            pass
        elif isinstance(stop, list):
            bad_stoppers = [s for s in stop if not isinstance(s, Stopper)]
            if bad_stoppers:
                stopper_types = [type(s) for s in stop]
                raise ValueError(
                    "If you pass a list as the `stop` argument to "
                    "`tune.run()`, each element must be an instance of "
                    f"`tune.stopper.Stopper`. Got {stopper_types}."
                )
            self._stopper = CombinedStopper(*stop)
        elif isinstance(stop, dict):
            stopping_criteria = stop
        elif callable(stop):
            if FunctionStopper.is_valid_function(stop):
                self._stopper = FunctionStopper(stop)
            elif isinstance(stop, Stopper):
                self._stopper = stop
            else:
                raise ValueError(
                    "Provided stop object must be either a dict, "
                    "a function, or a subclass of "
                    f"`ray.tune.Stopper`. Got {type(stop)}."
                )
        else:
            raise ValueError(
                f"Invalid stop criteria: {stop}. Must be a "
                f"callable or dict. Got {type(stop)}."
            )

        if time_budget_s:
            if self._stopper:
                self._stopper = CombinedStopper(
                    self._stopper, TimeoutStopper(time_budget_s)
                )
            else:
                self._stopper = TimeoutStopper(time_budget_s)

        stdout_file, stderr_file = _validate_log_to_file(log_to_file)

        spec = {
            "run": self._run_identifier,
            "stop": stopping_criteria,
            "time_budget_s": time_budget_s,
            "config": config,
            "resources_per_trial": resources_per_trial,
            "num_samples": num_samples,
            "local_dir": local_dir,
            "sync_config": sync_config,
            "remote_checkpoint_dir": self.remote_checkpoint_dir,
            "trial_name_creator": trial_name_creator,
            "trial_dirname_creator": trial_dirname_creator,
            "log_to_file": (stdout_file, stderr_file),
            "checkpoint_freq": checkpoint_freq,
            "checkpoint_at_end": checkpoint_at_end,
            "keep_checkpoints_num": keep_checkpoints_num,
            "checkpoint_score_attr": checkpoint_score_attr,
            "export_formats": export_formats or [],
            "max_failures": max_failures,
            "restore": os.path.abspath(os.path.expanduser(restore))
            if restore
            else None,
        }
        self.spec = spec
示例#13
0
def create_trial_from_spec(spec, output_path, parser, **trial_kwargs):
    """Creates a Trial object from parsing the spec.

    Args:
        spec (dict): A resolved experiment specification. Arguments should
            The args here should correspond to the command line flags
            in ray.tune.config_parser.
        output_path (str); A specific output path within the local_dir.
            Typically the name of the experiment.
        parser (ArgumentParser): An argument parser object from
            make_parser.
        trial_kwargs: Extra keyword arguments used in instantiating the Trial.

    Returns:
        A trial object with corresponding parameters to the specification.
    """
    global _cached_pgf

    spec = spec.copy()
    resources = spec.pop("resources_per_trial", None)

    try:
        args, _ = parser.parse_known_args(to_argv(spec))
    except SystemExit:
        raise TuneError("Error parsing args, see above message", spec)

    if resources:
        if isinstance(resources, PlacementGroupFactory):
            trial_kwargs["placement_group_factory"] = resources
        else:
            # This will be converted to a placement group factory in the
            # Trial object constructor
            try:
                trial_kwargs["resources"] = json_to_resources(resources)
            except (TuneError, ValueError) as exc:
                raise TuneError("Error parsing resources_per_trial",
                                resources) from exc

    remote_checkpoint_dir = spec.get("remote_checkpoint_dir")

    sync_config = spec.get("sync_config", SyncConfig())
    if sync_config.syncer is None or isinstance(sync_config.syncer, str):
        sync_function_tpl = sync_config.syncer
    elif not isinstance(sync_config.syncer, str):
        # If a syncer was specified, but not a template, it is a function.
        # Functions cannot be used for trial checkpointing on remote nodes,
        # so we set the remote checkpoint dir to None to disable this.
        sync_function_tpl = None
        remote_checkpoint_dir = None
    else:
        sync_function_tpl = None  # Auto-detect

    return Trial(
        # Submitting trial via server in py2.7 creates Unicode, which does not
        # convert to string in a straightforward manner.
        trainable_name=spec["run"],
        # json.load leads to str -> unicode in py2.7
        config=spec.get("config", {}),
        local_dir=os.path.join(spec["local_dir"], output_path),
        # json.load leads to str -> unicode in py2.7
        stopping_criterion=spec.get("stop", {}),
        remote_checkpoint_dir=remote_checkpoint_dir,
        sync_function_tpl=sync_function_tpl,
        checkpoint_freq=args.checkpoint_freq,
        checkpoint_at_end=args.checkpoint_at_end,
        sync_on_checkpoint=sync_config.sync_on_checkpoint,
        keep_checkpoints_num=args.keep_checkpoints_num,
        checkpoint_score_attr=args.checkpoint_score_attr,
        export_formats=spec.get("export_formats", []),
        # str(None) doesn't create None
        restore_path=spec.get("restore"),
        trial_name_creator=spec.get("trial_name_creator"),
        trial_dirname_creator=spec.get("trial_dirname_creator"),
        log_to_file=spec.get("log_to_file"),
        # str(None) doesn't create None
        max_failures=args.max_failures,
        **trial_kwargs)
示例#14
0
    def __init__(
        self,
        name,
        run,
        stop=None,
        time_budget_s=None,
        config=None,
        resources_per_trial=None,
        num_samples=1,
        local_dir=None,
        sync_config=None,
        trial_name_creator=None,
        trial_dirname_creator=None,
        log_to_file=False,
        checkpoint_freq=0,
        checkpoint_at_end=False,
        keep_checkpoints_num=None,
        checkpoint_score_attr=None,
        export_formats=None,
        max_failures=0,
        restore=None,
    ):

        config = config or {}
        sync_config = sync_config or SyncConfig()
        if (callable(run) and not inspect.isclass(run)
                and detect_checkpoint_function(run)):
            if checkpoint_at_end:
                raise ValueError("'checkpoint_at_end' cannot be used with a "
                                 "checkpointable function. You can specify "
                                 "and register checkpoints within "
                                 "your trainable function.")
            if checkpoint_freq:
                raise ValueError(
                    "'checkpoint_freq' cannot be used with a "
                    "checkpointable function. You can specify checkpoints "
                    "within your trainable function.")
        self._run_identifier = Experiment.register_if_needed(run)
        self.name = name or self._run_identifier

        # If the name has been set explicitly, we don't want to create
        # dated directories. The same is true for string run identifiers.
        if (int(os.environ.get("TUNE_DISABLE_DATED_SUBDIR", 0)) == 1 or name
                or isinstance(run, str)):
            self.dir_name = self.name
        else:
            self.dir_name = "{}_{}".format(self.name, date_str())

        if sync_config.upload_dir:
            self.remote_checkpoint_dir = os.path.join(sync_config.upload_dir,
                                                      self.dir_name)
        else:
            self.remote_checkpoint_dir = None

        self._stopper = None
        stopping_criteria = {}
        if not stop:
            pass
        elif isinstance(stop, list):
            bad_stoppers = [s for s in stop if not isinstance(s, Stopper)]
            if bad_stoppers:
                stopper_types = [type(s) for s in stop]
                raise ValueError(
                    "If you pass a list as the `stop` argument to "
                    "`tune.run()`, each element must be an instance of "
                    f"`tune.stopper.Stopper`. Got {stopper_types}.")
            self._stopper = CombinedStopper(*stop)
        elif isinstance(stop, dict):
            stopping_criteria = stop
        elif callable(stop):
            if FunctionStopper.is_valid_function(stop):
                self._stopper = FunctionStopper(stop)
            elif isinstance(stop, Stopper):
                self._stopper = stop
            else:
                raise ValueError("Provided stop object must be either a dict, "
                                 "a function, or a subclass of "
                                 f"`ray.tune.Stopper`. Got {type(stop)}.")
        else:
            raise ValueError(f"Invalid stop criteria: {stop}. Must be a "
                             f"callable or dict. Got {type(stop)}.")

        if time_budget_s:
            if self._stopper:
                self._stopper = CombinedStopper(self._stopper,
                                                TimeoutStopper(time_budget_s))
            else:
                self._stopper = TimeoutStopper(time_budget_s)

        stdout_file, stderr_file = _validate_log_to_file(log_to_file)

        spec = {
            "run":
            self._run_identifier,
            "stop":
            stopping_criteria,
            "config":
            config,
            "resources_per_trial":
            resources_per_trial,
            "num_samples":
            num_samples,
            "local_dir":
            os.path.abspath(
                os.path.expanduser(local_dir or DEFAULT_RESULTS_DIR)),
            "sync_config":
            sync_config,
            "remote_checkpoint_dir":
            self.remote_checkpoint_dir,
            "trial_name_creator":
            trial_name_creator,
            "trial_dirname_creator":
            trial_dirname_creator,
            "log_to_file": (stdout_file, stderr_file),
            "checkpoint_freq":
            checkpoint_freq,
            "checkpoint_at_end":
            checkpoint_at_end,
            "keep_checkpoints_num":
            keep_checkpoints_num,
            "checkpoint_score_attr":
            checkpoint_score_attr,
            "export_formats":
            export_formats or [],
            "max_failures":
            max_failures,
            "restore":
            os.path.abspath(os.path.expanduser(restore)) if restore else None,
        }
        self.spec = spec
示例#15
0
def create_trial_from_spec(spec: dict, output_path: str,
                           parser: argparse.ArgumentParser, **trial_kwargs):
    """Creates a Trial object from parsing the spec.

    Args:
        spec: A resolved experiment specification. Arguments should
            The args here should correspond to the command line flags
            in ray.tune.experiment.config_parser.
        output_path: A specific output path within the local_dir.
            Typically the name of the experiment.
        parser: An argument parser object from
            make_parser.
        trial_kwargs: Extra keyword arguments used in instantiating the Trial.

    Returns:
        A trial object with corresponding parameters to the specification.
    """
    global _cached_pgf

    spec = spec.copy()
    resources = spec.pop("resources_per_trial", None)

    try:
        args, _ = parser.parse_known_args(to_argv(spec))
    except SystemExit:
        raise TuneError("Error parsing args, see above message", spec)

    if resources:
        if isinstance(resources, PlacementGroupFactory):
            trial_kwargs["placement_group_factory"] = resources
        else:
            # This will be converted to a placement group factory in the
            # Trial object constructor
            try:
                trial_kwargs["resources"] = json_to_resources(resources)
            except (TuneError, ValueError) as exc:
                raise TuneError("Error parsing resources_per_trial",
                                resources) from exc

    remote_checkpoint_dir = spec.get("remote_checkpoint_dir")

    sync_config = spec.get("sync_config", SyncConfig())
    if (sync_config.syncer is None or sync_config.syncer == "auto"
            or isinstance(sync_config.syncer, Syncer)):
        custom_syncer = sync_config.syncer
    else:
        raise ValueError(
            f"Unknown syncer type passed in SyncConfig: {type(sync_config.syncer)}. "
            f"Note that custom sync functions and templates have been deprecated. "
            f"Instead you can implement you own `Syncer` class. "
            f"Please leave a comment on GitHub if you run into any issues with this: "
            f"https://github.com/ray-project/ray/issues")

    return Trial(
        # Submitting trial via server in py2.7 creates Unicode, which does not
        # convert to string in a straightforward manner.
        trainable_name=spec["run"],
        # json.load leads to str -> unicode in py2.7
        config=spec.get("config", {}),
        local_dir=os.path.join(spec["local_dir"], output_path),
        # json.load leads to str -> unicode in py2.7
        stopping_criterion=spec.get("stop", {}),
        remote_checkpoint_dir=remote_checkpoint_dir,
        custom_syncer=custom_syncer,
        checkpoint_freq=args.checkpoint_freq,
        checkpoint_at_end=args.checkpoint_at_end,
        sync_on_checkpoint=sync_config.sync_on_checkpoint,
        keep_checkpoints_num=args.keep_checkpoints_num,
        checkpoint_score_attr=args.checkpoint_score_attr,
        export_formats=spec.get("export_formats", []),
        # str(None) doesn't create None
        restore_path=spec.get("restore"),
        trial_name_creator=spec.get("trial_name_creator"),
        trial_dirname_creator=spec.get("trial_dirname_creator"),
        log_to_file=spec.get("log_to_file"),
        # str(None) doesn't create None
        max_failures=args.max_failures,
        **trial_kwargs,
    )
示例#16
0
    def execute(
        self,
        config,
        dataset=None,
        training_set=None,
        validation_set=None,
        test_set=None,
        training_set_metadata=None,
        data_format=None,
        experiment_name="hyperopt",
        model_name="run",
        resume=None,
        skip_save_training_description=False,
        skip_save_training_statistics=False,
        skip_save_model=False,
        skip_save_progress=False,
        skip_save_log=False,
        skip_save_processed_input=True,
        skip_save_unprocessed_output=False,
        skip_save_predictions=False,
        skip_save_eval_stats=False,
        output_directory="results",
        gpus=None,
        gpu_memory_limit=None,
        allow_parallel_threads=True,
        callbacks=None,
        backend=None,
        random_seed=default_random_seed,
        debug=False,
        hyperopt_log_verbosity=3,
        features_eligible_for_shared_params=None,
        **kwargs,
    ) -> RayTuneResults:
        if isinstance(dataset, str) and not has_remote_protocol(
                dataset) and not os.path.isabs(dataset):
            dataset = os.path.abspath(dataset)

        if isinstance(backend, str):
            backend = initialize_backend(backend)

        if gpus is not None:
            raise ValueError(
                "Parameter `gpus` is not supported when using Ray Tune. "
                "Configure GPU resources with Ray and set `gpu_resources_per_trial` in your "
                "hyperopt config.")

        if gpu_memory_limit is None and 0 < self._gpu_resources_per_trial_non_none < 1:
            # Enforce fractional GPU utilization
            gpu_memory_limit = self.gpu_resources_per_trial

        hyperopt_dict = dict(
            config=config,
            dataset=dataset,
            training_set=training_set,
            validation_set=validation_set,
            test_set=test_set,
            training_set_metadata=training_set_metadata,
            data_format=data_format,
            experiment_name=experiment_name,
            model_name=model_name,
            eval_split=self.split,
            skip_save_training_description=skip_save_training_description,
            skip_save_training_statistics=skip_save_training_statistics,
            skip_save_model=skip_save_model,
            skip_save_progress=skip_save_progress,
            skip_save_log=skip_save_log,
            skip_save_processed_input=skip_save_processed_input,
            skip_save_unprocessed_output=skip_save_unprocessed_output,
            skip_save_predictions=skip_save_predictions,
            skip_save_eval_stats=skip_save_eval_stats,
            output_directory=output_directory,
            gpus=gpus,
            gpu_memory_limit=gpu_memory_limit,
            allow_parallel_threads=allow_parallel_threads,
            callbacks=callbacks,
            backend=backend,
            random_seed=random_seed,
            debug=debug,
        )

        mode = "min" if self.goal != MAXIMIZE else "max"
        metric = "metric_score"
        # if random seed not set, use Ludwig seed
        self.search_algorithm.check_for_random_seed(random_seed)
        if self.search_algorithm.search_alg_dict is not None:
            if TYPE not in self.search_algorithm.search_alg_dict:
                candiate_search_algs = [
                    search_alg for search_alg in SEARCH_ALG_IMPORT.keys()
                ]
                logger.warning(
                    "WARNING: search_alg type parameter missing, using 'variant_generator' as default. "
                    f"These are possible values for the type parameter: {candiate_search_algs}."
                )
                search_alg = None
            else:
                search_alg_type = self.search_algorithm.search_alg_dict[TYPE]
                search_alg = tune.create_searcher(
                    search_alg_type,
                    metric=metric,
                    mode=mode,
                    **self.search_algorithm.search_alg_dict)
        else:
            search_alg = None

        if self.max_concurrent_trials:
            assert (
                self.max_concurrent_trials > 0
            ), f"`max_concurrent_trials` must be greater than 0, got {self.max_concurrent_trials}"
            if isinstance(search_alg,
                          BasicVariantGenerator) or search_alg is None:
                search_alg = BasicVariantGenerator(
                    max_concurrent=self.max_concurrent_trials)
            elif isinstance(search_alg, ConcurrencyLimiter):
                raise ValueError(
                    "You have specified `max_concurrent_trials`, but the search "
                    "algorithm is already a `ConcurrencyLimiter`. FIX THIS "
                    "by setting `max_concurrent_trials=None`.")
            else:
                search_alg = ConcurrencyLimiter(
                    search_alg, max_concurrent=self.max_concurrent_trials)

        resources_per_trial = {
            "cpu": self._cpu_resources_per_trial_non_none,
            "gpu": self._gpu_resources_per_trial_non_none,
        }

        def run_experiment_trial(config,
                                 local_hyperopt_dict,
                                 checkpoint_dir=None):
            return self._run_experiment(
                config,
                checkpoint_dir,
                local_hyperopt_dict,
                self.decode_ctx,
                features_eligible_for_shared_params,
                _is_ray_backend(backend),
            )

        tune_config = {}
        tune_callbacks = []
        for callback in callbacks or []:
            run_experiment_trial, tune_config = callback.prepare_ray_tune(
                run_experiment_trial,
                tune_config,
                tune_callbacks,
            )

        if _is_ray_backend(backend):
            # for now, we do not do distributed training on cpu (until spread scheduling is implemented for Ray Train)
            # but we do want to enable it when GPUs are specified
            resources_per_trial = PlacementGroupFactory(
                [{}] + ([{
                    "CPU": 0,
                    "GPU": 1
                }] * self._gpu_resources_per_trial_non_none) if self.
                _gpu_resources_per_trial_non_none else [{}] +
                [{
                    "CPU": self._cpu_resources_per_trial_non_none
                }])

        if has_remote_protocol(output_directory):
            run_experiment_trial = tune.durable(run_experiment_trial)
            self.sync_config = tune.SyncConfig(sync_to_driver=False,
                                               upload_dir=output_directory)
            if _ray_114:
                self.sync_client = get_node_to_storage_syncer(
                    SyncConfig(upload_dir=output_directory))
            else:
                self.sync_client = get_cloud_sync_client(output_directory)
            output_directory = None
        elif self.kubernetes_namespace:
            from ray.tune.integration.kubernetes import KubernetesSyncClient, NamespacedKubernetesSyncer

            self.sync_config = tune.SyncConfig(
                sync_to_driver=NamespacedKubernetesSyncer(
                    self.kubernetes_namespace))
            self.sync_client = KubernetesSyncClient(self.kubernetes_namespace)

        run_experiment_trial_params = tune.with_parameters(
            run_experiment_trial, local_hyperopt_dict=hyperopt_dict)
        register_trainable(
            f"trainable_func_f{hash_dict(config).decode('ascii')}",
            run_experiment_trial_params)

        # Note that resume="AUTO" will attempt to resume the experiment if possible, and
        # otherwise will start a new experiment:
        # https://docs.ray.io/en/latest/tune/tutorials/tune-stopping.html
        should_resume = "AUTO" if resume is None else resume

        try:
            analysis = tune.run(
                f"trainable_func_f{hash_dict(config).decode('ascii')}",
                name=experiment_name,
                config={
                    **self.search_space,
                    **tune_config,
                },
                scheduler=self.scheduler,
                search_alg=search_alg,
                num_samples=self.num_samples,
                keep_checkpoints_num=1,
                max_failures=1,  # retry a trial failure once
                resources_per_trial=resources_per_trial,
                time_budget_s=self.time_budget_s,
                sync_config=self.sync_config,
                local_dir=output_directory,
                metric=metric,
                mode=mode,
                trial_name_creator=lambda trial: f"trial_{trial.trial_id}",
                trial_dirname_creator=lambda trial: f"trial_{trial.trial_id}",
                callbacks=tune_callbacks,
                stop=CallbackStopper(callbacks),
                verbose=hyperopt_log_verbosity,
                resume=should_resume,
                log_to_file=True,
            )
        except Exception as e:
            # Explicitly raise a RuntimeError if an error is encountered during a Ray trial.
            # NOTE: Cascading the exception with "raise _ from e" still results in hanging.
            raise RuntimeError(f"Encountered Ray Tune error: {e}")

        if "metric_score" in analysis.results_df.columns:
            ordered_trials = analysis.results_df.sort_values(
                "metric_score", ascending=self.goal != MAXIMIZE)

            # Catch nans in edge case where the trial doesn't complete
            temp_ordered_trials = []
            for kwargs in ordered_trials.to_dict(orient="records"):
                for key in ["parameters", "training_stats", "eval_stats"]:
                    if isinstance(kwargs[key], float):
                        kwargs[key] = {}
                temp_ordered_trials.append(kwargs)

            # Trials w/empty eval_stats fields & non-empty training_stats fields ran intermediate
            # tune.report call(s) but were terminated before reporting eval_stats from post-train
            # evaluation (e.g., trial stopped due to time budget or relatively poor performance.)
            # For any such trials, run model evaluation for the best model in that trial & record
            # results in ordered_trials which is returned & is persisted in hyperopt_statistics.json.
            for trial in temp_ordered_trials:
                if trial["eval_stats"] == "{}" and trial[
                        "training_stats"] != "{}":
                    # Evaluate the best model on the eval_split, which is validation_set
                    if validation_set is not None and validation_set.size > 0:
                        trial_path = trial["trial_dir"]
                        best_model_path = self._get_best_model_path(
                            trial_path, analysis)
                        if best_model_path is not None:
                            self._evaluate_best_model(
                                trial,
                                trial_path,
                                best_model_path,
                                validation_set,
                                data_format,
                                skip_save_unprocessed_output,
                                skip_save_predictions,
                                skip_save_eval_stats,
                                gpus,
                                gpu_memory_limit,
                                allow_parallel_threads,
                                backend,
                                debug,
                            )
                        else:
                            logger.warning(
                                "Skipping evaluation as no model checkpoints were available"
                            )
                    else:
                        logger.warning(
                            "Skipping evaluation as no validation set was provided"
                        )

            ordered_trials = [
                TrialResults.from_dict(load_json_values(kwargs))
                for kwargs in temp_ordered_trials
            ]
        else:
            logger.warning(
                "No trials reported results; check if time budget lower than epoch latency"
            )
            ordered_trials = []

        return RayTuneResults(ordered_trials=ordered_trials,
                              experiment_analysis=analysis)