def testCallbackReordering(self): """SyncerCallback should come after ExperimentLogger callbacks""" def get_positions(callbacks): first_logger_pos = None last_logger_pos = None syncer_pos = None for i, callback in enumerate(callbacks): if isinstance(callback, ExperimentLogger): if first_logger_pos is None: first_logger_pos = i last_logger_pos = i elif isinstance(callback, SyncerCallback): syncer_pos = i return first_logger_pos, last_logger_pos, syncer_pos # Auto creation of loggers, no callbacks, no syncer callbacks = create_default_callbacks(None, SyncConfig(), None) first_logger_pos, last_logger_pos, syncer_pos = get_positions( callbacks) self.assertLess(last_logger_pos, syncer_pos) # Auto creation of loggers with callbacks callbacks = create_default_callbacks([Callback()], SyncConfig(), None) first_logger_pos, last_logger_pos, syncer_pos = get_positions( callbacks) self.assertLess(last_logger_pos, syncer_pos) # Auto creation of loggers with existing logger (but no CSV/JSON) callbacks = create_default_callbacks([ExperimentLogger()], SyncConfig(), None) first_logger_pos, last_logger_pos, syncer_pos = get_positions( callbacks) self.assertLess(last_logger_pos, syncer_pos) # This should throw an error as the syncer comes before the logger with self.assertRaises(ValueError): callbacks = create_default_callbacks( [SyncerCallback(None), ExperimentLogger()], SyncConfig(), None) # This should be reordered but preserve the regular callback order [mc1, mc2, mc3] = [Callback(), Callback(), Callback()] # Has to be legacy logger to avoid logger callback creation lc = LegacyExperimentLogger(logger_classes=DEFAULT_LOGGERS) callbacks = create_default_callbacks([mc1, mc2, lc, mc3], SyncConfig(), None) print(callbacks) first_logger_pos, last_logger_pos, syncer_pos = get_positions( callbacks) self.assertLess(last_logger_pos, syncer_pos) self.assertLess(callbacks.index(mc1), callbacks.index(mc2)) self.assertLess(callbacks.index(mc2), callbacks.index(mc3)) self.assertLess(callbacks.index(lc), callbacks.index(mc3)) # Syncer callback is appended self.assertLess(callbacks.index(mc3), syncer_pos)
def from_json(cls, name, spec): """Generates an Experiment object from JSON. Args: name (str): Name of Experiment. spec (dict): JSON configuration of experiment. """ if "run" not in spec: raise TuneError("No trainable specified!") # Special case the `env` param for RLlib by automatically # moving it into the `config` section. if "env" in spec: spec["config"] = spec.get("config", {}) spec["config"]["env"] = spec["env"] del spec["env"] if "sync_config" in spec and isinstance(spec["sync_config"], dict): spec["sync_config"] = SyncConfig(**spec["sync_config"]) spec = copy.deepcopy(spec) run_value = spec.pop("run") try: exp = cls(name, run_value, **spec) except TypeError: raise TuneError("Improper argument from JSON: {}.".format(spec)) return exp
def test_syncer_callback_noop_on_trial_cloud_checkpointing(): """Check that trial using cloud checkpointing disables sync to driver""" callbacks = create_default_callbacks(callbacks=[], sync_config=SyncConfig()) syncer_callback = None for cb in callbacks: if isinstance(cb, SyncerCallback): syncer_callback = cb trial1 = MockTrial(trial_id="a", logdir=None) trial1.uses_cloud_checkpointing = True assert syncer_callback assert syncer_callback._enabled # Cloud checkpointing set, so no-op assert not syncer_callback._sync_trial_dir(trial1) # This should not raise any error for not existing directory syncer_callback.on_checkpoint( iteration=1, trials=[], trial=trial1, checkpoint=_TrackedCheckpoint( dir_or_data="/does/not/exist", storage_mode=CheckpointStorage.PERSISTENT), )
def mock_storage_client(path): """Mocks storage client that treats a local dir as durable storage.""" os.makedirs(path, exist_ok=True) if _ray_114: syncer = get_node_to_storage_syncer(SyncConfig(upload_dir=path)) else: syncer = get_sync_client(LOCAL_SYNC_TEMPLATE, LOCAL_DELETE_TEMPLATE) return syncer
def __post_init__(self): if not self.failure_config: self.failure_config = FailureConfig() if not self.sync_config: self.sync_config = SyncConfig() if not self.checkpoint_config: self.checkpoint_config = CheckpointConfig()
def test_syncer_callback_op_on_no_cloud_checkpointing(): """Check that without cloud checkpointing sync to driver is enabled""" callbacks = create_default_callbacks(callbacks=[], sync_config=SyncConfig()) syncer_callback = None for cb in callbacks: if isinstance(cb, SyncerCallback): syncer_callback = cb trial1 = MockTrial(trial_id="a", logdir=None) trial1.uses_cloud_checkpointing = False assert syncer_callback assert syncer_callback._enabled assert syncer_callback._sync_trial_dir(trial1)
def testCheckpointAutoPeriod(self): ray.init(num_cpus=3) # This makes checkpointing take 2 seconds. def sync_up(source, target): time.sleep(2) return True runner = TrialRunner( local_checkpoint_dir=self.tmpdir, checkpoint_period="auto", sync_config=SyncConfig(upload_dir="fake", syncer=sync_up), remote_checkpoint_dir="fake") runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 1})) runner.step() # Run one step, this will trigger checkpointing self.assertGreaterEqual(runner._checkpoint_manager._checkpoint_period, 38.)
def testCheckpointAutoPeriod(self): ray.init(num_cpus=3) # This makes checkpointing take 2 seconds. class CustomSyncer(Syncer): def __init__(self, sync_period: float = 300.0): super(CustomSyncer, self).__init__(sync_period=sync_period) self._sync_status = {} def sync_up(self, local_dir: str, remote_dir: str, exclude: list = None) -> bool: time.sleep(2) return True def sync_down(self, remote_dir: str, local_dir: str, exclude: list = None) -> bool: time.sleep(2) return True def delete(self, remote_dir: str) -> bool: pass runner = TrialRunner( local_checkpoint_dir=self.tmpdir, checkpoint_period="auto", sync_config=SyncConfig(upload_dir="fake", syncer=CustomSyncer(), sync_period=0), remote_checkpoint_dir="fake", ) runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 1})) runner.step() # Run one step, this will trigger checkpointing self.assertGreaterEqual(runner._checkpoint_manager._checkpoint_period, 38.0)
def run( run_or_experiment: Union[str, Callable, Type], name: Optional[str] = None, metric: Optional[str] = None, mode: Optional[str] = None, stop: Union[None, Mapping, Stopper, Callable[[str, Mapping], bool]] = None, time_budget_s: Union[None, int, float, datetime.timedelta] = None, config: Optional[Dict[str, Any]] = None, resources_per_trial: Union[None, Mapping[str, Union[ float, int, Mapping]], PlacementGroupFactory] = None, num_samples: int = 1, local_dir: Optional[str] = None, search_alg: Optional[Union[Searcher, SearchAlgorithm, str]] = None, scheduler: Optional[Union[TrialScheduler, str]] = None, keep_checkpoints_num: Optional[int] = None, checkpoint_score_attr: Optional[str] = None, checkpoint_freq: int = 0, checkpoint_at_end: bool = False, verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS, progress_reporter: Optional[ProgressReporter] = None, log_to_file: bool = False, trial_name_creator: Optional[Callable[[Trial], str]] = None, trial_dirname_creator: Optional[Callable[[Trial], str]] = None, sync_config: Optional[SyncConfig] = None, export_formats: Optional[Sequence] = None, max_failures: int = 0, fail_fast: bool = False, restore: Optional[str] = None, server_port: Optional[int] = None, resume: bool = False, reuse_actors: bool = False, trial_executor: Optional[RayTrialExecutor] = None, raise_on_failed_trial: bool = True, callbacks: Optional[Sequence[Callback]] = None, max_concurrent_trials: Optional[int] = None, # Deprecated args queue_trials: Optional[bool] = None, loggers: Optional[Sequence[Type[Logger]]] = None, _remote: Optional[bool] = None, ) -> ExperimentAnalysis: """Executes training. When a SIGINT signal is received (e.g. through Ctrl+C), the tuning run will gracefully shut down and checkpoint the latest experiment state. Sending SIGINT again (or SIGKILL/SIGTERM instead) will skip this step. Many aspects of Tune, such as the frequency of global checkpointing, maximum pending placement group trials and the path of the result directory be configured through environment variables. Refer to :ref:`tune-env-vars` for a list of environment variables available. Examples: .. code-block:: python # Run 10 trials (each trial is one instance of a Trainable). Tune runs # in parallel and automatically determines concurrency. tune.run(trainable, num_samples=10) # Run 1 trial, stop when trial has reached 10 iterations tune.run(my_trainable, stop={"training_iteration": 10}) # automatically retry failed trials up to 3 times tune.run(my_trainable, stop={"training_iteration": 10}, max_failures=3) # Run 1 trial, search over hyperparameters, stop after 10 iterations. space = {"lr": tune.uniform(0, 1), "momentum": tune.uniform(0, 1)} tune.run(my_trainable, config=space, stop={"training_iteration": 10}) # Resumes training if a previous machine crashed tune.run(my_trainable, config=space, local_dir=<path/to/dir>, resume=True) # Rerun ONLY failed trials after an experiment is finished. tune.run(my_trainable, config=space, local_dir=<path/to/dir>, resume="ERRORED_ONLY") Args: run_or_experiment (function | class | str | :class:`Experiment`): If function|class|str, this is the algorithm or model to train. This may refer to the name of a built-on algorithm (e.g. RLLib's DQN or PPO), a user-defined trainable function or class, or the string identifier of a trainable function or class registered in the tune registry. If Experiment, then Tune will execute training based on Experiment.spec. If you want to pass in a Python lambda, you will need to first register the function: ``tune.register_trainable("lambda_id", lambda x: ...)``. You can then use ``tune.run("lambda_id")``. metric (str): Metric to optimize. This metric should be reported with `tune.report()`. If set, will be passed to the search algorithm and scheduler. mode (str): Must be one of [min, max]. Determines whether objective is minimizing or maximizing the metric attribute. If set, will be passed to the search algorithm and scheduler. name (str): Name of experiment. stop (dict | callable | :class:`Stopper`): Stopping criteria. If dict, the keys may be any field in the return result of 'train()', whichever is reached first. If function, it must take (trial_id, result) as arguments and return a boolean (True if trial should be stopped, False otherwise). This can also be a subclass of ``ray.tune.Stopper``, which allows users to implement custom experiment-wide stopping (i.e., stopping an entire Tune run based on some time constraint). time_budget_s (int|float|datetime.timedelta): Global time budget in seconds after which all trials are stopped. Can also be a ``datetime.timedelta`` object. config (dict): Algorithm-specific configuration for Tune variant generation (e.g. env, hyperparams). Defaults to empty dict. Custom search algorithms may ignore this. resources_per_trial (dict|PlacementGroupFactory): Machine resources to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}``. Note that GPUs will not be assigned unless you specify them here. Defaults to 1 CPU and 0 GPUs in ``Trainable.default_resource_request()``. This can also be a PlacementGroupFactory object wrapping arguments to create a per-trial placement group. num_samples (int): Number of times to sample from the hyperparameter space. Defaults to 1. If `grid_search` is provided as an argument, the grid will be repeated `num_samples` of times. If this is -1, (virtually) infinite samples are generated until a stopping condition is met. local_dir (str): Local dir to save training results to. Defaults to ``~/ray_results``. search_alg (Searcher|SearchAlgorithm|str): Search algorithm for optimization. You can also use the name of the algorithm. scheduler (TrialScheduler|str): Scheduler for executing the experiment. Choose among FIFO (default), MedianStopping, AsyncHyperBand, HyperBand and PopulationBasedTraining. Refer to ray.tune.schedulers for more options. You can also use the name of the scheduler. keep_checkpoints_num (int): Number of checkpoints to keep. A value of `None` keeps all checkpoints. Defaults to `None`. If set, need to provide `checkpoint_score_attr`. checkpoint_score_attr (str): Specifies by which attribute to rank the best checkpoint. Default is increasing order. If attribute starts with `min-` it will rank attribute in decreasing order, i.e. `min-validation_loss`. checkpoint_freq (int): How many training iterations between checkpoints. A value of 0 (default) disables checkpointing. This has no effect when using the Functional Training API. checkpoint_at_end (bool): Whether to checkpoint at the end of the experiment regardless of the checkpoint_freq. Default is False. This has no effect when using the Functional Training API. verbose (Union[int, Verbosity]): 0, 1, 2, or 3. Verbosity mode. 0 = silent, 1 = only status updates, 2 = status and brief trial results, 3 = status and detailed trial results. Defaults to 3. progress_reporter (ProgressReporter): Progress reporter for reporting intermediate experiment progress. Defaults to CLIReporter if running in command-line, or JupyterNotebookReporter if running in a Jupyter notebook. log_to_file (bool|str|Sequence): Log stdout and stderr to files in Tune's trial directories. If this is `False` (default), no files are written. If `true`, outputs are written to `trialdir/stdout` and `trialdir/stderr`, respectively. If this is a single string, this is interpreted as a file relative to the trialdir, to which both streams are written. If this is a Sequence (e.g. a Tuple), it has to have length 2 and the elements indicate the files to which stdout and stderr are written, respectively. trial_name_creator (Callable[[Trial], str]): Optional function for generating the trial string representation. trial_dirname_creator (Callable[[Trial], str]): Function for generating the trial dirname. This function should take in a Trial object and return a string representing the name of the directory. The return value cannot be a path. sync_config (SyncConfig): Configuration object for syncing. See tune.SyncConfig. export_formats (list): List of formats that exported at the end of the experiment. Default is None. max_failures (int): Try to recover a trial at least this many times. Ray will recover from the latest checkpoint if present. Setting to -1 will lead to infinite recovery retries. Setting to 0 will disable retries. Defaults to 0. fail_fast (bool | str): Whether to fail upon the first error. If fail_fast='raise' provided, Tune will automatically raise the exception received by the Trainable. fail_fast='raise' can easily leak resources and should be used with caution (it is best used with `ray.init(local_mode=True)`). restore (str): Path to checkpoint. Only makes sense to set if running 1 trial. Defaults to None. server_port (int): Port number for launching TuneServer. resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", "ERRORED_ONLY", or bool. LOCAL/True restores the checkpoint from the local experiment directory, determined by ``name`` and ``local_dir``. REMOTE restores the checkpoint from ``upload_dir`` (as passed to ``sync_config``). PROMPT provides CLI feedback. False forces a new experiment. ERRORED_ONLY resets and reruns ERRORED trials upon resume - previous trial artifacts will be left untouched. If resume is set but checkpoint does not exist, ValueError will be thrown. reuse_actors (bool): Whether to reuse actors between different trials when possible. This can drastically speed up experiments that start and stop actors often (e.g., PBT in time-multiplexing mode). This requires trials to have the same resource requirements. trial_executor (TrialExecutor): Manage the execution of trials. raise_on_failed_trial (bool): Raise TuneError if there exists failed trial (of ERROR state) when the experiments complete. callbacks (list): List of callbacks that will be called at different times in the training loop. Must be instances of the ``ray.tune.callback.Callback`` class. If not passed, `LoggerCallback` and `SyncerCallback` callbacks are automatically added. max_concurrent_trials (int): Maximum number of trials to run concurrently. Must be non-negative. If None or 0, no limit will be applied. This is achieved by wrapping the ``search_alg`` in a :class:`ConcurrencyLimiter`, and thus setting this argument will raise an exception if the ``search_alg`` is already a :class:`ConcurrencyLimiter`. Defaults to None. _remote (bool): Whether to run the Tune driver in a remote function. This is disabled automatically if a custom trial executor is passed in. This is enabled by default in Ray client mode. Returns: ExperimentAnalysis: Object for experiment analysis. Raises: TuneError: Any trials failed and `raise_on_failed_trial` is True. """ # To be removed in 1.9. if queue_trials is not None: raise DeprecationWarning( "`queue_trials` has been deprecated and is replaced by " "the `TUNE_MAX_PENDING_TRIALS_PG` environment variable. " "Per default at least one Trial is queued at all times, " "so you likely don't need to change anything other than " "removing this argument from your call to `tune.run()`") # NO CODE IS TO BE ADDED ABOVE THIS COMMENT # remote_run_kwargs must be defined before any other # code is ran to ensure that at this point, # `locals()` is equal to args and kwargs remote_run_kwargs = locals().copy() remote_run_kwargs.pop("_remote") if _remote is None: _remote = ray.util.client.ray.is_connected() if _remote is True and trial_executor: raise ValueError("cannot use custom trial executor") if not trial_executor or isinstance(trial_executor, RayTrialExecutor): _ray_auto_init() if _remote: remote_run = ray.remote(num_cpus=0)(run) # Make sure tune.run is called on the sever node. remote_run = force_on_current_node(remote_run) # JupyterNotebooks don't work with remote tune runs out of the box # (e.g. via Ray client) as they don't have access to the main # process stdout. So we introduce a queue here that accepts # callables, which will then be executed on the driver side. if isinstance(progress_reporter, JupyterNotebookReporter): execute_queue = Queue(actor_options={ "num_cpus": 0, **force_on_current_node(None) }) progress_reporter.set_output_queue(execute_queue) def get_next_queue_item(): try: return execute_queue.get(block=False) except Empty: return None else: # If we don't need a queue, use this dummy get fn instead of # scheduling an unneeded actor def get_next_queue_item(): return None def _handle_execute_queue(): execute_item = get_next_queue_item() while execute_item: if isinstance(execute_item, Callable): execute_item() execute_item = get_next_queue_item() remote_future = remote_run.remote(_remote=False, **remote_run_kwargs) # ray.wait(...)[1] returns futures that are not ready, yet while ray.wait([remote_future], timeout=0.2)[1]: # Check if we have items to execute _handle_execute_queue() # Handle queue one last time _handle_execute_queue() return ray.get(remote_future) del remote_run_kwargs all_start = time.time() if loggers: # Raise DeprecationWarning in 1.9, remove in 1.10/1.11 warnings.warn( "The `loggers` argument is deprecated. Please pass the respective " "`LoggerCallback` classes to the `callbacks` argument instead. " "See https://docs.ray.io/en/latest/tune/api_docs/logging.html") if mode and mode not in ["min", "max"]: raise ValueError( "The `mode` parameter passed to `tune.run()` has to be one of " "['min', 'max']") set_verbosity(verbose) config = config or {} sync_config = sync_config or SyncConfig() set_sync_periods(sync_config) if num_samples == -1: num_samples = sys.maxsize result_buffer_length = None # Create scheduler here as we need access to some of its properties if isinstance(scheduler, str): # importing at top level causes a recursive dependency from ray.tune.schedulers import create_scheduler scheduler = create_scheduler(scheduler) scheduler = scheduler or FIFOScheduler() if not scheduler.supports_buffered_results: # Result buffering with e.g. a Hyperband scheduler is a bad idea, as # hyperband tries to stop trials when processing brackets. With result # buffering, we might trigger this multiple times when evaluating # a single trial, which leads to unexpected behavior. env_result_buffer_length = os.getenv("TUNE_RESULT_BUFFER_LENGTH", "") if env_result_buffer_length: warnings.warn( f"You are using a {type(scheduler)} scheduler, but " f"TUNE_RESULT_BUFFER_LENGTH is set " f"({env_result_buffer_length}). This can lead to undesired " f"and faulty behavior, so the buffer length was forcibly set " f"to 1 instead.") result_buffer_length = 1 if isinstance(scheduler, (PopulationBasedTraining, PopulationBasedTrainingReplay)) and not reuse_actors: warnings.warn( "Consider boosting PBT performance by enabling `reuse_actors` as " "well as implementing `reset_config` for Trainable.") trial_executor = trial_executor or RayTrialExecutor( reuse_actors=reuse_actors, result_buffer_length=result_buffer_length) if isinstance(run_or_experiment, list): experiments = run_or_experiment else: experiments = [run_or_experiment] for i, exp in enumerate(experiments): if not isinstance(exp, Experiment): experiments[i] = Experiment( name=name, run=exp, stop=stop, time_budget_s=time_budget_s, config=config, resources_per_trial=resources_per_trial, num_samples=num_samples, local_dir=local_dir, sync_config=sync_config, trial_name_creator=trial_name_creator, trial_dirname_creator=trial_dirname_creator, log_to_file=log_to_file, checkpoint_freq=checkpoint_freq, checkpoint_at_end=checkpoint_at_end, keep_checkpoints_num=keep_checkpoints_num, checkpoint_score_attr=checkpoint_score_attr, export_formats=export_formats, max_failures=max_failures, restore=restore) else: logger.debug("Ignoring some parameters passed into tune.run.") if fail_fast and max_failures != 0: raise ValueError("max_failures must be 0 if fail_fast=True.") if isinstance(search_alg, str): # importing at top level causes a recursive dependency from ray.tune.suggest import create_searcher search_alg = create_searcher(search_alg) # if local_mode=True is set during ray.init(). is_local_mode = ray.worker._mode() == ray.worker.LOCAL_MODE if is_local_mode: max_concurrent_trials = 1 if not search_alg: search_alg = BasicVariantGenerator( max_concurrent=max_concurrent_trials or 0) elif max_concurrent_trials: if isinstance(search_alg, ConcurrencyLimiter): if search_alg.max_concurrent != max_concurrent_trials: raise ValueError( "You have specified `max_concurrent_trials=" f"{max_concurrent_trials}`, but the `search_alg` is " "already a `ConcurrencyLimiter` with `max_concurrent=" f"{search_alg.max_concurrent}. FIX THIS by setting " "`max_concurrent_trials=None`.") else: logger.warning( "You have specified `max_concurrent_trials=" f"{max_concurrent_trials}`, but the `search_alg` is " "already a `ConcurrencyLimiter`. `max_concurrent_trials` " "will be ignored.") else: if max_concurrent_trials < 1: raise ValueError( "`max_concurrent_trials` must be greater or equal than 1, " f"got {max_concurrent_trials}.") if isinstance(search_alg, Searcher): search_alg = ConcurrencyLimiter( search_alg, max_concurrent=max_concurrent_trials) elif not is_local_mode: logger.warning( "You have passed a `SearchGenerator` instance as the " "`search_alg`, but `max_concurrent_trials` requires a " "`Searcher` instance`. `max_concurrent_trials` " "will be ignored.") if isinstance(search_alg, Searcher): search_alg = SearchGenerator(search_alg) if config and not set_search_properties_backwards_compatible( search_alg.set_search_properties, metric, mode, config, ** experiments[0].public_spec): if has_unresolved_values(config): raise ValueError( "You passed a `config` parameter to `tune.run()` with " "unresolved parameters, but the search algorithm was already " "instantiated with a search space. Make sure that `config` " "does not contain any more parameter definitions - include " "them in the search algorithm's search space if necessary.") if not scheduler.set_search_properties(metric, mode): raise ValueError( "You passed a `metric` or `mode` argument to `tune.run()`, but " "the scheduler you are using was already instantiated with their " "own `metric` and `mode` parameters. Either remove the arguments " "from your scheduler or from your call to `tune.run()`") # Create syncer callbacks callbacks = create_default_callbacks( callbacks, sync_config, metric=metric, loggers=loggers) runner = TrialRunner( search_alg=search_alg, scheduler=scheduler, local_checkpoint_dir=experiments[0].checkpoint_dir, remote_checkpoint_dir=experiments[0].remote_checkpoint_dir, sync_config=sync_config, stopper=experiments[0].stopper, resume=resume, server_port=server_port, fail_fast=fail_fast, trial_executor=trial_executor, callbacks=callbacks, metric=metric, # Driver should only sync trial checkpoints if # checkpoints are not synced to cloud driver_sync_trial_checkpoints=not bool(sync_config.upload_dir)) if not runner.resumed: for exp in experiments: search_alg.add_configurations([exp]) else: logger.info("TrialRunner resumed, ignoring new add_experiment but " "updating trial resources.") if resources_per_trial: runner.update_pending_trial_resources(resources_per_trial) progress_reporter = progress_reporter or detect_reporter() if not progress_reporter.set_search_properties(metric, mode): raise ValueError( "You passed a `metric` or `mode` argument to `tune.run()`, but " "the reporter you are using was already instantiated with their " "own `metric` and `mode` parameters. Either remove the arguments " "from your reporter or from your call to `tune.run()`") progress_reporter.set_total_samples(search_alg.total_samples) # Calls setup on callbacks runner.setup_experiments( experiments=experiments, total_num_samples=search_alg.total_samples) # User Warning for GPUs if trial_executor.has_gpus(): if isinstance(resources_per_trial, dict) and "gpu" in resources_per_trial: # "gpu" is manually set. pass elif _check_default_resources_override(experiments[0].run_identifier): # "default_resources" is manually overridden. pass else: logger.warning("Tune detects GPUs, but no trials are using GPUs. " "To enable trials to use GPUs, set " "tune.run(resources_per_trial={'gpu': 1}...) " "which allows Tune to expose 1 GPU to each trial. " "You can also override " "`Trainable.default_resource_request` if using the " "Trainable API.") original_handler = signal.getsignal(signal.SIGINT) state = {signal.SIGINT: False} def sigint_handler(sig, frame): logger.warning( "SIGINT received (e.g. via Ctrl+C), ending Ray Tune run. " "This will try to checkpoint the experiment state one last time. " "Press CTRL+C one more time (or send SIGINT/SIGKILL/SIGTERM) " "to skip. ") state[signal.SIGINT] = True # Restore original signal handler to react to future SIGINT signals signal.signal(signal.SIGINT, original_handler) if not int(os.getenv("TUNE_DISABLE_SIGINT_HANDLER", "0")): signal.signal(signal.SIGINT, sigint_handler) tune_start = time.time() progress_reporter.set_start_time(tune_start) while not runner.is_finished() and not state[signal.SIGINT]: runner.step() if has_verbosity(Verbosity.V1_EXPERIMENT): _report_progress(runner, progress_reporter) tune_taken = time.time() - tune_start try: runner.checkpoint(force=True) except Exception as e: logger.warning(f"Trial Runner checkpointing failed: {str(e)}") if has_verbosity(Verbosity.V1_EXPERIMENT): _report_progress(runner, progress_reporter, done=True) wait_for_sync() runner.cleanup() incomplete_trials = [] for trial in runner.get_trials(): if trial.status != Trial.TERMINATED: incomplete_trials += [trial] if incomplete_trials: if raise_on_failed_trial and not state[signal.SIGINT]: raise TuneError("Trials did not complete", incomplete_trials) else: logger.error("Trials did not complete: %s", incomplete_trials) all_taken = time.time() - all_start if has_verbosity(Verbosity.V1_EXPERIMENT): logger.info(f"Total run time: {all_taken:.2f} seconds " f"({tune_taken:.2f} seconds for the tuning loop).") if state[signal.SIGINT]: logger.warning( "Experiment has been interrupted, but the most recent state was " "saved. You can continue running this experiment by passing " "`resume=True` to `tune.run()`") trials = runner.get_trials() return ExperimentAnalysis( runner.checkpoint_file, trials=trials, default_metric=metric, default_mode=mode, sync_config=sync_config)
def run( run_or_experiment, name=None, metric=None, mode=None, stop=None, time_budget_s=None, config=None, resources_per_trial=None, num_samples=1, local_dir=None, search_alg=None, scheduler=None, keep_checkpoints_num=None, checkpoint_score_attr=None, checkpoint_freq=0, checkpoint_at_end=False, verbose=2, progress_reporter=None, loggers=None, log_to_file=False, trial_name_creator=None, trial_dirname_creator=None, sync_config=None, export_formats=None, max_failures=0, fail_fast=False, restore=None, server_port=None, resume=False, reuse_actors=False, trial_executor=None, raise_on_failed_trial=True, # Deprecated args ray_auto_init=None, run_errored_only=None, queue_trials=None, global_checkpoint_period=None, with_server=None, upload_dir=None, sync_to_cloud=None, sync_to_driver=None, sync_on_checkpoint=None, ): """Executes training. Examples: .. code-block:: python # Run 10 trials (each trial is one instance of a Trainable). Tune runs # in parallel and automatically determines concurrency. tune.run(trainable, num_samples=10) # Run 1 trial, stop when trial has reached 10 iterations tune.run(my_trainable, stop={"training_iteration": 10}) # automatically retry failed trials up to 3 times tune.run(my_trainable, stop={"training_iteration": 10}, max_failures=3) # Run 1 trial, search over hyperparameters, stop after 10 iterations. space = {"lr": tune.uniform(0, 1), "momentum": tune.uniform(0, 1)} tune.run(my_trainable, config=space, stop={"training_iteration": 10}) # Resumes training if a previous machine crashed tune.run(my_trainable, config=space, local_dir=<path/to/dir>, resume=True) # Rerun ONLY failed trials after an experiment is finished. tune.run(my_trainable, config=space, local_dir=<path/to/dir>, resume="ERRORED_ONLY") Args: run_or_experiment (function | class | str | :class:`Experiment`): If function|class|str, this is the algorithm or model to train. This may refer to the name of a built-on algorithm (e.g. RLLib's DQN or PPO), a user-defined trainable function or class, or the string identifier of a trainable function or class registered in the tune registry. If Experiment, then Tune will execute training based on Experiment.spec. If you want to pass in a Python lambda, you will need to first register the function: ``tune.register_trainable("lambda_id", lambda x: ...)``. You can then use ``tune.run("lambda_id")``. metric (str): Metric to optimize. This metric should be reported with `tune.report()`. If set, will be passed to the search algorithm and scheduler. mode (str): Must be one of [min, max]. Determines whether objective is minimizing or maximizing the metric attribute. If set, will be passed to the search algorithm and scheduler. name (str): Name of experiment. stop (dict | callable | :class:`Stopper`): Stopping criteria. If dict, the keys may be any field in the return result of 'train()', whichever is reached first. If function, it must take (trial_id, result) as arguments and return a boolean (True if trial should be stopped, False otherwise). This can also be a subclass of ``ray.tune.Stopper``, which allows users to implement custom experiment-wide stopping (i.e., stopping an entire Tune run based on some time constraint). time_budget_s (int|float|datetime.timedelta): Global time budget in seconds after which all trials are stopped. Can also be a ``datetime.timedelta`` object. config (dict): Algorithm-specific configuration for Tune variant generation (e.g. env, hyperparams). Defaults to empty dict. Custom search algorithms may ignore this. resources_per_trial (dict): Machine resources to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}``. Note that GPUs will not be assigned unless you specify them here. Defaults to 1 CPU and 0 GPUs in ``Trainable.default_resource_request()``. num_samples (int): Number of times to sample from the hyperparameter space. Defaults to 1. If `grid_search` is provided as an argument, the grid will be repeated `num_samples` of times. local_dir (str): Local dir to save training results to. Defaults to ``~/ray_results``. search_alg (Searcher): Search algorithm for optimization. scheduler (TrialScheduler): Scheduler for executing the experiment. Choose among FIFO (default), MedianStopping, AsyncHyperBand, HyperBand and PopulationBasedTraining. Refer to ray.tune.schedulers for more options. keep_checkpoints_num (int): Number of checkpoints to keep. A value of `None` keeps all checkpoints. Defaults to `None`. If set, need to provide `checkpoint_score_attr`. checkpoint_score_attr (str): Specifies by which attribute to rank the best checkpoint. Default is increasing order. If attribute starts with `min-` it will rank attribute in decreasing order, i.e. `min-validation_loss`. checkpoint_freq (int): How many training iterations between checkpoints. A value of 0 (default) disables checkpointing. This has no effect when using the Functional Training API. checkpoint_at_end (bool): Whether to checkpoint at the end of the experiment regardless of the checkpoint_freq. Default is False. This has no effect when using the Functional Training API. verbose (int): 0, 1, or 2. Verbosity mode. 0 = silent, 1 = only status updates, 2 = status and trial results. progress_reporter (ProgressReporter): Progress reporter for reporting intermediate experiment progress. Defaults to CLIReporter if running in command-line, or JupyterNotebookReporter if running in a Jupyter notebook. loggers (list): List of logger creators to be used with each Trial. If None, defaults to ray.tune.logger.DEFAULT_LOGGERS. See `ray/tune/logger.py`. log_to_file (bool|str|Sequence): Log stdout and stderr to files in Tune's trial directories. If this is `False` (default), no files are written. If `true`, outputs are written to `trialdir/stdout` and `trialdir/stderr`, respectively. If this is a single string, this is interpreted as a file relative to the trialdir, to which both streams are written. If this is a Sequence (e.g. a Tuple), it has to have length 2 and the elements indicate the files to which stdout and stderr are written, respectively. trial_name_creator (Callable[[Trial], str]): Optional function for generating the trial string representation. trial_dirname_creator (Callable[[Trial], str]): Function for generating the trial dirname. This function should take in a Trial object and return a string representing the name of the directory. The return value cannot be a path. sync_config (SyncConfig): Configuration object for syncing. See tune.SyncConfig. export_formats (list): List of formats that exported at the end of the experiment. Default is None. max_failures (int): Try to recover a trial at least this many times. Ray will recover from the latest checkpoint if present. Setting to -1 will lead to infinite recovery retries. Setting to 0 will disable retries. Defaults to 3. fail_fast (bool | str): Whether to fail upon the first error. If fail_fast='raise' provided, Tune will automatically raise the exception received by the Trainable. fail_fast='raise' can easily leak resources and should be used with caution (it is best used with `ray.init(local_mode=True)`). restore (str): Path to checkpoint. Only makes sense to set if running 1 trial. Defaults to None. server_port (int): Port number for launching TuneServer. resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", "ERRORED_ONLY", or bool. LOCAL/True restores the checkpoint from the local_checkpoint_dir, determined by `name` and `local_dir`. REMOTE restores the checkpoint from remote_checkpoint_dir. PROMPT provides CLI feedback. False forces a new experiment. ERRORED_ONLY resets and reruns ERRORED trials upon resume - previous trial artifacts will be left untouched. If resume is set but checkpoint does not exist, ValueError will be thrown. reuse_actors (bool): Whether to reuse actors between different trials when possible. This can drastically speed up experiments that start and stop actors often (e.g., PBT in time-multiplexing mode). This requires trials to have the same resource requirements. trial_executor (TrialExecutor): Manage the execution of trials. raise_on_failed_trial (bool): Raise TuneError if there exists failed trial (of ERROR state) when the experiments complete. Returns: ExperimentAnalysis: Object for experiment analysis. Raises: TuneError: Any trials failed and `raise_on_failed_trial` is True. """ if global_checkpoint_period: raise ValueError("global_checkpoint_period is deprecated. Set env var " "'TUNE_GLOBAL_CHECKPOINT_S' instead.") if queue_trials: raise ValueError( "queue_trials is deprecated. " "Set env var 'TUNE_DISABLE_QUEUE_TRIALS=1' instead to " "disable queuing behavior.") if ray_auto_init: raise ValueError("ray_auto_init is deprecated. " "Set env var 'TUNE_DISABLE_AUTO_INIT=1' instead or " "call 'ray.init' before calling 'tune.run'.") if with_server: raise ValueError( "with_server is deprecated. It is now enabled by default " "if 'server_port' is not None.") if sync_on_checkpoint or sync_to_cloud or sync_to_driver or upload_dir: raise ValueError( "sync_on_checkpoint / sync_to_cloud / sync_to_driver / " "upload_dir must now be set via `tune.run(" "sync_config=SyncConfig(...)`. See `ray.tune.SyncConfig` for " "more details.") if mode and mode not in ["min", "max"]: raise ValueError( "The `mode` parameter passed to `tune.run()` has to be one of " "['min', 'max']") config = config or {} sync_config = sync_config or SyncConfig() set_sync_periods(sync_config) trial_executor = trial_executor or RayTrialExecutor( reuse_actors=reuse_actors) if isinstance(run_or_experiment, list): experiments = run_or_experiment else: experiments = [run_or_experiment] for i, exp in enumerate(experiments): if not isinstance(exp, Experiment): experiments[i] = Experiment( name=name, run=exp, stop=stop, time_budget_s=time_budget_s, config=config, resources_per_trial=resources_per_trial, num_samples=num_samples, local_dir=local_dir, upload_dir=sync_config.upload_dir, sync_to_driver=sync_config.sync_to_driver, trial_name_creator=trial_name_creator, trial_dirname_creator=trial_dirname_creator, loggers=loggers, log_to_file=log_to_file, checkpoint_freq=checkpoint_freq, checkpoint_at_end=checkpoint_at_end, sync_on_checkpoint=sync_config.sync_on_checkpoint, keep_checkpoints_num=keep_checkpoints_num, checkpoint_score_attr=checkpoint_score_attr, export_formats=export_formats, max_failures=max_failures, restore=restore) else: logger.debug("Ignoring some parameters passed into tune.run.") if sync_config.sync_to_cloud: for exp in experiments: assert exp.remote_checkpoint_dir, ( "Need `upload_dir` if `sync_to_cloud` given.") if fail_fast and max_failures != 0: raise ValueError("max_failures must be 0 if fail_fast=True.") if issubclass(type(search_alg), Searcher): search_alg = SearchGenerator(search_alg) if not search_alg: search_alg = BasicVariantGenerator() if config and not search_alg.set_search_properties(metric, mode, config): if has_unresolved_values(config): raise ValueError( "You passed a `config` parameter to `tune.run()` with " "unresolved parameters, but the search algorithm was already " "instantiated with a search space. Make sure that `config` " "does not contain any more parameter definitions - include " "them in the search algorithm's search space if necessary.") scheduler = scheduler or FIFOScheduler() if not scheduler.set_search_properties(metric, mode): raise ValueError( "You passed a `metric` or `mode` argument to `tune.run()`, but " "the scheduler you are using was already instantiated with their " "own `metric` and `mode` parameters. Either remove the arguments " "from your scheduler or from your call to `tune.run()`") runner = TrialRunner( search_alg=search_alg, scheduler=scheduler, local_checkpoint_dir=experiments[0].checkpoint_dir, remote_checkpoint_dir=experiments[0].remote_checkpoint_dir, sync_to_cloud=sync_config.sync_to_cloud, stopper=experiments[0].stopper, resume=resume, server_port=server_port, verbose=bool(verbose > 1), fail_fast=fail_fast, trial_executor=trial_executor) if not runner.resumed: for exp in experiments: search_alg.add_configurations([exp]) else: logger.info("TrialRunner resumed, ignoring new add_experiment.") if progress_reporter is None: if IS_NOTEBOOK: progress_reporter = JupyterNotebookReporter(overwrite=verbose < 2) else: progress_reporter = CLIReporter() # User Warning for GPUs if trial_executor.has_gpus(): if isinstance(resources_per_trial, dict) and "gpu" in resources_per_trial: # "gpu" is manually set. pass elif _check_default_resources_override(experiments[0].run_identifier): # "default_resources" is manually overriden. pass else: logger.warning("Tune detects GPUs, but no trials are using GPUs. " "To enable trials to use GPUs, set " "tune.run(resources_per_trial={'gpu': 1}...) " "which allows Tune to expose 1 GPU to each trial. " "You can also override " "`Trainable.default_resource_request` if using the " "Trainable API.") while not runner.is_finished(): runner.step() if verbose: _report_progress(runner, progress_reporter) try: runner.checkpoint(force=True) except Exception as e: logger.warning(f"Trial Runner checkpointing failed: {str(e)}") if verbose: _report_progress(runner, progress_reporter, done=True) wait_for_sync() runner.cleanup_trials() incomplete_trials = [] for trial in runner.get_trials(): if trial.status != Trial.TERMINATED: incomplete_trials += [trial] if incomplete_trials: if raise_on_failed_trial: raise TuneError("Trials did not complete", incomplete_trials) else: logger.error("Trials did not complete: %s", incomplete_trials) trials = runner.get_trials() return ExperimentAnalysis(runner.checkpoint_file, trials=trials, default_metric=metric, default_mode=mode)
def run( run_or_experiment: Union[str, Callable, Type], name: Optional[str] = None, metric: Optional[str] = None, mode: Optional[str] = None, stop: Union[None, Mapping, Stopper, Callable[[str, Mapping], bool]] = None, time_budget_s: Union[None, int, float, datetime.timedelta] = None, config: Optional[Dict[str, Any]] = None, resources_per_trial: Union[None, Mapping[str, Union[ float, int, Mapping]], PlacementGroupFactory] = None, num_samples: int = 1, local_dir: Optional[str] = None, search_alg: Optional[Union[Searcher, SearchAlgorithm]] = None, scheduler: Optional[TrialScheduler] = None, keep_checkpoints_num: Optional[int] = None, checkpoint_score_attr: Optional[str] = None, checkpoint_freq: int = 0, checkpoint_at_end: bool = False, verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS, progress_reporter: Optional[ProgressReporter] = None, log_to_file: bool = False, trial_name_creator: Optional[Callable[[Trial], str]] = None, trial_dirname_creator: Optional[Callable[[Trial], str]] = None, sync_config: Optional[SyncConfig] = None, export_formats: Optional[Sequence] = None, max_failures: int = 0, fail_fast: bool = False, restore: Optional[str] = None, server_port: Optional[int] = None, resume: bool = False, queue_trials: bool = False, reuse_actors: bool = False, trial_executor: Optional[RayTrialExecutor] = None, raise_on_failed_trial: bool = True, callbacks: Optional[Sequence[Callback]] = None, # Deprecated args loggers: Optional[Sequence[Type[Logger]]] = None, ray_auto_init: Optional = None, run_errored_only: Optional = None, global_checkpoint_period: Optional = None, with_server: Optional = None, upload_dir: Optional = None, sync_to_cloud: Optional = None, sync_to_driver: Optional = None, sync_on_checkpoint: Optional = None, _remote: bool = None, ) -> ExperimentAnalysis: """Executes training. When a SIGINT signal is received (e.g. through Ctrl+C), the tuning run will gracefully shut down and checkpoint the latest experiment state. Sending SIGINT again (or SIGKILL/SIGTERM instead) will skip this step. Examples: .. code-block:: python # Run 10 trials (each trial is one instance of a Trainable). Tune runs # in parallel and automatically determines concurrency. tune.run(trainable, num_samples=10) # Run 1 trial, stop when trial has reached 10 iterations tune.run(my_trainable, stop={"training_iteration": 10}) # automatically retry failed trials up to 3 times tune.run(my_trainable, stop={"training_iteration": 10}, max_failures=3) # Run 1 trial, search over hyperparameters, stop after 10 iterations. space = {"lr": tune.uniform(0, 1), "momentum": tune.uniform(0, 1)} tune.run(my_trainable, config=space, stop={"training_iteration": 10}) # Resumes training if a previous machine crashed tune.run(my_trainable, config=space, local_dir=<path/to/dir>, resume=True) # Rerun ONLY failed trials after an experiment is finished. tune.run(my_trainable, config=space, local_dir=<path/to/dir>, resume="ERRORED_ONLY") Args: run_or_experiment (function | class | str | :class:`Experiment`): If function|class|str, this is the algorithm or model to train. This may refer to the name of a built-on algorithm (e.g. RLLib's DQN or PPO), a user-defined trainable function or class, or the string identifier of a trainable function or class registered in the tune registry. If Experiment, then Tune will execute training based on Experiment.spec. If you want to pass in a Python lambda, you will need to first register the function: ``tune.register_trainable("lambda_id", lambda x: ...)``. You can then use ``tune.run("lambda_id")``. metric (str): Metric to optimize. This metric should be reported with `tune.report()`. If set, will be passed to the search algorithm and scheduler. mode (str): Must be one of [min, max]. Determines whether objective is minimizing or maximizing the metric attribute. If set, will be passed to the search algorithm and scheduler. name (str): Name of experiment. stop (dict | callable | :class:`Stopper`): Stopping criteria. If dict, the keys may be any field in the return result of 'train()', whichever is reached first. If function, it must take (trial_id, result) as arguments and return a boolean (True if trial should be stopped, False otherwise). This can also be a subclass of ``ray.tune.Stopper``, which allows users to implement custom experiment-wide stopping (i.e., stopping an entire Tune run based on some time constraint). time_budget_s (int|float|datetime.timedelta): Global time budget in seconds after which all trials are stopped. Can also be a ``datetime.timedelta`` object. config (dict): Algorithm-specific configuration for Tune variant generation (e.g. env, hyperparams). Defaults to empty dict. Custom search algorithms may ignore this. resources_per_trial (dict|PlacementGroupFactory): Machine resources to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}``. Note that GPUs will not be assigned unless you specify them here. Defaults to 1 CPU and 0 GPUs in ``Trainable.default_resource_request()``. This can also be a PlacementGroupFactory object wrapping arguments to create a per-trial placement group. num_samples (int): Number of times to sample from the hyperparameter space. Defaults to 1. If `grid_search` is provided as an argument, the grid will be repeated `num_samples` of times. If this is -1, (virtually) infinite samples are generated until a stopping condition is met. local_dir (str): Local dir to save training results to. Defaults to ``~/ray_results``. search_alg (Searcher|SearchAlgorithm): Search algorithm for optimization. scheduler (TrialScheduler): Scheduler for executing the experiment. Choose among FIFO (default), MedianStopping, AsyncHyperBand, HyperBand and PopulationBasedTraining. Refer to ray.tune.schedulers for more options. keep_checkpoints_num (int): Number of checkpoints to keep. A value of `None` keeps all checkpoints. Defaults to `None`. If set, need to provide `checkpoint_score_attr`. checkpoint_score_attr (str): Specifies by which attribute to rank the best checkpoint. Default is increasing order. If attribute starts with `min-` it will rank attribute in decreasing order, i.e. `min-validation_loss`. checkpoint_freq (int): How many training iterations between checkpoints. A value of 0 (default) disables checkpointing. This has no effect when using the Functional Training API. checkpoint_at_end (bool): Whether to checkpoint at the end of the experiment regardless of the checkpoint_freq. Default is False. This has no effect when using the Functional Training API. verbose (Union[int, Verbosity]): 0, 1, 2, or 3. Verbosity mode. 0 = silent, 1 = only status updates, 2 = status and brief trial results, 3 = status and detailed trial results. Defaults to 3. progress_reporter (ProgressReporter): Progress reporter for reporting intermediate experiment progress. Defaults to CLIReporter if running in command-line, or JupyterNotebookReporter if running in a Jupyter notebook. log_to_file (bool|str|Sequence): Log stdout and stderr to files in Tune's trial directories. If this is `False` (default), no files are written. If `true`, outputs are written to `trialdir/stdout` and `trialdir/stderr`, respectively. If this is a single string, this is interpreted as a file relative to the trialdir, to which both streams are written. If this is a Sequence (e.g. a Tuple), it has to have length 2 and the elements indicate the files to which stdout and stderr are written, respectively. trial_name_creator (Callable[[Trial], str]): Optional function for generating the trial string representation. trial_dirname_creator (Callable[[Trial], str]): Function for generating the trial dirname. This function should take in a Trial object and return a string representing the name of the directory. The return value cannot be a path. sync_config (SyncConfig): Configuration object for syncing. See tune.SyncConfig. export_formats (list): List of formats that exported at the end of the experiment. Default is None. max_failures (int): Try to recover a trial at least this many times. Ray will recover from the latest checkpoint if present. Setting to -1 will lead to infinite recovery retries. Setting to 0 will disable retries. Defaults to 0. fail_fast (bool | str): Whether to fail upon the first error. If fail_fast='raise' provided, Tune will automatically raise the exception received by the Trainable. fail_fast='raise' can easily leak resources and should be used with caution (it is best used with `ray.init(local_mode=True)`). restore (str): Path to checkpoint. Only makes sense to set if running 1 trial. Defaults to None. server_port (int): Port number for launching TuneServer. resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", "ERRORED_ONLY", or bool. LOCAL/True restores the checkpoint from the local_checkpoint_dir, determined by `name` and `local_dir`. REMOTE restores the checkpoint from remote_checkpoint_dir. PROMPT provides CLI feedback. False forces a new experiment. ERRORED_ONLY resets and reruns ERRORED trials upon resume - previous trial artifacts will be left untouched. If resume is set but checkpoint does not exist, ValueError will be thrown. queue_trials (bool): Whether to queue trials when the cluster does not currently have enough resources to launch one. This should be set to True when running on an autoscaling cluster to enable automatic scale-up. reuse_actors (bool): Whether to reuse actors between different trials when possible. This can drastically speed up experiments that start and stop actors often (e.g., PBT in time-multiplexing mode). This requires trials to have the same resource requirements. trial_executor (TrialExecutor): Manage the execution of trials. raise_on_failed_trial (bool): Raise TuneError if there exists failed trial (of ERROR state) when the experiments complete. callbacks (list): List of callbacks that will be called at different times in the training loop. Must be instances of the ``ray.tune.callback.Callback`` class. If not passed, `LoggerCallback` and `SyncerCallback` callbacks are automatically added. _remote (bool): Whether to run the Tune driver in a remote function. This is disabled automatically if a custom trial executor is passed in. This is enabled by default in Ray client mode. Returns: ExperimentAnalysis: Object for experiment analysis. Raises: TuneError: Any trials failed and `raise_on_failed_trial` is True. """ if _remote is None: _remote = ray.util.client.ray.is_connected() if _remote is True and trial_executor: raise ValueError("cannot use custom trial executor") if not trial_executor or isinstance(trial_executor, RayTrialExecutor): _ray_auto_init() if _remote: return ray.get( ray.remote(num_cpus=0)(run).remote( run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, queue_trials, reuse_actors, trial_executor, raise_on_failed_trial, callbacks, # Deprecated args loggers, ray_auto_init, run_errored_only, global_checkpoint_period, with_server, upload_dir, sync_to_cloud, sync_to_driver, sync_on_checkpoint, _remote=False)) all_start = time.time() if global_checkpoint_period: raise ValueError("global_checkpoint_period is deprecated. Set env var " "'TUNE_GLOBAL_CHECKPOINT_S' instead.") if ray_auto_init: raise ValueError("ray_auto_init is deprecated. " "Set env var 'TUNE_DISABLE_AUTO_INIT=1' instead or " "call 'ray.init' before calling 'tune.run'.") if with_server: raise ValueError( "with_server is deprecated. It is now enabled by default " "if 'server_port' is not None.") if sync_on_checkpoint or sync_to_cloud or sync_to_driver or upload_dir: raise ValueError( "sync_on_checkpoint / sync_to_cloud / sync_to_driver / " "upload_dir must now be set via `tune.run(" "sync_config=SyncConfig(...)`. See `ray.tune.SyncConfig` for " "more details.") if mode and mode not in ["min", "max"]: raise ValueError( "The `mode` parameter passed to `tune.run()` has to be one of " "['min', 'max']") set_verbosity(verbose) config = config or {} sync_config = sync_config or SyncConfig() set_sync_periods(sync_config) if num_samples == -1: num_samples = sys.maxsize trial_executor = trial_executor or RayTrialExecutor( reuse_actors=reuse_actors, queue_trials=queue_trials) if isinstance(run_or_experiment, list): experiments = run_or_experiment else: experiments = [run_or_experiment] for i, exp in enumerate(experiments): if not isinstance(exp, Experiment): experiments[i] = Experiment( name=name, run=exp, stop=stop, time_budget_s=time_budget_s, config=config, resources_per_trial=resources_per_trial, num_samples=num_samples, local_dir=local_dir, upload_dir=sync_config.upload_dir, sync_to_driver=sync_config.sync_to_driver, trial_name_creator=trial_name_creator, trial_dirname_creator=trial_dirname_creator, log_to_file=log_to_file, checkpoint_freq=checkpoint_freq, checkpoint_at_end=checkpoint_at_end, sync_on_checkpoint=sync_config.sync_on_checkpoint, keep_checkpoints_num=keep_checkpoints_num, checkpoint_score_attr=checkpoint_score_attr, export_formats=export_formats, max_failures=max_failures, restore=restore) else: logger.debug("Ignoring some parameters passed into tune.run.") if sync_config.sync_to_cloud: for exp in experiments: assert exp.remote_checkpoint_dir, ( "Need `upload_dir` if `sync_to_cloud` given.") if fail_fast and max_failures != 0: raise ValueError("max_failures must be 0 if fail_fast=True.") if issubclass(type(search_alg), Searcher): search_alg = SearchGenerator(search_alg) if not search_alg: search_alg = BasicVariantGenerator() if config and not search_alg.set_search_properties(metric, mode, config): if has_unresolved_values(config): raise ValueError( "You passed a `config` parameter to `tune.run()` with " "unresolved parameters, but the search algorithm was already " "instantiated with a search space. Make sure that `config` " "does not contain any more parameter definitions - include " "them in the search algorithm's search space if necessary.") scheduler = scheduler or FIFOScheduler() if not scheduler.set_search_properties(metric, mode): raise ValueError( "You passed a `metric` or `mode` argument to `tune.run()`, but " "the scheduler you are using was already instantiated with their " "own `metric` and `mode` parameters. Either remove the arguments " "from your scheduler or from your call to `tune.run()`") # Create syncer callbacks callbacks = create_default_callbacks( callbacks, sync_config, metric=metric, loggers=loggers) runner = TrialRunner( search_alg=search_alg, scheduler=scheduler, local_checkpoint_dir=experiments[0].checkpoint_dir, remote_checkpoint_dir=experiments[0].remote_checkpoint_dir, sync_to_cloud=sync_config.sync_to_cloud, stopper=experiments[0].stopper, resume=resume, server_port=server_port, fail_fast=fail_fast, trial_executor=trial_executor, callbacks=callbacks, metric=metric) if not runner.resumed: for exp in experiments: search_alg.add_configurations([exp]) else: logger.info("TrialRunner resumed, ignoring new add_experiment.") if progress_reporter is None: if IS_NOTEBOOK: progress_reporter = JupyterNotebookReporter( overwrite=not has_verbosity(Verbosity.V2_TRIAL_NORM)) else: progress_reporter = CLIReporter() if not progress_reporter.set_search_properties(metric, mode): raise ValueError( "You passed a `metric` or `mode` argument to `tune.run()`, but " "the reporter you are using was already instantiated with their " "own `metric` and `mode` parameters. Either remove the arguments " "from your reporter or from your call to `tune.run()`") progress_reporter.set_total_samples(search_alg.total_samples) # User Warning for GPUs if trial_executor.has_gpus(): if isinstance(resources_per_trial, dict) and "gpu" in resources_per_trial: # "gpu" is manually set. pass elif _check_default_resources_override(experiments[0].run_identifier): # "default_resources" is manually overridden. pass else: logger.warning("Tune detects GPUs, but no trials are using GPUs. " "To enable trials to use GPUs, set " "tune.run(resources_per_trial={'gpu': 1}...) " "which allows Tune to expose 1 GPU to each trial. " "You can also override " "`Trainable.default_resource_request` if using the " "Trainable API.") original_handler = signal.getsignal(signal.SIGINT) state = {signal.SIGINT: False} def sigint_handler(sig, frame): logger.warning( "SIGINT received (e.g. via Ctrl+C), ending Ray Tune run. " "This will try to checkpoint the experiment state one last time. " "Press CTRL+C one more time (or send SIGINT/SIGKILL/SIGTERM) " "to skip. ") state[signal.SIGINT] = True # Restore original signal handler to react to future SIGINT signals signal.signal(signal.SIGINT, original_handler) if not int(os.getenv("TUNE_DISABLE_SIGINT_HANDLER", "0")): signal.signal(signal.SIGINT, sigint_handler) tune_start = time.time() while not runner.is_finished() and not state[signal.SIGINT]: runner.step() if has_verbosity(Verbosity.V1_EXPERIMENT): _report_progress(runner, progress_reporter) tune_taken = time.time() - tune_start try: runner.checkpoint(force=True) except Exception as e: logger.warning(f"Trial Runner checkpointing failed: {str(e)}") if has_verbosity(Verbosity.V1_EXPERIMENT): _report_progress(runner, progress_reporter, done=True) wait_for_sync() runner.cleanup_trials() incomplete_trials = [] for trial in runner.get_trials(): if trial.status != Trial.TERMINATED: incomplete_trials += [trial] if incomplete_trials: if raise_on_failed_trial and not state[signal.SIGINT]: raise TuneError("Trials did not complete", incomplete_trials) else: logger.error("Trials did not complete: %s", incomplete_trials) all_taken = time.time() - all_start if has_verbosity(Verbosity.V1_EXPERIMENT): logger.info(f"Total run time: {all_taken:.2f} seconds " f"({tune_taken:.2f} seconds for the tuning loop).") if state[signal.SIGINT]: logger.warning( "Experiment has been interrupted, but the most recent state was " "saved. You can continue running this experiment by passing " "`resume=True` to `tune.run()`") trials = runner.get_trials() return ExperimentAnalysis( runner.checkpoint_file, trials=trials, default_metric=metric, default_mode=mode)
def __init__( self, name, run, stop=None, time_budget_s=None, config=None, resources_per_trial=None, num_samples=1, local_dir=None, _experiment_checkpoint_dir: Optional[str] = None, sync_config=None, trial_name_creator=None, trial_dirname_creator=None, log_to_file=False, checkpoint_freq=0, checkpoint_at_end=False, keep_checkpoints_num=None, checkpoint_score_attr=None, export_formats=None, max_failures=0, restore=None, ): local_dir = _get_local_dir_with_expand_user(local_dir) # `_experiment_checkpoint_dir` is for internal use only for better # support of Tuner API. # If set, it should be a subpath under `local_dir`. Also deduce `dir_name`. self._experiment_checkpoint_dir = _experiment_checkpoint_dir if _experiment_checkpoint_dir: experiment_checkpoint_dir_path = Path(_experiment_checkpoint_dir) local_dir_path = Path(local_dir) assert local_dir_path in experiment_checkpoint_dir_path.parents # `dir_name` is set by `_experiment_checkpoint_dir` indirectly. self.dir_name = os.path.relpath(_experiment_checkpoint_dir, local_dir) config = config or {} sync_config = sync_config or SyncConfig() if ( callable(run) and not inspect.isclass(run) and detect_checkpoint_function(run) ): if checkpoint_at_end: raise ValueError( "'checkpoint_at_end' cannot be used with a " "checkpointable function. You can specify " "and register checkpoints within " "your trainable function." ) if checkpoint_freq: raise ValueError( "'checkpoint_freq' cannot be used with a " "checkpointable function. You can specify checkpoints " "within your trainable function." ) try: self._run_identifier = Experiment.register_if_needed(run) except grpc.RpcError as e: if e.code() == grpc.StatusCode.RESOURCE_EXHAUSTED: raise TuneError( f"The Trainable/training function is too large for grpc resource " f"limit. Check that its definition is not implicitly capturing a " f"large array or other object in scope. " f"Tip: use tune.with_parameters() to put large objects " f"in the Ray object store. \n" f"Original exception: {traceback.format_exc()}" ) else: raise e self.name = name or self._run_identifier if not _experiment_checkpoint_dir: self.dir_name = _get_dir_name(run, name, self.name) assert self.dir_name if sync_config.upload_dir: self.remote_checkpoint_dir = os.path.join( sync_config.upload_dir, self.dir_name ) else: self.remote_checkpoint_dir = None self._stopper = None stopping_criteria = {} if not stop: pass elif isinstance(stop, list): bad_stoppers = [s for s in stop if not isinstance(s, Stopper)] if bad_stoppers: stopper_types = [type(s) for s in stop] raise ValueError( "If you pass a list as the `stop` argument to " "`tune.run()`, each element must be an instance of " f"`tune.stopper.Stopper`. Got {stopper_types}." ) self._stopper = CombinedStopper(*stop) elif isinstance(stop, dict): stopping_criteria = stop elif callable(stop): if FunctionStopper.is_valid_function(stop): self._stopper = FunctionStopper(stop) elif isinstance(stop, Stopper): self._stopper = stop else: raise ValueError( "Provided stop object must be either a dict, " "a function, or a subclass of " f"`ray.tune.Stopper`. Got {type(stop)}." ) else: raise ValueError( f"Invalid stop criteria: {stop}. Must be a " f"callable or dict. Got {type(stop)}." ) if time_budget_s: if self._stopper: self._stopper = CombinedStopper( self._stopper, TimeoutStopper(time_budget_s) ) else: self._stopper = TimeoutStopper(time_budget_s) stdout_file, stderr_file = _validate_log_to_file(log_to_file) spec = { "run": self._run_identifier, "stop": stopping_criteria, "time_budget_s": time_budget_s, "config": config, "resources_per_trial": resources_per_trial, "num_samples": num_samples, "local_dir": local_dir, "sync_config": sync_config, "remote_checkpoint_dir": self.remote_checkpoint_dir, "trial_name_creator": trial_name_creator, "trial_dirname_creator": trial_dirname_creator, "log_to_file": (stdout_file, stderr_file), "checkpoint_freq": checkpoint_freq, "checkpoint_at_end": checkpoint_at_end, "keep_checkpoints_num": keep_checkpoints_num, "checkpoint_score_attr": checkpoint_score_attr, "export_formats": export_formats or [], "max_failures": max_failures, "restore": os.path.abspath(os.path.expanduser(restore)) if restore else None, } self.spec = spec
def create_trial_from_spec(spec, output_path, parser, **trial_kwargs): """Creates a Trial object from parsing the spec. Args: spec (dict): A resolved experiment specification. Arguments should The args here should correspond to the command line flags in ray.tune.config_parser. output_path (str); A specific output path within the local_dir. Typically the name of the experiment. parser (ArgumentParser): An argument parser object from make_parser. trial_kwargs: Extra keyword arguments used in instantiating the Trial. Returns: A trial object with corresponding parameters to the specification. """ global _cached_pgf spec = spec.copy() resources = spec.pop("resources_per_trial", None) try: args, _ = parser.parse_known_args(to_argv(spec)) except SystemExit: raise TuneError("Error parsing args, see above message", spec) if resources: if isinstance(resources, PlacementGroupFactory): trial_kwargs["placement_group_factory"] = resources else: # This will be converted to a placement group factory in the # Trial object constructor try: trial_kwargs["resources"] = json_to_resources(resources) except (TuneError, ValueError) as exc: raise TuneError("Error parsing resources_per_trial", resources) from exc remote_checkpoint_dir = spec.get("remote_checkpoint_dir") sync_config = spec.get("sync_config", SyncConfig()) if sync_config.syncer is None or isinstance(sync_config.syncer, str): sync_function_tpl = sync_config.syncer elif not isinstance(sync_config.syncer, str): # If a syncer was specified, but not a template, it is a function. # Functions cannot be used for trial checkpointing on remote nodes, # so we set the remote checkpoint dir to None to disable this. sync_function_tpl = None remote_checkpoint_dir = None else: sync_function_tpl = None # Auto-detect return Trial( # Submitting trial via server in py2.7 creates Unicode, which does not # convert to string in a straightforward manner. trainable_name=spec["run"], # json.load leads to str -> unicode in py2.7 config=spec.get("config", {}), local_dir=os.path.join(spec["local_dir"], output_path), # json.load leads to str -> unicode in py2.7 stopping_criterion=spec.get("stop", {}), remote_checkpoint_dir=remote_checkpoint_dir, sync_function_tpl=sync_function_tpl, checkpoint_freq=args.checkpoint_freq, checkpoint_at_end=args.checkpoint_at_end, sync_on_checkpoint=sync_config.sync_on_checkpoint, keep_checkpoints_num=args.keep_checkpoints_num, checkpoint_score_attr=args.checkpoint_score_attr, export_formats=spec.get("export_formats", []), # str(None) doesn't create None restore_path=spec.get("restore"), trial_name_creator=spec.get("trial_name_creator"), trial_dirname_creator=spec.get("trial_dirname_creator"), log_to_file=spec.get("log_to_file"), # str(None) doesn't create None max_failures=args.max_failures, **trial_kwargs)
def __init__( self, name, run, stop=None, time_budget_s=None, config=None, resources_per_trial=None, num_samples=1, local_dir=None, sync_config=None, trial_name_creator=None, trial_dirname_creator=None, log_to_file=False, checkpoint_freq=0, checkpoint_at_end=False, keep_checkpoints_num=None, checkpoint_score_attr=None, export_formats=None, max_failures=0, restore=None, ): config = config or {} sync_config = sync_config or SyncConfig() if (callable(run) and not inspect.isclass(run) and detect_checkpoint_function(run)): if checkpoint_at_end: raise ValueError("'checkpoint_at_end' cannot be used with a " "checkpointable function. You can specify " "and register checkpoints within " "your trainable function.") if checkpoint_freq: raise ValueError( "'checkpoint_freq' cannot be used with a " "checkpointable function. You can specify checkpoints " "within your trainable function.") self._run_identifier = Experiment.register_if_needed(run) self.name = name or self._run_identifier # If the name has been set explicitly, we don't want to create # dated directories. The same is true for string run identifiers. if (int(os.environ.get("TUNE_DISABLE_DATED_SUBDIR", 0)) == 1 or name or isinstance(run, str)): self.dir_name = self.name else: self.dir_name = "{}_{}".format(self.name, date_str()) if sync_config.upload_dir: self.remote_checkpoint_dir = os.path.join(sync_config.upload_dir, self.dir_name) else: self.remote_checkpoint_dir = None self._stopper = None stopping_criteria = {} if not stop: pass elif isinstance(stop, list): bad_stoppers = [s for s in stop if not isinstance(s, Stopper)] if bad_stoppers: stopper_types = [type(s) for s in stop] raise ValueError( "If you pass a list as the `stop` argument to " "`tune.run()`, each element must be an instance of " f"`tune.stopper.Stopper`. Got {stopper_types}.") self._stopper = CombinedStopper(*stop) elif isinstance(stop, dict): stopping_criteria = stop elif callable(stop): if FunctionStopper.is_valid_function(stop): self._stopper = FunctionStopper(stop) elif isinstance(stop, Stopper): self._stopper = stop else: raise ValueError("Provided stop object must be either a dict, " "a function, or a subclass of " f"`ray.tune.Stopper`. Got {type(stop)}.") else: raise ValueError(f"Invalid stop criteria: {stop}. Must be a " f"callable or dict. Got {type(stop)}.") if time_budget_s: if self._stopper: self._stopper = CombinedStopper(self._stopper, TimeoutStopper(time_budget_s)) else: self._stopper = TimeoutStopper(time_budget_s) stdout_file, stderr_file = _validate_log_to_file(log_to_file) spec = { "run": self._run_identifier, "stop": stopping_criteria, "config": config, "resources_per_trial": resources_per_trial, "num_samples": num_samples, "local_dir": os.path.abspath( os.path.expanduser(local_dir or DEFAULT_RESULTS_DIR)), "sync_config": sync_config, "remote_checkpoint_dir": self.remote_checkpoint_dir, "trial_name_creator": trial_name_creator, "trial_dirname_creator": trial_dirname_creator, "log_to_file": (stdout_file, stderr_file), "checkpoint_freq": checkpoint_freq, "checkpoint_at_end": checkpoint_at_end, "keep_checkpoints_num": keep_checkpoints_num, "checkpoint_score_attr": checkpoint_score_attr, "export_formats": export_formats or [], "max_failures": max_failures, "restore": os.path.abspath(os.path.expanduser(restore)) if restore else None, } self.spec = spec
def create_trial_from_spec(spec: dict, output_path: str, parser: argparse.ArgumentParser, **trial_kwargs): """Creates a Trial object from parsing the spec. Args: spec: A resolved experiment specification. Arguments should The args here should correspond to the command line flags in ray.tune.experiment.config_parser. output_path: A specific output path within the local_dir. Typically the name of the experiment. parser: An argument parser object from make_parser. trial_kwargs: Extra keyword arguments used in instantiating the Trial. Returns: A trial object with corresponding parameters to the specification. """ global _cached_pgf spec = spec.copy() resources = spec.pop("resources_per_trial", None) try: args, _ = parser.parse_known_args(to_argv(spec)) except SystemExit: raise TuneError("Error parsing args, see above message", spec) if resources: if isinstance(resources, PlacementGroupFactory): trial_kwargs["placement_group_factory"] = resources else: # This will be converted to a placement group factory in the # Trial object constructor try: trial_kwargs["resources"] = json_to_resources(resources) except (TuneError, ValueError) as exc: raise TuneError("Error parsing resources_per_trial", resources) from exc remote_checkpoint_dir = spec.get("remote_checkpoint_dir") sync_config = spec.get("sync_config", SyncConfig()) if (sync_config.syncer is None or sync_config.syncer == "auto" or isinstance(sync_config.syncer, Syncer)): custom_syncer = sync_config.syncer else: raise ValueError( f"Unknown syncer type passed in SyncConfig: {type(sync_config.syncer)}. " f"Note that custom sync functions and templates have been deprecated. " f"Instead you can implement you own `Syncer` class. " f"Please leave a comment on GitHub if you run into any issues with this: " f"https://github.com/ray-project/ray/issues") return Trial( # Submitting trial via server in py2.7 creates Unicode, which does not # convert to string in a straightforward manner. trainable_name=spec["run"], # json.load leads to str -> unicode in py2.7 config=spec.get("config", {}), local_dir=os.path.join(spec["local_dir"], output_path), # json.load leads to str -> unicode in py2.7 stopping_criterion=spec.get("stop", {}), remote_checkpoint_dir=remote_checkpoint_dir, custom_syncer=custom_syncer, checkpoint_freq=args.checkpoint_freq, checkpoint_at_end=args.checkpoint_at_end, sync_on_checkpoint=sync_config.sync_on_checkpoint, keep_checkpoints_num=args.keep_checkpoints_num, checkpoint_score_attr=args.checkpoint_score_attr, export_formats=spec.get("export_formats", []), # str(None) doesn't create None restore_path=spec.get("restore"), trial_name_creator=spec.get("trial_name_creator"), trial_dirname_creator=spec.get("trial_dirname_creator"), log_to_file=spec.get("log_to_file"), # str(None) doesn't create None max_failures=args.max_failures, **trial_kwargs, )
def execute( self, config, dataset=None, training_set=None, validation_set=None, test_set=None, training_set_metadata=None, data_format=None, experiment_name="hyperopt", model_name="run", resume=None, skip_save_training_description=False, skip_save_training_statistics=False, skip_save_model=False, skip_save_progress=False, skip_save_log=False, skip_save_processed_input=True, skip_save_unprocessed_output=False, skip_save_predictions=False, skip_save_eval_stats=False, output_directory="results", gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, callbacks=None, backend=None, random_seed=default_random_seed, debug=False, hyperopt_log_verbosity=3, features_eligible_for_shared_params=None, **kwargs, ) -> RayTuneResults: if isinstance(dataset, str) and not has_remote_protocol( dataset) and not os.path.isabs(dataset): dataset = os.path.abspath(dataset) if isinstance(backend, str): backend = initialize_backend(backend) if gpus is not None: raise ValueError( "Parameter `gpus` is not supported when using Ray Tune. " "Configure GPU resources with Ray and set `gpu_resources_per_trial` in your " "hyperopt config.") if gpu_memory_limit is None and 0 < self._gpu_resources_per_trial_non_none < 1: # Enforce fractional GPU utilization gpu_memory_limit = self.gpu_resources_per_trial hyperopt_dict = dict( config=config, dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, training_set_metadata=training_set_metadata, data_format=data_format, experiment_name=experiment_name, model_name=model_name, eval_split=self.split, skip_save_training_description=skip_save_training_description, skip_save_training_statistics=skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, output_directory=output_directory, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, callbacks=callbacks, backend=backend, random_seed=random_seed, debug=debug, ) mode = "min" if self.goal != MAXIMIZE else "max" metric = "metric_score" # if random seed not set, use Ludwig seed self.search_algorithm.check_for_random_seed(random_seed) if self.search_algorithm.search_alg_dict is not None: if TYPE not in self.search_algorithm.search_alg_dict: candiate_search_algs = [ search_alg for search_alg in SEARCH_ALG_IMPORT.keys() ] logger.warning( "WARNING: search_alg type parameter missing, using 'variant_generator' as default. " f"These are possible values for the type parameter: {candiate_search_algs}." ) search_alg = None else: search_alg_type = self.search_algorithm.search_alg_dict[TYPE] search_alg = tune.create_searcher( search_alg_type, metric=metric, mode=mode, **self.search_algorithm.search_alg_dict) else: search_alg = None if self.max_concurrent_trials: assert ( self.max_concurrent_trials > 0 ), f"`max_concurrent_trials` must be greater than 0, got {self.max_concurrent_trials}" if isinstance(search_alg, BasicVariantGenerator) or search_alg is None: search_alg = BasicVariantGenerator( max_concurrent=self.max_concurrent_trials) elif isinstance(search_alg, ConcurrencyLimiter): raise ValueError( "You have specified `max_concurrent_trials`, but the search " "algorithm is already a `ConcurrencyLimiter`. FIX THIS " "by setting `max_concurrent_trials=None`.") else: search_alg = ConcurrencyLimiter( search_alg, max_concurrent=self.max_concurrent_trials) resources_per_trial = { "cpu": self._cpu_resources_per_trial_non_none, "gpu": self._gpu_resources_per_trial_non_none, } def run_experiment_trial(config, local_hyperopt_dict, checkpoint_dir=None): return self._run_experiment( config, checkpoint_dir, local_hyperopt_dict, self.decode_ctx, features_eligible_for_shared_params, _is_ray_backend(backend), ) tune_config = {} tune_callbacks = [] for callback in callbacks or []: run_experiment_trial, tune_config = callback.prepare_ray_tune( run_experiment_trial, tune_config, tune_callbacks, ) if _is_ray_backend(backend): # for now, we do not do distributed training on cpu (until spread scheduling is implemented for Ray Train) # but we do want to enable it when GPUs are specified resources_per_trial = PlacementGroupFactory( [{}] + ([{ "CPU": 0, "GPU": 1 }] * self._gpu_resources_per_trial_non_none) if self. _gpu_resources_per_trial_non_none else [{}] + [{ "CPU": self._cpu_resources_per_trial_non_none }]) if has_remote_protocol(output_directory): run_experiment_trial = tune.durable(run_experiment_trial) self.sync_config = tune.SyncConfig(sync_to_driver=False, upload_dir=output_directory) if _ray_114: self.sync_client = get_node_to_storage_syncer( SyncConfig(upload_dir=output_directory)) else: self.sync_client = get_cloud_sync_client(output_directory) output_directory = None elif self.kubernetes_namespace: from ray.tune.integration.kubernetes import KubernetesSyncClient, NamespacedKubernetesSyncer self.sync_config = tune.SyncConfig( sync_to_driver=NamespacedKubernetesSyncer( self.kubernetes_namespace)) self.sync_client = KubernetesSyncClient(self.kubernetes_namespace) run_experiment_trial_params = tune.with_parameters( run_experiment_trial, local_hyperopt_dict=hyperopt_dict) register_trainable( f"trainable_func_f{hash_dict(config).decode('ascii')}", run_experiment_trial_params) # Note that resume="AUTO" will attempt to resume the experiment if possible, and # otherwise will start a new experiment: # https://docs.ray.io/en/latest/tune/tutorials/tune-stopping.html should_resume = "AUTO" if resume is None else resume try: analysis = tune.run( f"trainable_func_f{hash_dict(config).decode('ascii')}", name=experiment_name, config={ **self.search_space, **tune_config, }, scheduler=self.scheduler, search_alg=search_alg, num_samples=self.num_samples, keep_checkpoints_num=1, max_failures=1, # retry a trial failure once resources_per_trial=resources_per_trial, time_budget_s=self.time_budget_s, sync_config=self.sync_config, local_dir=output_directory, metric=metric, mode=mode, trial_name_creator=lambda trial: f"trial_{trial.trial_id}", trial_dirname_creator=lambda trial: f"trial_{trial.trial_id}", callbacks=tune_callbacks, stop=CallbackStopper(callbacks), verbose=hyperopt_log_verbosity, resume=should_resume, log_to_file=True, ) except Exception as e: # Explicitly raise a RuntimeError if an error is encountered during a Ray trial. # NOTE: Cascading the exception with "raise _ from e" still results in hanging. raise RuntimeError(f"Encountered Ray Tune error: {e}") if "metric_score" in analysis.results_df.columns: ordered_trials = analysis.results_df.sort_values( "metric_score", ascending=self.goal != MAXIMIZE) # Catch nans in edge case where the trial doesn't complete temp_ordered_trials = [] for kwargs in ordered_trials.to_dict(orient="records"): for key in ["parameters", "training_stats", "eval_stats"]: if isinstance(kwargs[key], float): kwargs[key] = {} temp_ordered_trials.append(kwargs) # Trials w/empty eval_stats fields & non-empty training_stats fields ran intermediate # tune.report call(s) but were terminated before reporting eval_stats from post-train # evaluation (e.g., trial stopped due to time budget or relatively poor performance.) # For any such trials, run model evaluation for the best model in that trial & record # results in ordered_trials which is returned & is persisted in hyperopt_statistics.json. for trial in temp_ordered_trials: if trial["eval_stats"] == "{}" and trial[ "training_stats"] != "{}": # Evaluate the best model on the eval_split, which is validation_set if validation_set is not None and validation_set.size > 0: trial_path = trial["trial_dir"] best_model_path = self._get_best_model_path( trial_path, analysis) if best_model_path is not None: self._evaluate_best_model( trial, trial_path, best_model_path, validation_set, data_format, skip_save_unprocessed_output, skip_save_predictions, skip_save_eval_stats, gpus, gpu_memory_limit, allow_parallel_threads, backend, debug, ) else: logger.warning( "Skipping evaluation as no model checkpoints were available" ) else: logger.warning( "Skipping evaluation as no validation set was provided" ) ordered_trials = [ TrialResults.from_dict(load_json_values(kwargs)) for kwargs in temp_ordered_trials ] else: logger.warning( "No trials reported results; check if time budget lower than epoch latency" ) ordered_trials = [] return RayTuneResults(ordered_trials=ordered_trials, experiment_analysis=analysis)