def testHasResourcesForTrialWithCaching(self): pgm = PlacementGroupManager() pgf1 = PlacementGroupFactory([{"CPU": self.head_cpus}]) pgf2 = PlacementGroupFactory([{"CPU": self.head_cpus - 1}]) executor = RayTrialExecutor(reuse_actors=True) executor._pg_manager = pgm executor.set_max_pending_trials(1) def train(config): yield 1 yield 2 yield 3 yield 4 register_trainable("resettable", train) trial1 = Trial("resettable", placement_group_factory=pgf1) trial2 = Trial("resettable", placement_group_factory=pgf1) trial3 = Trial("resettable", placement_group_factory=pgf2) assert executor.has_resources_for_trial(trial1) assert executor.has_resources_for_trial(trial2) assert executor.has_resources_for_trial(trial3) executor._stage_and_update_status([trial1, trial2, trial3]) while not pgm.has_ready(trial1): time.sleep(1) executor._stage_and_update_status([trial1, trial2, trial3]) # Fill staging executor._stage_and_update_status([trial1, trial2, trial3]) assert executor.has_resources_for_trial(trial1) assert executor.has_resources_for_trial(trial2) assert not executor.has_resources_for_trial(trial3) executor._start_trial(trial1) executor._stage_and_update_status([trial1, trial2, trial3]) executor.pause_trial(trial1) # Caches the PG and removes a PG from staging assert len(pgm._staging_futures) == 0 # This will re-schedule a placement group pgm.reconcile_placement_groups([trial1, trial2]) assert len(pgm._staging_futures) == 1 assert not pgm.can_stage() # We should still have resources for this trial as it has a cached PG assert executor.has_resources_for_trial(trial1) assert executor.has_resources_for_trial(trial2) assert not executor.has_resources_for_trial(trial3)
class RayTrialExecutor(TrialExecutor): """An implementation of TrialExecutor based on Ray.""" def __init__( self, reuse_actors: bool = False, result_buffer_length: Optional[int] = None, refresh_period: Optional[float] = None, ): super(RayTrialExecutor, self).__init__() # future --> (type, trial/pg) self._futures = {} force_trial_cleanup = int( os.environ.get("TUNE_FORCE_TRIAL_CLEANUP_S", "0")) self._get_next_event_wait = int( os.environ.get("TUNE_GET_EXECUTOR_EVENT_WAIT_S", "5")) if force_trial_cleanup: self._trial_cleanup = _TrialCleanup(force_trial_cleanup) else: self._trial_cleanup = None self._resource_updater = ResourceUpdater(refresh_period) self._has_cleaned_up_pgs = False self._reuse_actors = reuse_actors # The maxlen will be updated when `set_max_pending_trials()` is called self._cached_actor_pg = deque(maxlen=1) self._pg_manager = PlacementGroupManager(prefix=get_tune_pg_prefix()) self._staged_trials = set() self._trial_just_finished = False self._trial_just_finished_before = False self.last_pg_recon = 0 self.pg_recon_interval = float( os.environ.get("TUNE_PLACEMENT_GROUP_RECON_INTERVAL", "5")) self._buffer_length = result_buffer_length or int( os.getenv("TUNE_RESULT_BUFFER_LENGTH", 1)) self._buffer_min_time_s = float( os.getenv("TUNE_RESULT_BUFFER_MIN_TIME_S", 0.0)) self._buffer_max_time_s = float( os.getenv("TUNE_RESULT_BUFFER_MAX_TIME_S", 100.0)) def set_max_pending_trials(self, max_pending: int) -> None: if len(self._cached_actor_pg) > 0: logger.warning( "Cannot update maximum number of queued actors for reuse " "during a run.") else: self._cached_actor_pg = deque(maxlen=max_pending) self._pg_manager.set_max_staging(max_pending) def _stage_and_update_status(self, trials: Iterable[Trial]): """Check and update statuses of scheduled placement groups. Stages placement groups of all trials. """ if not self._has_cleaned_up_pgs: # Clean up existing placement groups after trigger the tuning # run step() method for the first time self._pg_manager.cleanup_existing_pg() self._has_cleaned_up_pgs = True for trial in trials: if trial.status not in (Trial.PENDING, Trial.PAUSED): continue if trial in self._staged_trials: continue if self._pg_manager.trial_in_use(trial): continue if not self._pg_manager.stage_trial_pg(trial): # Break if we reached the limit of pending placement groups. break self._staged_trials.add(trial) self._pg_manager.update_status() def get_staged_trial(self): """Get a trial whose placement group was successfully staged. Can also return None if no trial is available. Returns: Trial object or None. """ # TODO(xwjiang): This method should consider `self._cached_actor_pg`. for trial in self._staged_trials: if self._pg_manager.has_ready(trial): return trial return None def _setup_remote_runner(self, trial): trial.init_logdir() # We checkpoint metadata here to try mitigating logdir duplication self._trials_to_cache.add(trial) logger_creator = partial(noop_logger_creator, logdir=trial.logdir) if len(self._cached_actor_pg) > 0: assert self._reuse_actors existing_runner, pg = self._cached_actor_pg.popleft() logger.debug(f"Trial {trial}: Reusing cached runner " f"{existing_runner}") trial.set_runner(existing_runner) if pg: self._pg_manager.assign_cached_pg(pg, trial) if not self.reset_trial(trial, trial.config, trial.experiment_tag, logger_creator): raise AbortTrialExecution( "Trainable runner reuse requires reset_config() to be " "implemented and return True.") return existing_runner trainable_cls = trial.get_trainable_cls() if not trainable_cls: raise AbortTrialExecution( f"Invalid trainable: {trial.trainable_name}. If you passed " f"a string, make sure the trainable was registered before.") _actor_cls = _class_cache.get(trainable_cls) if not self._pg_manager.has_ready(trial): return None full_actor_class = self._pg_manager.get_full_actor_cls( trial, _actor_cls) # Clear the Trial's location (to be updated later on result) # since we don't know where the remote runner is placed. trial.set_location(Location()) logger.debug("Trial %s: Setting up new remote runner.", trial) # Logging for trials is handled centrally by TrialRunner, so # configure the remote runner to use a noop-logger. trial_config = copy.deepcopy(trial.config) trial_config[TRIAL_INFO] = TrialInfo(trial) stdout_file, stderr_file = trial.log_to_file trial_config[STDOUT_FILE] = stdout_file trial_config[STDERR_FILE] = stderr_file kwargs = { "config": trial_config, "logger_creator": logger_creator, } if trial.uses_cloud_checkpointing: # We keep these kwargs separate for backwards compatibility # with trainables that don't provide these keyword arguments kwargs["remote_checkpoint_dir"] = trial.remote_checkpoint_dir kwargs["sync_function_tpl"] = trial.sync_function_tpl # Throw a meaningful error if trainable does not use the # new API sig = inspect.signature(trial.get_trainable_cls()) try: sig.bind_partial(**kwargs) except Exception as e: raise RuntimeError( "Your trainable class does not accept a " "`remote_checkpoint_dir` or `sync_function_tpl` argument " "in its constructor, but you've passed a " "`upload_dir` to your SyncConfig. Without accepting " "these parameters and passing them to the base trainable " "constructor in the init call, cloud checkpointing is " "effectively disabled. To resolve this issue, add the " "parameters to your trainable class constructor or " "disable cloud checkpointing by setting `upload_dir=None`." ) from e with self._change_working_directory(trial): return full_actor_class.remote(**kwargs) def _train(self, trial): """Start one iteration of training and save remote id.""" if self._find_future(trial): logging.debug( "Trial {} already has a queued future. Skipping this " "`train` call. This may occur if a trial has " "been unpaused within a scheduler callback.".format( str(trial))) return assert trial.status == Trial.RUNNING, trial.status buffer_time_s = max( self._buffer_min_time_s, min(self._buffer_max_time_s, len(self._futures) // 10), ) with self._change_working_directory(trial): buffer_length = self._buffer_length if buffer_length > 1 and trial.checkpoint_at_end: # If a trial checkpoint can be triggered externally, # it is not safe to buffer results. if log_once("trial_executor_buffer_checkpoint"): logger.warning("Disabling buffered training as you passed " "`checkpoint_at_end` to `tune.run()`.") buffer_length = 1 if buffer_length > 1: if trial.checkpoint_freq > 0: buffer_length = min(buffer_length, trial.checkpoint_freq) remote = trial.runner.train_buffered.remote( buffer_time_s, buffer_length) else: remote = trial.runner.train.remote() # Local Mode if isinstance(remote, dict): remote = _LocalWrapper(remote) self._futures[remote] = (ExecutorEventType.TRAINING_RESULT, trial) trial_item = self._find_future(trial) assert len(trial_item) < 2, trial_item def _start_trial(self, trial: Trial) -> bool: """Starts trial and restores last result if trial was paused. Args: trial: The trial to start. Returns: True if trial was started successfully, False otherwise. See `RayTrialExecutor.restore` for possible errors raised. """ self.set_status(trial, Trial.PENDING) runner = self._setup_remote_runner(trial) if not runner: return False trial.set_runner(runner) self.restore(trial) self.set_status(trial, Trial.RUNNING) if trial in self._staged_trials: self._staged_trials.remove(trial) if not trial.is_restoring: self._train(trial) return True def _stop_trial( self, trial: Trial, error: bool = False, exc: Optional[Union[TuneError, RayTaskError]] = None, ): """Stops this trial. Stops this trial, releasing all allocating resources. If stopping the trial fails, the run will be marked as terminated in error, but no exception will be thrown. Args: error: Whether to mark this trial as terminated in error. exc: Optional exception. """ self.set_status(trial, Trial.ERROR if error or exc else Trial.TERMINATED) self._trial_just_finished = True trial.set_location(Location()) try: trial.write_error_log(exc=exc) if hasattr(trial, "runner") and trial.runner: if (not error and self._reuse_actors and (len(self._cached_actor_pg) < (self._cached_actor_pg.maxlen or float("inf")))): logger.debug("Reusing actor for %s", trial.runner) # Move PG into cache (disassociate from trial) pg = self._pg_manager.cache_trial_pg(trial) if pg: # True if a placement group was replaced self._cached_actor_pg.append((trial.runner, pg)) should_destroy_actor = False else: # False if no placement group was replaced. This should # only be the case if there are no more trials with # this placement group factory to run logger.debug( "Could not cache of trial {trial} actor for " "reuse, as there are no pending trials " "requiring its resources.") should_destroy_actor = True else: should_destroy_actor = True if should_destroy_actor: logger.debug("Trial %s: Destroying actor.", trial) with self._change_working_directory(trial): future = trial.runner.stop.remote() pg = self._pg_manager.remove_from_in_use(trial) self._futures[future] = (ExecutorEventType.STOP_RESULT, pg) if self._trial_cleanup: # force trial cleanup within a deadline self._trial_cleanup.add(future) if trial in self._staged_trials: self._staged_trials.remove(trial) except Exception: logger.exception("Trial %s: Error stopping runner.", trial) self.set_status(trial, Trial.ERROR) finally: trial.set_runner(None) def start_trial(self, trial: Trial) -> bool: """Starts the trial. Will not return resources if trial repeatedly fails on start. Args: trial: Trial to be started. Returns: True if the remote runner has been started. False if trial was not started (e.g. because of lacking resources/pending PG). """ try: return self._start_trial(trial) except AbortTrialExecution as e: logger.exception("Trial %s: Error starting runner, aborting!", trial) time.sleep(2) self._stop_trial(trial, exc=e) return False except Exception as e: logger.exception("Trial %s: Unexpected error starting runner.", trial) time.sleep(2) if isinstance(e, TuneError): self._stop_trial(trial, exc=e) else: self._stop_trial(trial, exc=TuneStartTrialError( traceback.format_exc())) # Note that we don't return the resources, since they may # have been lost. TODO(ujvl): is this the right thing to do? return False def _find_future(self, trial): out = [rid for rid, t in self._futures.items() if t[1] is trial] assert ( len(out) <= 1), "Expecting one future for any given trial at any given time." return out def stop_trial( self, trial: Trial, error: bool = False, exc: Optional[Union[TuneError, RayTaskError]] = None, ) -> None: prior_status = trial.status self._stop_trial(trial, error=error or exc, exc=exc) if prior_status == Trial.RUNNING: logger.debug("Trial %s: Returning resources.", trial) out = self._find_future(trial) for result_id in out: self._futures.pop(result_id) def continue_training(self, trial: Trial) -> None: """Continues the training of this trial.""" self._train(trial) def reset_trial( self, trial: Trial, new_config: Dict, new_experiment_tag: str, logger_creator: Optional[Callable[[Dict], "ray.tune.Logger"]] = None, ) -> bool: """Tries to invoke `Trainable.reset()` to reset trial. Args: trial: Trial to be reset. new_config: New configuration for Trial trainable. new_experiment_tag: New experiment name for trial. logger_creator: Function that instantiates a logger on the actor process. Returns: True if `reset_config` is successful else False. """ trial.set_experiment_tag(new_experiment_tag) trial.set_config(new_config) trainable = trial.runner # Pass magic variables extra_config = copy.deepcopy(new_config) extra_config[TRIAL_INFO] = TrialInfo(trial) stdout_file, stderr_file = trial.log_to_file extra_config[STDOUT_FILE] = stdout_file extra_config[STDERR_FILE] = stderr_file with self._change_working_directory(trial): with warn_if_slow("reset"): try: reset_val = ray.get( trainable.reset.remote(extra_config, logger_creator), timeout=DEFAULT_GET_TIMEOUT, ) except GetTimeoutError: logger.exception("Trial %s: reset timed out.", trial) return False return reset_val def has_resources_for_trial(self, trial: Trial) -> bool: """Returns whether there are resources available for this trial. This will return True as long as we didn't reach the maximum number of pending trials. It will also return True if the trial placement group is already staged. Args: trial: Trial object which should be scheduled. Returns: boolean """ return (trial in self._staged_trials or self._pg_manager.can_stage() or self._pg_manager.has_ready(trial, update=True)) def debug_string(self) -> str: """Returns a human readable message for printing to the console.""" total_resources = self._pg_manager.occupied_resources() return self._resource_updater.debug_string(total_resources) def on_step_begin(self, trials: List[Trial]) -> None: """Before step() is called, update the available resources.""" self._resource_updater.update_avail_resources() self._trial_just_finished_before = self._trial_just_finished self._trial_just_finished = False def on_step_end(self, trials: List[Trial]) -> None: self._do_force_trial_cleanup() if time.time() > self.last_pg_recon + self.pg_recon_interval: # Only do this every now and then - usually the placement groups # should not get out of sync, and calling this often is inefficient self._pg_manager.reconcile_placement_groups(trials) self.last_pg_recon = time.time() self._pg_manager.cleanup() def _do_force_trial_cleanup(self) -> None: if self._trial_cleanup: while True: next_future_to_clean = self._trial_cleanup.get_next() if not next_future_to_clean: break if next_future_to_clean in self._futures.keys(): _, pg = self._futures.pop(next_future_to_clean) post_stop_cleanup(next_future_to_clean, pg) else: # This just means that before the deadline reaches, # the future is already cleaned up. pass def force_reconcilation_on_next_step_end(self) -> None: self.last_pg_recon = -float("inf") def save( self, trial: Trial, storage: str = _TuneCheckpoint.PERSISTENT, result: Optional[Dict] = None, ) -> _TuneCheckpoint: """Saves the trial's state to a checkpoint asynchronously. Args: trial: The trial to be saved. storage: Where to store the checkpoint. Defaults to PERSISTENT. result: The state of this trial as a dictionary to be saved. If result is None, the trial's last result will be used. Returns: Checkpoint object, or None if an Exception occurs. """ logger.debug(f"saving trial {trial}") result = result or trial.last_result with self._change_working_directory(trial): if storage == _TuneCheckpoint.MEMORY: value = trial.runner.save_to_object.remote() checkpoint = _TuneCheckpoint(storage, value, result) trial.on_checkpoint(checkpoint) else: value = trial.runner.save.remote() checkpoint = _TuneCheckpoint(storage, value, result) trial.saving_to = checkpoint self._futures[value] = (ExecutorEventType.SAVING_RESULT, trial) return checkpoint def restore(self, trial: Trial) -> None: """Restores training state from a given model checkpoint. Args: trial: The trial to be restored. Raises: RuntimeError: This error is raised if no runner is found. AbortTrialExecution: This error is raised if the trial is ineligible for restoration, given the Tune input arguments. """ checkpoint = trial.checkpoint if checkpoint.value is None: return if trial.runner is None: raise RuntimeError( "Trial {}: Unable to restore - no runner found.".format(trial)) value = checkpoint.value node_ip = checkpoint.node_ip if checkpoint.storage == _TuneCheckpoint.MEMORY: logger.debug("Trial %s: Attempting restore from object", trial) # Note that we don't store the remote since in-memory checkpoints # don't guarantee fault tolerance and don't need to be waited on. with self._change_working_directory(trial): trial.runner.restore_from_object.remote(value) else: logger.debug("Trial %s: Attempting restore from %s", trial, value) if trial.uses_cloud_checkpointing or not trial.sync_on_checkpoint: # If using cloud checkpointing, trial will get cp from cloud. # If not syncing to driver, assume it has access to the cp # on the local fs. with self._change_working_directory(trial): remote = trial.runner.restore.remote(value, node_ip) elif trial.sync_on_checkpoint: # This provides FT backwards compatibility in the # case where no cloud checkpoints are provided. logger.debug("Trial %s: Reading checkpoint into memory", trial) obj = TrainableUtil.checkpoint_to_object(value) with self._change_working_directory(trial): remote = trial.runner.restore_from_object.remote(obj) else: raise AbortTrialExecution( "Pass in `sync_on_checkpoint=True` for driver-based trial" "restoration. Pass in an `upload_dir` for remote " "storage-based restoration") self._futures[remote] = (ExecutorEventType.RESTORING_RESULT, trial) trial.restoring_from = checkpoint def export_trial_if_needed(self, trial: Trial) -> Dict: """Exports model of this trial based on trial.export_formats. Return: A dict that maps ExportFormats to successfully exported models. """ if trial.export_formats and len(trial.export_formats) > 0: with self._change_working_directory(trial): return ray.get( trial.runner.export_model.remote(trial.export_formats), timeout=DEFAULT_GET_TIMEOUT, ) return {} def has_gpus(self) -> bool: return self._resource_updater.get_num_gpus() > 0 def cleanup(self, trials: List[Trial]) -> None: while True: if self._trial_cleanup and self._trial_cleanup.is_empty(): break elif not self._trial_cleanup and len(self._futures) == 0: break self._do_force_trial_cleanup() ready, _ = ray.wait(list(self._futures.keys()), timeout=0) if not ready: continue event_type, trial_or_pg = self._futures.pop(ready[0]) if event_type == ExecutorEventType.STOP_RESULT: post_stop_cleanup(ready[0], trial_or_pg) self._pg_manager.reconcile_placement_groups(trials) self._pg_manager.cleanup(force=True) self._pg_manager.cleanup_existing_pg(block=True) @contextmanager def _change_working_directory(self, trial): """Context manager changing working directory to trial logdir. Used in local mode. For non-local mode it is no-op. """ if ray.worker._mode() == ray.worker.LOCAL_MODE: old_dir = os.getcwd() try: os.chdir(trial.logdir) yield finally: os.chdir(old_dir) else: yield def get_next_executor_event(self, live_trials: Set[Trial], next_trial_exists: bool) -> ExecutorEvent: """Get the next executor event to be processed in TrialRunner. In case there are multiple events available for handling, the next event is determined by the following priority: 1. if there is `next_trial_exists`, and if there is cached resources to use, PG_READY is emitted. 2. if there is `next_trial_exists` and there is no cached resources to use, wait on pg future and randomized other futures. If multiple futures are ready, pg future will take priority to be handled first. 3. if there is no `next_trial_exists`, wait on just randomized other futures. An example of #3 would be synchronous hyperband. Although there are pgs ready, the scheduler is holding back scheduling new trials since the whole band of trials is waiting for the slowest trial to finish. In this case, we prioritize handling training result to avoid deadlock situation. This is a blocking wait with a timeout (specified with env var). The reason for the timeout is we still want to print status info periodically in TrialRunner for better user experience. The handle of `ExecutorEvent.STOP_RESULT` is purely internal to RayTrialExecutor itself. All the other future results are handled by TrialRunner. In the future we may want to do most of the handle of `ExecutorEvent.RESTORE_RESULT` and `SAVING_RESULT` in RayTrialExecutor itself and only notify TrialRunner to invoke corresponding callbacks. This view is more consistent with our goal of TrialRunner responsible for external facing Trial state transition, while RayTrialExecutor responsible for internal facing transitions, namely, `is_saving`, `is_restoring` etc. Also you may notice that the boundary between RayTrialExecutor and PlacementGroupManager right now is really blurry. This will be improved once we move to an ActorPool abstraction. `next_trial_exists` means that there is a trial to run - prioritize returning PG_READY in this case. """ # First update status of staged placement groups self._stage_and_update_status(live_trials) while True: ################################################################### # when next_trial_exists and there are cached resources ################################################################### # There could be existing PGs from either `self._cached_actor_pg` # or from `self._pg_manager._ready`. If so and if there is indeed # a next trial to run, we return `PG_READY` future for trial # runner. The next trial can then be scheduled on this PG. if next_trial_exists: if len(self._cached_actor_pg) > 0: return ExecutorEvent(ExecutorEventType.PG_READY) # TODO(xwjiang): Expose proper API when we decide to do # ActorPool abstraction. if any(len(r) > 0 for r in self._pg_manager._ready.values()): return ExecutorEvent(ExecutorEventType.PG_READY) ################################################################### # Prepare for futures to wait ################################################################### futures_to_wait = list(self._futures.keys()) random.shuffle(futures_to_wait) if next_trial_exists: # Only wait for pg explicitly if there is next trial to run. # In which case, handling PG_READY triumphs handling other events. # Since we want to place pending trial ASAP. futures_to_wait = (self._pg_manager.get_staging_future_list() + futures_to_wait) logger.debug(f"get_next_executor_event before wait with futures " f"{futures_to_wait} and " f"next_trial_exists={next_trial_exists}") ready_futures, _ = ray.wait(futures_to_wait, num_returns=1, timeout=self._get_next_event_wait) ################################################################### # Dealing with no future returned case. ################################################################### if len(ready_futures) == 0: if len(self._futures) == 0: # No running trial and timing out with wait, could be we may # have insufficient cluster resources that makes tune run # infeasible. # TODO: Move InsufficientResourceManager's logic # to TrialExecutor. It is not Runner's responsibility! return ExecutorEvent( ExecutorEventType.NO_RUNNING_TRIAL_TIMEOUT) else: # Training simply takes long time, yield the control back to main # event loop to print progress info etc. return ExecutorEvent(ExecutorEventType.YIELD) ################################################################### # If there is future returned. ################################################################### assert len(ready_futures) == 1 ready_future = ready_futures[0] ################################################################### # If it is a PG_READY event. ################################################################### if ready_future not in self._futures.keys(): self._pg_manager.handle_ready_future(ready_future) return ExecutorEvent(ExecutorEventType.PG_READY) ################################################################### # non PG_READY event ################################################################### result_type, trial_or_pg = self._futures.pop(ready_future) if result_type == ExecutorEventType.STOP_RESULT: pg = trial_or_pg post_stop_cleanup(ready_future, pg) else: trial = trial_or_pg assert isinstance(trial, Trial) try: future_result = ray.get(ready_future) # For local mode if isinstance(future_result, _LocalWrapper): future_result = future_result.unwrap() if result_type in ( ExecutorEventType.TRAINING_RESULT, ExecutorEventType.SAVING_RESULT, ExecutorEventType.RESTORING_RESULT, ): logger.debug( f"Returning [{result_type}] for trial {trial}") return ExecutorEvent( result_type, trial, result={ ExecutorEvent.KEY_FUTURE_RESULT: future_result }, ) else: raise TuneError( f"Unexpected future type - [{result_type}]") except RayTaskError as e: return ExecutorEvent( ExecutorEventType.ERROR, trial, result={ ExecutorEvent.KEY_EXCEPTION: e.as_instanceof_cause() }, ) except Exception: return ExecutorEvent( ExecutorEventType.ERROR, trial, result={ ExecutorEvent.KEY_EXCEPTION: TuneGetNextExecutorEventError( traceback.format_exc()) }, )
class RayTrialExecutor(TrialExecutor): """An implementation of TrialExecutor based on Ray.""" def __init__(self, queue_trials: bool = False, reuse_actors: bool = False, refresh_period: Optional[float] = None, wait_for_placement_group: Optional[float] = None): super(RayTrialExecutor, self).__init__(queue_trials) # Check for if we are launching a trial without resources in kick off # autoscaler. self._trial_queued = False self._running = {} # Since trial resume after paused should not run # trial.train.remote(), thus no more new remote object ref generated. # We use self._paused to store paused trials here. self._paused = {} self._trial_cleanup = _TrialCleanup() self._has_cleaned_up_pgs = False self._reuse_actors = reuse_actors self._cached_actor_pg = (None, None) self._avail_resources = Resources(cpu=0, gpu=0) self._committed_resources = Resources(cpu=0, gpu=0) self._pg_manager = PlacementGroupManager(prefix=get_tune_pg_prefix()) self._staged_trials = set() self._just_staged_trials = set() self._trial_just_finished = False self._trial_just_finished_before = False self._resources_initialized = False if refresh_period is None: refresh_period = float( os.environ.get("TUNE_STATE_REFRESH_PERIOD", TUNE_STATE_REFRESH_PERIOD)) self._refresh_period = refresh_period self._wait_for_pg = wait_for_placement_group or float( os.environ.get("TUNE_PLACEMENT_GROUP_WAIT_S", "-1")) if self._wait_for_pg < 0: self._wait_for_pg = None self._buffer_length = int(os.getenv("TUNE_RESULT_BUFFER_LENGTH", 1000)) self._buffer_min_time_s = float( os.getenv("TUNE_RESULT_BUFFER_MIN_TIME_S", 0.)) self._buffer_max_time_s = float( os.getenv("TUNE_RESULT_BUFFER_MAX_TIME_S", 100.)) self._last_resource_refresh = float("-inf") self._last_ip_refresh = float("-inf") self._last_ip_addresses = set() self._last_nontrivial_wait = time.time() if ray.is_initialized(): self._update_avail_resources() def in_staging_grace_period(self) -> bool: """Returns True if trials have recently been staged.""" return self._pg_manager.in_staging_grace_period() def set_max_pending_trials(self, max_pending: int): self._pg_manager.set_max_staging(max_pending) def stage_and_update_status(self, trials: List[Trial]): """Check and update statuses of scheduled placement groups. Stages placement groups of all trials. """ if not self._has_cleaned_up_pgs: # Clean up existing placement groups after trigger the tuning # run step() method for the first time self._pg_manager.cleanup_existing_pg() self._has_cleaned_up_pgs = True for trial in trials: if trial.status != Trial.PENDING: continue if not trial.uses_placement_groups: continue if trial in self._staged_trials: continue if self._pg_manager.trial_in_use(trial): continue if not self._pg_manager.stage_trial_pg(trial): # Break if we reached the limit of pending placement groups. break self._staged_trials.add(trial) self._just_staged_trials.add(trial) self._pg_manager.update_status() def get_staged_trial(self): """Get a trial whose placement group was successfully staged. Can also return None if no trial is available. Returns: Trial object or None. """ for trial in self._staged_trials: if self._pg_manager.has_ready(trial): return trial return None def _setup_remote_runner(self, trial): trial.init_logdir() # We checkpoint metadata here to try mitigating logdir duplication self.try_checkpoint_metadata(trial) logger_creator = partial(noop_logger_creator, logdir=trial.logdir) if self._reuse_actors and self._cached_actor_pg[0] is not None: logger.debug(f"Trial {trial}: Reusing cached runner " f"{self._cached_actor_pg[0]}") existing_runner, pg = self._cached_actor_pg self._cached_actor_pg = (None, None) trial.set_runner(existing_runner) if pg and trial.uses_placement_groups: self._pg_manager.assign_cached_pg(pg, trial) if not self.reset_trial(trial, trial.config, trial.experiment_tag, logger_creator): raise AbortTrialExecution( "Trainable runner reuse requires reset_config() to be " "implemented and return True.") return existing_runner if self._cached_actor_pg[0]: logger.debug("Cannot reuse cached runner {} for new trial".format( self._cached_actor_pg[0])) existing_runner, pg = self._cached_actor_pg if pg: self._pg_manager.return_or_clean_cached_pg(pg) with self._change_working_directory(trial): self._trial_cleanup.add(trial, actor=existing_runner) self._cached_actor_pg = (None, None) trainable_cls = trial.get_trainable_cls() if not trainable_cls: raise AbortTrialExecution( f"Invalid trainable: {trial.trainable_name}. If you passed " f"a string, make sure the trainable was registered before.") _actor_cls = _class_cache.get(trainable_cls) if trial.uses_placement_groups: if not self._pg_manager.has_ready(trial, update=True): if trial not in self._staged_trials: if self._pg_manager.stage_trial_pg(trial): self._staged_trials.add(trial) self._just_staged_trials.add(trial) just_staged = trial in self._just_staged_trials # This part of the code is mostly here for testing # purposes. If self._wait_for_pg is set, we will wait here # for that many seconds until the placement group is ready. # This ensures that the trial can be started right away and # not just in the next step() of the trial runner. # We only do this if we have reason to believe that resources # will be ready, soon, i.e. when a) we just staged the PG, # b) another trial just exited, freeing resources, or c) # when there are no currently running trials. if self._wait_for_pg is not None and ( just_staged or self._trial_just_finished_before or not self.get_running_trials()): logger.debug( f"Waiting up to {self._wait_for_pg} seconds for " f"placement group of trial {trial} to become ready.") wait_end = time.monotonic() + self._wait_for_pg while time.monotonic() < wait_end: self._pg_manager.update_status() if self._pg_manager.has_ready(trial): break time.sleep(0.1) else: return None if not self._pg_manager.has_ready(trial): # PG may have become ready during waiting period return None full_actor_class = self._pg_manager.get_full_actor_cls( trial, _actor_cls) else: full_actor_class = _actor_cls.options( num_cpus=trial.resources.cpu, num_gpus=trial.resources.gpu, memory=trial.resources.memory or None, object_store_memory=trial.resources.object_store_memory or None, resources=trial.resources.custom_resources) # Clear the Trial's location (to be updated later on result) # since we don't know where the remote runner is placed. trial.set_location(Location()) logger.debug("Trial %s: Setting up new remote runner.", trial) # Logging for trials is handled centrally by TrialRunner, so # configure the remote runner to use a noop-logger. trial_config = copy.deepcopy(trial.config) trial_config[TRIAL_INFO] = TrialInfo(trial) stdout_file, stderr_file = trial.log_to_file trial_config[STDOUT_FILE] = stdout_file trial_config[STDERR_FILE] = stderr_file kwargs = { "config": trial_config, "logger_creator": logger_creator, } if issubclass(trial.get_trainable_cls(), DurableTrainable): kwargs["remote_checkpoint_dir"] = trial.remote_checkpoint_dir with self._change_working_directory(trial): return full_actor_class.remote(**kwargs) def _train(self, trial): """Start one iteration of training and save remote id.""" if self._find_item(self._paused, trial): raise TuneError( "Should not call `train` on PAUSED trial {}. " "This is an internal error - please file an issue " "on https://github.com/ray-project/ray/issues/.".format( str(trial))) if self._find_item(self._running, trial): logging.debug( "Trial {} already has a queued future. Skipping this " "`train` call. This may occur if a trial has " "been unpaused within a scheduler callback.".format( str(trial))) return assert trial.status == Trial.RUNNING, trial.status buffer_time_s = max( self._buffer_min_time_s, min(self._buffer_max_time_s, len(self._running) // 10)) with self._change_working_directory(trial): if self._buffer_length > 1: buffer_length = self._buffer_length if trial.checkpoint_freq > 0: buffer_length = min(buffer_length, trial.checkpoint_freq) remote = trial.runner.train_buffered.remote( buffer_time_s, buffer_length) else: remote = trial.runner.train.remote() # Local Mode if isinstance(remote, dict): remote = _LocalWrapper(remote) self._running[remote] = trial trial_item = self._find_item(self._running, trial) assert len(trial_item) < 2, trial_item def _start_trial(self, trial, checkpoint=None, runner=None, train=True) -> bool: """Starts trial and restores last result if trial was paused. Args: trial (Trial): The trial to start. checkpoint (Optional[Checkpoint]): The checkpoint to restore from. If None, and no trial checkpoint exists, the trial is started from the beginning. runner (Trainable): The remote runner to use. This can be the cached actor. If None, a new runner is created. train (bool): Whether or not to start training. Returns: True if trial was started successfully, False otherwise. See `RayTrialExecutor.restore` for possible errors raised. """ prior_status = trial.status self.set_status(trial, Trial.PENDING) if runner is None: runner = self._setup_remote_runner(trial) if not runner: return False trial.set_runner(runner) self.restore(trial, checkpoint) self.set_status(trial, Trial.RUNNING) if trial in self._staged_trials: self._staged_trials.remove(trial) previous_run = self._find_item(self._paused, trial) if prior_status == Trial.PAUSED and previous_run: # If Trial was in flight when paused, self._paused stores result. self._paused.pop(previous_run[0]) self._running[previous_run[0]] = trial elif train and not trial.is_restoring: self._train(trial) return True def _stop_trial(self, trial, error=False, error_msg=None): """Stops this trial. Stops this trial, releasing all allocating resources. If stopping the trial fails, the run will be marked as terminated in error, but no exception will be thrown. If the trial should be paused (``pause=True``), we do not remove its placement group (or a surrogate placement group). Args: error (bool): Whether to mark this trial as terminated in error. error_msg (str): Optional error message. """ self.set_status(trial, Trial.ERROR if error else Trial.TERMINATED) self._trial_just_finished = True trial.set_location(Location()) try: trial.write_error_log(error_msg) if hasattr(trial, "runner") and trial.runner: if (not error and self._reuse_actors and self._cached_actor_pg[0] is None): logger.debug("Reusing actor for %s", trial.runner) # Move PG into cache (disassociate from trial) pg = self._pg_manager.cache_trial_pg(trial) if pg or not trial.uses_placement_groups: # True if a placement group was replaced self._cached_actor_pg = (trial.runner, pg) should_destroy_actor = False else: # False if no placement group was replaced. This should # only be the case if there are no more trials with # this placement group factory to run logger.debug( "Could not cache of trial {trial} actor for " "reuse, as there are no pending trials " "requiring its resources.") should_destroy_actor = True else: should_destroy_actor = True if should_destroy_actor: logger.debug("Trial %s: Destroying actor.", trial) # Try to return the placement group for other trials to use self._pg_manager.return_pg(trial) with self._change_working_directory(trial): self._trial_cleanup.add(trial, actor=trial.runner) if trial in self._staged_trials: self._staged_trials.remove(trial) except Exception: logger.exception("Trial %s: Error stopping runner.", trial) self.set_status(trial, Trial.ERROR) finally: trial.set_runner(None) def start_trial(self, trial, checkpoint=None, train=True) -> bool: """Starts the trial. Will not return resources if trial repeatedly fails on start. Args: trial (Trial): Trial to be started. checkpoint (Checkpoint): A Python object or path storing the state of trial. train (bool): Whether or not to start training. Returns: True if the remote runner has been started. False if trial was not started (e.g. because of lacking resources/pending PG). """ if not trial.uses_placement_groups: self._commit_resources(trial.resources) try: return self._start_trial(trial, checkpoint, train=train) except AbortTrialExecution: logger.exception("Trial %s: Error starting runner, aborting!", trial) time.sleep(2) error_msg = traceback.format_exc() self._stop_trial(trial, error=True, error_msg=error_msg) return False except Exception: logger.exception("Trial %s: Unexpected error starting runner.", trial) time.sleep(2) error_msg = traceback.format_exc() self._stop_trial(trial, error=True, error_msg=error_msg) # Note that we don't return the resources, since they may # have been lost. TODO(ujvl): is this the right thing to do? return False def _find_item(self, dictionary, item): out = [rid for rid, t in dictionary.items() if t is item] return out def stop_trial(self, trial, error=False, error_msg=None): """Only returns resources if resources allocated.""" prior_status = trial.status self._stop_trial(trial, error=error, error_msg=error_msg) if prior_status == Trial.RUNNING: logger.debug("Trial %s: Returning resources.", trial) if not trial.uses_placement_groups: self._return_resources(trial.resources) out = self._find_item(self._running, trial) for result_id in out: self._running.pop(result_id) def continue_training(self, trial): """Continues the training of this trial.""" self._train(trial) def pause_trial(self, trial): """Pauses the trial. If trial is in-flight, preserves return value in separate queue before pausing, which is restored when Trial is resumed. """ trial_future = self._find_item(self._running, trial) if trial_future: self._paused[trial_future[0]] = trial super(RayTrialExecutor, self).pause_trial(trial) def reset_trial(self, trial, new_config, new_experiment_tag, logger_creator=None): """Tries to invoke `Trainable.reset()` to reset trial. Args: trial (Trial): Trial to be reset. new_config (dict): New configuration for Trial trainable. new_experiment_tag (str): New experiment name for trial. logger_creator (Optional[Callable[[Dict], Logger]]): Function that instantiates a logger on the actor process. Returns: True if `reset_config` is successful else False. """ trial.set_experiment_tag(new_experiment_tag) trial.set_config(new_config) trainable = trial.runner # Pass magic variables extra_config = copy.deepcopy(new_config) extra_config[TRIAL_INFO] = TrialInfo(trial) stdout_file, stderr_file = trial.log_to_file extra_config[STDOUT_FILE] = stdout_file extra_config[STDERR_FILE] = stderr_file with self._change_working_directory(trial): with warn_if_slow("reset"): try: reset_val = ray.get(trainable.reset.remote( extra_config, logger_creator), timeout=DEFAULT_GET_TIMEOUT) except GetTimeoutError: logger.exception("Trial %s: reset timed out.", trial) return False return reset_val def get_running_trials(self): """Returns the running trials.""" return list(self._running.values()) def get_alive_node_ips(self): now = time.time() if now - self._last_ip_refresh < self._refresh_period: return self._last_ip_addresses logger.debug("Checking ips from Ray state.") self._last_ip_refresh = now nodes = ray.state.nodes() ip_addresses = set() for node in nodes: if node["alive"]: ip_addresses.add(node["NodeManagerAddress"]) self._last_ip_addresses = ip_addresses return ip_addresses def get_current_trial_ips(self): return {t.node_ip for t in self.get_running_trials()} def get_next_failed_trial(self): """Gets the first trial found to be running on a node presumed dead. Returns: A Trial object that is ready for failure processing. None if no failure detected. """ if ray.worker._mode() != ray.worker.LOCAL_MODE: live_cluster_ips = self.get_alive_node_ips() if live_cluster_ips - self.get_current_trial_ips(): for trial in self.get_running_trials(): if trial.node_ip and trial.node_ip not in live_cluster_ips: return trial return None def get_next_available_trial(self, timeout: Optional[float] = None): if not self._running: return None shuffled_results = list(self._running.keys()) random.shuffle(shuffled_results) # Note: We shuffle the results because `ray.wait` by default returns # the first available result, and we want to guarantee that slower # trials (i.e. trials that run remotely) also get fairly reported. # See https://github.com/ray-project/ray/issues/4211 for details. start = time.time() ready, _ = ray.wait(shuffled_results, timeout=timeout) if not ready: return None result_id = ready[0] wait_time = time.time() - start if wait_time > NONTRIVIAL_WAIT_TIME_THRESHOLD_S: self._last_nontrivial_wait = time.time() if time.time() - self._last_nontrivial_wait > BOTTLENECK_WARN_PERIOD_S: logger.warning( "Over the last {} seconds, the Tune event loop has been " "backlogged processing new results. Consider increasing your " "period of result reporting to improve performance.".format( BOTTLENECK_WARN_PERIOD_S)) self._last_nontrivial_wait = time.time() return self._running[result_id] def fetch_result(self, trial): """Fetches result list of the running trials. Returns: Result of the most recent trial training run. """ trial_future = self._find_item(self._running, trial) if not trial_future: raise ValueError("Trial was not running.") self._running.pop(trial_future[0]) with warn_if_slow("fetch_result"): result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT) # For local mode if isinstance(result, _LocalWrapper): result = result.unwrap() if not isinstance(result, list): return [result] return result def _commit_resources(self, resources): committed = self._committed_resources all_keys = set(resources.custom_resources).union( set(committed.custom_resources)) custom_resources = { k: committed.get(k) + resources.get_res_total(k) for k in all_keys } self._committed_resources = Resources( committed.cpu + resources.cpu_total(), committed.gpu + resources.gpu_total(), committed.memory + resources.memory_total(), committed.object_store_memory + resources.object_store_memory_total(), custom_resources=custom_resources) def _return_resources(self, resources): committed = self._committed_resources all_keys = set(resources.custom_resources).union( set(committed.custom_resources)) custom_resources = { k: committed.get(k) - resources.get_res_total(k) for k in all_keys } self._committed_resources = Resources( committed.cpu - resources.cpu_total(), committed.gpu - resources.gpu_total(), custom_resources=custom_resources) assert self._committed_resources.is_nonnegative(), ( "Resource invalid: {}".format(resources)) def _update_avail_resources(self, num_retries=5): if time.time() - self._last_resource_refresh < self._refresh_period: return logger.debug("Checking Ray cluster resources.") resources = None for i in range(num_retries): if i > 0: logger.warning( "Cluster resources not detected or are 0. Attempt #" "%s...", i + 1) time.sleep(0.5) try: resources = ray.cluster_resources() except Exception as exc: # TODO(rliaw): Remove this when local mode is fixed. # https://github.com/ray-project/ray/issues/4147 logger.debug(f"{exc}: Using resources for local machine.") resources = ResourceSpec().resolve(True).to_resource_dict() if resources: break if not resources: # NOTE: This hides the possibility that Ray may be waiting for # clients to connect. resources.setdefault("CPU", 0) resources.setdefault("GPU", 0) logger.warning("Cluster resources cannot be detected or are 0. " "You can resume this experiment by passing in " "`resume=True` to `run`.") resources = resources.copy() num_cpus = resources.pop("CPU", 0) num_gpus = resources.pop("GPU", 0) memory = ray_constants.from_memory_units(resources.pop("memory", 0)) object_store_memory = ray_constants.from_memory_units( resources.pop("object_store_memory", 0)) custom_resources = resources self._avail_resources = Resources( int(num_cpus), int(num_gpus), memory=int(memory), object_store_memory=int(object_store_memory), custom_resources=custom_resources) self._last_resource_refresh = time.time() self._resources_initialized = True def has_resources_for_trial(self, trial: Trial): """Returns whether this runner has resources available for this trial. If using placement groups, this will return True as long as we didn't reach the maximum number of pending trials. It will also return True if the trial placement group is already staged. Args: trial: Trial object which should be scheduled. Returns: boolean """ if trial.uses_placement_groups: return trial in self._staged_trials or self._pg_manager.can_stage( ) or self._pg_manager.has_ready(trial, update=True) return self.has_resources(trial.resources) def has_resources(self, resources): """Returns whether this runner has at least the specified resources. This refreshes the Ray cluster resources if the time since last update has exceeded self._refresh_period. This also assumes that the cluster is not resizing very frequently. """ if resources.has_placement_group: return self._pg_manager.can_stage() self._update_avail_resources() currently_available = Resources.subtract(self._avail_resources, self._committed_resources) have_space = ( resources.cpu_total() <= currently_available.cpu and resources.gpu_total() <= currently_available.gpu and resources.memory_total() <= currently_available.memory and resources.object_store_memory_total() <= currently_available.object_store_memory and all( resources.get_res_total(res) <= currently_available.get(res) for res in resources.custom_resources)) if have_space: # The assumption right now is that we block all trials if one # trial is queued. self._trial_queued = False return True can_overcommit = self._queue_trials and not self._trial_queued if can_overcommit: self._trial_queued = True logger.warning( "Allowing trial to start even though the " "cluster does not have enough free resources. Trial actors " "may appear to hang until enough resources are added to the " "cluster (e.g., via autoscaling). You can disable this " "behavior by specifying `queue_trials=False` in " "ray.tune.run().") return True return False def debug_string(self): """Returns a human readable message for printing to the console.""" total_resources = self._pg_manager.total_used_resources( self._committed_resources) if self._resources_initialized: status = ("Resources requested: {}/{} CPUs, {}/{} GPUs, " "{}/{} GiB heap, {}/{} GiB objects".format( total_resources.pop("CPU", 0), self._avail_resources.cpu, total_resources.pop("GPU", 0), self._avail_resources.gpu, _to_gb(total_resources.pop("memory", 0.)), _to_gb(self._avail_resources.memory), _to_gb(total_resources.pop("object_store_memory", 0.)), _to_gb(self._avail_resources.object_store_memory))) customs = ", ".join([ "{}/{} {}".format(total_resources.get(name, 0.), self._avail_resources.get_res_total(name), name) for name in self._avail_resources.custom_resources if not name.startswith(ray.resource_spec.NODE_ID_PREFIX) ]) if customs: status += " ({})".format(customs) return status else: return "Resources requested: ?" def resource_string(self): """Returns a string describing the total resources available.""" if self._resources_initialized: res_str = ("{} CPUs, {} GPUs, " "{} GiB heap, {} GiB objects".format( self._avail_resources.cpu, self._avail_resources.gpu, _to_gb(self._avail_resources.memory), _to_gb(self._avail_resources.object_store_memory))) if self._avail_resources.custom_resources: custom = ", ".join( "{} {}".format(self._avail_resources.get_res_total(name), name) for name in self._avail_resources.custom_resources) res_str += " ({})".format(custom) return res_str else: return "? CPUs, ? GPUs" def on_step_begin(self, trial_runner): """Before step() called, update the available resources.""" self._update_avail_resources() self._trial_just_finished_before = self._trial_just_finished self._trial_just_finished = False def on_step_end(self, trial_runner): self._just_staged_trials.clear() self._pg_manager.reconcile_placement_groups(trial_runner.get_trials()) self._pg_manager.cleanup() def save(self, trial, storage=Checkpoint.PERSISTENT, result=None): """Saves the trial's state to a checkpoint asynchronously. Args: trial (Trial): The trial to be saved. storage (str): Where to store the checkpoint. Defaults to PERSISTENT. result (dict): The state of this trial as a dictionary to be saved. If result is None, the trial's last result will be used. Returns: Checkpoint object, or None if an Exception occurs. """ result = result or trial.last_result with self._change_working_directory(trial): if storage == Checkpoint.MEMORY: value = trial.runner.save_to_object.remote() checkpoint = Checkpoint(storage, value, result) trial.on_checkpoint(checkpoint) else: value = trial.runner.save.remote() checkpoint = Checkpoint(storage, value, result) trial.saving_to = checkpoint self._running[value] = trial return checkpoint def restore(self, trial, checkpoint=None, block=False): """Restores training state from a given model checkpoint. Args: trial (Trial): The trial to be restored. checkpoint (Checkpoint): The checkpoint to restore from. If None, the most recent PERSISTENT checkpoint is used. Defaults to None. block (bool): Whether or not to block on restore before returning. Raises: RuntimeError: This error is raised if no runner is found. AbortTrialExecution: This error is raised if the trial is ineligible for restoration, given the Tune input arguments. """ if checkpoint is None or checkpoint.value is None: checkpoint = trial.checkpoint if checkpoint.value is None: return if trial.runner is None: raise RuntimeError( "Trial {}: Unable to restore - no runner found.".format(trial)) value = checkpoint.value if checkpoint.storage == Checkpoint.MEMORY: logger.debug("Trial %s: Attempting restore from object", trial) # Note that we don't store the remote since in-memory checkpoints # don't guarantee fault tolerance and don't need to be waited on. with self._change_working_directory(trial): trial.runner.restore_from_object.remote(value) else: logger.debug("Trial %s: Attempting restore from %s", trial, value) if issubclass(trial.get_trainable_cls(), DurableTrainable) or not trial.sync_on_checkpoint: with self._change_working_directory(trial): remote = trial.runner.restore.remote(value) elif trial.sync_on_checkpoint: # This provides FT backwards compatibility in the # case where a DurableTrainable is not provided. logger.debug("Trial %s: Reading checkpoint into memory", trial) obj = TrainableUtil.checkpoint_to_object(value) with self._change_working_directory(trial): remote = trial.runner.restore_from_object.remote(obj) else: raise AbortTrialExecution( "Pass in `sync_on_checkpoint=True` for driver-based trial" "restoration. Pass in an `upload_dir` and a Trainable " "extending `DurableTrainable` for remote storage-based " "restoration") if block: ray.get(remote) else: self._running[remote] = trial trial.restoring_from = checkpoint def export_trial_if_needed(self, trial): """Exports model of this trial based on trial.export_formats. Return: A dict that maps ExportFormats to successfully exported models. """ if trial.export_formats and len(trial.export_formats) > 0: with self._change_working_directory(trial): return ray.get(trial.runner.export_model.remote( trial.export_formats), timeout=DEFAULT_GET_TIMEOUT) return {} def has_gpus(self): if self._resources_initialized: self._update_avail_resources() return self._avail_resources.gpu > 0 def cleanup(self): self._trial_cleanup.cleanup(partial=False) self._pg_manager.cleanup(force=True) self._pg_manager.cleanup_existing_pg(block=True) @contextmanager def _change_working_directory(self, trial): """Context manager changing working directory to trial logdir. Used in local mode. For non-local mode it is no-op. """ if ray.worker._mode() == ray.worker.LOCAL_MODE: old_dir = os.getcwd() try: os.chdir(trial.logdir) yield finally: os.chdir(old_dir) else: yield
class RayTrialExecutor(TrialExecutor): """An implementation of TrialExecutor based on Ray.""" def __init__(self, reuse_actors: bool = False, result_buffer_length: Optional[int] = None, refresh_period: Optional[float] = None, wait_for_placement_group: Optional[float] = None): super(RayTrialExecutor, self).__init__() self._running = {} force_trial_cleanup = int( os.environ.get("TUNE_FORCE_TRIAL_CLEANUP_S", "0")) self._trial_cleanup = _TrialCleanup(force_cleanup=force_trial_cleanup) self._has_cleaned_up_pgs = False self._reuse_actors = reuse_actors # The maxlen will be updated when `set_max_pending_trials()` is called self._cached_actor_pg = deque(maxlen=1) self._avail_resources = Resources(cpu=0, gpu=0) self._pg_manager = PlacementGroupManager(prefix=get_tune_pg_prefix()) self._staged_trials = set() self._just_staged_trials = set() self._trial_just_finished = False self._trial_just_finished_before = False self._resources_initialized = False if refresh_period is None: refresh_period = float( os.environ.get("TUNE_STATE_REFRESH_PERIOD", TUNE_STATE_REFRESH_PERIOD)) self._refresh_period = refresh_period self._wait_for_pg = wait_for_placement_group or float( os.environ.get("TUNE_PLACEMENT_GROUP_WAIT_S", "-1")) if self._wait_for_pg < 0: self._wait_for_pg = None self.last_pg_recon = 0 self.pg_recon_interval = float( os.environ.get("TUNE_PLACEMENT_GROUP_RECON_INTERVAL", "5")) self._buffer_length = result_buffer_length or int( os.getenv("TUNE_RESULT_BUFFER_LENGTH", 1)) self._buffer_min_time_s = float( os.getenv("TUNE_RESULT_BUFFER_MIN_TIME_S", 0.)) self._buffer_max_time_s = float( os.getenv("TUNE_RESULT_BUFFER_MAX_TIME_S", 100.)) self._last_resource_refresh = float("-inf") self._last_ip_refresh = float("-inf") self._last_ip_addresses = set() self._last_nontrivial_wait = time.time() if ray.is_initialized(): self._update_avail_resources() def in_staging_grace_period(self) -> bool: """Returns True if trials have recently been staged.""" return self._pg_manager.in_staging_grace_period() def set_max_pending_trials(self, max_pending: int) -> None: if len(self._cached_actor_pg) > 0: logger.warning( "Cannot update maximum number of queued actors for reuse " "during a run.") else: self._cached_actor_pg = deque(maxlen=max_pending) self._pg_manager.set_max_staging(max_pending) def stage_and_update_status(self, trials: Iterable[Trial]): """Check and update statuses of scheduled placement groups. Stages placement groups of all trials. """ if not self._has_cleaned_up_pgs: # Clean up existing placement groups after trigger the tuning # run step() method for the first time self._pg_manager.cleanup_existing_pg() self._has_cleaned_up_pgs = True for trial in trials: if trial.status != Trial.PENDING: continue if trial in self._staged_trials: continue if self._pg_manager.trial_in_use(trial): continue if not self._pg_manager.stage_trial_pg(trial): # Break if we reached the limit of pending placement groups. break self._staged_trials.add(trial) self._just_staged_trials.add(trial) self._pg_manager.update_status() def get_staged_trial(self): """Get a trial whose placement group was successfully staged. Can also return None if no trial is available. Returns: Trial object or None. """ for trial in self._staged_trials: if self._pg_manager.has_ready(trial): return trial return None def _setup_remote_runner(self, trial): trial.init_logdir() # We checkpoint metadata here to try mitigating logdir duplication self._trials_to_cache.add(trial) logger_creator = partial(noop_logger_creator, logdir=trial.logdir) if len(self._cached_actor_pg) > 0: assert self._reuse_actors existing_runner, pg = self._cached_actor_pg.popleft() logger.debug(f"Trial {trial}: Reusing cached runner " f"{existing_runner}") trial.set_runner(existing_runner) if pg: self._pg_manager.assign_cached_pg(pg, trial) if not self.reset_trial(trial, trial.config, trial.experiment_tag, logger_creator): raise AbortTrialExecution( "Trainable runner reuse requires reset_config() to be " "implemented and return True.") return existing_runner trainable_cls = trial.get_trainable_cls() if not trainable_cls: raise AbortTrialExecution( f"Invalid trainable: {trial.trainable_name}. If you passed " f"a string, make sure the trainable was registered before.") _actor_cls = _class_cache.get(trainable_cls) if not self._pg_manager.has_ready(trial, update=True): if trial not in self._staged_trials: if self._pg_manager.stage_trial_pg(trial): self._staged_trials.add(trial) self._just_staged_trials.add(trial) just_staged = trial in self._just_staged_trials # This part of the code is mostly here for testing # purposes. If self._wait_for_pg is set, we will wait here # for that many seconds until the placement group is ready. # This ensures that the trial can be started right away and # not just in the next step() of the trial runner. # We only do this if we have reason to believe that resources # will be ready, soon, i.e. when a) we just staged the PG, # b) another trial just exited, freeing resources, or c) # when there are no currently running trials. if self._wait_for_pg is not None and ( just_staged or self._trial_just_finished_before or not self.get_running_trials()): logger.debug( f"Waiting up to {self._wait_for_pg} seconds for " f"placement group of trial {trial} to become ready.") wait_end = time.monotonic() + self._wait_for_pg while time.monotonic() < wait_end: self._pg_manager.update_status() if self._pg_manager.has_ready(trial): break time.sleep(0.1) else: return None if not self._pg_manager.has_ready(trial): # PG may have become ready during waiting period return None full_actor_class = self._pg_manager.get_full_actor_cls( trial, _actor_cls) # Clear the Trial's location (to be updated later on result) # since we don't know where the remote runner is placed. trial.set_location(Location()) logger.debug("Trial %s: Setting up new remote runner.", trial) # Logging for trials is handled centrally by TrialRunner, so # configure the remote runner to use a noop-logger. trial_config = copy.deepcopy(trial.config) trial_config[TRIAL_INFO] = TrialInfo(trial) stdout_file, stderr_file = trial.log_to_file trial_config[STDOUT_FILE] = stdout_file trial_config[STDERR_FILE] = stderr_file kwargs = { "config": trial_config, "logger_creator": logger_creator, } if trial.uses_cloud_checkpointing: # We keep these kwargs separate for backwards compatibility # with trainables that don't provide these keyword arguments kwargs["remote_checkpoint_dir"] = trial.remote_checkpoint_dir kwargs["sync_function_tpl"] = trial.sync_function_tpl # Throw a meaningful error if trainable does not use the # new API sig = inspect.signature(trial.get_trainable_cls()) try: sig.bind_partial(**kwargs) except Exception as e: raise RuntimeError( "Your trainable class does not accept a " "`remote_checkpoint_dir` or `sync_function_tpl` argument " "in its constructor, but you've passed a " "`upload_dir` to your SyncConfig. Without accepting " "these parameters and passing them to the base trainable " "constructor in the init call, cloud checkpointing is " "effectively disabled. To resolve this issue, add the " "parameters to your trainable class constructor or " "disable cloud checkpointing by setting `upload_dir=None`." ) from e with self._change_working_directory(trial): return full_actor_class.remote(**kwargs) def _train(self, trial): """Start one iteration of training and save remote id.""" if self._find_item(self._running, trial): logging.debug( "Trial {} already has a queued future. Skipping this " "`train` call. This may occur if a trial has " "been unpaused within a scheduler callback.".format( str(trial))) return assert trial.status == Trial.RUNNING, trial.status buffer_time_s = max( self._buffer_min_time_s, min(self._buffer_max_time_s, len(self._running) // 10)) with self._change_working_directory(trial): buffer_length = self._buffer_length if buffer_length > 1 and trial.checkpoint_at_end: # If a trial checkpoint can be triggered externally, # it is not safe to buffer results. if log_once("trial_executor_buffer_checkpoint"): logger.warning("Disabling buffered training as you passed " "`checkpoint_at_end` to `tune.run()`.") buffer_length = 1 if buffer_length > 1: if trial.checkpoint_freq > 0: buffer_length = min(buffer_length, trial.checkpoint_freq) remote = trial.runner.train_buffered.remote( buffer_time_s, buffer_length) else: remote = trial.runner.train.remote() # Local Mode if isinstance(remote, dict): remote = _LocalWrapper(remote) self._running[remote] = trial trial_item = self._find_item(self._running, trial) assert len(trial_item) < 2, trial_item def _start_trial(self, trial) -> bool: """Starts trial and restores last result if trial was paused. Args: trial (Trial): The trial to start. Returns: True if trial was started successfully, False otherwise. See `RayTrialExecutor.restore` for possible errors raised. """ self.set_status(trial, Trial.PENDING) runner = self._setup_remote_runner(trial) if not runner: return False trial.set_runner(runner) self._notify_trainable_of_new_resources_if_needed(trial) self.restore(trial) self.set_status(trial, Trial.RUNNING) if trial in self._staged_trials: self._staged_trials.remove(trial) if not trial.is_restoring: self._train(trial) return True def _notify_trainable_of_new_resources_if_needed(self, trial: Trial): if trial.has_new_resources: trainable = trial.runner trial.has_new_resources = False with self._change_working_directory(trial): with warn_if_slow("update_resources"): try: ray.get( trainable._update_resources.remote( trial.placement_group_factory), timeout=DEFAULT_GET_TIMEOUT) except GetTimeoutError: logger.exception( "Trial %s: updating resources timed out.", trial) def _stop_trial(self, trial: Trial, error=False, error_msg=None): """Stops this trial. Stops this trial, releasing all allocating resources. If stopping the trial fails, the run will be marked as terminated in error, but no exception will be thrown. Args: error (bool): Whether to mark this trial as terminated in error. error_msg (str): Optional error message. """ self.set_status(trial, Trial.ERROR if error else Trial.TERMINATED) self._trial_just_finished = True trial.set_location(Location()) try: trial.write_error_log(error_msg) if hasattr(trial, "runner") and trial.runner: if (not error and self._reuse_actors and (len(self._cached_actor_pg) < (self._cached_actor_pg.maxlen or float("inf")))): logger.debug("Reusing actor for %s", trial.runner) # Move PG into cache (disassociate from trial) pg = self._pg_manager.cache_trial_pg(trial) if pg: # True if a placement group was replaced self._cached_actor_pg.append((trial.runner, pg)) should_destroy_actor = False else: # False if no placement group was replaced. This should # only be the case if there are no more trials with # this placement group factory to run logger.debug( "Could not cache of trial {trial} actor for " "reuse, as there are no pending trials " "requiring its resources.") should_destroy_actor = True else: should_destroy_actor = True if should_destroy_actor: logger.debug("Trial %s: Destroying actor.", trial) # Try to return the placement group for other trials to use self._pg_manager.return_pg(trial) with self._change_working_directory(trial): self._trial_cleanup.add(trial, actor=trial.runner) if trial in self._staged_trials: self._staged_trials.remove(trial) except Exception: logger.exception("Trial %s: Error stopping runner.", trial) self.set_status(trial, Trial.ERROR) finally: trial.set_runner(None) def start_trial(self, trial: Trial) -> bool: """Starts the trial. Will not return resources if trial repeatedly fails on start. Args: trial (Trial): Trial to be started. Returns: True if the remote runner has been started. False if trial was not started (e.g. because of lacking resources/pending PG). """ try: return self._start_trial(trial) except AbortTrialExecution: logger.exception("Trial %s: Error starting runner, aborting!", trial) time.sleep(2) error_msg = traceback.format_exc() self._stop_trial(trial, error=True, error_msg=error_msg) return False except Exception: logger.exception("Trial %s: Unexpected error starting runner.", trial) time.sleep(2) error_msg = traceback.format_exc() self._stop_trial(trial, error=True, error_msg=error_msg) # Note that we don't return the resources, since they may # have been lost. TODO(ujvl): is this the right thing to do? return False def _find_item(self, dictionary, item): out = [rid for rid, t in dictionary.items() if t is item] assert len( out ) <= 1, "Expecting one future for any given trial at any given time." return out def stop_trial(self, trial: Trial, error: bool = False, error_msg: Optional[str] = None) -> None: prior_status = trial.status self._stop_trial(trial, error=error, error_msg=error_msg) if prior_status == Trial.RUNNING: logger.debug("Trial %s: Returning resources.", trial) out = self._find_item(self._running, trial) for result_id in out: self._running.pop(result_id) def continue_training(self, trial: Trial) -> None: """Continues the training of this trial.""" self._train(trial) def reset_trial(self, trial: Trial, new_config: Dict, new_experiment_tag: str, logger_creator: Optional[Callable[ [Dict], "ray.tune.Logger"]] = None) -> bool: """Tries to invoke `Trainable.reset()` to reset trial. Args: trial (Trial): Trial to be reset. new_config (dict): New configuration for Trial trainable. new_experiment_tag (str): New experiment name for trial. logger_creator (Optional[Callable[[Dict], Logger]]): Function that instantiates a logger on the actor process. Returns: True if `reset_config` is successful else False. """ trial.set_experiment_tag(new_experiment_tag) trial.set_config(new_config) trainable = trial.runner # Pass magic variables extra_config = copy.deepcopy(new_config) extra_config[TRIAL_INFO] = TrialInfo(trial) stdout_file, stderr_file = trial.log_to_file extra_config[STDOUT_FILE] = stdout_file extra_config[STDERR_FILE] = stderr_file with self._change_working_directory(trial): with warn_if_slow("reset"): try: reset_val = ray.get( trainable.reset.remote(extra_config, logger_creator), timeout=DEFAULT_GET_TIMEOUT) except GetTimeoutError: logger.exception("Trial %s: reset timed out.", trial) return False return reset_val def get_running_trials(self) -> List[Trial]: """Returns the running trials.""" return list(self._running.values()) def get_next_available_trial( self, timeout: Optional[float] = None) -> Optional[Trial]: if not self._running: return None shuffled_results = list(self._running.keys()) random.shuffle(shuffled_results) # Note: We shuffle the results because `ray.wait` by default returns # the first available result, and we want to guarantee that slower # trials (i.e. trials that run remotely) also get fairly reported. # See https://github.com/ray-project/ray/issues/4211 for details. start = time.time() ready, _ = ray.wait(shuffled_results, timeout=timeout) if not ready: return None result_id = ready[0] wait_time = time.time() - start if wait_time > NONTRIVIAL_WAIT_TIME_THRESHOLD_S: self._last_nontrivial_wait = time.time() if time.time() - self._last_nontrivial_wait > BOTTLENECK_WARN_PERIOD_S: logger.warning( "Over the last {} seconds, the Tune event loop has been " "backlogged processing new results. Consider increasing your " "period of result reporting to improve performance.".format( BOTTLENECK_WARN_PERIOD_S)) self._last_nontrivial_wait = time.time() return self._running[result_id] def fetch_result(self, trial) -> List[Dict]: """Fetches result list of the running trials. Returns: Result of the most recent trial training run. """ trial_future = self._find_item(self._running, trial) if not trial_future: raise ValueError("Trial was not running.") self._running.pop(trial_future[0]) with warn_if_slow("fetch_result"): result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT) # For local mode if isinstance(result, _LocalWrapper): result = result.unwrap() if not isinstance(result, list): return [result] return result def _update_avail_resources(self, num_retries=5): if time.time() - self._last_resource_refresh < self._refresh_period: return logger.debug("Checking Ray cluster resources.") resources = None for i in range(num_retries): if i > 0: logger.warning( "Cluster resources not detected or are 0. Attempt #" "%s...", i + 1) time.sleep(0.5) resources = ray.cluster_resources() if resources: break if not resources: # NOTE: This hides the possibility that Ray may be waiting for # clients to connect. resources.setdefault("CPU", 0) resources.setdefault("GPU", 0) logger.warning("Cluster resources cannot be detected or are 0. " "You can resume this experiment by passing in " "`resume=True` to `run`.") resources = resources.copy() num_cpus = resources.pop("CPU", 0) num_gpus = resources.pop("GPU", 0) memory = ray_constants.from_memory_units(resources.pop("memory", 0)) object_store_memory = ray_constants.from_memory_units( resources.pop("object_store_memory", 0)) custom_resources = resources self._avail_resources = Resources( int(num_cpus), int(num_gpus), memory=int(memory), object_store_memory=int(object_store_memory), custom_resources=custom_resources) self._last_resource_refresh = time.time() self._resources_initialized = True def has_resources_for_trial(self, trial: Trial) -> bool: """Returns whether there are resources available for this trial. This will return True as long as we didn't reach the maximum number of pending trials. It will also return True if the trial placement group is already staged. Args: trial: Trial object which should be scheduled. Returns: boolean """ return trial in self._staged_trials or self._pg_manager.can_stage( ) or self._pg_manager.has_ready( trial, update=True) def debug_string(self) -> str: """Returns a human readable message for printing to the console.""" total_resources = self._pg_manager.occupied_resources() if self._resources_initialized: status = ("Resources requested: {}/{} CPUs, {}/{} GPUs, " "{}/{} GiB heap, {}/{} GiB objects".format( total_resources.pop("CPU", 0), self._avail_resources.cpu, total_resources.pop("GPU", 0), self._avail_resources.gpu, _to_gb(total_resources.pop("memory", 0.)), _to_gb(self._avail_resources.memory), _to_gb( total_resources.pop("object_store_memory", 0.)), _to_gb(self._avail_resources.object_store_memory))) customs = ", ".join([ "{}/{} {}".format( total_resources.get(name, 0.), self._avail_resources.get_res_total(name), name) for name in self._avail_resources.custom_resources if not name.startswith(NODE_ID_PREFIX) and ( total_resources.get(name, 0.) > 0 or "_group_" not in name) ]) if customs: status += " ({})".format(customs) return status else: return "Resources requested: ?" def on_step_begin(self, trials: List[Trial]) -> None: """Before step() is called, update the available resources.""" self._update_avail_resources() self._trial_just_finished_before = self._trial_just_finished self._trial_just_finished = False def on_step_end(self, trials: List[Trial]) -> None: self._just_staged_trials.clear() if time.time() > self.last_pg_recon + self.pg_recon_interval: # Only do this every now and then - usually the placement groups # should not get out of sync, and calling this often is inefficient self._pg_manager.reconcile_placement_groups(trials) self.last_pg_recon = time.time() self._pg_manager.cleanup() def force_reconcilation_on_next_step_end(self) -> None: self.last_pg_recon = -float("inf") def save(self, trial, storage=Checkpoint.PERSISTENT, result: Optional[Dict] = None) -> Checkpoint: """Saves the trial's state to a checkpoint asynchronously. Args: trial (Trial): The trial to be saved. storage (str): Where to store the checkpoint. Defaults to PERSISTENT. result (dict): The state of this trial as a dictionary to be saved. If result is None, the trial's last result will be used. Returns: Checkpoint object, or None if an Exception occurs. """ result = result or trial.last_result with self._change_working_directory(trial): if storage == Checkpoint.MEMORY: value = trial.runner.save_to_object.remote() checkpoint = Checkpoint(storage, value, result) trial.on_checkpoint(checkpoint) else: value = trial.runner.save.remote() checkpoint = Checkpoint(storage, value, result) trial.saving_to = checkpoint self._running[value] = trial return checkpoint def restore(self, trial) -> None: """Restores training state from a given model checkpoint. Args: trial (Trial): The trial to be restored. Raises: RuntimeError: This error is raised if no runner is found. AbortTrialExecution: This error is raised if the trial is ineligible for restoration, given the Tune input arguments. """ checkpoint = trial.checkpoint if checkpoint.value is None: return if trial.runner is None: raise RuntimeError( "Trial {}: Unable to restore - no runner found.".format(trial)) value = checkpoint.value if checkpoint.storage == Checkpoint.MEMORY: logger.debug("Trial %s: Attempting restore from object", trial) # Note that we don't store the remote since in-memory checkpoints # don't guarantee fault tolerance and don't need to be waited on. with self._change_working_directory(trial): trial.runner.restore_from_object.remote(value) else: logger.debug("Trial %s: Attempting restore from %s", trial, value) if trial.uses_cloud_checkpointing or not trial.sync_on_checkpoint: with self._change_working_directory(trial): remote = trial.runner.restore.remote(value) elif trial.sync_on_checkpoint: # This provides FT backwards compatibility in the # case where no cloud checkpoints are provided. logger.debug("Trial %s: Reading checkpoint into memory", trial) obj = TrainableUtil.checkpoint_to_object(value) with self._change_working_directory(trial): remote = trial.runner.restore_from_object.remote(obj) else: raise AbortTrialExecution( "Pass in `sync_on_checkpoint=True` for driver-based trial" "restoration. Pass in an `upload_dir` for remote " "storage-based restoration") self._running[remote] = trial trial.restoring_from = checkpoint def export_trial_if_needed(self, trial: Trial) -> Dict: """Exports model of this trial based on trial.export_formats. Return: A dict that maps ExportFormats to successfully exported models. """ if trial.export_formats and len(trial.export_formats) > 0: with self._change_working_directory(trial): return ray.get( trial.runner.export_model.remote(trial.export_formats), timeout=DEFAULT_GET_TIMEOUT) return {} def has_gpus(self) -> bool: if self._resources_initialized: self._update_avail_resources() return self._avail_resources.gpu > 0 def cleanup(self, trials: List[Trial]) -> None: self._trial_cleanup.cleanup(partial=False) self._pg_manager.reconcile_placement_groups(trials) self._pg_manager.cleanup(force=True) self._pg_manager.cleanup_existing_pg(block=True) @contextmanager def _change_working_directory(self, trial): """Context manager changing working directory to trial logdir. Used in local mode. For non-local mode it is no-op. """ if ray.worker._mode() == ray.worker.LOCAL_MODE: old_dir = os.getcwd() try: os.chdir(trial.logdir) yield finally: os.chdir(old_dir) else: yield