def _get_trial_location(trial: Trial, result: dict) -> _Location: # we get the location from the result, as the one in trial will be # reset when trial terminates node_ip, pid = result.get(NODE_IP, None), result.get(PID, None) if node_ip and pid: location = _Location(node_ip, pid) else: # fallback to trial location if there hasn't been a report yet location = trial.location return location
def _stop_trial( self, trial: Trial, error: bool = False, exc: Optional[Union[TuneError, RayTaskError]] = None, ): """Stops this trial. Stops this trial, releasing all allocating resources. If stopping the trial fails, the run will be marked as terminated in error, but no exception will be thrown. Args: error: Whether to mark this trial as terminated in error. exc: Optional exception. """ self.set_status(trial, Trial.ERROR if error or exc else Trial.TERMINATED) self._trial_just_finished = True trial.set_location(_Location()) try: trial.write_error_log(exc=exc) if hasattr(trial, "runner") and trial.runner: if (not error and self._reuse_actors and (len(self._cached_actor_pg) < (self._cached_actor_pg.maxlen or float("inf")))): logger.debug("Reusing actor for %s", trial.runner) # Move PG into cache (disassociate from trial) pg = self._pg_manager.cache_trial_pg(trial) if pg: # True if a placement group was replaced self._cached_actor_pg.append((trial.runner, pg)) should_destroy_actor = False else: # False if no placement group was replaced. This should # only be the case if there are no more trials with # this placement group factory to run logger.debug( f"Could not cache actor of trial {trial} for " "reuse, as there are no pending trials " "requiring its resources.") should_destroy_actor = True else: should_destroy_actor = True if should_destroy_actor: logger.debug("Trial %s: Destroying actor.", trial) with self._change_working_directory(trial): future = trial.runner.stop.remote() pg = self._pg_manager.remove_from_in_use(trial) self._futures[future] = (_ExecutorEventType.STOP_RESULT, pg) if self._trial_cleanup: # force trial cleanup within a deadline self._trial_cleanup.add(future) self._staged_trials.discard(trial) except Exception: logger.exception("Trial %s: Error stopping runner.", trial) self.set_status(trial, Trial.ERROR) finally: trial.set_runner(None)
def _setup_remote_runner(self, trial): trial.init_logdir() # We checkpoint metadata here to try mitigating logdir duplication self._trials_to_cache.add(trial) logger_creator = partial(noop_logger_creator, logdir=trial.logdir) if len(self._cached_actor_pg) > 0: assert self._reuse_actors existing_runner, pg = self._cached_actor_pg.popleft() logger.debug(f"Trial {trial}: Reusing cached runner " f"{existing_runner}") trial.set_runner(existing_runner) if pg: self._pg_manager.assign_cached_pg(pg, trial) if not self.reset_trial(trial, trial.config, trial.experiment_tag, logger_creator): raise _AbortTrialExecution( "Trainable runner reuse requires reset_config() to be " "implemented and return True.") return existing_runner trainable_cls = trial.get_trainable_cls() if not trainable_cls: raise _AbortTrialExecution( f"Invalid trainable: {trial.trainable_name}. If you passed " f"a string, make sure the trainable was registered before.") _actor_cls = _class_cache.get(trainable_cls) if not self._pg_manager.has_ready(trial): return None full_actor_class = self._pg_manager.get_full_actor_cls( trial, _actor_cls) # Clear the Trial's location (to be updated later on result) # since we don't know where the remote runner is placed. trial.set_location(_Location()) logger.debug("Trial %s: Setting up new remote runner.", trial) # Logging for trials is handled centrally by TrialRunner, so # configure the remote runner to use a noop-logger. trial_config = copy.deepcopy(trial.config) trial_config[TRIAL_INFO] = _TrialInfo(trial) stdout_file, stderr_file = trial.log_to_file trial_config[STDOUT_FILE] = stdout_file trial_config[STDERR_FILE] = stderr_file kwargs = { "config": trial_config, "logger_creator": logger_creator, } if trial.uses_cloud_checkpointing: # We keep these kwargs separate for backwards compatibility # with trainables that don't provide these keyword arguments kwargs["remote_checkpoint_dir"] = trial.remote_checkpoint_dir kwargs["sync_function_tpl"] = trial.sync_function_tpl # Throw a meaningful error if trainable does not use the # new API sig = inspect.signature(trial.get_trainable_cls()) try: sig.bind_partial(**kwargs) except Exception as e: raise RuntimeError( "Your trainable class does not accept a " "`remote_checkpoint_dir` or `sync_function_tpl` argument " "in its constructor, but you've passed a " "`upload_dir` to your SyncConfig. Without accepting " "these parameters and passing them to the base trainable " "constructor in the init call, cloud checkpointing is " "effectively disabled. To resolve this issue, add the " "parameters to your trainable class constructor or " "disable cloud checkpointing by setting `upload_dir=None`." ) from e with self._change_working_directory(trial): return full_actor_class.remote(**kwargs)