def _restore(self, trial, checkpoint=None, block=False) -> Optional[RunningJob]: """Restores training state from a given model checkpoint. Args: trial (Trial): The trial to be restored. checkpoint (Checkpoint): The checkpoint to restore from. If None, the most recent PERSISTENT checkpoint is used. Defaults to None. block (bool): Whether or not to block on restore before returning. Raises: RuntimeError: This error is raised if no runner is found. AbortTrialExecution: This error is raised if the trial is ineligible for restoration, given the Tune input arguments. """ if checkpoint is None or checkpoint.value is None: checkpoint = trial.checkpoint if checkpoint.value is None: return if trial.runner is None: raise RuntimeError( "Trial {}: Unable to restore - no runner found.".format(trial)) value = checkpoint.value if checkpoint.storage == Checkpoint.MEMORY: logger.debug("Trial %s: Attempting restore from object", trial) # Note that we don't store the remote since in-memory checkpoints # don't guarantee fault tolerance and don't need to be waited on. with _change_working_directory(trial): trial.runner.restore_from_object.remote(value) else: logger.debug("Trial %s: Attempting restore from %s", trial, value) if issubclass(trial.get_trainable_cls(), DurableTrainable): with _change_working_directory(trial): remote = trial.runner.restore.remote(value) elif trial.sync_on_checkpoint: # This provides FT backwards compatibility in the # case where a DurableTrainable is not provided. logger.warning("Trial %s: Reading checkpoint into memory.", trial) data_dict = TrainableUtil.pickle_checkpoint(value) with _change_working_directory(trial): remote = trial.runner.restore_from_object.remote(data_dict) else: raise AbortTrialExecution( "Pass in `sync_on_checkpoint=True` for driver-based trial" "restoration. Pass in an `upload_dir` and a Trainable " "extending `DurableTrainable` for remote storage-based " "restoration") if block: ray.get(remote) else: trial.restoring_from = checkpoint running_job = RunningJob(trial, remote) self.jobs_running[remote] = running_job return running_job
def _setup_remote_runner(self, trial, reuse_allowed): trial.init_logger() # We checkpoint metadata here to try mitigating logdir duplication self.try_checkpoint_metadata(trial) remote_logdir = trial.logdir if (self._reuse_actors and reuse_allowed and self._cached_actor is not None): logger.debug("Trial %s: Reusing cached runner %s", trial, self._cached_actor) existing_runner = self._cached_actor self._cached_actor = None trial.set_runner(existing_runner) if not self.reset_trial(trial, trial.config, trial.experiment_tag): raise AbortTrialExecution( "Trainable runner reuse requires reset_config() to be " "implemented and return True.") return existing_runner if self._cached_actor: logger.debug("Cannot reuse cached runner {} for new trial".format( self._cached_actor)) with self._change_working_directory(trial): self._cached_actor.stop.remote() self._cached_actor.__ray_terminate__.remote() self._cached_actor = None cls = ray.remote( num_cpus=trial.resources.cpu, num_gpus=trial.resources.gpu, memory=trial.resources.memory, object_store_memory=trial.resources.object_store_memory, resources=trial.resources.custom_resources)( trial.get_trainable_cls()) def logger_creator(config): # Set the working dir in the remote process, for user file writes os.makedirs(remote_logdir, exist_ok=True) if not ray.worker._mode() == ray.worker.LOCAL_MODE: os.chdir(remote_logdir) return NoopLogger(config, remote_logdir) # Clear the Trial's location (to be updated later on result) # since we don't know where the remote runner is placed. trial.set_location(Location()) logger.debug("Trial %s: Setting up new remote runner.", trial) # Logging for trials is handled centrally by TrialRunner, so # configure the remote runner to use a noop-logger. trial_config = copy.deepcopy(trial.config) trial_config[TRIAL_INFO] = TrialInfo(trial) kwargs = { "config": trial_config, "logger_creator": logger_creator, } if issubclass(trial.get_trainable_cls(), DurableTrainable): kwargs["remote_checkpoint_dir"] = trial.remote_checkpoint_dir with self._change_working_directory(trial): return cls.remote(**kwargs)
def _setup_remote_runner(self, trial, reuse_allowed): trial.init_logger() # We checkpoint metadata here to try mitigating logdir duplication self.try_checkpoint_metadata(trial) logger_creator = partial(noop_logger_creator, logdir=trial.logdir) if (self._reuse_actors and reuse_allowed and self._cached_actor is not None): logger.debug("Trial %s: Reusing cached runner %s", trial, self._cached_actor) existing_runner = self._cached_actor self._cached_actor = None trial.set_runner(existing_runner) if not self.reset_trial(trial, trial.config, trial.experiment_tag, logger_creator): raise AbortTrialExecution( "Trainable runner reuse requires reset_config() to be " "implemented and return True.") return existing_runner if self._cached_actor: logger.debug("Cannot reuse cached runner {} for new trial".format( self._cached_actor)) with self._change_working_directory(trial): self._trial_cleanup.add(trial, actor=self._cached_actor) self._cached_actor = None _actor_cls = _class_cache.get(trial.get_trainable_cls()) full_actor_class = _actor_cls.options( num_cpus=trial.resources.cpu, num_gpus=trial.resources.gpu, memory=trial.resources.memory or None, object_store_memory=trial.resources.object_store_memory or None, resources=trial.resources.custom_resources) # Clear the Trial's location (to be updated later on result) # since we don't know where the remote runner is placed. trial.set_location(Location()) logger.debug("Trial %s: Setting up new remote runner.", trial) # Logging for trials is handled centrally by TrialRunner, so # configure the remote runner to use a noop-logger. trial_config = copy.deepcopy(trial.config) trial_config[TRIAL_INFO] = TrialInfo(trial) stdout_file, stderr_file = trial.log_to_file trial_config[STDOUT_FILE] = stdout_file trial_config[STDERR_FILE] = stderr_file kwargs = { "config": trial_config, "logger_creator": logger_creator, } if issubclass(trial.get_trainable_cls(), DurableTrainable): kwargs["remote_checkpoint_dir"] = trial.remote_checkpoint_dir with self._change_working_directory(trial): return full_actor_class.remote(**kwargs)
def restore(self, trial, checkpoint=None, block=False) -> None: """Restores training state from a given model checkpoint. Args: trial (Trial): The trial to be restored. checkpoint (Checkpoint): The checkpoint to restore from. If None, the most recent PERSISTENT checkpoint is used. Defaults to None. block (bool): Whether or not to block on restore before returning. Raises: RuntimeError: This error is raised if no runner is found. AbortTrialExecution: This error is raised if the trial is ineligible for restoration, given the Tune input arguments. """ if checkpoint is None or checkpoint.value is None: checkpoint = trial.checkpoint if checkpoint.value is None: return if trial.runner is None: raise RuntimeError( "Trial {}: Unable to restore - no runner found.".format(trial)) value = checkpoint.value if checkpoint.storage == Checkpoint.MEMORY: logger.debug("Trial %s: Attempting restore from object", trial) # Note that we don't store the remote since in-memory checkpoints # don't guarantee fault tolerance and don't need to be waited on. with self._change_working_directory(trial): trial.runner.restore_from_object.remote(value) else: logger.debug("Trial %s: Attempting restore from %s", trial, value) if trial.uses_cloud_checkpointing or not trial.sync_on_checkpoint: with self._change_working_directory(trial): remote = trial.runner.restore.remote(value) elif trial.sync_on_checkpoint: # This provides FT backwards compatibility in the # case where no cloud checkpoints are provided. logger.debug("Trial %s: Reading checkpoint into memory", trial) obj = TrainableUtil.checkpoint_to_object(value) with self._change_working_directory(trial): remote = trial.runner.restore_from_object.remote(obj) else: raise AbortTrialExecution( "Pass in `sync_on_checkpoint=True` for driver-based trial" "restoration. Pass in an `upload_dir` for remote " "storage-based restoration") if block: ray.get(remote) else: self._running[remote] = trial trial.restoring_from = checkpoint
def restore(self, trial: Trial) -> None: """Restores training state from a given model checkpoint. Args: trial: The trial to be restored. Raises: RuntimeError: This error is raised if no runner is found. AbortTrialExecution: This error is raised if the trial is ineligible for restoration, given the Tune input arguments. """ checkpoint = trial.checkpoint if checkpoint.value is None: return if trial.runner is None: raise RuntimeError( "Trial {}: Unable to restore - no runner found.".format(trial) ) value = checkpoint.value node_ip = checkpoint.node_ip if checkpoint.storage == _TuneCheckpoint.MEMORY: logger.debug("Trial %s: Attempting restore from object", trial) # Note that we don't store the remote since in-memory checkpoints # don't guarantee fault tolerance and don't need to be waited on. with self._change_working_directory(trial): trial.runner.restore_from_object.remote(value) else: logger.debug("Trial %s: Attempting restore from %s", trial, value) if trial.uses_cloud_checkpointing or not trial.sync_on_checkpoint: # If using cloud checkpointing, trial will get cp from cloud. # If not syncing to driver, assume it has access to the cp # on the local fs. with self._change_working_directory(trial): remote = trial.runner.restore.remote(value, node_ip) elif trial.sync_on_checkpoint: # This provides FT backwards compatibility in the # case where no cloud checkpoints are provided. logger.debug("Trial %s: Reading checkpoint into memory", trial) obj = TrainableUtil.checkpoint_to_object(value) with self._change_working_directory(trial): remote = trial.runner.restore_from_object.remote(obj) else: raise AbortTrialExecution( "Pass in `sync_on_checkpoint=True` for driver-based trial" "restoration. Pass in an `upload_dir` for remote " "storage-based restoration" ) self._futures[remote] = (ExecutorEventType.RESTORING_RESULT, trial) trial.restoring_from = checkpoint
def _setup_remote_runner(self, trial, reuse_allowed): trial.init_logger() # We checkpoint metadata here to try mitigating logdir duplication self.try_checkpoint_metadata(trial) remote_logdir = trial.logdir if (self._reuse_actors and reuse_allowed and self._cached_actor is not None): logger.debug("Reusing cached runner {} for {}".format( self._cached_actor, trial.trial_id)) existing_runner = self._cached_actor self._cached_actor = None trial.runner = existing_runner if not self.reset_trial(trial, trial.config, trial.experiment_tag): raise AbortTrialExecution( "Trainable runner reuse requires reset_config() to be " "implemented and return True.") return existing_runner if self._cached_actor: logger.debug("Cannot reuse cached runner {} for new trial".format( self._cached_actor)) self._cached_actor.stop.remote() self._cached_actor.__ray_terminate__.remote() self._cached_actor = None cls = ray.remote( num_cpus=trial.resources.cpu, num_gpus=trial.resources.gpu, memory=trial.resources.memory, object_store_memory=trial.resources.object_store_memory, resources=trial.resources.custom_resources)( trial.get_trainable_cls()) def logger_creator(config): # Set the working dir in the remote process, for user file writes if not os.path.exists(remote_logdir): os.makedirs(remote_logdir) if not ray.worker._mode() == ray.worker.LOCAL_MODE: os.chdir(remote_logdir) return NoopLogger(config, remote_logdir) # Clear the Trial's location (to be updated later on result) # since we don't know where the remote runner is placed. trial.set_location(Location()) logger.info("Trial %s: Setting up new remote runner.", trial) # Logging for trials is handled centrally by TrialRunner, so # configure the remote runner to use a noop-logger. return cls.remote(config=trial.config, logger_creator=logger_creator)
def _setup_runner(self, trial, reuse_allowed): if (self._reuse_actors and reuse_allowed and self._cached_actor is not None): logger.debug("Reusing cached runner {} for {}".format( self._cached_actor, trial.trial_id)) existing_runner = self._cached_actor self._cached_actor = None else: if self._cached_actor: logger.debug( "Cannot reuse cached runner {} for new trial".format( self._cached_actor)) self._cached_actor.stop.remote() self._cached_actor.__ray_terminate__.remote() self._cached_actor = None existing_runner = None cls = ray.remote( num_cpus=trial.resources.cpu, num_gpus=trial.resources.gpu, resources=trial.resources.custom_resources)( trial._get_trainable_cls()) trial.init_logger() # We checkpoint metadata here to try mitigating logdir duplication self.try_checkpoint_metadata(trial) remote_logdir = trial.logdir if existing_runner: trial.runner = existing_runner if not self.reset_trial(trial, trial.config, trial.experiment_tag): raise AbortTrialExecution( "Trial runner reuse requires reset_trial() to be " "implemented and return True.") return existing_runner def logger_creator(config): # Set the working dir in the remote process, for user file writes if not os.path.exists(remote_logdir): os.makedirs(remote_logdir) os.chdir(remote_logdir) return NoopLogger(config, remote_logdir) # Logging for trials is handled centrally by TrialRunner, so # configure the remote runner to use a noop-logger. return cls.remote(config=trial.config, logger_creator=logger_creator)
def restore(self, trial, checkpoint=None): """Restores training state from a given model checkpoint. Raises: RuntimeError: This error is raised if no runner is found. AbortTrialExecution: This error is raised if the trial is ineligible for restoration, given the Tune input arguments. """ if checkpoint is None or checkpoint.value is None: checkpoint = trial.checkpoint if checkpoint.value is None: return if trial.runner is None: raise RuntimeError( "Trial {}: Unable to restore - no runner found.".format(trial)) value = checkpoint.value if checkpoint.storage == Checkpoint.MEMORY: logger.debug("Trial %s: Attempting restore from object", trial) # Note that we don't store the remote since in-memory checkpoints # don't guarantee fault tolerance and don't need to be waited on. trial.runner.restore_from_object.remote(value) else: logger.debug("Trial %s: Attempting restore from %s", trial, value) if issubclass(trial.get_trainable_cls(), DurableTrainable): remote = trial.runner.restore.remote(value) elif trial.sync_on_checkpoint: # This provides FT backwards compatibility in the # case where a DurableTrainable is not provided. logger.warning("Trial %s: Reading checkpoint into memory.", trial) data_dict = TrainableUtil.pickle_checkpoint(value) remote = trial.runner.restore_from_object.remote(data_dict) else: raise AbortTrialExecution( "Pass in `sync_on_checkpoint=True` for driver-based trial" "restoration. Pass in an `upload_dir` and a Trainable " "extending `DurableTrainable` for remote storage-based " "restoration") self._running[remote] = trial trial.restoring_from = checkpoint
def _setup_remote_runner(self, trial): trial.init_logdir() # We checkpoint metadata here to try mitigating logdir duplication self._trials_to_cache.add(trial) logger_creator = partial(noop_logger_creator, logdir=trial.logdir) if len(self._cached_actor_pg) > 0: assert self._reuse_actors existing_runner, pg = self._cached_actor_pg.popleft() logger.debug(f"Trial {trial}: Reusing cached runner " f"{existing_runner}") trial.set_runner(existing_runner) if pg: self._pg_manager.assign_cached_pg(pg, trial) if not self.reset_trial(trial, trial.config, trial.experiment_tag, logger_creator): raise AbortTrialExecution( "Trainable runner reuse requires reset_config() to be " "implemented and return True.") return existing_runner trainable_cls = trial.get_trainable_cls() if not trainable_cls: raise AbortTrialExecution( f"Invalid trainable: {trial.trainable_name}. If you passed " f"a string, make sure the trainable was registered before.") _actor_cls = _class_cache.get(trainable_cls) if not self._pg_manager.has_ready(trial): return None full_actor_class = self._pg_manager.get_full_actor_cls( trial, _actor_cls) # Clear the Trial's location (to be updated later on result) # since we don't know where the remote runner is placed. trial.set_location(Location()) logger.debug("Trial %s: Setting up new remote runner.", trial) # Logging for trials is handled centrally by TrialRunner, so # configure the remote runner to use a noop-logger. trial_config = copy.deepcopy(trial.config) trial_config[TRIAL_INFO] = TrialInfo(trial) stdout_file, stderr_file = trial.log_to_file trial_config[STDOUT_FILE] = stdout_file trial_config[STDERR_FILE] = stderr_file kwargs = { "config": trial_config, "logger_creator": logger_creator, } if trial.uses_cloud_checkpointing: # We keep these kwargs separate for backwards compatibility # with trainables that don't provide these keyword arguments kwargs["remote_checkpoint_dir"] = trial.remote_checkpoint_dir kwargs["sync_function_tpl"] = trial.sync_function_tpl # Throw a meaningful error if trainable does not use the # new API sig = inspect.signature(trial.get_trainable_cls()) try: sig.bind_partial(**kwargs) except Exception as e: raise RuntimeError( "Your trainable class does not accept a " "`remote_checkpoint_dir` or `sync_function_tpl` argument " "in its constructor, but you've passed a " "`upload_dir` to your SyncConfig. Without accepting " "these parameters and passing them to the base trainable " "constructor in the init call, cloud checkpointing is " "effectively disabled. To resolve this issue, add the " "parameters to your trainable class constructor or " "disable cloud checkpointing by setting `upload_dir=None`." ) from e with self._change_working_directory(trial): return full_actor_class.remote(**kwargs)
def _setup_remote_runner(self, trial): trial.init_logdir() # We checkpoint metadata here to try mitigating logdir duplication self.try_checkpoint_metadata(trial) logger_creator = partial(noop_logger_creator, logdir=trial.logdir) if self._reuse_actors and self._cached_actor_pg[0] is not None: logger.debug(f"Trial {trial}: Reusing cached runner " f"{self._cached_actor_pg[0]}") existing_runner, pg = self._cached_actor_pg self._cached_actor_pg = (None, None) trial.set_runner(existing_runner) if pg and trial.uses_placement_groups: self._pg_manager.assign_cached_pg(pg, trial) if not self.reset_trial(trial, trial.config, trial.experiment_tag, logger_creator): raise AbortTrialExecution( "Trainable runner reuse requires reset_config() to be " "implemented and return True.") return existing_runner if self._cached_actor_pg[0]: logger.debug("Cannot reuse cached runner {} for new trial".format( self._cached_actor_pg[0])) existing_runner, pg = self._cached_actor_pg if pg: self._pg_manager.return_or_clean_cached_pg(pg) with self._change_working_directory(trial): self._trial_cleanup.add(trial, actor=existing_runner) self._cached_actor_pg = (None, None) trainable_cls = trial.get_trainable_cls() if not trainable_cls: raise AbortTrialExecution( f"Invalid trainable: {trial.trainable_name}. If you passed " f"a string, make sure the trainable was registered before.") _actor_cls = _class_cache.get(trainable_cls) if trial.uses_placement_groups: if not self._pg_manager.has_ready(trial, update=True): if trial not in self._staged_trials: if self._pg_manager.stage_trial_pg(trial): self._staged_trials.add(trial) self._just_staged_trials.add(trial) just_staged = trial in self._just_staged_trials # This part of the code is mostly here for testing # purposes. If self._wait_for_pg is set, we will wait here # for that many seconds until the placement group is ready. # This ensures that the trial can be started right away and # not just in the next step() of the trial runner. # We only do this if we have reason to believe that resources # will be ready, soon, i.e. when a) we just staged the PG, # b) another trial just exited, freeing resources, or c) # when there are no currently running trials. if self._wait_for_pg is not None and ( just_staged or self._trial_just_finished_before or not self.get_running_trials()): logger.debug( f"Waiting up to {self._wait_for_pg} seconds for " f"placement group of trial {trial} to become ready.") wait_end = time.monotonic() + self._wait_for_pg while time.monotonic() < wait_end: self._pg_manager.update_status() if self._pg_manager.has_ready(trial): break time.sleep(0.1) else: return None if not self._pg_manager.has_ready(trial): # PG may have become ready during waiting period return None full_actor_class = self._pg_manager.get_full_actor_cls( trial, _actor_cls) else: full_actor_class = _actor_cls.options( num_cpus=trial.resources.cpu, num_gpus=trial.resources.gpu, memory=trial.resources.memory or None, object_store_memory=trial.resources.object_store_memory or None, resources=trial.resources.custom_resources) # Clear the Trial's location (to be updated later on result) # since we don't know where the remote runner is placed. trial.set_location(Location()) logger.debug("Trial %s: Setting up new remote runner.", trial) # Logging for trials is handled centrally by TrialRunner, so # configure the remote runner to use a noop-logger. trial_config = copy.deepcopy(trial.config) trial_config[TRIAL_INFO] = TrialInfo(trial) stdout_file, stderr_file = trial.log_to_file trial_config[STDOUT_FILE] = stdout_file trial_config[STDERR_FILE] = stderr_file kwargs = { "config": trial_config, "logger_creator": logger_creator, } if issubclass(trial.get_trainable_cls(), DurableTrainable): kwargs["remote_checkpoint_dir"] = trial.remote_checkpoint_dir with self._change_working_directory(trial): return full_actor_class.remote(**kwargs)
def _setup_remote_runner(self, trial): trial.init_logdir() # We checkpoint metadata here to try mitigating logdir duplication self._trials_to_cache.add(trial) logger_creator = partial(noop_logger_creator, logdir=trial.logdir) if len(self._cached_actor_pg) > 0: assert self._reuse_actors existing_runner, pg = self._cached_actor_pg.popleft() logger.debug(f"Trial {trial}: Reusing cached runner " f"{existing_runner}") trial.set_runner(existing_runner) if pg: self._pg_manager.assign_cached_pg(pg, trial) if not self.reset_trial(trial, trial.config, trial.experiment_tag, logger_creator): raise AbortTrialExecution( "Trainable runner reuse requires reset_config() to be " "implemented and return True.") return existing_runner trainable_cls = trial.get_trainable_cls() if not trainable_cls: raise AbortTrialExecution( f"Invalid trainable: {trial.trainable_name}. If you passed " f"a string, make sure the trainable was registered before.") _actor_cls = _class_cache.get(trainable_cls) if not self._pg_manager.has_ready(trial, update=True): if trial not in self._staged_trials: if self._pg_manager.stage_trial_pg(trial): self._staged_trials.add(trial) self._just_staged_trials.add(trial) just_staged = trial in self._just_staged_trials # This part of the code is mostly here for testing # purposes. If self._wait_for_pg is set, we will wait here # for that many seconds until the placement group is ready. # This ensures that the trial can be started right away and # not just in the next step() of the trial runner. # We only do this if we have reason to believe that resources # will be ready, soon, i.e. when a) we just staged the PG, # b) another trial just exited, freeing resources, or c) # when there are no currently running trials. if self._wait_for_pg is not None and ( just_staged or self._trial_just_finished_before or not self.get_running_trials()): logger.debug( f"Waiting up to {self._wait_for_pg} seconds for " f"placement group of trial {trial} to become ready.") wait_end = time.monotonic() + self._wait_for_pg while time.monotonic() < wait_end: self._pg_manager.update_status() if self._pg_manager.has_ready(trial): break time.sleep(0.1) else: return None if not self._pg_manager.has_ready(trial): # PG may have become ready during waiting period return None full_actor_class = self._pg_manager.get_full_actor_cls( trial, _actor_cls) # Clear the Trial's location (to be updated later on result) # since we don't know where the remote runner is placed. trial.set_location(Location()) logger.debug("Trial %s: Setting up new remote runner.", trial) # Logging for trials is handled centrally by TrialRunner, so # configure the remote runner to use a noop-logger. trial_config = copy.deepcopy(trial.config) trial_config[TRIAL_INFO] = TrialInfo(trial) stdout_file, stderr_file = trial.log_to_file trial_config[STDOUT_FILE] = stdout_file trial_config[STDERR_FILE] = stderr_file kwargs = { "config": trial_config, "logger_creator": logger_creator, } if trial.uses_cloud_checkpointing: # We keep these kwargs separate for backwards compatibility # with trainables that don't provide these keyword arguments kwargs["remote_checkpoint_dir"] = trial.remote_checkpoint_dir kwargs["sync_function_tpl"] = trial.sync_function_tpl # Throw a meaningful error if trainable does not use the # new API sig = inspect.signature(trial.get_trainable_cls()) try: sig.bind_partial(**kwargs) except Exception as e: raise RuntimeError( "Your trainable class does not accept a " "`remote_checkpoint_dir` or `sync_function_tpl` argument " "in its constructor, but you've passed a " "`upload_dir` to your SyncConfig. Without accepting " "these parameters and passing them to the base trainable " "constructor in the init call, cloud checkpointing is " "effectively disabled. To resolve this issue, add the " "parameters to your trainable class constructor or " "disable cloud checkpointing by setting `upload_dir=None`." ) from e with self._change_working_directory(trial): return full_actor_class.remote(**kwargs)