def restore(self, trial, checkpoint=None, block=False): """Restores training state from a given model checkpoint. Args: trial (Trial): The trial to be restored. checkpoint (Checkpoint): The checkpoint to restore from. If None, the most recent PERSISTENT checkpoint is used. Defaults to None. block (bool): Whether or not to block on restore before returning. Raises: RuntimeError: This error is raised if no runner is found. AbortTrialExecution: This error is raised if the trial is ineligible for restoration, given the Tune input arguments. """ if checkpoint is None or checkpoint.value is None: checkpoint = trial.checkpoint if checkpoint.value is None: return if trial.runner is None: raise RuntimeError( "Trial {}: Unable to restore - no runner found.".format(trial)) value = checkpoint.value if checkpoint.storage == Checkpoint.MEMORY: logger.debug("Trial %s: Attempting restore from object", trial) # Note that we don't store the remote since in-memory checkpoints # don't guarantee fault tolerance and don't need to be waited on. with self._change_working_directory(trial): trial.runner.restore_from_object.remote(value) else: logger.debug("Trial %s: Attempting restore from %s", trial, value) if issubclass(trial.get_trainable_cls(), DurableTrainable) or not trial.sync_on_checkpoint: with self._change_working_directory(trial): remote = trial.runner.restore.remote(value) elif trial.sync_on_checkpoint: # This provides FT backwards compatibility in the # case where a DurableTrainable is not provided. logger.debug("Trial %s: Reading checkpoint into memory", trial) obj = TrainableUtil.checkpoint_to_object(value) with self._change_working_directory(trial): remote = trial.runner.restore_from_object.remote(obj) else: raise AbortTrialExecution( "Pass in `sync_on_checkpoint=True` for driver-based trial" "restoration. Pass in an `upload_dir` and a Trainable " "extending `DurableTrainable` for remote storage-based " "restoration") if block: ray.get(remote) else: self._running[remote] = trial trial.restoring_from = checkpoint
def save_all_states_remote(self, trial_state): """ Save all of AdaptDL's job state and return it as an in-memory object.""" checkpoint = save_all_states() parent_dir = TrainableUtil.find_checkpoint_dir(checkpoint) checkpoint_path = TrainableUtil.process_checkpoint(checkpoint, parent_dir, trial_state) checkpoint_obj = TrainableUtil.checkpoint_to_object(checkpoint_path) # Done with the directory, remove shutil.rmtree(checkpoint_path) return checkpoint_obj
def load_checkpoint(self, checkpoint_dir: str): checkpoint_obj = TrainableUtil.checkpoint_to_object(checkpoint_dir) x_id = ray.put(checkpoint_obj) return self.executor.execute(lambda w: w.restore_from_object(x_id))
def save_to_object(self): checkpoint_path = self.save() obj = TrainableUtil.checkpoint_to_object(checkpoint_path) return obj
def load_checkpoint(self, checkpoint_dir): checkpoint_obj = TrainableUtil.checkpoint_to_object(checkpoint_dir) return ray.get( w.restore_from_object.remote(checkpoint_obj) for w in self.workers)