def pause(self, trial_runner): """ Pause the AdaptDLTrial with a checkpoint. We try to remove the PG attached to this trial""" assert self.runner is not None checkpoint_obj = ray.get( self.runner.save_all_states.remote(self.runner.get_state.remote())) # Serialize to disk temp_checkpoint_dir = (FuncCheckpointUtil.mk_temp_checkpoint_dir( self.logdir)) checkpoint_path = TrainableUtil.create_from_pickle( checkpoint_obj, temp_checkpoint_dir) # Trial will be restored from the checkpoint_path when it's resumed self.restore_path = checkpoint_path # Clear the allocation. This is a hack to clear the PG associated with # the trial. We assign a temporary PG which will get replaced with a # real PG once we resume the trial. This is needed because Tune likes # to keep the PGs around even for PAUSED trials. self.placement_group_factory = PlacementGroupFactory([{"CPU": 0.001}]) # This forces Tune to garbage-collect uneeded PGs which can then be # reused trial_runner.trial_executor._pg_manager.\ reconcile_placement_groups([self]) logger.debug(f"PAUSING {self} w/ checkpoint at {checkpoint_path}")
def restore_from_object(self, obj): self.temp_checkpoint_dir = FuncCheckpointUtil.mk_temp_checkpoint_dir( self.logdir ) checkpoint_path = TrainableUtil.create_from_pickle( obj, self.temp_checkpoint_dir ) self.restore(checkpoint_path)
def restore_from_object(self, obj): if self.default_checkpoint_dir is not None and os.exists( self.default_checkpoint_dir): shutil.rmtree(self.default_checkpoint_dir) logger.debug("Clearing default checkpoint: %s", self.default_checkpoint_dir) checkpoint_dir = self.create_default_checkpoint_dir() checkpoint_path = TrainableUtil.create_from_pickle(obj, checkpoint_dir) self.restore(checkpoint_path)
def save_checkpoint(self, checkpoint_dir: str) -> str: # TODO: optimize if colocated save_obj = self.executor.execute_single(lambda w: w.save_to_object()) checkpoint_path = TrainableUtil.create_from_pickle( save_obj, checkpoint_dir) return checkpoint_path
def save_checkpoint(self, checkpoint_dir): # TODO: optimize if colocated save_obj = ray.get(self.workers[0].save_to_object.remote()) checkpoint_path = TrainableUtil.create_from_pickle( save_obj, checkpoint_dir) return checkpoint_path