Exemplo n.º 1
0
    def pause(self, trial_runner):
        """ Pause the AdaptDLTrial with a checkpoint. We try to remove the PG
        attached to this trial"""
        assert self.runner is not None
        checkpoint_obj = ray.get(
            self.runner.save_all_states.remote(self.runner.get_state.remote()))
        # Serialize to disk
        temp_checkpoint_dir = (FuncCheckpointUtil.mk_temp_checkpoint_dir(
            self.logdir))
        checkpoint_path = TrainableUtil.create_from_pickle(
            checkpoint_obj, temp_checkpoint_dir)

        # Trial will be restored from the checkpoint_path when it's resumed
        self.restore_path = checkpoint_path

        # Clear the allocation. This is a hack to clear the PG associated with
        # the trial. We assign a temporary PG which will get replaced with a
        # real PG once we resume the trial. This is needed because Tune likes
        # to keep the PGs around even for PAUSED trials.
        self.placement_group_factory = PlacementGroupFactory([{"CPU": 0.001}])
        # This forces Tune to garbage-collect uneeded PGs which can then be
        # reused
        trial_runner.trial_executor._pg_manager.\
            reconcile_placement_groups([self])
        logger.debug(f"PAUSING {self} w/ checkpoint at {checkpoint_path}")
Exemplo n.º 2
0
 def restore_from_object(self, obj):
     self.temp_checkpoint_dir = FuncCheckpointUtil.mk_temp_checkpoint_dir(
         self.logdir
     )
     checkpoint_path = TrainableUtil.create_from_pickle(
         obj, self.temp_checkpoint_dir
     )
     self.restore(checkpoint_path)
Exemplo n.º 3
0
    def restore_from_object(self, obj):
        if self.default_checkpoint_dir is not None and os.exists(
                self.default_checkpoint_dir):
            shutil.rmtree(self.default_checkpoint_dir)
            logger.debug("Clearing default checkpoint: %s",
                         self.default_checkpoint_dir)

        checkpoint_dir = self.create_default_checkpoint_dir()
        checkpoint_path = TrainableUtil.create_from_pickle(obj, checkpoint_dir)
        self.restore(checkpoint_path)
Exemplo n.º 4
0
 def save_checkpoint(self, checkpoint_dir: str) -> str:
     # TODO: optimize if colocated
     save_obj = self.executor.execute_single(lambda w: w.save_to_object())
     checkpoint_path = TrainableUtil.create_from_pickle(
         save_obj, checkpoint_dir)
     return checkpoint_path
Exemplo n.º 5
0
 def save_checkpoint(self, checkpoint_dir):
     # TODO: optimize if colocated
     save_obj = ray.get(self.workers[0].save_to_object.remote())
     checkpoint_path = TrainableUtil.create_from_pickle(
         save_obj, checkpoint_dir)
     return checkpoint_path