Пример #1
0
    def _clone_from(cls,
                    trial: Trial,
                    allocation,
                    restore_path=None) -> "AdaptDLTrial":
        trainable_cls = trial.get_trainable_cls()
        pgf = allocation_to_pgf(allocation)
        num_workers = pgf_to_num_replicas(pgf)
        assert num_workers > 0
        if isinstance(trial, AdaptDLTrial):
            # Cloning from existing AdaptDLTrial
            rescale_count = trial.rescale_count + 1
            # Carry over the creation_timestamp
            creation_timestamp = trial.creation_timestamp
        else:
            creation_timestamp = datetime.now()
            rescale_count = 0

        adaptdl_trainable_cls = AdaptDLTrainableCreator(
            trainable_cls._function, num_workers, group=rescale_count)
        return cls(
            trainable_name=adaptdl_trainable_cls.__name__,
            creation_timestamp=creation_timestamp,
            rescale_count=rescale_count,
            config=trial.config,
            experiment_tag=trial.experiment_tag,
            evaluated_params=trial.evaluated_params,
            stopping_criterion=trial.stopping_criterion,
            trial_id=trial.trial_id,
            restore_path=restore_path,
            local_dir="/tmp",  # TODO: Decide a proper way
            placement_group_factory=pgf)
Пример #2
0
    def _setup_remote_runner(self, trial: Trial, res: Resources,
                             reuse_allowed: bool) -> Any:
        trial.init_logger()
        # We checkpoint metadata here to try mitigating logdir duplication
        self.try_checkpoint_metadata(trial)
        remote_logdir = trial.logdir

        cls = ray.remote(
            num_cpus=res.cpu,
            num_gpus=0 if self._fake_gpus else res.gpu,
            memory=res.memory,
            object_store_memory=res.object_store_memory,
            resources=res.custom_resources,
        )(trial.get_trainable_cls())

        def logger_creator(config):
            # Set the working dir in the remote process, for user file writes
            os.makedirs(remote_logdir, exist_ok=True)
            if not ray.worker._mode() == ray.worker.LOCAL_MODE:
                os.chdir(remote_logdir)
            return NoopLogger(config, remote_logdir)

        # Clear the Trial's location (to be updated later on result)
        # since we don't know where the remote runner is placed.
        trial.set_location(Location())
        logger.debug("Trial %s: Setting up new remote runner.", trial)
        # Logging for trials is handled centrally by TrialRunner, so
        # configure the remote runner to use a noop-logger.
        trial_config = copy.deepcopy(trial.config)
        trial_config[TRIAL_INFO] = TrialInfo(trial)
        kwargs = {
            "config": trial_config,
            "logger_creator": logger_creator,
        }
        if issubclass(trial.get_trainable_cls(), DurableTrainable):
            kwargs["remote_checkpoint_dir"] = trial.remote_checkpoint_dir

        with _change_working_directory(trial):
            return cls.remote(**kwargs)