def _clone_from(cls, trial: Trial, allocation, restore_path=None) -> "AdaptDLTrial": trainable_cls = trial.get_trainable_cls() pgf = allocation_to_pgf(allocation) num_workers = pgf_to_num_replicas(pgf) assert num_workers > 0 if isinstance(trial, AdaptDLTrial): # Cloning from existing AdaptDLTrial rescale_count = trial.rescale_count + 1 # Carry over the creation_timestamp creation_timestamp = trial.creation_timestamp else: creation_timestamp = datetime.now() rescale_count = 0 adaptdl_trainable_cls = AdaptDLTrainableCreator( trainable_cls._function, num_workers, group=rescale_count) return cls( trainable_name=adaptdl_trainable_cls.__name__, creation_timestamp=creation_timestamp, rescale_count=rescale_count, config=trial.config, experiment_tag=trial.experiment_tag, evaluated_params=trial.evaluated_params, stopping_criterion=trial.stopping_criterion, trial_id=trial.trial_id, restore_path=restore_path, local_dir="/tmp", # TODO: Decide a proper way placement_group_factory=pgf)
def _setup_remote_runner(self, trial: Trial, res: Resources, reuse_allowed: bool) -> Any: trial.init_logger() # We checkpoint metadata here to try mitigating logdir duplication self.try_checkpoint_metadata(trial) remote_logdir = trial.logdir cls = ray.remote( num_cpus=res.cpu, num_gpus=0 if self._fake_gpus else res.gpu, memory=res.memory, object_store_memory=res.object_store_memory, resources=res.custom_resources, )(trial.get_trainable_cls()) def logger_creator(config): # Set the working dir in the remote process, for user file writes os.makedirs(remote_logdir, exist_ok=True) if not ray.worker._mode() == ray.worker.LOCAL_MODE: os.chdir(remote_logdir) return NoopLogger(config, remote_logdir) # Clear the Trial's location (to be updated later on result) # since we don't know where the remote runner is placed. trial.set_location(Location()) logger.debug("Trial %s: Setting up new remote runner.", trial) # Logging for trials is handled centrally by TrialRunner, so # configure the remote runner to use a noop-logger. trial_config = copy.deepcopy(trial.config) trial_config[TRIAL_INFO] = TrialInfo(trial) kwargs = { "config": trial_config, "logger_creator": logger_creator, } if issubclass(trial.get_trainable_cls(), DurableTrainable): kwargs["remote_checkpoint_dir"] = trial.remote_checkpoint_dir with _change_working_directory(trial): return cls.remote(**kwargs)