def create_default_env_context( experiment_config: Dict[str, Any]) -> det.EnvContext: det_trial_runner_network_interface = constants.AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE return det.EnvContext( experiment_config=experiment_config, initial_workload=workload.Workload( workload.Workload.Kind.RUN_STEP, ExperimentID(1), TrialID(1), StepID(1), det.ExperimentConfig(experiment_config).scheduling_unit(), 0, ), master_addr="", master_port=0, use_tls=False, master_cert_file=None, master_cert_name=None, container_id="", hparams={"global_batch_size": 32}, latest_checkpoint=None, use_gpu=False, container_gpus=[], slot_ids=[], debug=False, workload_manager_type="", det_rendezvous_ports="", det_trial_unique_port_offset=0, det_trial_runner_network_interface=det_trial_runner_network_interface, det_trial_id="1", det_experiment_id="1", det_cluster_id="uuid-123", trial_seed=0, )
def do_test_launch(config: Dict[str, Any], cmd: List[str], mock_popen: mock.MagicMock) -> None: mock_proc = mock.MagicMock() mock_proc.wait.return_value = 99 mock_popen.return_value = mock_proc assert launch.launch(det.ExperimentConfig(config)) == 99 mock_popen.assert_called_once_with(cmd)
def create_trial_instance( trial_def: Type[det.Trial], checkpoint_dir: str, config: Optional[Dict[str, Any]] = None, hparams: Optional[Dict[str, Any]] = None, ) -> det.Trial: """ Create a trial instance from a Trial class definition. This can be a useful utility for debugging your trial logic in any development environment. Arguments: trial_def: A class definition that inherits from the det.Trial interface. checkpoint_dir: The checkpoint directory that the trial will use for loading and saving checkpoints. config: An optional experiment configuration that is used to initialize the :class:`determined.TrialContext`. If not specified, a minimal default is used. """ determined_common.set_logger( util.debug_mode() or det.ExperimentConfig(config or {}).debug_enabled()) env, rendezvous_info, hvd_config = det._make_local_execution_env( False, config, hparams) trial_context = trial_def.trial_context_class(env, hvd_config) return trial_def(trial_context)
def __init__( self, master_addr: str, master_port: int, use_tls: bool, master_cert_file: Optional[str], master_cert_name: Optional[str], container_id: str, experiment_config: Dict[str, Any], hparams: Dict[str, Any], initial_workload: workload.Workload, latest_checkpoint: Optional[Dict[str, Any]], use_gpu: bool, container_gpus: List[str], slot_ids: List[int], debug: bool, workload_manager_type: str, det_rendezvous_port: str, det_trial_unique_port_offset: int, det_trial_runner_network_interface: str, det_trial_id: str, det_experiment_id: str, det_agent_id: str, det_cluster_id: str, det_task_token: str, trial_seed: int, managed_training: bool, test_mode: bool, on_cluster: bool, ): self.master_addr = master_addr self.master_port = master_port self.use_tls = use_tls self.master_cert_file = master_cert_file self.master_cert_name = master_cert_name self.container_id = container_id self.experiment_config = det.ExperimentConfig(experiment_config) self.hparams = hparams self.initial_workload = initial_workload self.latest_checkpoint = latest_checkpoint self.use_gpu = use_gpu self.container_gpus = container_gpus self.slot_ids = slot_ids self.debug = debug self.workload_manager_type = workload_manager_type self.det_rendezvous_port = det_rendezvous_port self.det_trial_unique_port_offset = det_trial_unique_port_offset self.det_trial_runner_network_interface = det_trial_runner_network_interface self.det_trial_id = det_trial_id self.det_experiment_id = det_experiment_id self.det_agent_id = det_agent_id self.det_cluster_id = det_cluster_id self.det_task_token = det_task_token self.trial_seed = trial_seed self.managed_training = managed_training self.test_mode = test_mode self.on_cluster = on_cluster self._per_slot_batch_size, self._global_batch_size = self._calculate_batch_sizes( )
def _make_local_execution_env( managed_training: bool, test_mode: bool, config: Optional[Dict[str, Any]], hparams: Optional[Dict[str, Any]] = None, limit_gpus: Optional[int] = None, ) -> Tuple[det.EnvContext, det.RendezvousInfo, horovod.HorovodContext]: config = det.ExperimentConfig( _make_local_execution_exp_config(config, managed_training=managed_training, test_mode=test_mode)) hparams = hparams or api.generate_random_hparam_values( config.get("hyperparameters", {})) use_gpu, container_gpus, slot_ids = _get_gpus(limit_gpus) local_rendezvous_ports = ( f"{constants.LOCAL_RENDEZVOUS_PORT},{constants.LOCAL_RENDEZVOUS_PORT+1}" ) env = det.EnvContext( master_addr="", master_port=0, use_tls=False, master_cert_file=None, master_cert_name=None, container_id="", experiment_config=config, hparams=hparams, initial_workload=workload.train_workload(1, 1, 1, config.scheduling_unit()), latest_checkpoint=None, use_gpu=use_gpu, container_gpus=container_gpus, slot_ids=slot_ids, debug=config.debug_enabled(), workload_manager_type="", det_rendezvous_ports=local_rendezvous_ports, det_trial_unique_port_offset=0, det_trial_runner_network_interface=constants. AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE, det_trial_id="", det_experiment_id="", det_cluster_id="", trial_seed=config.experiment_seed(), managed_training=managed_training, test_mode=test_mode, on_cluster=False, ) rendezvous_ports = env.rendezvous_ports() rendezvous_info = det.RendezvousInfo( addrs=[f"0.0.0.0:{rendezvous_ports[0]}"], addrs2=[f"0.0.0.0:{rendezvous_ports[1]}"], rank=0) hvd_config = horovod.HorovodContext.from_configs(env.experiment_config, rendezvous_info, env.hparams) return env, rendezvous_info, hvd_config
def make_default_env_context(hparams: Dict[str, Any], experiment_config: Optional[Dict] = None, trial_seed: int = 0) -> det.EnvContext: if experiment_config is None: experiment_config = make_default_exp_config(hparams, 1) # TODO(ryan): Fix the parameter passing so that this doesn't read from environment variables, # and we can get rid of the @expose_gpus fixture. use_gpu = distutils.util.strtobool(os.environ.get("DET_USE_GPU", "false")) gpu_uuids = gpu.get_gpu_uuids_and_validate(use_gpu) return det.EnvContext( experiment_config=experiment_config, initial_workload=workload.Workload( workload.Workload.Kind.RUN_STEP, ExperimentID(1), TrialID(1), StepID(1), det.ExperimentConfig(experiment_config).scheduling_unit(), 0, ), master_addr="", master_port=0, use_tls=False, master_cert_file=None, master_cert_name=None, container_id="", hparams=hparams, latest_checkpoint=None, use_gpu=use_gpu, container_gpus=gpu_uuids, slot_ids=[], debug=False, workload_manager_type="TRIAL_WORKLOAD_MANAGER", det_rendezvous_ports="", det_trial_unique_port_offset=0, det_trial_runner_network_interface=constants. AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE, det_trial_id="1", det_experiment_id="1", det_agent_id="1", det_cluster_id="uuid-123", det_task_token="", trial_seed=trial_seed, managed_training=True, test_mode=False, on_cluster=False, )
def __init__( self, master_url: str, master_cert_file: Optional[str], master_cert_name: Optional[str], experiment_config: Dict[str, Any], hparams: Dict[str, Any], latest_checkpoint: Optional[str], steps_completed: int, use_gpu: bool, container_gpus: List[str], slot_ids: List[int], debug: bool, det_trial_unique_port_offset: int, det_trial_id: str, det_experiment_id: str, det_agent_id: str, det_cluster_id: str, trial_seed: int, trial_run_id: int, allocation_id: str, managed_training: bool, test_mode: bool, on_cluster: bool, ): self.master_url = master_url self.master_cert_file = master_cert_file self.master_cert_name = master_cert_name self.experiment_config = det.ExperimentConfig(experiment_config) self.hparams = hparams self.latest_checkpoint = latest_checkpoint self.steps_completed = steps_completed self.use_gpu = use_gpu self.container_gpus = container_gpus self.slot_ids = slot_ids self.debug = debug self.det_trial_unique_port_offset = det_trial_unique_port_offset self.det_trial_id = det_trial_id self.det_experiment_id = det_experiment_id self.det_agent_id = det_agent_id self.det_cluster_id = det_cluster_id self.trial_seed = trial_seed self.trial_run_id = trial_run_id self.allocation_id = allocation_id self.managed_training = managed_training self.test_mode = test_mode self.on_cluster = on_cluster
def _make_local_test_experiment_env( checkpoint_dir: pathlib.Path, config: Optional[Dict[str, Any]], hparams: Optional[Dict[str, Any]] = None, ) -> Tuple[det.EnvContext, workload.Stream, det.RendezvousInfo, horovod.HorovodContext]: config = det.ExperimentConfig(_make_local_test_experiment_config(config)) hparams = hparams or _generate_test_hparam_values(config) use_gpu, container_gpus, slot_ids = _get_gpus() local_rendezvous_ports = ( f"{constants.LOCAL_RENDEZVOUS_PORT},{constants.LOCAL_RENDEZVOUS_PORT+1}" ) env = det.EnvContext( master_addr="", master_port=1, container_id="test_mode", experiment_config=config, hparams=hparams, initial_workload=workload.train_workload(1, 1, 1, config.batches_per_step()), latest_checkpoint=None, use_gpu=use_gpu, container_gpus=container_gpus, slot_ids=slot_ids, debug=config.debug_enabled(), workload_manager_type="", det_rendezvous_ports=local_rendezvous_ports, det_trial_runner_network_interface=constants. AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE, det_trial_id="1", det_experiment_id="1", det_cluster_id="test_mode", trial_seed=config.experiment_seed(), ) workloads = _make_test_workloads(checkpoint_dir.joinpath("checkpoint"), config) rendezvous_ports = env.rendezvous_ports() rendezvous_info = det.RendezvousInfo( addrs=[f"0.0.0.0:{rendezvous_ports[0]}"], addrs2=[f"0.0.0.0:{rendezvous_ports[1]}"], rank=0) hvd_config = horovod.HorovodContext.from_configs(env.experiment_config, rendezvous_info, env.hparams) return env, workloads, rendezvous_info, hvd_config
def _make_local_execution_env( managed_training: bool, test_mode: bool, config: Optional[Dict[str, Any]], checkpoint_dir: str, hparams: Optional[Dict[str, Any]] = None, limit_gpus: Optional[int] = None, ) -> Tuple[core.Context, det.EnvContext]: config = det.ExperimentConfig( _make_local_execution_exp_config(config, checkpoint_dir, managed_training=managed_training, test_mode=test_mode)) hparams = hparams or api.generate_random_hparam_values( config.get("hyperparameters", {})) use_gpu, container_gpus, slot_ids = _get_gpus(limit_gpus) env = det.EnvContext( master_url="", master_cert_file=None, master_cert_name=None, experiment_config=config, hparams=hparams, latest_checkpoint=None, steps_completed=0, use_gpu=use_gpu, container_gpus=container_gpus, slot_ids=slot_ids, debug=config.debug_enabled(), det_trial_unique_port_offset=0, det_trial_id="", det_agent_id="", det_experiment_id="", det_cluster_id="", trial_seed=config.experiment_seed(), trial_run_id=1, allocation_id="", managed_training=managed_training, test_mode=test_mode, on_cluster=False, ) core_context = core._dummy_init() return core_context, env
def __init__( self, master_addr: str, master_port: int, container_id: str, experiment_config: Dict[str, Any], hparams: Dict[str, Any], initial_workload: workload.Workload, latest_checkpoint: Optional[Dict[str, Any]], use_gpu: bool, container_gpus: List[str], slot_ids: List[int], debug: bool, workload_manager_type: str, det_rendezvous_ports: str, det_trial_runner_network_interface: str, det_trial_id: str, det_experiment_id: str, det_cluster_id: str, trial_seed: int, training: bool = True, ): self.master_addr = master_addr self.master_port = master_port self.container_id = container_id self.experiment_config = det.ExperimentConfig(experiment_config) self.hparams = hparams self.initial_workload = initial_workload self.latest_checkpoint = latest_checkpoint self.use_gpu = use_gpu self.container_gpus = container_gpus self.slot_ids = slot_ids self.debug = debug self.workload_manager_type = workload_manager_type self.det_rendezvous_ports = det_rendezvous_ports self.det_trial_runner_network_interface = det_trial_runner_network_interface self.det_trial_id = det_trial_id self.det_experiment_id = det_experiment_id self.det_cluster_id = det_cluster_id self.trial_seed = trial_seed self.training = training self._per_slot_batch_size, self._global_batch_size = self._calculate_batch_sizes( )
def create_trial_instance( trial_def: Type[det.Trial], checkpoint_dir: str, config: Optional[Dict[str, Any]] = None, hparams: Optional[Dict[str, Any]] = None, ) -> det.Trial: """ Deprecated: please use your TrialContext's .from_config() method instead. Create a trial instance from a Trial class definition. This can be a useful utility for debugging your trial logic in any development environment. Arguments: trial_def: A class definition that inherits from the det.Trial interface. checkpoint_dir: The checkpoint directory that the trial will use for loading and saving checkpoints. config: An optional experiment configuration that is used to initialize the :class:`determined.TrialContext`. If not specified, a minimal default is used. """ warnings.warn( "det.experimental.create_trial_instance() is now deprecated. Please use\n" "your TrialContext's .from_config() method instead. Example\n" "\n" " context = PyTorchTrialContext.from_config()\n" " my_trial = MyPyTorchTrial(context)\n", FutureWarning, ) determined.common.set_logger( util.debug_mode() or det.ExperimentConfig(config or {}).debug_enabled()) env, rendezvous_info, hvd_config = det._make_local_execution_env( managed_training=False, test_mode=False, config=config, hparams=hparams) trial_context = trial_def.trial_context_class( env, hvd_config, rendezvous_info=rendezvous_info) return trial_def(trial_context)
def init_native( trial_def: Optional[Type[det.Trial]] = None, controller_cls: Optional[Type[det.TrialController]] = None, native_context_cls: Optional[Type[det.NativeContext]] = None, config: Optional[Dict[str, Any]] = None, local: bool = False, test: bool = False, context_dir: str = "", command: Optional[List[str]] = None, master_url: Optional[str] = None, ) -> Any: determined.common.set_logger( util.debug_mode() or det.ExperimentConfig(config or {}).debug_enabled()) if local: if not test: logging.warning("local training is not supported, testing instead") with det._local_execution_manager(pathlib.Path(context_dir).resolve()): return test_one_batch( controller_cls=controller_cls, native_context_cls=native_context_cls, trial_class=trial_def, config=config, ) else: return _init_cluster_mode( trial_def=trial_def, controller_cls=controller_cls, native_context_cls=native_context_cls, config=config, test=test, context_dir=context_dir, command=command, master_url=master_url, )
def _init_native( controller_cls: Type[det.TrialController], native_context_cls: Type[det.NativeContext], config: Optional[Dict[str, Any]] = None, mode: Mode = Mode.CLUSTER, context_dir: str = "", command: Optional[List[str]] = None, master_url: Optional[str] = None, ) -> Any: det._set_logger(util.debug_mode() or det.ExperimentConfig(config or {}).debug_enabled()) if Mode(mode) == Mode.CLUSTER: if load.RunpyGlobals.is_initialized(): controller_cls.pre_execute_hook( env=load.RunpyGlobals.get_instance().env, hvd_config=load.RunpyGlobals.get_instance().hvd_config, ) context = native_context_cls( env=load.RunpyGlobals.get_instance().env, hvd_config=load.RunpyGlobals.get_instance().hvd_config, ) load.RunpyGlobals.set_runpy_native_result(context, controller_cls) context._set_train_fn(_stop_loading_implementation) return context else: create_experiment( config=config, context_dir=context_dir, command=command, master_url=master_url ) logging.info("Exiting the program after submitting the experiment.") sys.exit(0) elif Mode(mode) == Mode.LOCAL: logging.info("Running a minimal test experiment locally") checkpoint_dir = tempfile.TemporaryDirectory() env, workloads, rendezvous_info, hvd_config = make_test_experiment_env( checkpoint_dir=pathlib.Path(checkpoint_dir.name), config=config ) logging.info(f"Using hyperparameters: {env.hparams}") logging.debug(f"Using a test experiment config: {env.experiment_config}") controller_cls.pre_execute_hook(env=env, hvd_config=hvd_config) context = native_context_cls(env=env, hvd_config=hvd_config) def train_fn() -> None: controller = controller_cls.from_native( context=context, env=env, workloads=workloads, load_path=None, rendezvous_info=rendezvous_info, hvd_config=hvd_config, ) controller.run() checkpoint_dir.cleanup() context._set_train_fn(train_fn) return context else: raise errors.InvalidExperimentException("Must use either local mode or cluster mode.")
def __init__(self, trial_inst: det.Trial, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) check.is_instance(trial_inst, PyTorchTrial, "PyTorchTrialController needs an PyTorchTrial") self.trial = cast(PyTorchTrial, trial_inst) self.context = cast(PyTorchTrialContext, self.context) self.callbacks = self.trial.build_callbacks() # TODO(DET-3262): remove this backward compatibility of old interface. if (util.is_overridden(self.trial.build_model, PyTorchTrial) or util.is_overridden(self.trial.optimizer, PyTorchTrial) or util.is_overridden(self.trial.create_lr_scheduler, PyTorchTrial)): check.true( util.is_overridden(self.trial.build_model, PyTorchTrial) and util.is_overridden(self.trial.optimizer, PyTorchTrial), "Both build_model() and optimizer() must be defined " "if any of build_model(), optimizer(), and create_lr_scheduler() are defined. " "If you want to use the new interface, you should instead instantiate your models, " "optimizers, and LR schedulers in __init__ and call context.backward(loss) " "and context.step_optimizer(optimizer) in train_batch.", ) model = self.context._Model(self.trial.build_model()) optim = self.context._Optimizer(self.trial.optimizer(model)) lr_scheduler = self.trial.create_lr_scheduler(optim) if lr_scheduler is not None: self.context.lr_schedulers.append(lr_scheduler) if det.ExperimentConfig(self.context.get_experiment_config() ).mixed_precision_enabled(): self.context._configure_apex_amp( models=model, optimizers=optim, opt_level=self.context.get_experiment_config().get( "optimizations", {}).get("mixed_precision", "O0"), ) train_batch = self.trial.train_batch def new_train_batch( batch: TorchData, model: nn.Module, epoch_idx: int, batch_idx: int) -> Union[torch.Tensor, Dict[str, Any]]: tr_metrics = train_batch(batch, model, epoch_idx, batch_idx) if isinstance(tr_metrics, torch.Tensor): tr_metrics = {"loss": tr_metrics} check.is_instance( tr_metrics, dict, "train_batch() must return a dictionary " f"mapping string names to Tensor metrics, got {type(tr_metrics)}", ) check.is_in("loss", tr_metrics.keys(), 'Please include "loss" in you training metrics.') def clip_grads(parameters: Iterator) -> None: for callback in self.callbacks.values(): callback.on_before_optimizer_step(parameters) self.context._backward(tr_metrics["loss"]) self.context._step_optimizer(self.context.optimizers[0], clip_grads=clip_grads) return tr_metrics self.trial.__setattr__("train_batch", new_train_batch) check.gt_eq( len(self.context.models), 1, "Must have at least one model. " "This might be caused by not wrapping your model with Model()", ) check.gt_eq( len(self.context.optimizers), 1, "Must have at least one optimizer. " "This might be caused by not wrapping your model with Optimizer()", ) self._check_evaluate_implementation() # Validation loader will be undefined on process ranks > 0 # when the user defines `validate_full_dataset()`. self.validation_loader = None # type: Optional[torch.utils.data.DataLoader] self._set_data_loaders() # If a load path is provided load weights and restore the data location. self._load() if self.hvd_config.use: hvd.broadcast_parameters(self.context._main_model.state_dict(), root_rank=0) for optimizer in self.context.optimizers: hvd.broadcast_optimizer_state(optimizer, root_rank=0) self.training_iterator = iter(self.training_loader)
def create( trial_def: Type[det.Trial], config: Optional[Dict[str, Any]] = None, local: bool = False, test: bool = False, context_dir: str = "", command: Optional[List[str]] = None, master_url: Optional[str] = None, ) -> Any: # TODO: Add a reference to the local development tutorial. """ Create an experiment. Arguments: trial_def: A class definition implementing the :class:`determined.Trial` interface. config: A dictionary representing the experiment configuration to be associated with the experiment. local: A boolean indicating if training should be done locally. When ``False``, the experiment will be submitted to the Determined cluster. Defaults to ``False``. test: A boolean indicating if the experiment should be shortened to a minimal loop of training on a small amount of data, performing validation, and checkpointing. ``test=True`` is useful for quick iteration during model porting or debugging because common errors will surface more quickly. Defaults to ``False``. context_dir: A string filepath that defines the context directory. All model code will be executed with this as the current working directory. When ``local=False``, this argument is required. All files in this directory will be uploaded to the Determined cluster. The total size of this directory must be under 96 MB. When ``local=True``, this argument is optional and defaults to the current working directory. command: A list of strings that is used as the entrypoint of the training script in the Determined task environment. When executing this function via a Python script, this argument is inferred to be ``sys.argv`` by default. When executing this function via IPython or Jupyter notebook, this argument is required. Example: When creating an experiment by running ``python train.py --flag value``, the default command is inferred as ``["train.py", "--flag", "value"]``. master_url: An optional string to use as the Determined master URL when ``local=False``. If not specified, will be inferred from the environment variable ``DET_MASTER``. """ if local and not test: raise NotImplementedError( "det.create(local=True, test=False) is not yet implemented. Please set local=False " "or test=True.") determined.common.set_logger( util.debug_mode() or det.ExperimentConfig(config or {}).debug_enabled()) if local: # Local test mode. with det._local_execution_manager(pathlib.Path(context_dir).resolve()): return test_one_batch( trial_class=trial_def, config=config, ) elif not load.RunpyGlobals.is_initialized(): # Cluster mode, but still running locally; submit the experiment. _submit_experiment( config=config, test=test, context_dir=context_dir, command=command, master_url=master_url, ) else: # Cluster mode, now on the cluster; actually train. load.RunpyGlobals.set_runpy_trial_result(trial_def) raise det.errors.StopLoadingImplementation()
def _apply_backwards_compatibility(self) -> None: # TODO(DET-3262): remove this backward compatibility of old interface. if (util.is_overridden(self.trial.build_model, PyTorchTrial) or util.is_overridden(self.trial.optimizer, PyTorchTrial) or util.is_overridden(self.trial.create_lr_scheduler, PyTorchTrial)): logging.warning( "build_model(), optimizer(), and create_lr_scheduler(), which belong to " "the old interface, are deprecated. Please see the following documentation " "of PyTorchTrial for the new interface \n" f"{PyTorchTrial.__doc__}") logging.warning( "The callback on_before_optimizer_step is deprecated." "Please use context.step_optimizer to clip gradients.") check.true( util.is_overridden(self.trial.build_model, PyTorchTrial) and util.is_overridden(self.trial.optimizer, PyTorchTrial), "Both build_model() and optimizer() must be defined " "if any of build_model(), optimizer(), and create_lr_scheduler() are defined. " "If you want to use the new interface, you should instead instantiate your models, " "optimizers, and LR schedulers in __init__ and call context.backward(loss) " "and context.step_optimizer(optimizer) in train_batch.", ) model = self.context.wrap_model(self.trial.build_model()) optim = self.context.wrap_optimizer(self.trial.optimizer(model)) lr_scheduler = self.trial.create_lr_scheduler(optim) if lr_scheduler is not None: opt = getattr(lr_scheduler._scheduler, "optimizer", None) if opt is not None: check.is_in( opt, self.context.optimizers, "Must use a wrapped optimizer that is passed in by the optimizer " "argument of create_lr_scheduler", ) self.context.lr_schedulers.append(lr_scheduler) if det.ExperimentConfig(self.context.get_experiment_config() ).mixed_precision_enabled(): logging.warning( "The experiment configuration field optimization.mixed_precision is deprecated." "Please use configure_apex_amp in __init__ to configrue apex amp. " "See the following documentation of PyTorchTrial for the new interface \n" f"{PyTorchTrial.__doc__}") self.context.configure_apex_amp( models=model, optimizers=optim, opt_level=self.context.get_experiment_config().get( "optimizations", {}).get("mixed_precision", "O0"), ) # Backward compatibility: train_batch train_batch = cast(Callable, self.trial.train_batch) def new_train_batch(batch: pytorch.TorchData, epoch_idx: int, batch_idx: int) -> Any: tr_metrics = train_batch( batch=batch, model=model, epoch_idx=epoch_idx, batch_idx=batch_idx, ) if isinstance(tr_metrics, torch.Tensor): tr_metrics = {"loss": tr_metrics} check.is_instance( tr_metrics, dict, "train_batch() must return a dictionary " f"mapping string names to Tensor metrics, got {type(tr_metrics)}", ) check.is_in("loss", tr_metrics.keys(), 'Please include "loss" in you training metrics.') def clip_grads(parameters: Iterator) -> None: for callback in self.callbacks.values(): callback.on_before_optimizer_step(parameters) self.context.backward(tr_metrics["loss"]) self.context.step_optimizer(self.context.optimizers[0], clip_grads=clip_grads) return tr_metrics self.trial.__setattr__("train_batch", new_train_batch) # Backward compatibility: evaluate_batch if self._evaluate_batch_defined(): evaluate_batch = cast(Callable, self.trial.evaluate_batch) def new_evaluate_batch(batch: pytorch.TorchData) -> Any: return evaluate_batch(model=model, batch=batch) self.trial.__setattr__("evaluate_batch", new_evaluate_batch) # Backward compatibility: evaluate_full_dataset if self._evaluate_full_dataset_defined(): evaluate_full_dataset = cast(Callable, self.trial.evaluate_full_dataset) def new_evaluate_full_dataset( data_loader: torch.utils.data.DataLoader) -> Any: return evaluate_full_dataset(model=model, data_loader=data_loader) self.trial.__setattr__("evaluate_full_dataset", new_evaluate_full_dataset)
def create( trial_def: Type[det.Trial], config: Optional[Dict[str, Any]] = None, mode: Mode = Mode.CLUSTER, context_dir: str = "", command: Optional[List[str]] = None, master_url: Optional[str] = None, ) -> None: # TODO: Add a reference to the local development tutorial. """ Create an experiment. Arguments: trial_def: A class definition implementing the ``det.Trial`` interface. config: A dictionary representing the experiment configuration to be associated with the experiment. mode: The :py:class:`determined.experimental.Mode` used when creating an experiment 1. ``Mode.CLUSTER`` (default): Submit the experiment to a remote Determined cluster. 2. ``Mode.LOCAL``: Test the experiment in the calling Python process for local development / debugging purposes. Run through a minimal loop of training, validation, and checkpointing steps. context_dir: A string filepath that defines the context directory. All model code will be executed with this as the current working directory. In CLUSTER mode, this argument is required. All files in this directory will be uploaded to the Determined cluster. The total size of this directory must be under 96 MB. In LOCAL mode, this argument is optional and assumed to be the current working directory by default. command: A list of strings that is used as the entrypoint of the training script in the Determined task environment. When executing this function via a python script, this argument is inferred to be ``sys.argv`` by default. When executing this function via IPython or Jupyter notebook, this argument is required. Example: When creating an experiment by running "python train.py --flag value", the default command is inferred as ["train.py", "--flag", "value"]. master_url: An optional string to use as the Determined master URL in submit mode. If not specified, will be inferred from the environment variable ``DET_MASTER``. """ det._set_logger(util.debug_mode() or det.ExperimentConfig(config or {}).debug_enabled()) if Mode(mode) == Mode.CLUSTER: if load.RunpyGlobals.is_initialized(): load.RunpyGlobals.set_runpy_trial_result( trial_def, cast(Type[det.TrialController], trial_def.trial_controller_class) ) _stop_loading_implementation() else: create_experiment( config=config, context_dir=context_dir, command=command, master_url=master_url ) elif Mode(mode) == Mode.LOCAL: context_path = pathlib.Path(context_dir) if context_dir else pathlib.Path.cwd() test_one_batch(context_path, trial_class=trial_def, config=config) else: raise errors.InvalidExperimentException("Must use either local mode or cluster mode.")
pass return new_dict if __name__ == "__main__": info = det.get_cluster_info() assert info is not None, "must be run on-cluster" assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"' # Hack: get the resources id from the environment. resources_id = os.environ.get("DET_RESOURCES_ID") assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset" # Hack: read the full config. The experiment config is not a stable API! experiment_config = det.ExperimentConfig(info.trial._config) determined.common.set_logger(experiment_config.debug_enabled()) logging.info( f"New trial runner in (container {resources_id}) on agent {info.agent_id}: " + json.dumps(mask_config_dict(info.trial._config))) # Perform validations try: logging.info("Validating checkpoint storage ...") storage.validate_config( experiment_config.get_checkpoint_storage(), container_path=constants.SHARED_FS_CONTAINER_PATH, ) except Exception as e:
def main(script: List[str]) -> int: info = det.get_cluster_info() assert info is not None, "must be run on-cluster" assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"' experiment_config = det.ExperimentConfig(info.trial._config) determined.common.set_logger(experiment_config.debug_enabled()) multi_machine = len(info.container_addrs) > 1 check_deepspeed_version(multi_machine) # Hack: get the resources id from the environment. resources_id = os.environ.get("DET_RESOURCES_ID") assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset" # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert. cert = certs.default_load(info.master_url) certs.cli_cert = cert # The launch layer should provide the chief_ip to the training code, so that the training code # can function with a different launch layer in a different environment. Inside Determined, the # easiest way to get the chief_ip is with container_addrs. chief_ip = info.container_addrs[0] # Chief IP is set as an environment variable to support nested launch layers os.environ["DET_CHIEF_IP"] = chief_ip # If the NCCL_SOCKET_IFNAME environment variable wasn't explicitly set by # the user in the experiment's YAML file, then set it to the distributed # network interface, if the value of "dtrain_network_interface" under # "task_container_defaults" has been set in the "master.yaml". if is_using_cuda() and not is_nccl_socket_ifname_env_var_set(): dtrain_network_interface = os.environ.get("DET_INTER_NODE_NETWORK_INTERFACE", None) if dtrain_network_interface is not None and len(dtrain_network_interface) > 0: os.environ["NCCL_SOCKET_IFNAME"] = dtrain_network_interface # All ranks will need to run sshd. run_sshd_command = create_sshd_cmd() if info.container_rank > 0: # Non-chief machines just run sshd. # Mark sshd containers as daemon containers that the master should kill when all non-daemon # containers (deepspeed launcher, in this case) have exited. api.post( info.master_url, path=f"/api/v1/allocations/{info.allocation_id}/resources/{resources_id}/daemon", cert=cert, ) # Wrap it in a pid_server to ensure that we can't hang if a worker fails. # This is useful for deepspeed which does not have good error handling for remote processes # spun up by pdsh. pid_server_cmd = create_pid_server_cmd(info.allocation_id, len(info.slot_ids)) logging.debug( f"Non-chief [{info.container_rank}] training process launch " f"command: {run_sshd_command}." ) p = subprocess.Popen(pid_server_cmd + run_sshd_command) with det.util.forward_signals(p): return p.wait() # We always need to set this variable to initialize the context correctly, even in the single # slot case. os.environ["USE_DEEPSPEED"] = "1" # The chief has several layers of wrapper processes: # - a top-level pid_server, which causes the whole container to exit if any local worker dies. # - deepspeed, which launches $slots_per_trial copies of the following layers: # - a pid_client process to contact the local pid_server # - wrap_rank, which redirects stdin/stdout to the local container # - harness.py, which actually does the training for the worker pid_server_cmd = create_pid_server_cmd(info.allocation_id, len(info.slot_ids)) hostfile_path = get_hostfile_path(multi_machine) master_address = create_hostlist_file( hostfile_path=hostfile_path, num_proc_per_machine=len(info.slot_ids), ip_addresses=info.container_addrs, ) cmd = create_run_command(master_address, hostfile_path) pid_client_cmd = create_pid_client_cmd(info.allocation_id) log_redirect_cmd = create_log_redirect_cmd() harness_cmd = script logging.debug(f"chief worker calling deepspeed with args: {cmd[1:]} ...") full_cmd = pid_server_cmd + cmd + pid_client_cmd + log_redirect_cmd + harness_cmd if not multi_machine: p = subprocess.Popen(full_cmd) with det.util.forward_signals(p): return p.wait() # Create the environment file that will be passed by deepspeed to individual ranks. create_deepspeed_env_file() # Set custom PDSH args: # * bypass strict host checking # * -p our custom port # * other args are default ssh args for pdsh os.environ["PDSH_SSH_ARGS"] = ( "-o PasswordAuthentication=no -o StrictHostKeyChecking=no " f"-p {constants.DTRAIN_SSH_PORT} -2 -a -x %h" ) # Chief worker also needs to run sshd when using pdsh and multi-machine training. sshd_process = subprocess.Popen(run_sshd_command) try: # Chief machine waits for every worker's sshd to be available. All machines should be # close to in-step by now because all machines just finished synchronizing rendezvous # info. deadline = time.time() + 20 for peer_addr in info.container_addrs: util.check_sshd(peer_addr, deadline, constants.DTRAIN_SSH_PORT) p = subprocess.Popen(full_cmd) with det.util.forward_signals(p): return p.wait() finally: sshd_process.kill() sshd_process.wait()