def checkpoint(self, already_exiting: bool) -> WorkloadGenerator: self.core_context.train.set_status("checkpointing") # Update the last_ckpt now so it can be captured by get_state() after we yield. self.state.last_ckpt = self.state.steps_completed wkld = workload.Workload( kind=workload.Workload.Kind.CHECKPOINT_MODEL, e_id=self._exp_id, t_id=self._trial_id, s_id=self.state.step_id, num_batches=0, total_batches_processed=self.state.steps_completed, ) response = yield from yield_and_await_response(wkld) if isinstance(response, workload.InvalidHP): self.core_context.train.report_early_exit( core.EarlyExitReason.INVALID_HP) if not already_exiting: raise ShouldExit(skip_exit_checkpoint=True) return if already_exiting: return if response.get("stop_requested"): raise ShouldExit() self.check_for_preemption()
def make_default_env_context( hparams: Dict[str, Any], experiment_config: Optional[Dict] = None, trial_seed: int = 0 ) -> det.EnvContext: if experiment_config is None: experiment_config = make_default_exp_config(hparams, 1) # TODO(ryan): Fix the parameter passing so that this doesn't read from environment variables, # and we can get rid of the @expose_gpus fixture. use_gpu = distutils.util.strtobool(os.environ.get("DET_USE_GPU", "false")) gpu_uuids = gpu.get_gpu_uuids_and_validate(use_gpu) return det.EnvContext( experiment_config=experiment_config, initial_workload=workload.Workload( workload.Workload.Kind.RUN_STEP, ExperimentID(1), TrialID(1), StepID(1) ), master_addr="", master_port=0, container_id="", hparams=hparams, latest_checkpoint=None, use_gpu=use_gpu, container_gpus=gpu_uuids, slot_ids=[], debug=False, workload_manager_type="", det_rendezvous_ports="", det_trial_runner_network_interface=constants.AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE, det_trial_id="1", det_experiment_id="1", det_cluster_id="uuid-123", trial_seed=trial_seed, )
def create_default_env_context( experiment_config: Dict[str, Any]) -> det.EnvContext: det_trial_runner_network_interface = constants.AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE return det.EnvContext( experiment_config=experiment_config, initial_workload=workload.Workload( workload.Workload.Kind.RUN_STEP, ExperimentID(1), TrialID(1), StepID(1), det.ExperimentConfig(experiment_config).scheduling_unit(), 0, ), master_addr="", master_port=0, use_tls=False, master_cert_file=None, master_cert_name=None, container_id="", hparams={"global_batch_size": 32}, latest_checkpoint=None, use_gpu=False, container_gpus=[], slot_ids=[], debug=False, workload_manager_type="", det_rendezvous_ports="", det_trial_unique_port_offset=0, det_trial_runner_network_interface=det_trial_runner_network_interface, det_trial_id="1", det_experiment_id="1", det_cluster_id="uuid-123", trial_seed=0, )
def get_dummy_env() -> det.EnvContext: return det.EnvContext( master_addr="", master_port=0, container_id="", experiment_config={ "resources": { "slots_per_trial": 1, "native_parallel": False } }, initial_workload=workload.Workload( workload.Workload.Kind.RUN_STEP, determined_common.types.ExperimentID(1), determined_common.types.TrialID(1), determined_common.types.StepID(1), ), latest_checkpoint=None, use_gpu=False, container_gpus=[], slot_ids=[], debug=False, workload_manager_type="", hparams={"global_batch_size": 1}, det_rendezvous_ports="", det_trial_runner_network_interface=constants. AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE, det_trial_id="1", det_experiment_id="1", det_cluster_id="uuid-123", trial_seed=0, )
def train(self, num_batches: int, op: core.SearcherOperation) -> WorkloadGenerator: # Report a train step is starting. self.core_context.train.set_status("training") wkld = workload.Workload( kind=workload.Workload.Kind.RUN_STEP, e_id=self._exp_id, t_id=self._trial_id, s_id=self.state.step_id + 1, num_batches=num_batches, total_batches_processed=self.state.steps_completed, ) response = yield from yield_and_await_response(wkld) # Train step is complete, process the result. if isinstance(response, workload.InvalidHP): # Exit before reporting metrics (which would be empty anyway). self.core_context.train.report_early_exit( core.EarlyExitReason.INVALID_HP) raise ShouldExit() metrics = response.get("metrics", {}).get("avg_metrics", {}) batch_metrics = response.get("metrics", {}).get("batch_metrics", []) self.state.steps_completed += num_batches self.state.step_id += 1 self.core_context.train.report_training_metrics( steps_completed=self.state.steps_completed, metrics=metrics, batch_metrics=batch_metrics, ) # Report progress to the searcher. For historical reasons we only deal in batches. if self._unit == core.Unit.BATCHES: op.report_progress(self.state.steps_completed) elif self._unit == core.Unit.RECORDS: op.report_progress(self.global_batch_size * self.state.steps_completed) elif self._unit == core.Unit.EPOCHS: op.report_progress(self.state.steps_completed / self.as_batches(epochs=1)) else: raise ValueError(f"unrecognized searcher op unit: {self._unit}") if response.get("stop_requested"): # Exit after reporting metrics. raise ShouldExit() self.check_for_preemption()
def get_dummy_env() -> det.EnvContext: return det.EnvContext( master_addr="", master_port=0, use_tls=False, master_cert_file=None, master_cert_name=None, container_id="", experiment_config={ "resources": { "slots_per_trial": 1, "native_parallel": False } }, initial_workload=workload.Workload( workload.Workload.Kind.RUN_STEP, determined.common.types.ExperimentID(1), determined.common.types.TrialID(1), determined.common.types.StepID(1), constants.DEFAULT_SCHEDULING_UNIT, 0, ), latest_checkpoint=None, use_gpu=False, container_gpus=[], slot_ids=[], debug=False, workload_manager_type="", hparams={"global_batch_size": 1}, det_rendezvous_port="", det_trial_unique_port_offset=0, det_trial_runner_network_interface=constants. AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE, det_trial_id="1", det_agent_id="1", det_experiment_id="1", det_task_token="", det_cluster_id="uuid-123", trial_seed=0, managed_training=True, test_mode=False, on_cluster=False, )
def validate(self, op: Optional[core.SearcherOperation]) -> WorkloadGenerator: # Report a validation step is starting. self.core_context.train.set_status("validating") wkld = workload.Workload( kind=workload.Workload.Kind.COMPUTE_VALIDATION_METRICS, e_id=self._exp_id, t_id=self._trial_id, s_id=self.state.step_id, num_batches=0, total_batches_processed=self.state.steps_completed, ) response = yield from yield_and_await_response(wkld) # Validation step is complete, process the result. if isinstance(response, workload.InvalidHP): self.core_context.train.report_early_exit( core.EarlyExitReason.INVALID_HP) raise ShouldExit() metrics = response["metrics"]["validation_metrics"] # Check that the validation metrics computed by the model code # includes the metric used by the search method. searcher_metric_name = self.env.experiment_config["searcher"]["metric"] if searcher_metric_name not in metrics: raise RuntimeError( f"Search method is configured to use metric '{searcher_metric_name}' but model " f"definition returned validation metrics {list(metrics.keys())}. The metric " "used by the search method must be one of the validation " "metrics returned by the model definition.") # Check that the searcher metric has a scalar value so that it can be compared for # search purposes. Other metrics don't have to be scalars. searcher_metric = metrics[searcher_metric_name] if not tensorboard.metric_writers.util.is_numerical_scalar( searcher_metric): raise RuntimeError( f"Searcher validation metric '{searcher_metric_name}' returned " f"a non-scalar value: {searcher_metric}") # Report to the searcher API first, so we don't end up in a situation where we die between # reporting to the metrics API and when we come back we refuse to repeat a validation, but # we also don't have any validation metrics to report the the searcher API. # # A simpler solution here would be to execute in the following order (which would be # suitable for most customers to implement on their own): # - validation # - report to metrics API # - report to searcher API # - checkpoint # # But we can't do that without breaking behavior. if op is not None and self.batches_until_op_complete(op) < 1: op.report_completed(searcher_metric) if self.ckpt_policy == "best" and not self.checkpoint_is_current(): # Before reporting our own validation metric, check what the best known validation is # without it. best_validation_before = self.core_context.train.get_experiment_best_validation( ) self.state.last_val = self.state.steps_completed self.core_context.train.report_validation_metrics( steps_completed=self.state.steps_completed, metrics=metrics, ) if response.get("stop_requested"): raise ShouldExit() if not self.checkpoint_is_current(): if self.ckpt_policy == "all" or ( self.ckpt_policy == "best" and self.is_best_validation( now=searcher_metric, before=best_validation_before)): yield from self.checkpoint(already_exiting=False) self.check_for_preemption()