def make_test_workloads( checkpoint_dir: pathlib.Path, config: det.ExperimentConfig ) -> workload.Stream: print("Start training a test experiment.") interceptor = workload.WorkloadResponseInterceptor() print("Training 1 step.") yield from interceptor.send(workload.train_workload(1), [config.batches_per_step()]) metrics = interceptor.metrics_result() batch_metrics = metrics["batch_metrics"] check.eq(len(batch_metrics), config.batches_per_step()) print(f"Finished training. Metrics: {batch_metrics}") print("Validating.") yield from interceptor.send(workload.validation_workload(1), []) validation = interceptor.metrics_result() v_metrics = validation["validation_metrics"] print(f"Finished validating. Validation metrics: {v_metrics}") print(f"Saving a checkpoint to {checkpoint_dir}") yield workload.checkpoint_workload(), [checkpoint_dir], workload.ignore_workload_response print(f"Finished saving a checkpoint to {checkpoint_dir}.") yield workload.terminate_workload(), [], workload.ignore_workload_response print("The test experiment passed.")
def launch(experiment_config: det.ExperimentConfig) -> int: entrypoint = experiment_config.get_entrypoint() if isinstance(entrypoint, str) and det.util.match_legacy_trial_class(entrypoint): # Legacy entrypoint ("model_def:Trial") detected entrypoint = [ "python3", "-m", "determined.launch.horovod", "--autohorovod", "--trial", entrypoint, ] if isinstance(entrypoint, str): entrypoint = ["sh", "-c", entrypoint] if os.environ.get( "DET_RESOURCES_TYPE") == prep_container.RESOURCES_TYPE_SLURM_JOB: # SLURM sends SIGTERM to notify of pending preemption signal.signal(signal.SIGTERM, trigger_preemption) logging.info(f"Launching: {entrypoint}") return subprocess.Popen(entrypoint).wait()
def _make_test_workloads(config: det.ExperimentConfig) -> workload.Stream: interceptor = workload.WorkloadResponseInterceptor() logging.info("Training one batch") yield from interceptor.send(workload.train_workload(1)) metrics = interceptor.metrics_result() batch_metrics = metrics["metrics"]["batch_metrics"] check.eq(len(batch_metrics), config.scheduling_unit()) logging.info(f"Finished training, metrics: {batch_metrics}") logging.info("Validating one batch") yield from interceptor.send(workload.validation_workload(1)) validation = interceptor.metrics_result() v_metrics = validation["metrics"]["validation_metrics"] logging.info(f"Finished validating, validation metrics: {v_metrics}") logging.info("Saving a checkpoint.") yield workload.checkpoint_workload(), workload.ignore_workload_response logging.info("Finished saving a checkpoint.")
def _make_test_workloads(checkpoint_dir: pathlib.Path, config: det.ExperimentConfig) -> workload.Stream: interceptor = workload.WorkloadResponseInterceptor() logging.info("Training one batch") yield from interceptor.send(workload.train_workload(1), []) metrics = interceptor.metrics_result() batch_metrics = metrics["metrics"]["batch_metrics"] check.eq(len(batch_metrics), config.scheduling_unit()) logging.debug(f"Finished training, metrics: {batch_metrics}") logging.info("Validating one step") yield from interceptor.send(workload.validation_workload(1), []) validation = interceptor.metrics_result() v_metrics = validation["metrics"]["validation_metrics"] logging.debug(f"Finished validating, validation metrics: {v_metrics}") logging.info(f"Saving a checkpoint to {checkpoint_dir}.") yield workload.checkpoint_workload(), [checkpoint_dir ], workload.ignore_workload_response logging.info(f"Finished saving a checkpoint to {checkpoint_dir}.") yield workload.terminate_workload(), [], workload.ignore_workload_response logging.info("The test experiment passed.")