示例#1
0
def make_test_workloads(
    checkpoint_dir: pathlib.Path, config: det.ExperimentConfig
) -> workload.Stream:
    print("Start training a test experiment.")
    interceptor = workload.WorkloadResponseInterceptor()

    print("Training 1 step.")
    yield from interceptor.send(workload.train_workload(1), [config.batches_per_step()])
    metrics = interceptor.metrics_result()
    batch_metrics = metrics["batch_metrics"]
    check.eq(len(batch_metrics), config.batches_per_step())
    print(f"Finished training. Metrics: {batch_metrics}")

    print("Validating.")
    yield from interceptor.send(workload.validation_workload(1), [])
    validation = interceptor.metrics_result()
    v_metrics = validation["validation_metrics"]
    print(f"Finished validating. Validation metrics: {v_metrics}")

    print(f"Saving a checkpoint to {checkpoint_dir}")
    yield workload.checkpoint_workload(), [checkpoint_dir], workload.ignore_workload_response
    print(f"Finished saving a checkpoint to {checkpoint_dir}.")

    yield workload.terminate_workload(), [], workload.ignore_workload_response
    print("The test experiment passed.")
示例#2
0
def launch(experiment_config: det.ExperimentConfig) -> int:
    entrypoint = experiment_config.get_entrypoint()

    if isinstance(entrypoint,
                  str) and det.util.match_legacy_trial_class(entrypoint):
        # Legacy entrypoint ("model_def:Trial") detected
        entrypoint = [
            "python3",
            "-m",
            "determined.launch.horovod",
            "--autohorovod",
            "--trial",
            entrypoint,
        ]

    if isinstance(entrypoint, str):
        entrypoint = ["sh", "-c", entrypoint]

    if os.environ.get(
            "DET_RESOURCES_TYPE") == prep_container.RESOURCES_TYPE_SLURM_JOB:
        # SLURM sends SIGTERM to notify of pending preemption
        signal.signal(signal.SIGTERM, trigger_preemption)

    logging.info(f"Launching: {entrypoint}")

    return subprocess.Popen(entrypoint).wait()
def _make_test_workloads(config: det.ExperimentConfig) -> workload.Stream:
    interceptor = workload.WorkloadResponseInterceptor()

    logging.info("Training one batch")
    yield from interceptor.send(workload.train_workload(1))
    metrics = interceptor.metrics_result()
    batch_metrics = metrics["metrics"]["batch_metrics"]
    check.eq(len(batch_metrics), config.scheduling_unit())
    logging.info(f"Finished training, metrics: {batch_metrics}")

    logging.info("Validating one batch")
    yield from interceptor.send(workload.validation_workload(1))
    validation = interceptor.metrics_result()
    v_metrics = validation["metrics"]["validation_metrics"]
    logging.info(f"Finished validating, validation metrics: {v_metrics}")

    logging.info("Saving a checkpoint.")
    yield workload.checkpoint_workload(), workload.ignore_workload_response
    logging.info("Finished saving a checkpoint.")
示例#4
0
def _make_test_workloads(checkpoint_dir: pathlib.Path,
                         config: det.ExperimentConfig) -> workload.Stream:
    interceptor = workload.WorkloadResponseInterceptor()

    logging.info("Training one batch")
    yield from interceptor.send(workload.train_workload(1), [])
    metrics = interceptor.metrics_result()
    batch_metrics = metrics["metrics"]["batch_metrics"]
    check.eq(len(batch_metrics), config.scheduling_unit())
    logging.debug(f"Finished training, metrics: {batch_metrics}")

    logging.info("Validating one step")
    yield from interceptor.send(workload.validation_workload(1), [])
    validation = interceptor.metrics_result()
    v_metrics = validation["metrics"]["validation_metrics"]
    logging.debug(f"Finished validating, validation metrics: {v_metrics}")

    logging.info(f"Saving a checkpoint to {checkpoint_dir}.")
    yield workload.checkpoint_workload(), [checkpoint_dir
                                           ], workload.ignore_workload_response
    logging.info(f"Finished saving a checkpoint to {checkpoint_dir}.")

    yield workload.terminate_workload(), [], workload.ignore_workload_response
    logging.info("The test experiment passed.")