예제 #1
0
def test_mnist_estimator_const(tf2: bool) -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_estimator/single.yaml"))
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("mnist_estimator"), 1)

    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1

    # Check validation metrics.
    steps = trials[0].steps
    assert len(steps) == 1

    step = steps[0]
    assert "validation" in step

    v_metrics = step.validation.metrics["validation_metrics"]

    # GPU training is non-deterministic, but on CPU we can validate that we
    # reach a consistent result.
    if not cluster.running_on_gpu():
        assert v_metrics["accuracy"] == 0.9125999808311462

    # Check training metrics.
    full_trial_metrics = exp.trial_metrics(trials[0].id)
    for step in full_trial_metrics.steps:
        metrics = step.metrics

        batch_metrics = metrics["batch_metrics"]
        assert len(batch_metrics) == 100

        for batch_metric in batch_metrics:
            assert batch_metric["loss"] > 0
예제 #2
0
def test_model_registry() -> None:
    exp_id = exp.run_basic_test(
        conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"),
        conf.official_examples_path("trial/mnist_pytorch"),
        None,
    )

    d = Determined(conf.make_master_url())

    mnist = d.create_model("mnist", "simple computer vision model")
    assert mnist.metadata == {}

    mnist.add_metadata({"testing": "metadata"})
    assert mnist.metadata == {"testing": "metadata"}

    mnist.add_metadata({"some_key": "some_value"})
    assert mnist.metadata == {"testing": "metadata", "some_key": "some_value"}

    mnist.add_metadata({"testing": "override"})
    assert mnist.metadata == {"testing": "override", "some_key": "some_value"}

    mnist.remove_metadata(["some_key"])
    assert mnist.metadata == {"testing": "override"}

    checkpoint = d.get_experiment(exp_id).top_checkpoint()
    model_version = mnist.register_version(checkpoint.uuid)
    assert model_version.version == 1
    assert mnist.get_version().uuid == checkpoint.uuid

    d.create_model("transformer", "all you need is attention")
    d.create_model("object-detection", "a bounding box model")

    models = d.get_models(sort_by=ModelSortBy.NAME)
    assert [m.name
            for m in models] == ["mnist", "object-detection", "transformer"]
예제 #3
0
def test_pytorch_11_const(aggregation_frequency: int) -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"))
    config = conf.set_aggregation_frequency(config, aggregation_frequency)

    exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("trial/mnist_pytorch"), 1)
예제 #4
0
def test_pytorch_11_const(aggregation_frequency: int, using_k8s: bool) -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"))
    config = conf.set_aggregation_frequency(config, aggregation_frequency)

    if using_k8s:
        pod_spec = {
            "metadata": {
                "labels": {
                    "ci": "testing"
                }
            },
            "spec": {
                "containers": [{
                    "name":
                    "determined-container",
                    "volumeMounts": [{
                        "name": "temp1",
                        "mountPath": "/random"
                    }],
                }],
                "volumes": [{
                    "name": "temp1",
                    "emptyDir": {}
                }],
            },
        }
        config = conf.set_pod_spec(config, pod_spec)

    exp.run_basic_test_with_temp_config(config,
                                        conf.tutorials_path("mnist_pytorch"),
                                        1)
예제 #5
0
def test_maskrcnn_distributed_fake() -> None:
    example_path = conf.fixtures_path("mmdetection")
    config = conf.load_config(os.path.join(example_path, "distributed_fake_data.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = set_docker_image(config)

    exp.run_basic_test_with_temp_config(config, example_path, 1)
예제 #6
0
def test_streaming_metrics_api() -> None:
    auth.initialize_session(conf.make_master_url(), try_reauth=True)

    pool = mp.pool.ThreadPool(processes=7)

    experiment_id = exp.create_experiment(
        conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"),
        conf.tutorials_path("mnist_pytorch"),
    )
    # To fully test the streaming APIs, the requests need to start running immediately after the
    # experiment, and then stay open until the experiment is complete. To accomplish this with all
    # of the API calls on a single experiment, we spawn them all in threads.

    # The HP importance portion of this test is commented out until the feature is enabled by
    # default

    metric_names_thread = pool.apply_async(request_metric_names,
                                           (experiment_id, ))
    train_metric_batches_thread = pool.apply_async(
        request_train_metric_batches, (experiment_id, ))
    valid_metric_batches_thread = pool.apply_async(
        request_valid_metric_batches, (experiment_id, ))
    train_trials_snapshot_thread = pool.apply_async(
        request_train_trials_snapshot, (experiment_id, ))
    valid_trials_snapshot_thread = pool.apply_async(
        request_valid_trials_snapshot, (experiment_id, ))
    train_trials_sample_thread = pool.apply_async(request_train_trials_sample,
                                                  (experiment_id, ))
    valid_trials_sample_thread = pool.apply_async(request_valid_trials_sample,
                                                  (experiment_id, ))

    metric_names_results = metric_names_thread.get()
    train_metric_batches_results = train_metric_batches_thread.get()
    valid_metric_batches_results = valid_metric_batches_thread.get()
    train_trials_snapshot_results = train_trials_snapshot_thread.get()
    valid_trials_snapshot_results = valid_trials_snapshot_thread.get()
    train_trials_sample_results = train_trials_sample_thread.get()
    valid_trials_sample_results = valid_trials_sample_thread.get()

    if metric_names_results is not None:
        pytest.fail("metric-names: %s. Results: %s" % metric_names_results)
    if train_metric_batches_results is not None:
        pytest.fail("metric-batches (training): %s. Results: %s" %
                    train_metric_batches_results)
    if valid_metric_batches_results is not None:
        pytest.fail("metric-batches (validation): %s. Results: %s" %
                    valid_metric_batches_results)
    if train_trials_snapshot_results is not None:
        pytest.fail("trials-snapshot (training): %s. Results: %s" %
                    train_trials_snapshot_results)
    if valid_trials_snapshot_results is not None:
        pytest.fail("trials-snapshot (validation): %s. Results: %s" %
                    valid_trials_snapshot_results)
    if train_trials_sample_results is not None:
        pytest.fail("trials-sample (training): %s. Results: %s" %
                    train_trials_sample_results)
    if valid_trials_sample_results is not None:
        pytest.fail("trials-sample (validation): %s. Results: %s" %
                    valid_trials_sample_results)
예제 #7
0
def test_end_to_end_adaptive() -> None:
    exp_id = exp.run_basic_test(
        conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"),
        conf.official_examples_path("trial/mnist_pytorch"),
        None,
    )

    # Check that validation accuracy look sane (more than 93% on MNIST).
    trials = exp.experiment_trials(exp_id)
    best = None
    for trial in trials:
        assert len(trial["steps"])
        last_step = trial["steps"][-1]
        accuracy = last_step["validation"]["metrics"]["validation_metrics"]["accuracy"]
        if not best or accuracy > best:
            best = accuracy

    assert best is not None
    assert best > 0.93

    # Check that ExperimentReference returns a sorted order of top checkpoints
    # without gaps. The top 2 checkpoints should be the first 2 of the top k
    # checkpoints if sorting is stable.
    d = Determined(conf.make_master_url())
    exp_ref = d.get_experiment(exp_id)

    top_2 = exp_ref.top_n_checkpoints(2)
    top_k = exp_ref.top_n_checkpoints(len(trials))

    top_2_uuids = [c.uuid for c in top_2]
    top_k_uuids = [c.uuid for c in top_k]

    assert top_2_uuids == top_k_uuids[:2]

    # Check that metrics are truly in sorted order.
    metrics = [c.validation["metrics"]["validation_metrics"]["validation_loss"] for c in top_k]

    assert metrics == sorted(metrics)

    # Check that changing smaller is better reverses the checkpoint ordering.
    top_k_reversed = exp_ref.top_n_checkpoints(
        len(trials), sort_by="validation_loss", smaller_is_better=False
    )
    top_k_reversed_uuids = [c.uuid for c in top_k_reversed]

    assert top_k_uuids == top_k_reversed_uuids[::-1]

    checkpoint = top_k[0]
    checkpoint.add_metadata({"testing": "metadata"})
    assert checkpoint.metadata == {"testing": "metadata"}

    checkpoint.add_metadata({"some_key": "some_value"})
    assert checkpoint.metadata == {"testing": "metadata", "some_key": "some_value"}

    checkpoint.add_metadata({"testing": "override"})
    assert checkpoint.metadata == {"testing": "override", "some_key": "some_value"}

    checkpoint.remove_metadata(["some_key"])
    assert checkpoint.metadata == {"testing": "override"}
def test_start_shell_with_template() -> None:
    template_name = "test_start_shell_with_template"
    tpl.set_template(template_name,
                     conf.fixtures_path("templates/template.yaml"))

    with cmd.interactive_command("shell", "start", "--template", template_name,
                                 "--detach"):
        pass
def test_start_command_with_template() -> None:
    template_name = "test_start_command_with_template"
    tpl.set_template(template_name,
                     conf.fixtures_path("templates/template.yaml"))

    with cmd.interactive_command("command", "run", "--template", template_name,
                                 "--detach", "sleep infinity"):
        pass
예제 #10
0
def test_custom_reducer_distributed(secrets: Dict[str, str], tf2: bool) -> None:
    config = conf.load_config(conf.fixtures_path("estimator_dataset/distributed.yaml"))
    # Run with multiple steps to verify we are resetting reducers right.
    config = conf.set_max_length(config, {"batches": 2})
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.fixtures_path("estimator_dataset"), 1
    )

    trial = exp.experiment_trials(experiment_id)[0]
    last_validation = trial["steps"][len(trial["steps"]) - 1]["validation"]
    metrics = last_validation["metrics"]["validation_metrics"]
    label_sum = 2 * sum(range(16))
    assert metrics["label_sum_fn"] == label_sum
    assert metrics["label_sum_cls"] == label_sum
예제 #11
0
def test_mnist_estimator_adaptive_with_data_layer() -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_estimator/adaptive.yaml"))
    config = conf.set_tf2_image(config)
    config = conf.set_shared_fs_data_layer(config)

    exp.run_basic_test_with_temp_config(
        config, conf.experimental_path("data_layer_mnist_estimator"), None)
예제 #12
0
def test_mnist_estimator_adaptive(tf2: bool) -> None:
    # Only test tf1 here, because a tf2 test would add no extra coverage.
    config = conf.load_config(
        conf.fixtures_path("mnist_estimator/adaptive.yaml"))
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)

    exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("mnist_estimator"), None)
예제 #13
0
def _test_rng_restore(fixture: str, metrics: list) -> None:
    """
    This test confirms that an experiment can be restarted from a checkpoint
    with the same RNG state. It requires a test fixture that will emit
    random numbers from all of the RNGs used in the relevant framework as
    metrics. The experiment must have a const.yaml, run for at least 3 steps,
    checkpoint every step, and keep the first checkpoint (either by having
    metrics get worse over time, or by configuring the experiment to keep all
    checkpoints).
    """
    experiment = exp.run_basic_test(
        conf.fixtures_path(fixture + "/const.yaml"),
        conf.fixtures_path(fixture),
        1,
    )

    first_trial = exp.experiment_trials(experiment)[0]

    assert len(first_trial["steps"]) >= 3

    first_step = first_trial["steps"][0]
    first_checkpoint_id = first_step["checkpoint"]["id"]

    config_base = conf.load_config(conf.fixtures_path(fixture + "/const.yaml"))
    config_obj = copy.deepcopy(config_base)
    config_obj["searcher"]["source_checkpoint_uuid"] = first_step["checkpoint"]["uuid"]

    experiment2 = exp.run_basic_test_with_temp_config(config_obj, conf.fixtures_path(fixture), 1)

    second_trial = exp.experiment_trials(experiment2)[0]

    assert len(second_trial["steps"]) >= 3
    assert second_trial["warm_start_checkpoint_id"] == first_checkpoint_id

    for step in range(0, 2):
        for metric in metrics:
            first_metric = first_trial["steps"][step + 1]["validation"]["metrics"][
                "validation_metrics"
            ][metric]
            second_metric = second_trial["steps"][step]["validation"]["metrics"][
                "validation_metrics"
            ][metric]
            assert (
                first_metric == second_metric
            ), f"failures on iteration: {step} with metric: {metric}"
예제 #14
0
def test_metric_gathering() -> None:
    """
    Confirm that metrics are gathered from the trial the way that we expect.
    """
    experiment_id = exp.run_basic_test(
        conf.fixtures_path("metric_maker/const.yaml"), conf.fixtures_path("metric_maker"), 1
    )

    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1

    # Read the structure of the metrics directly from the config file
    config = conf.load_config(conf.fixtures_path("metric_maker/const.yaml"))

    base_value = config["hyperparameters"]["starting_base_value"]
    gain_per_batch = config["hyperparameters"]["gain_per_batch"]
    training_structure = config["hyperparameters"]["training_structure"]["val"]
    validation_structure = config["hyperparameters"]["validation_structure"]["val"]

    scheduling_unit = 100

    # Check training metrics.
    full_trial_metrics = exp.trial_metrics(trials[0].trial.id)
    batches_trained = 0
    for step in full_trial_metrics["steps"]:
        metrics = step["metrics"]

        actual = metrics["batch_metrics"]
        assert len(actual) == scheduling_unit

        first_base_value = base_value + batches_trained
        batch_values = first_base_value + gain_per_batch * np.arange(scheduling_unit)
        expected = [structure_to_metrics(value, training_structure) for value in batch_values]
        assert structure_equal(expected, actual)
        batches_trained = step["total_batches"]

    # Check validation metrics.
    validation_workloads = exp.workloads_with_validation(trials[0].workloads)
    for validation in validation_workloads:
        actual = validation.metrics
        batches_trained = validation.totalBatches

        value = base_value + batches_trained
        expected = structure_to_metrics(value, validation_structure)
        assert structure_equal(expected, actual)
예제 #15
0
def test_trial_logs() -> None:
    experiment_id = exp.run_basic_test(conf.fixtures_path("no_op/single.yaml"),
                                       conf.fixtures_path("no_op"), 1)
    trial_id = exp.experiment_trials(experiment_id)[0]["id"]
    subprocess.check_call(
        ["det", "-m",
         conf.make_master_url(), "trial", "logs",
         str(trial_id)])
    subprocess.check_call([
        "det", "-m",
        conf.make_master_url(), "trial", "logs", "--head", "10",
        str(trial_id)
    ], )
    subprocess.check_call([
        "det", "-m",
        conf.make_master_url(), "trial", "logs", "--tail", "10",
        str(trial_id)
    ], )
예제 #16
0
def test_pytorch_load() -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"))

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.tutorials_path("mnist_pytorch"), 1)

    (Determined(conf.make_master_url()).get_experiment(
        experiment_id).top_checkpoint().load(map_location="cpu"))
예제 #17
0
def test_experiment_delete() -> None:
    subprocess.check_call(["det", "-m", conf.make_master_url(), "user", "whoami"])

    experiment_id = exp.run_basic_test(
        conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1
    )

    subprocess.check_call(
        ["det", "-m", conf.make_master_url(), "experiment", "delete", str(experiment_id), "--yes"],
        env={**os.environ, "DET_ADMIN": "1"},
    )

    # "det experiment describe" call should fail, because the
    # experiment is no longer in the database.
    with pytest.raises(subprocess.CalledProcessError):
        subprocess.check_call(
            ["det", "-m", conf.make_master_url(), "experiment", "describe", str(experiment_id)]
        )
예제 #18
0
def test_metric_gathering() -> None:
    """
    Confirm that metrics are gathered from the trial the way that we expect.
    """
    experiment_id = exp.run_basic_test(
        conf.fixtures_path("metric_maker/const.yaml"), conf.fixtures_path("metric_maker"), 1
    )

    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1

    # Read the structure of the metrics directly from the config file
    config = conf.load_config(conf.fixtures_path("metric_maker/const.yaml"))

    base_value = config["hyperparameters"]["starting_base_value"]
    gain_per_batch = config["hyperparameters"]["gain_per_batch"]
    training_structure = config["hyperparameters"]["training_structure"]["val"]
    validation_structure = config["hyperparameters"]["validation_structure"]["val"]

    scheduling_unit = 100

    # Check training metrics.
    full_trial_metrics = exp.trial_metrics(trials[0]["id"])
    for step in full_trial_metrics["steps"]:
        metrics = step["metrics"]
        assert metrics["num_inputs"] == scheduling_unit

        actual = metrics["batch_metrics"]
        assert len(actual) == scheduling_unit

        first_base_value = base_value + (step["id"] - 1) * scheduling_unit
        batch_values = first_base_value + gain_per_batch * np.arange(scheduling_unit)
        expected = [structure_to_metrics(value, training_structure) for value in batch_values]
        assert structure_equal(expected, actual)

    # Check validation metrics.
    for step in trials[0]["steps"]:
        validation = step["validation"]
        metrics = validation["metrics"]
        actual = metrics["validation_metrics"]

        value = base_value + step["id"] * scheduling_unit
        expected = structure_to_metrics(value, validation_structure)
        assert structure_equal(expected, actual)
예제 #19
0
def test_mnist_estimator_load() -> None:
    config = conf.load_config(conf.fixtures_path("mnist_estimator/single.yaml"))
    config = conf.set_tf1_image(config)
    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("trial/mnist_estimator"), 1
    )

    trials = exp.experiment_trials(experiment_id)
    model = Determined(conf.make_master_url()).get_trial(trials[0]["id"]).top_checkpoint().load()
    assert isinstance(model, AutoTrackable)
예제 #20
0
def test_custom_port() -> None:
    name = "port_test"
    master_host = "localhost"
    master_port = "12321"
    conf.MASTER_IP = master_host
    conf.MASTER_PORT = master_port
    arguments = [
        "--cluster-name",
        name,
        "--master-port",
        f"{master_port}",
    ]
    cluster_up(arguments)
    exp.run_basic_test(
        conf.fixtures_path("no_op/single-one-short-step.yaml"),
        conf.fixtures_path("no_op"),
        1,
    )
    cluster_down(["--cluster-name", name])
예제 #21
0
def test_experiment_archive_unarchive() -> None:
    experiment_id = exp.create_experiment(
        conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), ["--paused"]
    )

    describe_args = [
        "det",
        "-m",
        conf.make_master_url(),
        "experiment",
        "describe",
        "--json",
        str(experiment_id),
    ]

    # Check that the experiment is initially unarchived.
    infos = json.loads(subprocess.check_output(describe_args))
    assert len(infos) == 1
    assert not infos[0]["archived"]

    # Check that archiving a non-terminal experiment fails, then terminate it.
    with pytest.raises(subprocess.CalledProcessError):
        subprocess.check_call(
            ["det", "-m", conf.make_master_url(), "experiment", "archive", str(experiment_id)]
        )
    subprocess.check_call(
        ["det", "-m", conf.make_master_url(), "experiment", "cancel", str(experiment_id)]
    )

    # Check that we can archive and unarchive the experiment and see the expected effects.
    subprocess.check_call(
        ["det", "-m", conf.make_master_url(), "experiment", "archive", str(experiment_id)]
    )
    infos = json.loads(subprocess.check_output(describe_args))
    assert len(infos) == 1
    assert infos[0]["archived"]

    subprocess.check_call(
        ["det", "-m", conf.make_master_url(), "experiment", "unarchive", str(experiment_id)]
    )
    infos = json.loads(subprocess.check_output(describe_args))
    assert len(infos) == 1
    assert not infos[0]["archived"]
예제 #22
0
def test_pytorch_load() -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"))

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("mnist_pytorch"), 1)

    nn = (Determined(conf.make_master_url()).get_experiment(
        experiment_id).top_checkpoint().load(map_location=torch.device("cpu")))
    assert isinstance(nn, torch.nn.Module)
예제 #23
0
def test_nan_metrics() -> None:
    """
    Confirm that NaN and Infinity metrics are gathered from the trial.
    """
    exp_id = exp.run_basic_test(conf.fixtures_path("metric_maker/nans.yaml"),
                                conf.fixtures_path("metric_maker"), 1)
    trials = exp.experiment_trials(exp_id)
    config = conf.load_config(conf.fixtures_path("metric_maker/nans.yaml"))
    base_value = config["hyperparameters"]["starting_base_value"]
    gain_per_batch = config["hyperparameters"]["gain_per_batch"]

    # Infinity and NaN cannot be processed in the YAML->JSON deserializer
    # Add them to expected values here
    training_structure = config["hyperparameters"]["training_structure"]["val"]
    training_structure["inf"] = "Infinity"
    training_structure["nan"] = "NaN"
    training_structure["nanarray"] = ["NaN", "NaN"]
    validation_structure = config["hyperparameters"]["validation_structure"][
        "val"]
    validation_structure["neg_inf"] = "-Infinity"

    # Check training metrics.
    full_trial_metrics = exp.trial_metrics(trials[0].trial.id)
    batches_trained = 0
    for step in full_trial_metrics["steps"]:
        metrics = step["metrics"]
        actual = metrics["batch_metrics"]
        first_base_value = base_value + batches_trained
        batch_values = first_base_value + gain_per_batch * np.arange(5)
        expected = [
            structure_to_metrics(value, training_structure)
            for value in batch_values
        ]
        assert structure_equal(expected, actual)
        batches_trained = step["total_batches"]

    # Check validation metrics.
    validation_workloads = exp.workloads_with_validation(trials[0].workloads)
    for validation in validation_workloads:
        actual = validation.metrics
        batches_trained = validation.totalBatches
        expected = structure_to_metrics(base_value, validation_structure)
        assert structure_equal(expected, actual)
예제 #24
0
def test_detr_distributed_fake() -> None:
    example_path = conf.fixtures_path("mmdetection")
    config = conf.load_config(os.path.join(example_path, "distributed_fake_data.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = set_docker_image(config)
    config = conf.set_hparam(
        config, "config_file", "/mmdetection/configs/detr/detr_r50_8x2_150e_coco.py"
    )

    exp.run_basic_test_with_temp_config(config, example_path, 1)
예제 #25
0
def test_agent_restart_exp_container_failure(
        managed_cluster_restarts: ManagedCluster) -> None:
    managed_cluster_restarts.ensure_agent_ok()
    try:
        exp_id = exp.create_experiment(
            conf.fixtures_path("no_op/single-medium-train-step.yaml"),
            conf.fixtures_path("no_op"),
            None,
        )
        exp.wait_for_experiment_workload_progress(exp_id)
        container_ids = list(_local_container_ids_for_experiment(exp_id))
        if len(container_ids) != 1:
            pytest.fail(
                f"unexpected number of local containers for the experiment: {len(container_ids)}"
            )
        # Get task id / allocation id
        tasks_data = _task_list_json(managed_cluster_restarts.master_url)
        assert len(tasks_data) == 1
        exp_task_before = list(tasks_data.values())[0]

        managed_cluster_restarts.kill_agent()
        subprocess.run(["docker", "kill", container_ids[0]],
                       check=True,
                       stdout=subprocess.PIPE)
    except Exception:
        managed_cluster_restarts.restart_agent()
        raise
    else:
        managed_cluster_restarts.restart_agent()
        # As soon as the agent is back, the original allocation should be considered dead,
        # but the new one should be allocated.
        state = exp.experiment_state(exp_id)
        assert state == EXP_STATE.STATE_ACTIVE
        tasks_data = _task_list_json(managed_cluster_restarts.master_url)
        assert len(tasks_data) == 1
        exp_task_after = list(tasks_data.values())[0]

        assert exp_task_before["task_id"] == exp_task_after["task_id"]
        assert exp_task_before["allocation_id"] != exp_task_after[
            "allocation_id"]

        exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED)
예제 #26
0
def test_task_logs(task_type: str, task_config: Dict[str, Any],
                   log_regex: Any) -> None:
    # TODO: refactor tests to not use cli singleton auth.
    master_url = conf.make_master_url()
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)

    rps = bindings.get_GetResourcePools(
        session.Session(master_url, "determined", authentication.cli_auth,
                        certs.cli_cert))
    assert rps.resourcePools and len(
        rps.resourcePools) > 0, "missing resource pool"

    if (rps.resourcePools[0].type
            == bindings.v1ResourcePoolType.RESOURCE_POOL_TYPE_K8S
            and task_type == command.TaskTypeCommand):
        # TODO(DET-6712): Investigate intermittent slowness with K8s command logs.
        return

    body = {}
    if task_type == command.TaskTypeTensorBoard:
        exp_id = exp.run_basic_test(
            conf.fixtures_path("no_op/single.yaml"),
            conf.fixtures_path("no_op"),
            1,
        )
        body.update({"experiment_ids": [exp_id]})

    resp = command.launch_command(
        master_url,
        f"api/v1/{command.RemoteTaskNewAPIs[task_type]}",
        task_config,
        "",
        default_body=body,
    )
    task_id = resp[command.RemoteTaskName[task_type]]["id"]
    try:
        check_logs(master_url, task_id, log_regex, api.task_logs,
                   api.task_log_fields)
    finally:
        command._kill(master_url, task_type, task_id)
def test_mnist_estimator_const_parallel(tf2: bool) -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_estimator/single-multi-slot.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    config = conf.set_perform_initial_validation(config, True)

    exp_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("mnist_estimator"), 1)
    exp.assert_performed_initial_validation(exp_id)
예제 #28
0
def test_support_bundle() -> None:
    exp_id = exp.run_basic_test(
        config_file=conf.fixtures_path("no_op/single-one-short-step.yaml"),
        model_def_file=conf.fixtures_path("no_op"),
        expected_trials=1,
    )

    trial_id = exp.experiment_first_trial(exp_id)
    output_dir = f"e2etest_trial_{trial_id}"
    os.mkdir(output_dir)

    command = ["det", "trial", "support-bundle", str(trial_id), "-o", output_dir]

    completed_process = subprocess.run(
        command, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
    )

    assert completed_process.returncode == 0, "\nstdout:\n{} \nstderr:\n{}".format(
        completed_process.stdout, completed_process.stderr
    )
예제 #29
0
def test_noop_pause() -> None:
    """
    Walk through starting, pausing, and resuming a single no-op experiment.
    """
    experiment_id = exp.create_experiment(
        conf.fixtures_path("no_op/single-medium-train-step.yaml"),
        conf.fixtures_path("no_op"),
        None,
    )
    exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_ACTIVE)

    # Wait for the only trial to get scheduled.
    exp.wait_for_experiment_active_workload(experiment_id)

    # Wait for the only trial to show progress, indicating the image is built and running.
    exp.wait_for_experiment_workload_progress(experiment_id)

    # Pause the experiment. Note that Determined does not currently differentiate
    # between a "stopping paused" and a "paused" state, so we follow this check
    # up by ensuring the experiment cleared all scheduled workloads.
    exp.pause_experiment(experiment_id)
    exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_PAUSED)

    # Wait at most 20 seconds for the experiment to clear all workloads (each
    # train step should take 5 seconds).
    for _ in range(20):
        workload_active = exp.experiment_has_active_workload(experiment_id)
        if not workload_active:
            break
        else:
            time.sleep(1)
    check.true(
        not workload_active,
        "The experiment cannot be paused within 20 seconds.",
    )

    # Resume the experiment and wait for completion.
    exp.activate_experiment(experiment_id)
    exp.wait_for_experiment_state(
        experiment_id, bindings.determinedexperimentv1State.STATE_COMPLETED
    )
예제 #30
0
def test_trial_logs() -> None:
    # TODO: refactor tests to not use cli singleton auth.
    master_url = conf.make_master_url()
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)

    experiment_id = exp.run_basic_test(conf.fixtures_path("no_op/single.yaml"),
                                       conf.fixtures_path("no_op"), 1)
    trial = exp.experiment_trials(experiment_id)[0].trial
    trial_id = trial.id
    task_id = trial.taskId
    assert task_id != ""

    log_regex = re.compile("^.*New trial runner.*$")
    # Trial-specific APIs should work just fine.
    check_logs(master_url, trial_id, log_regex, api.trial_logs,
               api.trial_log_fields)
    # And so should new task log APIs.
    check_logs(master_url, task_id, log_regex, api.task_logs,
               api.task_log_fields)