def test_mnist_estimator_const(tf2: bool) -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 # Check validation metrics. steps = trials[0].steps assert len(steps) == 1 step = steps[0] assert "validation" in step v_metrics = step.validation.metrics["validation_metrics"] # GPU training is non-deterministic, but on CPU we can validate that we # reach a consistent result. if not cluster.running_on_gpu(): assert v_metrics["accuracy"] == 0.9125999808311462 # Check training metrics. full_trial_metrics = exp.trial_metrics(trials[0].id) for step in full_trial_metrics.steps: metrics = step.metrics batch_metrics = metrics["batch_metrics"] assert len(batch_metrics) == 100 for batch_metric in batch_metrics: assert batch_metric["loss"] > 0
def test_model_registry() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"), conf.official_examples_path("trial/mnist_pytorch"), None, ) d = Determined(conf.make_master_url()) mnist = d.create_model("mnist", "simple computer vision model") assert mnist.metadata == {} mnist.add_metadata({"testing": "metadata"}) assert mnist.metadata == {"testing": "metadata"} mnist.add_metadata({"some_key": "some_value"}) assert mnist.metadata == {"testing": "metadata", "some_key": "some_value"} mnist.add_metadata({"testing": "override"}) assert mnist.metadata == {"testing": "override", "some_key": "some_value"} mnist.remove_metadata(["some_key"]) assert mnist.metadata == {"testing": "override"} checkpoint = d.get_experiment(exp_id).top_checkpoint() model_version = mnist.register_version(checkpoint.uuid) assert model_version.version == 1 assert mnist.get_version().uuid == checkpoint.uuid d.create_model("transformer", "all you need is attention") d.create_model("object-detection", "a bounding box model") models = d.get_models(sort_by=ModelSortBy.NAME) assert [m.name for m in models] == ["mnist", "object-detection", "transformer"]
def test_pytorch_11_const(aggregation_frequency: int) -> None: config = conf.load_config( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml")) config = conf.set_aggregation_frequency(config, aggregation_frequency) exp.run_basic_test_with_temp_config( config, conf.official_examples_path("trial/mnist_pytorch"), 1)
def test_pytorch_11_const(aggregation_frequency: int, using_k8s: bool) -> None: config = conf.load_config( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml")) config = conf.set_aggregation_frequency(config, aggregation_frequency) if using_k8s: pod_spec = { "metadata": { "labels": { "ci": "testing" } }, "spec": { "containers": [{ "name": "determined-container", "volumeMounts": [{ "name": "temp1", "mountPath": "/random" }], }], "volumes": [{ "name": "temp1", "emptyDir": {} }], }, } config = conf.set_pod_spec(config, pod_spec) exp.run_basic_test_with_temp_config(config, conf.tutorials_path("mnist_pytorch"), 1)
def test_maskrcnn_distributed_fake() -> None: example_path = conf.fixtures_path("mmdetection") config = conf.load_config(os.path.join(example_path, "distributed_fake_data.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = set_docker_image(config) exp.run_basic_test_with_temp_config(config, example_path, 1)
def test_streaming_metrics_api() -> None: auth.initialize_session(conf.make_master_url(), try_reauth=True) pool = mp.pool.ThreadPool(processes=7) experiment_id = exp.create_experiment( conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.tutorials_path("mnist_pytorch"), ) # To fully test the streaming APIs, the requests need to start running immediately after the # experiment, and then stay open until the experiment is complete. To accomplish this with all # of the API calls on a single experiment, we spawn them all in threads. # The HP importance portion of this test is commented out until the feature is enabled by # default metric_names_thread = pool.apply_async(request_metric_names, (experiment_id, )) train_metric_batches_thread = pool.apply_async( request_train_metric_batches, (experiment_id, )) valid_metric_batches_thread = pool.apply_async( request_valid_metric_batches, (experiment_id, )) train_trials_snapshot_thread = pool.apply_async( request_train_trials_snapshot, (experiment_id, )) valid_trials_snapshot_thread = pool.apply_async( request_valid_trials_snapshot, (experiment_id, )) train_trials_sample_thread = pool.apply_async(request_train_trials_sample, (experiment_id, )) valid_trials_sample_thread = pool.apply_async(request_valid_trials_sample, (experiment_id, )) metric_names_results = metric_names_thread.get() train_metric_batches_results = train_metric_batches_thread.get() valid_metric_batches_results = valid_metric_batches_thread.get() train_trials_snapshot_results = train_trials_snapshot_thread.get() valid_trials_snapshot_results = valid_trials_snapshot_thread.get() train_trials_sample_results = train_trials_sample_thread.get() valid_trials_sample_results = valid_trials_sample_thread.get() if metric_names_results is not None: pytest.fail("metric-names: %s. Results: %s" % metric_names_results) if train_metric_batches_results is not None: pytest.fail("metric-batches (training): %s. Results: %s" % train_metric_batches_results) if valid_metric_batches_results is not None: pytest.fail("metric-batches (validation): %s. Results: %s" % valid_metric_batches_results) if train_trials_snapshot_results is not None: pytest.fail("trials-snapshot (training): %s. Results: %s" % train_trials_snapshot_results) if valid_trials_snapshot_results is not None: pytest.fail("trials-snapshot (validation): %s. Results: %s" % valid_trials_snapshot_results) if train_trials_sample_results is not None: pytest.fail("trials-sample (training): %s. Results: %s" % train_trials_sample_results) if valid_trials_sample_results is not None: pytest.fail("trials-sample (validation): %s. Results: %s" % valid_trials_sample_results)
def test_end_to_end_adaptive() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.official_examples_path("trial/mnist_pytorch"), None, ) # Check that validation accuracy look sane (more than 93% on MNIST). trials = exp.experiment_trials(exp_id) best = None for trial in trials: assert len(trial["steps"]) last_step = trial["steps"][-1] accuracy = last_step["validation"]["metrics"]["validation_metrics"]["accuracy"] if not best or accuracy > best: best = accuracy assert best is not None assert best > 0.93 # Check that ExperimentReference returns a sorted order of top checkpoints # without gaps. The top 2 checkpoints should be the first 2 of the top k # checkpoints if sorting is stable. d = Determined(conf.make_master_url()) exp_ref = d.get_experiment(exp_id) top_2 = exp_ref.top_n_checkpoints(2) top_k = exp_ref.top_n_checkpoints(len(trials)) top_2_uuids = [c.uuid for c in top_2] top_k_uuids = [c.uuid for c in top_k] assert top_2_uuids == top_k_uuids[:2] # Check that metrics are truly in sorted order. metrics = [c.validation["metrics"]["validation_metrics"]["validation_loss"] for c in top_k] assert metrics == sorted(metrics) # Check that changing smaller is better reverses the checkpoint ordering. top_k_reversed = exp_ref.top_n_checkpoints( len(trials), sort_by="validation_loss", smaller_is_better=False ) top_k_reversed_uuids = [c.uuid for c in top_k_reversed] assert top_k_uuids == top_k_reversed_uuids[::-1] checkpoint = top_k[0] checkpoint.add_metadata({"testing": "metadata"}) assert checkpoint.metadata == {"testing": "metadata"} checkpoint.add_metadata({"some_key": "some_value"}) assert checkpoint.metadata == {"testing": "metadata", "some_key": "some_value"} checkpoint.add_metadata({"testing": "override"}) assert checkpoint.metadata == {"testing": "override", "some_key": "some_value"} checkpoint.remove_metadata(["some_key"]) assert checkpoint.metadata == {"testing": "override"}
def test_start_shell_with_template() -> None: template_name = "test_start_shell_with_template" tpl.set_template(template_name, conf.fixtures_path("templates/template.yaml")) with cmd.interactive_command("shell", "start", "--template", template_name, "--detach"): pass
def test_start_command_with_template() -> None: template_name = "test_start_command_with_template" tpl.set_template(template_name, conf.fixtures_path("templates/template.yaml")) with cmd.interactive_command("command", "run", "--template", template_name, "--detach", "sleep infinity"): pass
def test_custom_reducer_distributed(secrets: Dict[str, str], tf2: bool) -> None: config = conf.load_config(conf.fixtures_path("estimator_dataset/distributed.yaml")) # Run with multiple steps to verify we are resetting reducers right. config = conf.set_max_length(config, {"batches": 2}) config = conf.set_slots_per_trial(config, 8) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("estimator_dataset"), 1 ) trial = exp.experiment_trials(experiment_id)[0] last_validation = trial["steps"][len(trial["steps"]) - 1]["validation"] metrics = last_validation["metrics"]["validation_metrics"] label_sum = 2 * sum(range(16)) assert metrics["label_sum_fn"] == label_sum assert metrics["label_sum_cls"] == label_sum
def test_mnist_estimator_adaptive_with_data_layer() -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/adaptive.yaml")) config = conf.set_tf2_image(config) config = conf.set_shared_fs_data_layer(config) exp.run_basic_test_with_temp_config( config, conf.experimental_path("data_layer_mnist_estimator"), None)
def test_mnist_estimator_adaptive(tf2: bool) -> None: # Only test tf1 here, because a tf2 test would add no extra coverage. config = conf.load_config( conf.fixtures_path("mnist_estimator/adaptive.yaml")) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_estimator"), None)
def _test_rng_restore(fixture: str, metrics: list) -> None: """ This test confirms that an experiment can be restarted from a checkpoint with the same RNG state. It requires a test fixture that will emit random numbers from all of the RNGs used in the relevant framework as metrics. The experiment must have a const.yaml, run for at least 3 steps, checkpoint every step, and keep the first checkpoint (either by having metrics get worse over time, or by configuring the experiment to keep all checkpoints). """ experiment = exp.run_basic_test( conf.fixtures_path(fixture + "/const.yaml"), conf.fixtures_path(fixture), 1, ) first_trial = exp.experiment_trials(experiment)[0] assert len(first_trial["steps"]) >= 3 first_step = first_trial["steps"][0] first_checkpoint_id = first_step["checkpoint"]["id"] config_base = conf.load_config(conf.fixtures_path(fixture + "/const.yaml")) config_obj = copy.deepcopy(config_base) config_obj["searcher"]["source_checkpoint_uuid"] = first_step["checkpoint"]["uuid"] experiment2 = exp.run_basic_test_with_temp_config(config_obj, conf.fixtures_path(fixture), 1) second_trial = exp.experiment_trials(experiment2)[0] assert len(second_trial["steps"]) >= 3 assert second_trial["warm_start_checkpoint_id"] == first_checkpoint_id for step in range(0, 2): for metric in metrics: first_metric = first_trial["steps"][step + 1]["validation"]["metrics"][ "validation_metrics" ][metric] second_metric = second_trial["steps"][step]["validation"]["metrics"][ "validation_metrics" ][metric] assert ( first_metric == second_metric ), f"failures on iteration: {step} with metric: {metric}"
def test_metric_gathering() -> None: """ Confirm that metrics are gathered from the trial the way that we expect. """ experiment_id = exp.run_basic_test( conf.fixtures_path("metric_maker/const.yaml"), conf.fixtures_path("metric_maker"), 1 ) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 # Read the structure of the metrics directly from the config file config = conf.load_config(conf.fixtures_path("metric_maker/const.yaml")) base_value = config["hyperparameters"]["starting_base_value"] gain_per_batch = config["hyperparameters"]["gain_per_batch"] training_structure = config["hyperparameters"]["training_structure"]["val"] validation_structure = config["hyperparameters"]["validation_structure"]["val"] scheduling_unit = 100 # Check training metrics. full_trial_metrics = exp.trial_metrics(trials[0].trial.id) batches_trained = 0 for step in full_trial_metrics["steps"]: metrics = step["metrics"] actual = metrics["batch_metrics"] assert len(actual) == scheduling_unit first_base_value = base_value + batches_trained batch_values = first_base_value + gain_per_batch * np.arange(scheduling_unit) expected = [structure_to_metrics(value, training_structure) for value in batch_values] assert structure_equal(expected, actual) batches_trained = step["total_batches"] # Check validation metrics. validation_workloads = exp.workloads_with_validation(trials[0].workloads) for validation in validation_workloads: actual = validation.metrics batches_trained = validation.totalBatches value = base_value + batches_trained expected = structure_to_metrics(value, validation_structure) assert structure_equal(expected, actual)
def test_trial_logs() -> None: experiment_id = exp.run_basic_test(conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) trial_id = exp.experiment_trials(experiment_id)[0]["id"] subprocess.check_call( ["det", "-m", conf.make_master_url(), "trial", "logs", str(trial_id)]) subprocess.check_call([ "det", "-m", conf.make_master_url(), "trial", "logs", "--head", "10", str(trial_id) ], ) subprocess.check_call([ "det", "-m", conf.make_master_url(), "trial", "logs", "--tail", "10", str(trial_id) ], )
def test_pytorch_load() -> None: config = conf.load_config( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml")) experiment_id = exp.run_basic_test_with_temp_config( config, conf.tutorials_path("mnist_pytorch"), 1) (Determined(conf.make_master_url()).get_experiment( experiment_id).top_checkpoint().load(map_location="cpu"))
def test_experiment_delete() -> None: subprocess.check_call(["det", "-m", conf.make_master_url(), "user", "whoami"]) experiment_id = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1 ) subprocess.check_call( ["det", "-m", conf.make_master_url(), "experiment", "delete", str(experiment_id), "--yes"], env={**os.environ, "DET_ADMIN": "1"}, ) # "det experiment describe" call should fail, because the # experiment is no longer in the database. with pytest.raises(subprocess.CalledProcessError): subprocess.check_call( ["det", "-m", conf.make_master_url(), "experiment", "describe", str(experiment_id)] )
def test_metric_gathering() -> None: """ Confirm that metrics are gathered from the trial the way that we expect. """ experiment_id = exp.run_basic_test( conf.fixtures_path("metric_maker/const.yaml"), conf.fixtures_path("metric_maker"), 1 ) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 # Read the structure of the metrics directly from the config file config = conf.load_config(conf.fixtures_path("metric_maker/const.yaml")) base_value = config["hyperparameters"]["starting_base_value"] gain_per_batch = config["hyperparameters"]["gain_per_batch"] training_structure = config["hyperparameters"]["training_structure"]["val"] validation_structure = config["hyperparameters"]["validation_structure"]["val"] scheduling_unit = 100 # Check training metrics. full_trial_metrics = exp.trial_metrics(trials[0]["id"]) for step in full_trial_metrics["steps"]: metrics = step["metrics"] assert metrics["num_inputs"] == scheduling_unit actual = metrics["batch_metrics"] assert len(actual) == scheduling_unit first_base_value = base_value + (step["id"] - 1) * scheduling_unit batch_values = first_base_value + gain_per_batch * np.arange(scheduling_unit) expected = [structure_to_metrics(value, training_structure) for value in batch_values] assert structure_equal(expected, actual) # Check validation metrics. for step in trials[0]["steps"]: validation = step["validation"] metrics = validation["metrics"] actual = metrics["validation_metrics"] value = base_value + step["id"] * scheduling_unit expected = structure_to_metrics(value, validation_structure) assert structure_equal(expected, actual)
def test_mnist_estimator_load() -> None: config = conf.load_config(conf.fixtures_path("mnist_estimator/single.yaml")) config = conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("trial/mnist_estimator"), 1 ) trials = exp.experiment_trials(experiment_id) model = Determined(conf.make_master_url()).get_trial(trials[0]["id"]).top_checkpoint().load() assert isinstance(model, AutoTrackable)
def test_custom_port() -> None: name = "port_test" master_host = "localhost" master_port = "12321" conf.MASTER_IP = master_host conf.MASTER_PORT = master_port arguments = [ "--cluster-name", name, "--master-port", f"{master_port}", ] cluster_up(arguments) exp.run_basic_test( conf.fixtures_path("no_op/single-one-short-step.yaml"), conf.fixtures_path("no_op"), 1, ) cluster_down(["--cluster-name", name])
def test_experiment_archive_unarchive() -> None: experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), ["--paused"] ) describe_args = [ "det", "-m", conf.make_master_url(), "experiment", "describe", "--json", str(experiment_id), ] # Check that the experiment is initially unarchived. infos = json.loads(subprocess.check_output(describe_args)) assert len(infos) == 1 assert not infos[0]["archived"] # Check that archiving a non-terminal experiment fails, then terminate it. with pytest.raises(subprocess.CalledProcessError): subprocess.check_call( ["det", "-m", conf.make_master_url(), "experiment", "archive", str(experiment_id)] ) subprocess.check_call( ["det", "-m", conf.make_master_url(), "experiment", "cancel", str(experiment_id)] ) # Check that we can archive and unarchive the experiment and see the expected effects. subprocess.check_call( ["det", "-m", conf.make_master_url(), "experiment", "archive", str(experiment_id)] ) infos = json.loads(subprocess.check_output(describe_args)) assert len(infos) == 1 assert infos[0]["archived"] subprocess.check_call( ["det", "-m", conf.make_master_url(), "experiment", "unarchive", str(experiment_id)] ) infos = json.loads(subprocess.check_output(describe_args)) assert len(infos) == 1 assert not infos[0]["archived"]
def test_pytorch_load() -> None: config = conf.load_config( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml")) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_pytorch"), 1) nn = (Determined(conf.make_master_url()).get_experiment( experiment_id).top_checkpoint().load(map_location=torch.device("cpu"))) assert isinstance(nn, torch.nn.Module)
def test_nan_metrics() -> None: """ Confirm that NaN and Infinity metrics are gathered from the trial. """ exp_id = exp.run_basic_test(conf.fixtures_path("metric_maker/nans.yaml"), conf.fixtures_path("metric_maker"), 1) trials = exp.experiment_trials(exp_id) config = conf.load_config(conf.fixtures_path("metric_maker/nans.yaml")) base_value = config["hyperparameters"]["starting_base_value"] gain_per_batch = config["hyperparameters"]["gain_per_batch"] # Infinity and NaN cannot be processed in the YAML->JSON deserializer # Add them to expected values here training_structure = config["hyperparameters"]["training_structure"]["val"] training_structure["inf"] = "Infinity" training_structure["nan"] = "NaN" training_structure["nanarray"] = ["NaN", "NaN"] validation_structure = config["hyperparameters"]["validation_structure"][ "val"] validation_structure["neg_inf"] = "-Infinity" # Check training metrics. full_trial_metrics = exp.trial_metrics(trials[0].trial.id) batches_trained = 0 for step in full_trial_metrics["steps"]: metrics = step["metrics"] actual = metrics["batch_metrics"] first_base_value = base_value + batches_trained batch_values = first_base_value + gain_per_batch * np.arange(5) expected = [ structure_to_metrics(value, training_structure) for value in batch_values ] assert structure_equal(expected, actual) batches_trained = step["total_batches"] # Check validation metrics. validation_workloads = exp.workloads_with_validation(trials[0].workloads) for validation in validation_workloads: actual = validation.metrics batches_trained = validation.totalBatches expected = structure_to_metrics(base_value, validation_structure) assert structure_equal(expected, actual)
def test_detr_distributed_fake() -> None: example_path = conf.fixtures_path("mmdetection") config = conf.load_config(os.path.join(example_path, "distributed_fake_data.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = set_docker_image(config) config = conf.set_hparam( config, "config_file", "/mmdetection/configs/detr/detr_r50_8x2_150e_coco.py" ) exp.run_basic_test_with_temp_config(config, example_path, 1)
def test_agent_restart_exp_container_failure( managed_cluster_restarts: ManagedCluster) -> None: managed_cluster_restarts.ensure_agent_ok() try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_workload_progress(exp_id) container_ids = list(_local_container_ids_for_experiment(exp_id)) if len(container_ids) != 1: pytest.fail( f"unexpected number of local containers for the experiment: {len(container_ids)}" ) # Get task id / allocation id tasks_data = _task_list_json(managed_cluster_restarts.master_url) assert len(tasks_data) == 1 exp_task_before = list(tasks_data.values())[0] managed_cluster_restarts.kill_agent() subprocess.run(["docker", "kill", container_ids[0]], check=True, stdout=subprocess.PIPE) except Exception: managed_cluster_restarts.restart_agent() raise else: managed_cluster_restarts.restart_agent() # As soon as the agent is back, the original allocation should be considered dead, # but the new one should be allocated. state = exp.experiment_state(exp_id) assert state == EXP_STATE.STATE_ACTIVE tasks_data = _task_list_json(managed_cluster_restarts.master_url) assert len(tasks_data) == 1 exp_task_after = list(tasks_data.values())[0] assert exp_task_before["task_id"] == exp_task_after["task_id"] assert exp_task_before["allocation_id"] != exp_task_after[ "allocation_id"] exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED)
def test_task_logs(task_type: str, task_config: Dict[str, Any], log_regex: Any) -> None: # TODO: refactor tests to not use cli singleton auth. master_url = conf.make_master_url() certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) rps = bindings.get_GetResourcePools( session.Session(master_url, "determined", authentication.cli_auth, certs.cli_cert)) assert rps.resourcePools and len( rps.resourcePools) > 0, "missing resource pool" if (rps.resourcePools[0].type == bindings.v1ResourcePoolType.RESOURCE_POOL_TYPE_K8S and task_type == command.TaskTypeCommand): # TODO(DET-6712): Investigate intermittent slowness with K8s command logs. return body = {} if task_type == command.TaskTypeTensorBoard: exp_id = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1, ) body.update({"experiment_ids": [exp_id]}) resp = command.launch_command( master_url, f"api/v1/{command.RemoteTaskNewAPIs[task_type]}", task_config, "", default_body=body, ) task_id = resp[command.RemoteTaskName[task_type]]["id"] try: check_logs(master_url, task_id, log_regex, api.task_logs, api.task_log_fields) finally: command._kill(master_url, task_type, task_id)
def test_mnist_estimator_const_parallel(tf2: bool) -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/single-multi-slot.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) config = conf.set_perform_initial_validation(config, True) exp_id = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("mnist_estimator"), 1) exp.assert_performed_initial_validation(exp_id)
def test_support_bundle() -> None: exp_id = exp.run_basic_test( config_file=conf.fixtures_path("no_op/single-one-short-step.yaml"), model_def_file=conf.fixtures_path("no_op"), expected_trials=1, ) trial_id = exp.experiment_first_trial(exp_id) output_dir = f"e2etest_trial_{trial_id}" os.mkdir(output_dir) command = ["det", "trial", "support-bundle", str(trial_id), "-o", output_dir] completed_process = subprocess.run( command, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) assert completed_process.returncode == 0, "\nstdout:\n{} \nstderr:\n{}".format( completed_process.stdout, completed_process.stderr )
def test_noop_pause() -> None: """ Walk through starting, pausing, and resuming a single no-op experiment. """ experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_ACTIVE) # Wait for the only trial to get scheduled. exp.wait_for_experiment_active_workload(experiment_id) # Wait for the only trial to show progress, indicating the image is built and running. exp.wait_for_experiment_workload_progress(experiment_id) # Pause the experiment. Note that Determined does not currently differentiate # between a "stopping paused" and a "paused" state, so we follow this check # up by ensuring the experiment cleared all scheduled workloads. exp.pause_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_PAUSED) # Wait at most 20 seconds for the experiment to clear all workloads (each # train step should take 5 seconds). for _ in range(20): workload_active = exp.experiment_has_active_workload(experiment_id) if not workload_active: break else: time.sleep(1) check.true( not workload_active, "The experiment cannot be paused within 20 seconds.", ) # Resume the experiment and wait for completion. exp.activate_experiment(experiment_id) exp.wait_for_experiment_state( experiment_id, bindings.determinedexperimentv1State.STATE_COMPLETED )
def test_trial_logs() -> None: # TODO: refactor tests to not use cli singleton auth. master_url = conf.make_master_url() certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) experiment_id = exp.run_basic_test(conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) trial = exp.experiment_trials(experiment_id)[0].trial trial_id = trial.id task_id = trial.taskId assert task_id != "" log_regex = re.compile("^.*New trial runner.*$") # Trial-specific APIs should work just fine. check_logs(master_url, trial_id, log_regex, api.trial_logs, api.trial_log_fields) # And so should new task log APIs. check_logs(master_url, task_id, log_regex, api.task_logs, api.task_log_fields)