def run_dataset_experiment( searcher_max_steps: int, batches_per_step: int, secrets: Dict[str, str], tf2: bool, slots_per_trial: int = 1, source_trial_id: Optional[str] = None, ) -> List[gql.trials]: config = conf.load_config( conf.fixtures_path("estimator_dataset/const.yaml")) config.setdefault("searcher", {}) config["searcher"]["max_steps"] = searcher_max_steps config["batches_per_step"] = batches_per_step config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) if source_trial_id is not None: config["searcher"]["source_trial_id"] = source_trial_id config.setdefault("resources", {}) config["resources"]["slots_per_trial"] = slots_per_trial if cluster.num_agents() > 1: config["checkpoint_storage"] = exp.s3_checkpoint_config(secrets) experiment_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("estimator_dataset"), 1) return exp.experiment_trials(experiment_id)
def test_noop_pause() -> None: """ Walk through starting, pausing, and resuming a single no-op experiment. """ experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state(experiment_id, "ACTIVE") # Wait for the only trial to get scheduled. workload_active = False for _ in range(conf.MAX_TASK_SCHEDULED_SECS): workload_active = exp.experiment_has_active_workload(experiment_id) if workload_active: break else: time.sleep(1) check.true( workload_active, f"The only trial cannot be scheduled within {conf.MAX_TASK_SCHEDULED_SECS} seconds.", ) # Wait for the only trial to show progress, indicating the image is built and running. num_steps = 0 for _ in range(conf.MAX_TRIAL_BUILD_SECS): trials = exp.experiment_trials(experiment_id) if len(trials) > 0: only_trial = trials[0] num_steps = len(only_trial["steps"]) if num_steps > 1: break time.sleep(1) check.true( num_steps > 1, f"The only trial cannot start training within {conf.MAX_TRIAL_BUILD_SECS} seconds.", ) # Pause the experiment. Note that Determined does not currently differentiate # between a "stopping paused" and a "paused" state, so we follow this check # up by ensuring the experiment cleared all scheduled workloads. exp.pause_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "PAUSED") # Wait at most 20 seconds for the experiment to clear all workloads (each # train step should take 5 seconds). for _ in range(20): workload_active = exp.experiment_has_active_workload(experiment_id) if not workload_active: break else: time.sleep(1) check.true( not workload_active, f"The experiment cannot be paused within 20 seconds.", ) # Resume the experiment and wait for completion. exp.activate_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "COMPLETED")
def test_create_test_mode() -> None: # test-mode should succeed with a valid experiment. command = [ "det", "-m", conf.make_master_url(), "experiment", "create", "--test-mode", conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.official_examples_path("mnist_pytorch"), ] output = subprocess.check_output(command, universal_newlines=True) assert "Model definition test succeeded" in output # test-mode should fail when an error is introduced into the trial # implementation. command = [ "det", "-m", conf.make_master_url(), "experiment", "create", "--test-mode", conf.fixtures_path("trial_error/const.yaml"), conf.fixtures_path("trial_error"), ] with pytest.raises(subprocess.CalledProcessError): subprocess.check_call(command)
def test_experiment_delete() -> None: subprocess.check_call( ["det", "-m", conf.make_master_url(), "user", "whoami"]) experiment_id = exp.run_basic_test(conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) subprocess.check_call( [ "det", "-m", conf.make_master_url(), "experiment", "delete", str(experiment_id), "--yes" ], env={ **os.environ, "DET_ADMIN": "1" }, ) # "det experiment describe" call should fail, because the # experiment is no longer in the database. with pytest.raises(subprocess.CalledProcessError): subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "describe", str(experiment_id) ])
def test_experiment_creation_and_listing(auth: Authentication) -> None: # Create 2 users. creds1 = create_test_user(ADMIN_CREDENTIALS, True) creds2 = create_test_user(ADMIN_CREDENTIALS, True) # Create an experiment as first user. with logged_in_user(creds1): experiment_id1 = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) # Create another experiment, this time as second user. with logged_in_user(creds2): experiment_id2 = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) with logged_in_user(creds1): # Now it should be the other way around. output = extract_id_and_owner_from_exp_list(det_run(["e", "list"])) assert (experiment_id1, creds1.username) in output assert (experiment_id2, creds2.username) not in output # Now use the -a flag to list all experiments. The output should include both experiments. output = extract_id_and_owner_from_exp_list( det_run(["e", "list", "-a"])) assert (experiment_id1, creds1.username) in output assert (experiment_id2, creds2.username) in output # Clean up. delete_experiments(experiment_id1, experiment_id2)
def test_tensorboard_creation_and_listing(auth: Authentication) -> None: creds1 = create_test_user(ADMIN_CREDENTIALS, True) creds2 = create_test_user(ADMIN_CREDENTIALS, True) with logged_in_user(creds1): # Create an experiment. experiment_id1 = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) with logged_in_user(creds1): tensorboard_id1 = start_tensorboard(experiment_id1) with logged_in_user(creds2): experiment_id2 = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) with logged_in_user(creds2): tensorboard_id2 = start_tensorboard(experiment_id2) with logged_in_user(creds1): output = extract_columns(det_run(["tensorboard", "list"]), [0, 1]) assert (tensorboard_id1, creds1.username) in output assert (tensorboard_id2, creds2.username) not in output output = extract_columns(det_run(["tensorboard", "list", "-a"]), [0, 1]) assert (tensorboard_id1, creds1.username) in output assert (tensorboard_id2, creds2.username) in output kill_tensorboards(tensorboard_id1, tensorboard_id2) delete_experiments(experiment_id1, experiment_id2)
def test_labels() -> None: experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-one-short-step.yaml"), conf.fixtures_path("no_op"), None) label = "__det_test_dummy_label__" # Add a label and check that it shows up. subprocess.check_call([ "det", "-m", conf.make_master_url(), "e", "label", "add", str(experiment_id), label ]) output = subprocess.check_output([ "det", "-m", conf.make_master_url(), "e", "describe", str(experiment_id) ]).decode() assert label in output # Remove the label and check that it doesn't show up. subprocess.check_call([ "det", "-m", conf.make_master_url(), "e", "label", "remove", str(experiment_id), label ]) output = subprocess.check_output([ "det", "-m", conf.make_master_url(), "e", "describe", str(experiment_id) ]).decode() assert label not in output
def test_mnist_estimator_warm_start(tf2: bool) -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id1 = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0] first_trial_id = first_trial.id assert len(first_trial.steps) == 1 first_checkpoint_id = first_trial.steps[0].checkpoint.id config_obj = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config_obj["searcher"]["source_trial_id"] = first_trial_id config_obj = conf.set_tf2_image(config_obj) if tf2 else conf.set_tf1_image( config_obj) experiment_id2 = exp.run_basic_test_with_temp_config( config_obj, conf.official_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 assert trials[0].warm_start_checkpoint_id == first_checkpoint_id
def test_cancel_one_paused_experiment() -> None: experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-many-long-steps.yaml"), conf.fixtures_path("no_op"), ["--paused"], ) exp.cancel_single(experiment_id)
def test_cancel_ten_experiments() -> None: experiment_ids = [ exp.create_experiment( conf.fixtures_path("no_op/single-many-long-steps.yaml"), conf.fixtures_path("no_op"), ) for _ in range(10) ] for experiment_id in experiment_ids: exp.cancel_single(experiment_id)
def test_log_null_bytes() -> None: config_obj = conf.load_config(conf.fixtures_path("no_op/single.yaml")) config_obj["hyperparameters"]["write_null"] = True config_obj["max_restarts"] = 0 config_obj["searcher"]["max_steps"] = 1 experiment_id = exp.run_basic_test_with_temp_config(config_obj, conf.fixtures_path("no_op"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 logs = exp.trial_logs(trials[0].id) assert len(logs) > 0
def test_cancel_one_active_experiment() -> None: experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-many-long-steps.yaml"), conf.fixtures_path("no_op"), ) for _ in range(15): if exp.experiment_has_active_workload(experiment_id): break time.sleep(1) else: raise AssertionError("no workload active after 15 seconds") exp.cancel_single(experiment_id, should_have_trial=True)
def test_experiment_archive_unarchive() -> None: experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), ["--paused"]) describe_args = [ "det", "-m", conf.make_master_url(), "experiment", "describe", "--json", str(experiment_id), ] # Check that the experiment is initially unarchived. infos = json.loads(subprocess.check_output(describe_args)) assert len(infos) == 1 assert not infos[0]["archived"] # Check that archiving a non-terminal experiment fails, then terminate it. with pytest.raises(subprocess.CalledProcessError): subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "archive", str(experiment_id) ]) subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "cancel", str(experiment_id) ]) # Check that we can archive and unarchive the experiment and see the expected effects. subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "archive", str(experiment_id) ]) infos = json.loads(subprocess.check_output(describe_args)) assert len(infos) == 1 assert infos[0]["archived"] subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "unarchive", str(experiment_id) ]) infos = json.loads(subprocess.check_output(describe_args)) assert len(infos) == 1 assert not infos[0]["archived"]
def test_start_tensorboard_for_multi_experiment( tmp_path: Path, secrets: Dict[str, str]) -> None: """ Start 3 random experiments configured with the s3 and shared_fs backends, start a TensorBoard instance pointed to the experiments and some select trials, and kill the TensorBoard instance. """ with FileTree( tmp_path, { "shared_fs_config.yaml": shared_fs_config(1), "s3_config.yaml": s3_config(1, secrets), "multi_trial_config.yaml": shared_fs_config(3), }, ) as tree: shared_conf_path = tree.joinpath("shared_fs_config.yaml") shared_fs_exp_id = exp.run_basic_test(str(shared_conf_path), conf.fixtures_path("no_op"), num_trials) s3_conf_path = tree.joinpath("s3_config.yaml") s3_exp_id = exp.run_basic_test(str(s3_conf_path), conf.fixtures_path("no_op"), num_trials) multi_trial_config = tree.joinpath("multi_trial_config.yaml") multi_trial_exp_id = exp.run_basic_test(str(multi_trial_config), conf.fixtures_path("no_op"), 3) trial_ids = [ str(t["id"]) for t in exp.experiment_trials(multi_trial_exp_id) ] command = [ "tensorboard", "start", str(shared_fs_exp_id), str(s3_exp_id), "-t", *trial_ids, "--no-browser", ] with cmd.interactive_command(*command) as tensorboard: for line in tensorboard.stdout: if SERVICE_READY in line: break else: raise AssertionError(f"Did not find {SERVICE_READY} in output")
def test_metric_gathering() -> None: """ Confirm that metrics are gathered from the trial the way that we expect. """ experiment_id = exp.run_basic_test( conf.fixtures_path("metric_maker/const.yaml"), conf.fixtures_path("metric_maker"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 # Read the structure of the metrics directly from the config file config = conf.load_config(conf.fixtures_path("metric_maker/const.yaml")) base_value = config["hyperparameters"]["starting_base_value"] gain_per_batch = config["hyperparameters"]["gain_per_batch"] training_structure = config["hyperparameters"]["training_structure"]["val"] validation_structure = config["hyperparameters"]["validation_structure"][ "val"] batches_per_step = 100 # Check training metrics. full_trial_metrics = exp.trial_metrics(trials[0].id) for step in full_trial_metrics.steps: metrics = step.metrics assert metrics["num_inputs"] == batches_per_step actual = metrics["batch_metrics"] assert len(actual) == batches_per_step first_base_value = base_value + (step.id - 1) * batches_per_step batch_values = first_base_value + gain_per_batch * np.arange( batches_per_step) expected = [ structure_to_metrics(value, training_structure) for value in batch_values ] assert structure_equal(expected, actual) # Check validation metrics. for step in trials[0].steps: validation = step.validation metrics = validation.metrics actual = metrics["validation_metrics"] value = base_value + step.id * batches_per_step expected = structure_to_metrics(value, validation_structure) assert structure_equal(expected, actual)
def test_pytorch_11_const(aggregation_frequency: int) -> None: config = conf.load_config( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml")) config = conf.set_aggregation_frequency(config, aggregation_frequency) exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_pytorch"), 1)
def test_mnist_estimator_const(tf2: bool) -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 # Check validation metrics. steps = trials[0].steps assert len(steps) == 1 step = steps[0] assert "validation" in step v_metrics = step.validation.metrics["validation_metrics"] # GPU training is non-deterministic, but on CPU we can validate that we # reach a consistent result. if not cluster.running_on_gpu(): assert v_metrics["accuracy"] == 0.9125999808311462 # Check training metrics. full_trial_metrics = exp.trial_metrics(trials[0].id) for step in full_trial_metrics.steps: metrics = step.metrics batch_metrics = metrics["batch_metrics"] assert len(batch_metrics) == 100 for batch_metric in batch_metrics: assert batch_metric["loss"] > 0
def test_mnist_estimator_adaptive(tf2: bool) -> None: # Only test tf1 here, because a tf2 test would add no extra coverage. config = conf.load_config( conf.fixtures_path("mnist_estimator/adaptive.yaml")) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_estimator"), None)
def test_start_shell_with_template() -> None: template_name = "test_start_shell_with_template" tpl.set_template(template_name, conf.fixtures_path("templates/template.yaml")) with cmd.interactive_command("shell", "start", "--template", template_name, "--detach"): pass
def test_start_command_with_template() -> None: template_name = "test_start_command_with_template" tpl.set_template(template_name, conf.fixtures_path("templates/template.yaml")) with cmd.interactive_command("command", "run", "--template", template_name, "--detach", "sleep infinity"): pass
def test_mnist_estimator_adaptive_with_data_layer() -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/adaptive.yaml")) config = conf.set_tf2_image(config) config = conf.set_shared_fs_data_layer(config) exp.run_basic_test_with_temp_config( config, conf.experimental_path("data_layer_mnist_estimator"), None)
def test_pytorch_load() -> None: config = conf.load_config( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml")) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_pytorch"), 1) nn = Determined().get_experiment(experiment_id).top_checkpoint().load() assert isinstance(nn, torch.nn.Module)
def test_mnist_estimator_load() -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config = conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id) model = Determined().get_trial(trials[0].id).top_checkpoint().load() assert isinstance(model, AutoTrackable)
def test_non_root_experiment(auth: Authentication, tmp_path: pathlib.Path) -> None: user = create_linked_user(65534, "nobody", 65534, "nogroup") with logged_in_user(user): with open(conf.fixtures_path("no_op/single-one-short-step.yaml")) as f: config_content = f.read() with open(conf.fixtures_path("no_op/model_def.py")) as f: model_def_content = f.read() # Call `det --version` in a startup hook to ensure that det is on the PATH. with FileTree( tmp_path, { "startup-hook.sh": "det --version || exit 77", "const.yaml": config_content, "model_def.py": model_def_content, }, ) as tree: exp.run_basic_test(str(tree.joinpath("const.yaml")), str(tree), None)
def test_mnist_estimmator_const_parallel(native_parallel: bool, tf2: bool) -> None: if tf2 and native_parallel: pytest.skip("TF2 native parallel training is not currently supported.") config = conf.load_config( conf.fixtures_path("mnist_estimator/single-multi-slot.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, native_parallel) config = conf.set_max_steps(config, 2) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_estimator"), 1)
def test_end_to_end_adaptive() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.official_examples_path("mnist_pytorch"), None, ) # Check that validation accuracy look sane (more than 93% on MNIST). trials = exp.experiment_trials(exp_id) best = None for trial in trials: assert len(trial.steps) last_step = trial.steps[-1] accuracy = last_step.validation.metrics["validation_metrics"][ "accuracy"] if not best or accuracy > best: best = accuracy assert best is not None assert best > 0.93 # Check that ExperimentReference returns a sorted order of top checkpoints # without gaps. The top 2 checkpoints should be the first 2 of the top k # checkpoints if sorting is stable. exp_ref = Determined().get_experiment(exp_id) top_2 = exp_ref.top_n_checkpoints(2) top_k = exp_ref.top_n_checkpoints(len(trials)) top_2_uuids = [c.uuid for c in top_2] top_k_uuids = [c.uuid for c in top_k] assert top_2_uuids == top_k_uuids[:2] # Check that metrics are truly in sorted order. metrics = [ c.validation.metrics["validation_metrics"]["validation_loss"] for c in top_k ] assert metrics == sorted(metrics) # Check that changing smaller is better reverses the checkpoint ordering. top_k_reversed = exp_ref.top_n_checkpoints(len(trials), sort_by="validation_loss", smaller_is_better=False) top_k_reversed_uuids = [c.uuid for c in top_k_reversed] assert top_k_uuids == top_k_reversed_uuids[::-1]
def test_noop_pause_of_experiment_without_trials() -> None: """ Walk through starting, pausing, and resuming a single no-op experiment which will never schedule a trial. """ config_obj = conf.load_config(conf.fixtures_path("no_op/single-one-short-step.yaml")) impossibly_large = 100 config_obj["max_restarts"] = 0 config_obj["resources"] = {"slots_per_trial": impossibly_large} with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config_obj, f) experiment_id = exp.create_experiment(tf.name, conf.fixtures_path("no_op"), None) exp.pause_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "PAUSED") exp.activate_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "ACTIVE") for _ in range(5): assert exp.experiment_state(experiment_id) == "ACTIVE" time.sleep(1) exp.cancel_single(experiment_id)
def test_mnist_tf1_15() -> None: pytest.skip("Ignore until we have official support for tf1.15.") config = conf.load_config(conf.fixtures_path("mnist_tf/const.yaml")) # TODO(brian + yoni) don't hardcode TF1.15 image when we build a TF1.15 # golden image. config.setdefault("environment", {}) config["environment"]["image"] = ( "573932760021.dkr.ecr.us-west-2.amazonaws.com" "/determinedai/task-environment:" "c8750377f18ff0a738229adcf16a50685ef41631779616cdc86c0655fc554704") # This particular configuration takes a long time to build, so wait longer than normal. exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_tf"), None, max_wait_secs=3000)
def test_start_tensorboard_for_shared_fs_experiment(tmp_path: Path) -> None: """ Start a random experiment configured with the shared_fs backend, start a TensorBoard instance pointed to the experiment, and kill the TensorBoard instance. """ with FileTree(tmp_path, {"config.yaml": shared_fs_config(1)}) as tree: config_path = tree.joinpath("config.yaml") experiment_id = exp.run_basic_test(str(config_path), conf.fixtures_path("no_op"), num_trials) command = ["tensorboard", "start", str(experiment_id), "--no-browser"] with cmd.interactive_command(*command) as tensorboard: for line in tensorboard.stdout: if SERVICE_READY in line: break else: raise AssertionError(f"Did not find {SERVICE_READY} in output")
def test_invalid_experiment() -> None: completed_process = exp.maybe_create_experiment( conf.fixtures_path("invalid_experiment/const.yaml"), conf.official_examples_path("mnist_tf")) assert completed_process.returncode != 0