def test_tensorboard_creation_and_listing(auth: Authentication) -> None: creds1 = create_test_user(ADMIN_CREDENTIALS, True) creds2 = create_test_user(ADMIN_CREDENTIALS, True) with logged_in_user(creds1): # Create an experiment. experiment_id1 = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) with logged_in_user(creds1): tensorboard_id1 = start_tensorboard(experiment_id1) with logged_in_user(creds2): experiment_id2 = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) with logged_in_user(creds2): tensorboard_id2 = start_tensorboard(experiment_id2) with logged_in_user(creds1): output = extract_columns(det_run(["tensorboard", "list"]), [0, 1]) assert (tensorboard_id1, creds1.username) in output assert (tensorboard_id2, creds2.username) not in output output = extract_columns(det_run(["tensorboard", "list", "-a"]), [0, 1]) assert (tensorboard_id1, creds1.username) in output assert (tensorboard_id2, creds2.username) in output kill_tensorboards(tensorboard_id1, tensorboard_id2) delete_experiments(experiment_id1, experiment_id2)
def test_experiment_creation_and_listing(auth: Authentication) -> None: # Create 2 users. creds1 = create_test_user(ADMIN_CREDENTIALS, True) creds2 = create_test_user(ADMIN_CREDENTIALS, True) # Create an experiment as first user. with logged_in_user(creds1): experiment_id1 = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) # Create another experiment, this time as second user. with logged_in_user(creds2): experiment_id2 = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) with logged_in_user(creds1): # Now it should be the other way around. output = extract_id_and_owner_from_exp_list(det_run(["e", "list"])) assert (experiment_id1, creds1.username) in output assert (experiment_id2, creds2.username) not in output # Now use the -a flag to list all experiments. The output should include both experiments. output = extract_id_and_owner_from_exp_list( det_run(["e", "list", "-a"])) assert (experiment_id1, creds1.username) in output assert (experiment_id2, creds2.username) in output # Clean up. delete_experiments(experiment_id1, experiment_id2)
def test_start_tensorboard_for_multi_experiment( tmp_path: Path, secrets: Dict[str, str]) -> None: """ Start 3 random experiments configured with the s3 and shared_fs backends, start a TensorBoard instance pointed to the experiments and some select trials, and kill the TensorBoard instance. """ with FileTree( tmp_path, { "shared_fs_config.yaml": shared_fs_config(1), "s3_config.yaml": s3_config(1, secrets), "multi_trial_config.yaml": shared_fs_config(3), }, ) as tree: shared_conf_path = tree.joinpath("shared_fs_config.yaml") shared_fs_exp_id = exp.run_basic_test(str(shared_conf_path), conf.fixtures_path("no_op"), num_trials) s3_conf_path = tree.joinpath("s3_config.yaml") s3_exp_id = exp.run_basic_test(str(s3_conf_path), conf.fixtures_path("no_op"), num_trials) multi_trial_config = tree.joinpath("multi_trial_config.yaml") multi_trial_exp_id = exp.run_basic_test(str(multi_trial_config), conf.fixtures_path("no_op"), 3) trial_ids = [ str(t["id"]) for t in exp.experiment_trials(multi_trial_exp_id) ] command = [ "tensorboard", "start", str(shared_fs_exp_id), str(s3_exp_id), "-t", *trial_ids, "--no-browser", ] with cmd.interactive_command(*command) as tensorboard: for line in tensorboard.stdout: if SERVICE_READY in line: break else: raise AssertionError(f"Did not find {SERVICE_READY} in output")
def test_mask_rcnn_64_slots() -> None: experiment_id = exp.run_basic_test( conf.experimental_path("FasterRCNN_tp/64-gpus.yaml"), conf.experimental_path("FasterRCNN_tp/"), 1, max_wait_secs=5 * 60 * 60, ) validation_metric_name = "mAP(bbox)/IoU=0.5:0.95" validation_metric = exp.get_validation_metric_from_last_step( experiment_id, 0, validation_metric_name) durations = exp.get_experiment_durations(experiment_id, 0) wait_for_agents_time = (durations.experiment_duration - durations.training_duration - durations.validation_duration - durations.checkpoint_duration) print(validation_metric_name, validation_metric) print(durations) print(f"wait for agents duration: {wait_for_agents_time}") assert validation_metric > 0.375 assert durations.training_duration < datetime.timedelta(hours=2, minutes=45) assert durations.validation_duration < datetime.timedelta(hours=1, minutes=15)
def test_experiment_delete() -> None: subprocess.check_call( ["det", "-m", conf.make_master_url(), "user", "whoami"]) experiment_id = exp.run_basic_test(conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) subprocess.check_call( [ "det", "-m", conf.make_master_url(), "experiment", "delete", str(experiment_id), "--yes" ], env={ **os.environ, "DET_ADMIN": "1" }, ) # "det experiment describe" call should fail, because the # experiment is no longer in the database. with pytest.raises(subprocess.CalledProcessError): subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "describe", str(experiment_id) ])
def test_non_root_experiment(auth: Authentication, tmp_path: pathlib.Path) -> None: user = create_linked_user(65534, "nobody", 65534, "nogroup") with logged_in_user(user): with open(conf.fixtures_path("no_op/single-one-short-step.yaml")) as f: config_content = f.read() with open(conf.fixtures_path("no_op/model_def.py")) as f: model_def_content = f.read() # Call `det --version` in a startup hook to ensure that det is on the PATH. with FileTree( tmp_path, { "startup-hook.sh": "det --version || exit 77", "const.yaml": config_content, "model_def.py": model_def_content, }, ) as tree: exp.run_basic_test(str(tree.joinpath("const.yaml")), str(tree), None)
def test_metric_gathering() -> None: """ Confirm that metrics are gathered from the trial the way that we expect. """ experiment_id = exp.run_basic_test( conf.fixtures_path("metric_maker/const.yaml"), conf.fixtures_path("metric_maker"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 # Read the structure of the metrics directly from the config file config = conf.load_config(conf.fixtures_path("metric_maker/const.yaml")) base_value = config["hyperparameters"]["starting_base_value"] gain_per_batch = config["hyperparameters"]["gain_per_batch"] training_structure = config["hyperparameters"]["training_structure"]["val"] validation_structure = config["hyperparameters"]["validation_structure"][ "val"] batches_per_step = 100 # Check training metrics. full_trial_metrics = exp.trial_metrics(trials[0].id) for step in full_trial_metrics.steps: metrics = step.metrics assert metrics["num_inputs"] == batches_per_step actual = metrics["batch_metrics"] assert len(actual) == batches_per_step first_base_value = base_value + (step.id - 1) * batches_per_step batch_values = first_base_value + gain_per_batch * np.arange( batches_per_step) expected = [ structure_to_metrics(value, training_structure) for value in batch_values ] assert structure_equal(expected, actual) # Check validation metrics. for step in trials[0].steps: validation = step.validation metrics = validation.metrics actual = metrics["validation_metrics"] value = base_value + step.id * batches_per_step expected = structure_to_metrics(value, validation_structure) assert structure_equal(expected, actual)
def test_end_to_end_adaptive() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.official_examples_path("mnist_pytorch"), None, ) # Check that validation accuracy look sane (more than 93% on MNIST). trials = exp.experiment_trials(exp_id) best = None for trial in trials: assert len(trial.steps) last_step = trial.steps[-1] accuracy = last_step.validation.metrics["validation_metrics"][ "accuracy"] if not best or accuracy > best: best = accuracy assert best is not None assert best > 0.93 # Check that ExperimentReference returns a sorted order of top checkpoints # without gaps. The top 2 checkpoints should be the first 2 of the top k # checkpoints if sorting is stable. exp_ref = Determined().get_experiment(exp_id) top_2 = exp_ref.top_n_checkpoints(2) top_k = exp_ref.top_n_checkpoints(len(trials)) top_2_uuids = [c.uuid for c in top_2] top_k_uuids = [c.uuid for c in top_k] assert top_2_uuids == top_k_uuids[:2] # Check that metrics are truly in sorted order. metrics = [ c.validation.metrics["validation_metrics"]["validation_loss"] for c in top_k ] assert metrics == sorted(metrics) # Check that changing smaller is better reverses the checkpoint ordering. top_k_reversed = exp_ref.top_n_checkpoints(len(trials), sort_by="validation_loss", smaller_is_better=False) top_k_reversed_uuids = [c.uuid for c in top_k_reversed] assert top_k_uuids == top_k_reversed_uuids[::-1]
def test_start_tensorboard_for_shared_fs_experiment(tmp_path: Path) -> None: """ Start a random experiment configured with the shared_fs backend, start a TensorBoard instance pointed to the experiment, and kill the TensorBoard instance. """ with FileTree(tmp_path, {"config.yaml": shared_fs_config(1)}) as tree: config_path = tree.joinpath("config.yaml") experiment_id = exp.run_basic_test(str(config_path), conf.fixtures_path("no_op"), num_trials) command = ["tensorboard", "start", str(experiment_id), "--no-browser"] with cmd.interactive_command(*command) as tensorboard: for line in tensorboard.stdout: if SERVICE_READY in line: break else: raise AssertionError(f"Did not find {SERVICE_READY} in output")
def test_startup_hook() -> None: exp.run_basic_test( conf.fixtures_path("no_op/startup-hook.yaml"), conf.fixtures_path("no_op"), 1, )
def test_noop_long_train_step() -> None: exp.run_basic_test( conf.fixtures_path("no_op/single-long-train-step.yaml"), conf.fixtures_path("no_op"), 1, )
def test_noop_single_warm_start() -> None: experiment_id1 = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1 ) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0] first_trial_id = first_trial["id"] assert len(first_trial["steps"]) == 30 first_step = first_trial["steps"][0] first_checkpoint_id = first_step["checkpoint"]["id"] last_step = first_trial["steps"][29] last_checkpoint_id = last_step["checkpoint"]["id"] assert last_step["validation"]["metrics"]["validation_metrics"][ "validation_error" ] == pytest.approx(0.9 ** 30) config_base = conf.load_config(conf.fixtures_path("no_op/single.yaml")) # Test source_trial_id. config_obj = copy.deepcopy(config_base) # Add a source trial ID to warm start from. config_obj["searcher"]["source_trial_id"] = first_trial_id experiment_id2 = exp.run_basic_test_with_temp_config(config_obj, conf.fixtures_path("no_op"), 1) trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 second_trial = trials[0] assert len(second_trial["steps"]) == 30 # Second trial should have a warm start checkpoint id. assert second_trial["warm_start_checkpoint_id"] == last_checkpoint_id assert second_trial["steps"][29]["validation"]["metrics"]["validation_metrics"][ "validation_error" ] == pytest.approx(0.9 ** 60) # Now test source_checkpoint_uuid. config_obj = copy.deepcopy(config_base) # Add a source trial ID to warm start from. config_obj["searcher"]["source_checkpoint_uuid"] = first_step["checkpoint"]["uuid"] with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config_obj, f) experiment_id3 = exp.run_basic_test(tf.name, conf.fixtures_path("no_op"), 1) trials = exp.experiment_trials(experiment_id3) assert len(trials) == 1 third_trial = trials[0] assert len(third_trial["steps"]) == 30 assert third_trial["warm_start_checkpoint_id"] == first_checkpoint_id assert third_trial["steps"][1]["validation"]["metrics"]["validation_metrics"][ "validation_error" ] == pytest.approx(0.9 ** 3)