def test_labels() -> None: experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-one-short-step.yaml"), conf.fixtures_path("no_op"), None) label = "__det_test_dummy_label__" # Add a label and check that it shows up. subprocess.check_call([ "det", "-m", conf.make_master_url(), "e", "label", "add", str(experiment_id), label ]) output = subprocess.check_output([ "det", "-m", conf.make_master_url(), "e", "describe", str(experiment_id) ]).decode() assert label in output # Remove the label and check that it doesn't show up. subprocess.check_call([ "det", "-m", conf.make_master_url(), "e", "label", "remove", str(experiment_id), label ]) output = subprocess.check_output([ "det", "-m", conf.make_master_url(), "e", "describe", str(experiment_id) ]).decode() assert label not in output
def test_noop_pause() -> None: """ Walk through starting, pausing, and resuming a single no-op experiment. """ experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state(experiment_id, "ACTIVE") # Wait for the only trial to get scheduled. workload_active = False for _ in range(conf.MAX_TASK_SCHEDULED_SECS): workload_active = exp.experiment_has_active_workload(experiment_id) if workload_active: break else: time.sleep(1) check.true( workload_active, f"The only trial cannot be scheduled within {conf.MAX_TASK_SCHEDULED_SECS} seconds.", ) # Wait for the only trial to show progress, indicating the image is built and running. num_steps = 0 for _ in range(conf.MAX_TRIAL_BUILD_SECS): trials = exp.experiment_trials(experiment_id) if len(trials) > 0: only_trial = trials[0] num_steps = len(only_trial["steps"]) if num_steps > 1: break time.sleep(1) check.true( num_steps > 1, f"The only trial cannot start training within {conf.MAX_TRIAL_BUILD_SECS} seconds.", ) # Pause the experiment. Note that Determined does not currently differentiate # between a "stopping paused" and a "paused" state, so we follow this check # up by ensuring the experiment cleared all scheduled workloads. exp.pause_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "PAUSED") # Wait at most 20 seconds for the experiment to clear all workloads (each # train step should take 5 seconds). for _ in range(20): workload_active = exp.experiment_has_active_workload(experiment_id) if not workload_active: break else: time.sleep(1) check.true( not workload_active, f"The experiment cannot be paused within 20 seconds.", ) # Resume the experiment and wait for completion. exp.activate_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "COMPLETED")
def test_cancel_one_paused_experiment() -> None: experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-many-long-steps.yaml"), conf.fixtures_path("no_op"), ["--paused"], ) exp.cancel_single(experiment_id)
def test_cancel_ten_experiments() -> None: experiment_ids = [ exp.create_experiment( conf.fixtures_path("no_op/single-many-long-steps.yaml"), conf.fixtures_path("no_op"), ) for _ in range(10) ] for experiment_id in experiment_ids: exp.cancel_single(experiment_id)
def test_cancel_one_active_experiment() -> None: experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-many-long-steps.yaml"), conf.fixtures_path("no_op"), ) for _ in range(15): if exp.experiment_has_active_workload(experiment_id): break time.sleep(1) else: raise AssertionError("no workload active after 15 seconds") exp.cancel_single(experiment_id, should_have_trial=True)
def test_experiment_archive_unarchive() -> None: experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), ["--paused"]) describe_args = [ "det", "-m", conf.make_master_url(), "experiment", "describe", "--json", str(experiment_id), ] # Check that the experiment is initially unarchived. infos = json.loads(subprocess.check_output(describe_args)) assert len(infos) == 1 assert not infos[0]["archived"] # Check that archiving a non-terminal experiment fails, then terminate it. with pytest.raises(subprocess.CalledProcessError): subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "archive", str(experiment_id) ]) subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "cancel", str(experiment_id) ]) # Check that we can archive and unarchive the experiment and see the expected effects. subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "archive", str(experiment_id) ]) infos = json.loads(subprocess.check_output(describe_args)) assert len(infos) == 1 assert infos[0]["archived"] subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "unarchive", str(experiment_id) ]) infos = json.loads(subprocess.check_output(describe_args)) assert len(infos) == 1 assert not infos[0]["archived"]
def test_noop_pause_of_experiment_without_trials() -> None: """ Walk through starting, pausing, and resuming a single no-op experiment which will never schedule a trial. """ config_obj = conf.load_config(conf.fixtures_path("no_op/single-one-short-step.yaml")) impossibly_large = 100 config_obj["max_restarts"] = 0 config_obj["resources"] = {"slots_per_trial": impossibly_large} with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config_obj, f) experiment_id = exp.create_experiment(tf.name, conf.fixtures_path("no_op"), None) exp.pause_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "PAUSED") exp.activate_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "ACTIVE") for _ in range(5): assert exp.experiment_state(experiment_id) == "ACTIVE" time.sleep(1) exp.cancel_single(experiment_id)
def run_gc_checkpoints_test(checkpoint_storage: Dict[str, str]) -> None: fixtures = [ ( conf.fixtures_path("no_op/gc_checkpoints_decreasing.yaml"), { "COMPLETED": {8, 9, 10}, "DELETED": {1, 2, 3, 4, 5, 6, 7} }, ), ( conf.fixtures_path("no_op/gc_checkpoints_increasing.yaml"), { "COMPLETED": {1, 2, 3, 9, 10}, "DELETED": {4, 5, 6, 7, 8} }, ), ] all_checkpoints = [] for base_conf_path, result in fixtures: config = conf.load_config(str(base_conf_path)) config["checkpoint_storage"].update(checkpoint_storage) with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config, f) experiment_id = exp.create_experiment(tf.name, conf.fixtures_path("no_op")) exp.wait_for_experiment_state(experiment_id, "COMPLETED") # Checkpoints are not marked as deleted until gc_checkpoint task starts. retries = 5 for retry in range(retries): trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 checkpoints = sorted( (step.checkpoint for step in trials[0].steps), key=operator.itemgetter("step_id"), ) assert len(checkpoints) == 10 by_state = {} # type: Dict[str, Set[int]] for checkpoint in checkpoints: by_state.setdefault(checkpoint.state, set()).add(checkpoint.step_id) if by_state == result: all_checkpoints.append((config, checkpoints)) break if retry + 1 == retries: assert by_state == result time.sleep(1) # Check that the actual checkpoint storage (for shared_fs) reflects the # deletions. We want to wait for the GC containers to exit, so check # repeatedly with a timeout. max_checks = 30 for check in range(max_checks): time.sleep(1) try: for config, checkpoints in all_checkpoints: checkpoint_config = config["checkpoint_storage"] if checkpoint_config["type"] == "shared_fs" and ( "storage_path" not in checkpoint_config): if "tensorboard_path" in checkpoint_config: checkpoint_config[ "storage_path"] = checkpoint_config.get( "tensorboard_path", None) else: checkpoint_config[ "storage_path"] = checkpoint_config.get( "checkpoint_path", None) root = os.path.join(checkpoint_config["host_path"], checkpoint_config["storage_path"]) for checkpoint in checkpoints: dirname = os.path.join(root, checkpoint.uuid) if checkpoint.state == "COMPLETED": assert os.path.isdir(dirname) elif checkpoint.state == "DELETED": assert not os.path.exists(dirname) except AssertionError: if check == max_checks - 1: raise else: break