def test_job_queue_adjust_weight() -> None: config = conf.tutorials_path("mnist_pytorch/const.yaml") model = conf.tutorials_path("mnist_pytorch") for _ in range(2): exp.create_experiment(config, model) jobs = JobInfo() ok = jobs.refresh_until_populated() assert ok ordered_ids = jobs.get_ids() subprocess.run(["det", "job", "update", ordered_ids[0], "--weight", "10"]) sleep(2) jobs.refresh() new_weight = jobs.get_job_weight(ordered_ids[0]) assert new_weight == "10" subprocess.run( ["det", "job", "update-batch", f"{ordered_ids[1]}.weight=10"]) sleep(2) jobs.refresh() new_weight = jobs.get_job_weight(ordered_ids[1]) assert new_weight == "10"
def test_drain_agent_sched() -> None: """ Start an experiment, drain it. Start a second one and make sure it schedules on the second agent *before* the first one has finished. """ slots = _wait_for_slots(2) assert len(slots) == 2 exp_id1 = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_workload_progress(exp_id1) slots = _fetch_slots() used_slots = [s for s in slots if s["allocation_id"] != "FREE"] assert len(used_slots) == 1 agent_id1 = used_slots[0]["agent_id"] with _disable_agent(agent_id1, drain=True): exp_id2 = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state(exp_id2, determinedexperimentv1State.STATE_ACTIVE) # Wait for a state when *BOTH* experiments are scheduled. for _ in range(20): slots = _fetch_slots() assert len(slots) == 2 used_slots = [s for s in slots if s["allocation_id"] != "FREE"] if len(used_slots) == 2: # All good. break else: pytest.fail( "Second experiment didn't schedule on the second agent " "while the first agent was draining") exp.wait_for_experiment_state( exp_id1, determinedexperimentv1State.STATE_COMPLETED) exp.wait_for_experiment_state( exp_id2, determinedexperimentv1State.STATE_COMPLETED) trials1 = exp.experiment_trials(exp_id1) trials2 = exp.experiment_trials(exp_id2) assert len(trials1) == len(trials2) == 1 assert len(trials1[0].workloads) == len(trials2[0].workloads) == 7
def test_master_restart_reattach_recover_experiment( managed_cluster_restarts: ManagedCluster, downtime: int) -> None: _sanity_check(managed_cluster_restarts) try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) # TODO(ilia): don't wait for progress. exp.wait_for_experiment_workload_progress(exp_id) if downtime >= 0: managed_cluster_restarts.kill_master() time.sleep(downtime) managed_cluster_restarts.restart_master() exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED, max_wait_secs=downtime + 60) trials = exp.experiment_trials(exp_id) assert len(trials) == 1 train_wls = exp.workloads_with_training(trials[0].workloads) assert len(train_wls) == 5 except Exception: managed_cluster_restarts.restart_master() managed_cluster_restarts.restart_agent() raise
def test_master_restart_kill_works( managed_cluster_restarts: ManagedCluster) -> None: _sanity_check(managed_cluster_restarts) try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-many-long-steps.yaml"), conf.fixtures_path("no_op"), [ "--config", "searcher.max_length.batches=10000", "--config", "max_restarts=0" ], ) exp.wait_for_experiment_workload_progress(exp_id) managed_cluster_restarts.kill_master() time.sleep(0) managed_cluster_restarts.restart_master() command = [ "det", "-m", conf.make_master_url(), "e", "kill", str(exp_id) ] subprocess.check_call(command) exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_CANCELED, max_wait_secs=10) managed_cluster_restarts.ensure_agent_ok() except Exception: managed_cluster_restarts.restart_master() managed_cluster_restarts.restart_agent()
def test_agent_reconnect_keep_experiment( managed_cluster_restarts: ManagedCluster) -> None: managed_cluster_restarts.ensure_agent_ok() try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_workload_progress(exp_id) managed_cluster_restarts.kill_proxy() time.sleep(1) managed_cluster_restarts.restart_proxy() exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED) trials = exp.experiment_trials(exp_id) assert len(trials) == 1 train_wls = exp.workloads_with_training(trials[0].workloads) assert len(train_wls) == 5 except Exception: managed_cluster_restarts.restart_proxy(wait_for_reconnect=False) managed_cluster_restarts.restart_agent() raise
def test_cancel_one_experiment() -> None: experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-many-long-steps.yaml"), conf.fixtures_path("no_op"), ) exp.cancel_single(experiment_id)
def test_noop_pause() -> None: """ Walk through starting, pausing, and resuming a single no-op experiment. """ experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state(experiment_id, "ACTIVE") # Wait for the only trial to get scheduled. workload_active = False for _ in range(conf.MAX_TASK_SCHEDULED_SECS): workload_active = exp.experiment_has_active_workload(experiment_id) if workload_active: break else: time.sleep(1) check.true( workload_active, f"The only trial cannot be scheduled within {conf.MAX_TASK_SCHEDULED_SECS} seconds.", ) # Wait for the only trial to show progress, indicating the image is built and running. num_steps = 0 for _ in range(conf.MAX_TRIAL_BUILD_SECS): trials = exp.experiment_trials(experiment_id) if len(trials) > 0: only_trial = trials[0] num_steps = len(only_trial["steps"]) if num_steps > 1: break time.sleep(1) check.true( num_steps > 1, f"The only trial cannot start training within {conf.MAX_TRIAL_BUILD_SECS} seconds.", ) # Pause the experiment. Note that Determined does not currently differentiate # between a "stopping paused" and a "paused" state, so we follow this check # up by ensuring the experiment cleared all scheduled workloads. exp.pause_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "PAUSED") # Wait at most 20 seconds for the experiment to clear all workloads (each # train step should take 5 seconds). for _ in range(20): workload_active = exp.experiment_has_active_workload(experiment_id) if not workload_active: break else: time.sleep(1) check.true( not workload_active, "The experiment cannot be paused within 20 seconds.", ) # Resume the experiment and wait for completion. exp.activate_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "COMPLETED")
def test_labels() -> None: experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-one-short-step.yaml"), conf.fixtures_path("no_op"), None) label = "__det_test_dummy_label__" # Add a label and check that it shows up. subprocess.check_call([ "det", "-m", conf.make_master_url(), "e", "label", "add", str(experiment_id), label ]) output = subprocess.check_output([ "det", "-m", conf.make_master_url(), "e", "describe", str(experiment_id) ]).decode() assert label in output # Remove the label and check that it doesn't show up. subprocess.check_call([ "det", "-m", conf.make_master_url(), "e", "label", "remove", str(experiment_id), label ]) output = subprocess.check_output([ "det", "-m", conf.make_master_url(), "e", "describe", str(experiment_id) ]).decode() assert label not in output
def test_disable_agent_experiment_resume() -> None: """ Start an experiment with max_restarts=0 and ensure that being killed due to an explicit agent disable/enable (without draining) does not count toward the number of restarts. """ slots = _fetch_slots() assert len(slots) == 1 agent_id = slots[0]["agent_id"] exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), ["--config", "max_restarts=0"], ) exp.wait_for_experiment_workload_progress(exp_id) with _disable_agent(agent_id): # Wait for the allocation to go away. for _ in range(20): slots = _fetch_slots() print(slots) if not any(s["allocation_id"] != "FREE" for s in slots): break time.sleep(1) else: pytest.fail("Experiment stayed scheduled after agent was disabled") exp.wait_for_experiment_state(exp_id, determinedexperimentv1State.STATE_COMPLETED)
def test_streaming_observability_metrics_apis( framework_base_experiment: str, framework_timings_enabled: bool ) -> None: # TODO: refactor tests to not use cli singleton auth. certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication(conf.make_master_url(), try_reauth=True) config_path = conf.tutorials_path(f"../{framework_base_experiment}/const.yaml") model_def_path = conf.tutorials_path(f"../{framework_base_experiment}") config_obj = conf.load_config(config_path) config_obj = conf.set_profiling_enabled(config_obj) with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config_obj, f) experiment_id = exp.create_experiment( tf.name, model_def_path, ) exp.wait_for_experiment_state(experiment_id, "COMPLETED") trials = exp.experiment_trials(experiment_id) trial_id = trials[0]["id"] gpu_enabled = conf.GPU_ENABLED request_profiling_metric_labels(trial_id, framework_timings_enabled, gpu_enabled) if gpu_enabled: request_profiling_system_metrics(trial_id, "gpu_util") if framework_timings_enabled: request_profiling_pytorch_timing_metrics(trial_id, "train_batch")
def test_agent_restart_recover_experiment( managed_cluster_restarts: ManagedCluster, downtime: int) -> None: if not managed_cluster_restarts.reattach: pytest.skip() managed_cluster_restarts.ensure_agent_ok() try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_workload_progress(exp_id) if downtime >= 0: managed_cluster_restarts.kill_agent() time.sleep(downtime) managed_cluster_restarts.restart_agent(wait_for_amnesia=False) exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED) trials = exp.experiment_trials(exp_id) assert len(trials) == 1 train_wls = exp.workloads_with_training(trials[0].workloads) assert len(train_wls) == 5 except Exception: managed_cluster_restarts.restart_agent() raise
def test_noop_pause_of_experiment_without_trials() -> None: """ Walk through starting, pausing, and resuming a single no-op experiment which will never schedule a trial. """ config_obj = conf.load_config( conf.fixtures_path("no_op/single-one-short-step.yaml")) impossibly_large = 100 config_obj["max_restarts"] = 0 config_obj["resources"] = {"slots_per_trial": impossibly_large} with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config_obj, f) experiment_id = exp.create_experiment(tf.name, conf.fixtures_path("no_op"), None) exp.pause_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "PAUSED") exp.activate_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "ACTIVE") for _ in range(5): assert exp.experiment_state(experiment_id) == "ACTIVE" time.sleep(1) exp.cancel_single(experiment_id)
def test_streaming_metrics_api() -> None: auth.initialize_session(conf.make_master_url(), try_reauth=True) pool = mp.pool.ThreadPool(processes=7) experiment_id = exp.create_experiment( conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.tutorials_path("mnist_pytorch"), ) # To fully test the streaming APIs, the requests need to start running immediately after the # experiment, and then stay open until the experiment is complete. To accomplish this with all # of the API calls on a single experiment, we spawn them all in threads. # The HP importance portion of this test is commented out until the feature is enabled by # default metric_names_thread = pool.apply_async(request_metric_names, (experiment_id, )) train_metric_batches_thread = pool.apply_async( request_train_metric_batches, (experiment_id, )) valid_metric_batches_thread = pool.apply_async( request_valid_metric_batches, (experiment_id, )) train_trials_snapshot_thread = pool.apply_async( request_train_trials_snapshot, (experiment_id, )) valid_trials_snapshot_thread = pool.apply_async( request_valid_trials_snapshot, (experiment_id, )) train_trials_sample_thread = pool.apply_async(request_train_trials_sample, (experiment_id, )) valid_trials_sample_thread = pool.apply_async(request_valid_trials_sample, (experiment_id, )) metric_names_results = metric_names_thread.get() train_metric_batches_results = train_metric_batches_thread.get() valid_metric_batches_results = valid_metric_batches_thread.get() train_trials_snapshot_results = train_trials_snapshot_thread.get() valid_trials_snapshot_results = valid_trials_snapshot_thread.get() train_trials_sample_results = train_trials_sample_thread.get() valid_trials_sample_results = valid_trials_sample_thread.get() if metric_names_results is not None: pytest.fail("metric-names: %s. Results: %s" % metric_names_results) if train_metric_batches_results is not None: pytest.fail("metric-batches (training): %s. Results: %s" % train_metric_batches_results) if valid_metric_batches_results is not None: pytest.fail("metric-batches (validation): %s. Results: %s" % valid_metric_batches_results) if train_trials_snapshot_results is not None: pytest.fail("trials-snapshot (training): %s. Results: %s" % train_trials_snapshot_results) if valid_trials_snapshot_results is not None: pytest.fail("trials-snapshot (validation): %s. Results: %s" % valid_trials_snapshot_results) if train_trials_sample_results is not None: pytest.fail("trials-sample (training): %s. Results: %s" % train_trials_sample_results) if valid_trials_sample_results is not None: pytest.fail("trials-sample (validation): %s. Results: %s" % valid_trials_sample_results)
def test_cancel_ten_experiments() -> None: experiment_ids = [ exp.create_experiment( conf.fixtures_path("no_op/single-many-long-steps.yaml"), conf.fixtures_path("no_op"), ) for _ in range(10) ] for experiment_id in experiment_ids: exp.cancel_single(experiment_id)
def test_noop_nan_validations() -> None: """ Ensure that NaN validation metric values don't prevent an experiment from completing. """ experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-nan-validations.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state( experiment_id, bindings.determinedexperimentv1State.STATE_COMPLETED )
def test_noop_experiment_config_override() -> None: config_obj = conf.load_config(conf.fixtures_path("no_op/single-one-short-step.yaml")) with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config_obj, f) experiment_id = exp.create_experiment( tf.name, conf.fixtures_path("no_op"), ["--config", "reproducibility.experiment_seed=8200"], ) exp_config = exp.experiment_config_json(experiment_id) assert exp_config["reproducibility"]["experiment_seed"] == 8200 exp.cancel_single(experiment_id)
def test_cancel_one_active_experiment_ready() -> None: experiment_id = exp.create_experiment( conf.tutorials_path("mnist_pytorch/const.yaml"), conf.tutorials_path("mnist_pytorch"), ) while 1: if exp.experiment_has_completed_workload(experiment_id): break time.sleep(1) exp.cancel_single(experiment_id, should_have_trial=True) exp.assert_performed_final_checkpoint(experiment_id)
def test_cancel_one_active_experiment() -> None: experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-many-long-steps.yaml"), conf.fixtures_path("no_op"), ) for _ in range(15): if exp.experiment_has_active_workload(experiment_id): break time.sleep(1) else: raise AssertionError("no workload active after 15 seconds") exp.cancel_single(experiment_id, should_have_trial=True)
def test_experiment_archive_unarchive() -> None: experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), ["--paused"]) describe_args = [ "det", "-m", conf.make_master_url(), "experiment", "describe", "--json", str(experiment_id), ] # Check that the experiment is initially unarchived. infos = json.loads(subprocess.check_output(describe_args)) assert len(infos) == 1 assert not infos[0]["archived"] # Check that archiving a non-terminal experiment fails, then terminate it. with pytest.raises(subprocess.CalledProcessError): subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "archive", str(experiment_id) ]) subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "cancel", str(experiment_id) ]) # Check that we can archive and unarchive the experiment and see the expected effects. subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "archive", str(experiment_id) ]) infos = json.loads(subprocess.check_output(describe_args)) assert len(infos) == 1 assert infos[0]["archived"] subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "unarchive", str(experiment_id) ]) infos = json.loads(subprocess.check_output(describe_args)) assert len(infos) == 1 assert not infos[0]["archived"]
def test_hp_importance_api() -> None: auth.initialize_session(conf.make_master_url(), try_reauth=True) pool = mp.pool.ThreadPool(processes=1) experiment_id = exp.create_experiment( conf.fixtures_path("mnist_pytorch/random.yaml"), conf.tutorials_path("mnist_pytorch"), ) hp_importance_thread = pool.apply_async(request_hp_importance, (experiment_id,)) hp_importance_results = hp_importance_thread.get() if hp_importance_results is not None: pytest.fail("hyperparameter-importance: %s. Results: %s" % hp_importance_results)
def test_agent_restart_exp_container_failure( managed_cluster_restarts: ManagedCluster) -> None: managed_cluster_restarts.ensure_agent_ok() try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_workload_progress(exp_id) container_ids = list(_local_container_ids_for_experiment(exp_id)) if len(container_ids) != 1: pytest.fail( f"unexpected number of local containers for the experiment: {len(container_ids)}" ) # Get task id / allocation id tasks_data = _task_list_json(managed_cluster_restarts.master_url) assert len(tasks_data) == 1 exp_task_before = list(tasks_data.values())[0] managed_cluster_restarts.kill_agent() subprocess.run(["docker", "kill", container_ids[0]], check=True, stdout=subprocess.PIPE) except Exception: managed_cluster_restarts.restart_agent() raise else: managed_cluster_restarts.restart_agent() # As soon as the agent is back, the original allocation should be considered dead, # but the new one should be allocated. state = exp.experiment_state(exp_id) assert state == EXP_STATE.STATE_ACTIVE tasks_data = _task_list_json(managed_cluster_restarts.master_url) assert len(tasks_data) == 1 exp_task_after = list(tasks_data.values())[0] assert exp_task_before["task_id"] == exp_task_after["task_id"] assert exp_task_before["allocation_id"] != exp_task_after[ "allocation_id"] exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED)
def test_noop_pause() -> None: """ Walk through starting, pausing, and resuming a single no-op experiment. """ experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_ACTIVE) # Wait for the only trial to get scheduled. exp.wait_for_experiment_active_workload(experiment_id) # Wait for the only trial to show progress, indicating the image is built and running. exp.wait_for_experiment_workload_progress(experiment_id) # Pause the experiment. Note that Determined does not currently differentiate # between a "stopping paused" and a "paused" state, so we follow this check # up by ensuring the experiment cleared all scheduled workloads. exp.pause_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_PAUSED) # Wait at most 20 seconds for the experiment to clear all workloads (each # train step should take 5 seconds). for _ in range(20): workload_active = exp.experiment_has_active_workload(experiment_id) if not workload_active: break else: time.sleep(1) check.true( not workload_active, "The experiment cannot be paused within 20 seconds.", ) # Resume the experiment and wait for completion. exp.activate_experiment(experiment_id) exp.wait_for_experiment_state( experiment_id, bindings.determinedexperimentv1State.STATE_COMPLETED )
def run_gc_checkpoints_test(checkpoint_storage: Dict[str, str]) -> None: fixtures = [ ( conf.fixtures_path("no_op/gc_checkpoints_decreasing.yaml"), { "COMPLETED": {8, 9, 10}, "DELETED": {1, 2, 3, 4, 5, 6, 7} }, ), ( conf.fixtures_path("no_op/gc_checkpoints_increasing.yaml"), { "COMPLETED": {1, 2, 3, 9, 10}, "DELETED": {4, 5, 6, 7, 8} }, ), ] all_checkpoints = [] for base_conf_path, result in fixtures: config = conf.load_config(str(base_conf_path)) config["checkpoint_storage"].update(checkpoint_storage) with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config, f) experiment_id = exp.create_experiment(tf.name, conf.fixtures_path("no_op")) exp.wait_for_experiment_state(experiment_id, "COMPLETED") # Checkpoints are not marked as deleted until gc_checkpoint task starts. retries = 5 for retry in range(retries): trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 checkpoints = sorted( (step.checkpoint for step in trials[0].steps), key=operator.itemgetter("step_id"), ) assert len(checkpoints) == 10 by_state = {} # type: Dict[str, Set[int]] for checkpoint in checkpoints: by_state.setdefault(checkpoint.state, set()).add(checkpoint.step_id) if by_state == result: all_checkpoints.append((config, checkpoints)) break if retry + 1 == retries: assert by_state == result time.sleep(1) # Check that the actual checkpoint storage (for shared_fs) reflects the # deletions. We want to wait for the GC containers to exit, so check # repeatedly with a timeout. max_checks = 30 for check in range(max_checks): time.sleep(1) try: for config, checkpoints in all_checkpoints: checkpoint_config = config["checkpoint_storage"] if checkpoint_config["type"] == "shared_fs" and ( "storage_path" not in checkpoint_config): if "tensorboard_path" in checkpoint_config: checkpoint_config[ "storage_path"] = checkpoint_config.get( "tensorboard_path", None) else: checkpoint_config[ "storage_path"] = checkpoint_config.get( "checkpoint_path", None) root = os.path.join(checkpoint_config["host_path"], checkpoint_config["storage_path"]) for checkpoint in checkpoints: dirname = os.path.join(root, checkpoint.uuid) if checkpoint.state == "COMPLETED": assert os.path.isdir(dirname) elif checkpoint.state == "DELETED": assert not os.path.exists(dirname) except AssertionError: if check == max_checks - 1: raise else: break
def run_gc_checkpoints_test(checkpoint_storage: Dict[str, str]) -> None: fixtures = [ ( conf.fixtures_path("no_op/gc_checkpoints_decreasing.yaml"), { "COMPLETED": {8, 9, 10}, "DELETED": {1, 2, 3, 4, 5, 6, 7} }, ), ( conf.fixtures_path("no_op/gc_checkpoints_increasing.yaml"), { "COMPLETED": {1, 2, 3, 9, 10}, "DELETED": {4, 5, 6, 7, 8} }, ), ] all_checkpoints = [] for base_conf_path, result in fixtures: config = conf.load_config(str(base_conf_path)) config["checkpoint_storage"].update(checkpoint_storage) with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config, f) experiment_id = exp.create_experiment(tf.name, conf.fixtures_path("no_op")) exp.wait_for_experiment_state(experiment_id, "COMPLETED") # Checkpoints are not marked as deleted until gc_checkpoint task starts. retries = 5 for retry in range(retries): trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 checkpoints = sorted( (step["checkpoint"] for step in trials[0]["steps"]), key=operator.itemgetter("step_id"), ) assert len(checkpoints) == 10 by_state = {} # type: Dict[str, Set[int]] for checkpoint in checkpoints: by_state.setdefault(checkpoint["state"], set()).add(checkpoint["step_id"]) if by_state == result: all_checkpoints.append((config, checkpoints)) break if retry + 1 == retries: assert by_state == result time.sleep(1) # Check that the actual checkpoint storage (for shared_fs) reflects the # deletions. We want to wait for the GC containers to exit, so check # repeatedly with a timeout. max_checks = 30 for i in range(max_checks): time.sleep(1) try: for config, checkpoints in all_checkpoints: checkpoint_config = config["checkpoint_storage"] if checkpoint_config["type"] == "shared_fs": deleted_exception = check.CheckFailedError elif checkpoint_config["type"] == "s3": deleted_exception = botocore.exceptions.ClientError else: raise NotImplementedError( f'unsupported storage type {checkpoint_config["type"]}' ) storage_manager = storage.build(checkpoint_config, container_path=None) for checkpoint in checkpoints: metadata = storage.StorageMetadata.from_json(checkpoint) if checkpoint["state"] == "COMPLETED": with storage_manager.restore_path(metadata): pass elif checkpoint["state"] == "DELETED": try: with storage_manager.restore_path(metadata): raise AssertionError("checkpoint not deleted") except deleted_exception: pass except AssertionError: if i == max_checks - 1: raise else: break
def run_gc_checkpoints_test(checkpoint_storage: Dict[str, str]) -> None: fixtures = [ ( conf.fixtures_path("no_op/gc_checkpoints_decreasing.yaml"), { (bindings.determinedexperimentv1State.STATE_COMPLETED.value): {800, 900, 1000}, (bindings.determinedexperimentv1State.STATE_DELETED.value): { 100, 200, 300, 400, 500, 600, 700, }, }, ), ( conf.fixtures_path("no_op/gc_checkpoints_increasing.yaml"), { (bindings.determinedexperimentv1State.STATE_COMPLETED.value): { 100, 200, 300, 900, 1000, }, (bindings.determinedexperimentv1State.STATE_DELETED.value): { 400, 500, 600, 700, 800, }, }, ), ] all_checkpoints: List[Tuple[Any, List[bindings.v1CheckpointWorkload]]] = [] for base_conf_path, result in fixtures: config = conf.load_config(str(base_conf_path)) config["checkpoint_storage"].update(checkpoint_storage) with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config, f) experiment_id = exp.create_experiment(tf.name, conf.fixtures_path("no_op")) exp.wait_for_experiment_state( experiment_id, bindings.determinedexperimentv1State.STATE_COMPLETED) # In some configurations, checkpoint GC will run on an auxillary machine, which may have to # be spun up still. So we'll wait for it to run. wait_for_gc_to_finish(experiment_id) # Checkpoints are not marked as deleted until gc_checkpoint task starts. retries = 5 for retry in range(retries): trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 cpoints = exp.workloads_with_checkpoint(trials[0].workloads) sorted_checkpoints = sorted( cpoints, key=lambda ckp: int(ckp.totalBatches), ) assert len(sorted_checkpoints) == 10 by_state = {} # type: Dict[str, Set[int]] for ckpt in sorted_checkpoints: by_state.setdefault(ckpt.state.value, set()).add(ckpt.totalBatches) if by_state == result: all_checkpoints.append((config, sorted_checkpoints)) break if retry + 1 == retries: assert by_state == result time.sleep(1) # Check that the actual checkpoint storage (for shared_fs) reflects the # deletions. We want to wait for the GC containers to exit, so check # repeatedly with a timeout. max_checks = 30 for i in range(max_checks): time.sleep(1) try: storage_states = [] for config, checkpoints in all_checkpoints: checkpoint_config = config["checkpoint_storage"] storage_manager = storage.build(checkpoint_config, container_path=None) storage_state = {} # type: Dict[str, Any] for checkpoint in checkpoints: assert checkpoint.uuid is not None storage_id = checkpoint.uuid storage_state[storage_id] = {} if checkpoint.state == bindings.determinedcheckpointv1State.STATE_COMPLETED: storage_state[storage_id]["found"] = False try: with storage_manager.restore_path(storage_id): storage_state[storage_id]["found"] = True except errors.CheckpointNotFound: pass elif checkpoint.state == bindings.determinedcheckpointv1State.STATE_DELETED: storage_state[storage_id] = { "deleted": False, "checkpoint": checkpoint } try: with storage_manager.restore_path(storage_id): pass except errors.CheckpointNotFound: storage_state[storage_id]["deleted"] = True storage_states.append(storage_state) for storage_state in storage_states: for state in storage_state.values(): if state.get("deleted", None) is False: json_states = json.dumps(storage_states) raise AssertionError( f"Some checkpoints were not deleted: JSON:{json_states}" ) if state.get("found", None) is False: json_states = json.dumps(storage_states) raise AssertionError( f"Some checkpoints were not found: JSON:{json_states}" ) except AssertionError: if i == max_checks - 1: raise else: break
def test_allocation_resources_incremental_release() -> None: """ Start an two container experiment and ensure one container exits before the other. Ensure resources are released and schedule-able without the other needing to be released. """ cleanup_exp_ids = [] try: slots = _wait_for_slots(2) assert len(slots) == 2 with tempfile.TemporaryDirectory() as context_dir, open( os.path.join(context_dir, "const.yaml"), "w") as config_file: # Launch an experiment that has one resource (docker container) that exits immediately. config_obj = conf.load_config( conf.fixtures_path("no_op/single.yaml")) config_obj["resources"] = { **config_obj.get("resources", {}), **{ "slots": 2 } } config_obj["hyperparameters"] = { **config_obj.get("hyperparameters", {}), **{ "non_chief_exit_immediately": True }, } yaml.dump(config_obj, config_file) shutil.copy(conf.fixtures_path("no_op/model_def.py"), os.path.join(context_dir, "model_def.py")) exp_id = exp.create_experiment(config_file.name, context_dir, None) cleanup_exp_ids.append(exp_id) # Wait for the experiment to start and run some. exp.wait_for_experiment_state( exp_id, determinedexperimentv1State.STATE_ACTIVE, ) exp.wait_for_experiment_active_workload(exp_id) # And wait for exactly one of the resources to free, while one is still in use. confirmations = 0 for _ in range(RANK_ONE_WAIT_TIME): free_agents = list_free_agents() if len(free_agents) == 1: confirmations += 1 if confirmations == 2: # Just for the race where one container has exited and the other hasn't quite yet, # but is going to, make sure we see it at least twice. break # Still waiting on partial exit time.sleep(1) else: pytest.fail( "exactly one agent did not free after {} seconds".format( RANK_ONE_WAIT_TIME)) # Ensure we can schedule on the free slot, not only that the API says its available. exp_id_2 = exp.create_experiment( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), None, ) cleanup_exp_ids.append(exp_id_2) exp.wait_for_experiment_workload_progress(exp_id_2) exp.wait_for_experiment_state( exp_id_2, determinedexperimentv1State.STATE_COMPLETED) cleanup_exp_ids = cleanup_exp_ids[:-1] # And check the hung experiment still is holding on to its hung slot. free_agents = list_free_agents() if len(free_agents) != 1: pytest.fail( f"should still have exactly one agent scheduled: {free_agents}" ) finally: for exp_id in cleanup_exp_ids: bindings.post_KillExperiment(determined_test_session(), id=exp_id) exp.wait_for_experiment_state( exp_id, determinedexperimentv1State.STATE_CANCELED)
def test_drain_agent() -> None: """ Start an experiment, `disable --drain` the agent once the trial is running, make sure the experiment still finishes, but the new ones won't schedule. """ slots = _fetch_slots() assert len(slots) == 1 agent_id = slots[0]["agent_id"] experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state(experiment_id, determinedexperimentv1State.STATE_ACTIVE) exp.wait_for_experiment_active_workload(experiment_id) exp.wait_for_experiment_workload_progress(experiment_id) # Disable and quickly enable it back. with _disable_agent(agent_id, drain=True): pass # Try to launch another experiment. It shouldn't get scheduled because the # slot is still busy with the first experiment. experiment_id_no_start = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) time.sleep(5) exp.wait_for_experiment_state(experiment_id_no_start, determinedexperimentv1State.STATE_ACTIVE) with _disable_agent(agent_id, drain=True): # Check for 15 seconds it doesn't get scheduled into the same slot. for _ in range(15): trials = exp.experiment_trials(experiment_id_no_start) assert len(trials) == 0 # Ensure the first one has finished with the correct number of workloads. exp.wait_for_experiment_state(experiment_id, determinedexperimentv1State.STATE_COMPLETED) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 assert len(trials[0].workloads) == 7 # Ensure the slot is empty. slots = _fetch_slots() assert len(slots) == 1 assert slots[0]["enabled"] is False assert slots[0]["draining"] is True assert slots[0]["allocation_id"] == "FREE" # Check agent state. command = ["det", "-m", conf.make_master_url(), "agent", "list", "--json"] output = subprocess.check_output(command).decode() agent_data = cast(List[Dict[str, Any]], json.loads(output))[0] assert agent_data["id"] == agent_id assert agent_data["enabled"] is False assert agent_data["draining"] is True exp.cancel_single(experiment_id_no_start)
def test_stress_agents_reconnect(steps: int, num_agents: int, should_disconnect: bool) -> None: random.seed(42) master_host = "localhost" master_port = "8080" conf.MASTER_IP = master_host conf.MASTER_PORT = master_port master_up([]) # Start all agents. agents_are_up = [True] * num_agents for i in range(num_agents): agent_up(["--agent-name", f"agent-{i}"], fluent_offset=i) time.sleep(3) for _ in range(steps): for agent_id, agent_is_up in enumerate(agents_are_up): if random.choice([True, False]): # Flip agents status randomly. continue if should_disconnect: # Can't just randomly deploy up/down due to just getting a Docker name conflict. if agent_is_up: agent_down(["--agent-name", f"agent-{agent_id}"]) else: agent_up(["--agent-name", f"agent-{agent_id}"], fluent_offset=agent_id) agents_are_up[agent_id] = not agents_are_up[agent_id] else: if random.choice([True, False]): agent_disable([f"agent-{agent_id}"]) agents_are_up[agent_id] = False else: agent_enable([f"agent-{agent_id}"]) agents_are_up[agent_id] = True time.sleep(10) # Validate that our master kept track of the agent reconnect spam. agent_list = json.loads( subprocess.check_output( [ "det", "agent", "list", "--json", ] ).decode() ) assert sum(agents_are_up) <= len(agent_list) for agent in agent_list: agent_id = int(agent["id"].replace("agent-", "")) assert agents_are_up[agent_id] == agent["enabled"] # Can we still schedule something? if any(agents_are_up): experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-one-short-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state( experiment_id, bindings.determinedexperimentv1State.STATE_COMPLETED ) for agent_id in range(num_agents): agent_down(["--agent-name", f"agent-{agent_id}"]) master_down([])