def test_disable_agent_experiment_resume() -> None: """ Start an experiment with max_restarts=0 and ensure that being killed due to an explicit agent disable/enable (without draining) does not count toward the number of restarts. """ slots = _fetch_slots() assert len(slots) == 1 agent_id = slots[0]["agent_id"] exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), ["--config", "max_restarts=0"], ) exp.wait_for_experiment_workload_progress(exp_id) with _disable_agent(agent_id): # Wait for the allocation to go away. for _ in range(20): slots = _fetch_slots() print(slots) if not any(s["allocation_id"] != "FREE" for s in slots): break time.sleep(1) else: pytest.fail("Experiment stayed scheduled after agent was disabled") exp.wait_for_experiment_state(exp_id, determinedexperimentv1State.STATE_COMPLETED)
def test_master_restart_kill_works( managed_cluster_restarts: ManagedCluster) -> None: _sanity_check(managed_cluster_restarts) try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-many-long-steps.yaml"), conf.fixtures_path("no_op"), [ "--config", "searcher.max_length.batches=10000", "--config", "max_restarts=0" ], ) exp.wait_for_experiment_workload_progress(exp_id) managed_cluster_restarts.kill_master() time.sleep(0) managed_cluster_restarts.restart_master() command = [ "det", "-m", conf.make_master_url(), "e", "kill", str(exp_id) ] subprocess.check_call(command) exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_CANCELED, max_wait_secs=10) managed_cluster_restarts.ensure_agent_ok() except Exception: managed_cluster_restarts.restart_master() managed_cluster_restarts.restart_agent()
def test_master_restart_reattach_recover_experiment( managed_cluster_restarts: ManagedCluster, downtime: int) -> None: _sanity_check(managed_cluster_restarts) try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) # TODO(ilia): don't wait for progress. exp.wait_for_experiment_workload_progress(exp_id) if downtime >= 0: managed_cluster_restarts.kill_master() time.sleep(downtime) managed_cluster_restarts.restart_master() exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED, max_wait_secs=downtime + 60) trials = exp.experiment_trials(exp_id) assert len(trials) == 1 train_wls = exp.workloads_with_training(trials[0].workloads) assert len(train_wls) == 5 except Exception: managed_cluster_restarts.restart_master() managed_cluster_restarts.restart_agent() raise
def test_streaming_observability_metrics_apis( framework_base_experiment: str, framework_timings_enabled: bool ) -> None: # TODO: refactor tests to not use cli singleton auth. certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication(conf.make_master_url(), try_reauth=True) config_path = conf.tutorials_path(f"../{framework_base_experiment}/const.yaml") model_def_path = conf.tutorials_path(f"../{framework_base_experiment}") config_obj = conf.load_config(config_path) config_obj = conf.set_profiling_enabled(config_obj) with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config_obj, f) experiment_id = exp.create_experiment( tf.name, model_def_path, ) exp.wait_for_experiment_state(experiment_id, "COMPLETED") trials = exp.experiment_trials(experiment_id) trial_id = trials[0]["id"] gpu_enabled = conf.GPU_ENABLED request_profiling_metric_labels(trial_id, framework_timings_enabled, gpu_enabled) if gpu_enabled: request_profiling_system_metrics(trial_id, "gpu_util") if framework_timings_enabled: request_profiling_pytorch_timing_metrics(trial_id, "train_batch")
def test_agent_reconnect_keep_experiment( managed_cluster_restarts: ManagedCluster) -> None: managed_cluster_restarts.ensure_agent_ok() try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_workload_progress(exp_id) managed_cluster_restarts.kill_proxy() time.sleep(1) managed_cluster_restarts.restart_proxy() exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED) trials = exp.experiment_trials(exp_id) assert len(trials) == 1 train_wls = exp.workloads_with_training(trials[0].workloads) assert len(train_wls) == 5 except Exception: managed_cluster_restarts.restart_proxy(wait_for_reconnect=False) managed_cluster_restarts.restart_agent() raise
def test_agent_restart_recover_experiment( managed_cluster_restarts: ManagedCluster, downtime: int) -> None: if not managed_cluster_restarts.reattach: pytest.skip() managed_cluster_restarts.ensure_agent_ok() try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_workload_progress(exp_id) if downtime >= 0: managed_cluster_restarts.kill_agent() time.sleep(downtime) managed_cluster_restarts.restart_agent(wait_for_amnesia=False) exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED) trials = exp.experiment_trials(exp_id) assert len(trials) == 1 train_wls = exp.workloads_with_training(trials[0].workloads) assert len(train_wls) == 5 except Exception: managed_cluster_restarts.restart_agent() raise
def test_noop_pause_of_experiment_without_trials() -> None: """ Walk through starting, pausing, and resuming a single no-op experiment which will never schedule a trial. """ config_obj = conf.load_config( conf.fixtures_path("no_op/single-one-short-step.yaml")) impossibly_large = 100 config_obj["max_restarts"] = 0 config_obj["resources"] = {"slots_per_trial": impossibly_large} with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config_obj, f) experiment_id = exp.create_experiment(tf.name, conf.fixtures_path("no_op"), None) exp.pause_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "PAUSED") exp.activate_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "ACTIVE") for _ in range(5): assert exp.experiment_state(experiment_id) == "ACTIVE" time.sleep(1) exp.cancel_single(experiment_id)
def test_noop_nan_validations() -> None: """ Ensure that NaN validation metric values don't prevent an experiment from completing. """ experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-nan-validations.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state( experiment_id, bindings.determinedexperimentv1State.STATE_COMPLETED )
def test_agent_restart_exp_container_failure( managed_cluster_restarts: ManagedCluster) -> None: managed_cluster_restarts.ensure_agent_ok() try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_workload_progress(exp_id) container_ids = list(_local_container_ids_for_experiment(exp_id)) if len(container_ids) != 1: pytest.fail( f"unexpected number of local containers for the experiment: {len(container_ids)}" ) # Get task id / allocation id tasks_data = _task_list_json(managed_cluster_restarts.master_url) assert len(tasks_data) == 1 exp_task_before = list(tasks_data.values())[0] managed_cluster_restarts.kill_agent() subprocess.run(["docker", "kill", container_ids[0]], check=True, stdout=subprocess.PIPE) except Exception: managed_cluster_restarts.restart_agent() raise else: managed_cluster_restarts.restart_agent() # As soon as the agent is back, the original allocation should be considered dead, # but the new one should be allocated. state = exp.experiment_state(exp_id) assert state == EXP_STATE.STATE_ACTIVE tasks_data = _task_list_json(managed_cluster_restarts.master_url) assert len(tasks_data) == 1 exp_task_after = list(tasks_data.values())[0] assert exp_task_before["task_id"] == exp_task_after["task_id"] assert exp_task_before["allocation_id"] != exp_task_after[ "allocation_id"] exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED)
def test_noop_pause() -> None: """ Walk through starting, pausing, and resuming a single no-op experiment. """ experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state(experiment_id, "ACTIVE") # Wait for the only trial to get scheduled. workload_active = False for _ in range(conf.MAX_TASK_SCHEDULED_SECS): workload_active = exp.experiment_has_active_workload(experiment_id) if workload_active: break else: time.sleep(1) check.true( workload_active, f"The only trial cannot be scheduled within {conf.MAX_TASK_SCHEDULED_SECS} seconds.", ) # Wait for the only trial to show progress, indicating the image is built and running. num_steps = 0 for _ in range(conf.MAX_TRIAL_BUILD_SECS): trials = exp.experiment_trials(experiment_id) if len(trials) > 0: only_trial = trials[0] num_steps = len(only_trial["steps"]) if num_steps > 1: break time.sleep(1) check.true( num_steps > 1, f"The only trial cannot start training within {conf.MAX_TRIAL_BUILD_SECS} seconds.", ) # Pause the experiment. Note that Determined does not currently differentiate # between a "stopping paused" and a "paused" state, so we follow this check # up by ensuring the experiment cleared all scheduled workloads. exp.pause_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "PAUSED") # Wait at most 20 seconds for the experiment to clear all workloads (each # train step should take 5 seconds). for _ in range(20): workload_active = exp.experiment_has_active_workload(experiment_id) if not workload_active: break else: time.sleep(1) check.true( not workload_active, "The experiment cannot be paused within 20 seconds.", ) # Resume the experiment and wait for completion. exp.activate_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "COMPLETED")
def test_experimental_experiment_api_determined_disabled() -> None: context_path = pathlib.Path(conf.fixtures_path("no_op")) model_def_path = pathlib.Path( conf.fixtures_path("no_op/single-medium-train-step.yaml")) model_context = context.Context.from_local(context_path) with model_def_path.open("r") as fin: dai_experiment_config = util.safe_load_yaml_with_exceptions(fin) determined_master = conf.make_master_url() requested_user, password = create_test_user(ADMIN_CREDENTIALS, add_password=True) a_username, _ = ADMIN_CREDENTIALS try: det_spawn(["-u", a_username, "user", "deactivate", "determined"]) certs.cli_cert = certs.default_load(master_url=determined_master, ) determined_api.authentication.cli_auth = determined_api.authentication.Authentication( determined_master, requested_user=requested_user, password=password, try_reauth=True, cert=certs.cli_cert, ) exp_id = determined_api.experiment.create_experiment_and_follow_logs( master_url=determined_master, config=dai_experiment_config, model_context=model_context, template=None, additional_body_fields={}, activate=True, follow_first_trial_logs=False, ) exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED) finally: det_spawn(["-u", a_username, "user", "activate", "determined"])
def test_drain_agent_sched() -> None: """ Start an experiment, drain it. Start a second one and make sure it schedules on the second agent *before* the first one has finished. """ slots = _wait_for_slots(2) assert len(slots) == 2 exp_id1 = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_workload_progress(exp_id1) slots = _fetch_slots() used_slots = [s for s in slots if s["allocation_id"] != "FREE"] assert len(used_slots) == 1 agent_id1 = used_slots[0]["agent_id"] with _disable_agent(agent_id1, drain=True): exp_id2 = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state(exp_id2, determinedexperimentv1State.STATE_ACTIVE) # Wait for a state when *BOTH* experiments are scheduled. for _ in range(20): slots = _fetch_slots() assert len(slots) == 2 used_slots = [s for s in slots if s["allocation_id"] != "FREE"] if len(used_slots) == 2: # All good. break else: pytest.fail( "Second experiment didn't schedule on the second agent " "while the first agent was draining") exp.wait_for_experiment_state( exp_id1, determinedexperimentv1State.STATE_COMPLETED) exp.wait_for_experiment_state( exp_id2, determinedexperimentv1State.STATE_COMPLETED) trials1 = exp.experiment_trials(exp_id1) trials2 = exp.experiment_trials(exp_id2) assert len(trials1) == len(trials2) == 1 assert len(trials1[0].workloads) == len(trials2[0].workloads) == 7
def test_noop_pause() -> None: """ Walk through starting, pausing, and resuming a single no-op experiment. """ experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_ACTIVE) # Wait for the only trial to get scheduled. exp.wait_for_experiment_active_workload(experiment_id) # Wait for the only trial to show progress, indicating the image is built and running. exp.wait_for_experiment_workload_progress(experiment_id) # Pause the experiment. Note that Determined does not currently differentiate # between a "stopping paused" and a "paused" state, so we follow this check # up by ensuring the experiment cleared all scheduled workloads. exp.pause_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_PAUSED) # Wait at most 20 seconds for the experiment to clear all workloads (each # train step should take 5 seconds). for _ in range(20): workload_active = exp.experiment_has_active_workload(experiment_id) if not workload_active: break else: time.sleep(1) check.true( not workload_active, "The experiment cannot be paused within 20 seconds.", ) # Resume the experiment and wait for completion. exp.activate_experiment(experiment_id) exp.wait_for_experiment_state( experiment_id, bindings.determinedexperimentv1State.STATE_COMPLETED )
def test_allocation_resources_incremental_release() -> None: """ Start an two container experiment and ensure one container exits before the other. Ensure resources are released and schedule-able without the other needing to be released. """ cleanup_exp_ids = [] try: slots = _wait_for_slots(2) assert len(slots) == 2 with tempfile.TemporaryDirectory() as context_dir, open( os.path.join(context_dir, "const.yaml"), "w") as config_file: # Launch an experiment that has one resource (docker container) that exits immediately. config_obj = conf.load_config( conf.fixtures_path("no_op/single.yaml")) config_obj["resources"] = { **config_obj.get("resources", {}), **{ "slots": 2 } } config_obj["hyperparameters"] = { **config_obj.get("hyperparameters", {}), **{ "non_chief_exit_immediately": True }, } yaml.dump(config_obj, config_file) shutil.copy(conf.fixtures_path("no_op/model_def.py"), os.path.join(context_dir, "model_def.py")) exp_id = exp.create_experiment(config_file.name, context_dir, None) cleanup_exp_ids.append(exp_id) # Wait for the experiment to start and run some. exp.wait_for_experiment_state( exp_id, determinedexperimentv1State.STATE_ACTIVE, ) exp.wait_for_experiment_active_workload(exp_id) # And wait for exactly one of the resources to free, while one is still in use. confirmations = 0 for _ in range(RANK_ONE_WAIT_TIME): free_agents = list_free_agents() if len(free_agents) == 1: confirmations += 1 if confirmations == 2: # Just for the race where one container has exited and the other hasn't quite yet, # but is going to, make sure we see it at least twice. break # Still waiting on partial exit time.sleep(1) else: pytest.fail( "exactly one agent did not free after {} seconds".format( RANK_ONE_WAIT_TIME)) # Ensure we can schedule on the free slot, not only that the API says its available. exp_id_2 = exp.create_experiment( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), None, ) cleanup_exp_ids.append(exp_id_2) exp.wait_for_experiment_workload_progress(exp_id_2) exp.wait_for_experiment_state( exp_id_2, determinedexperimentv1State.STATE_COMPLETED) cleanup_exp_ids = cleanup_exp_ids[:-1] # And check the hung experiment still is holding on to its hung slot. free_agents = list_free_agents() if len(free_agents) != 1: pytest.fail( f"should still have exactly one agent scheduled: {free_agents}" ) finally: for exp_id in cleanup_exp_ids: bindings.post_KillExperiment(determined_test_session(), id=exp_id) exp.wait_for_experiment_state( exp_id, determinedexperimentv1State.STATE_CANCELED)
def test_stress_agents_reconnect(steps: int, num_agents: int, should_disconnect: bool) -> None: random.seed(42) master_host = "localhost" master_port = "8080" conf.MASTER_IP = master_host conf.MASTER_PORT = master_port master_up([]) # Start all agents. agents_are_up = [True] * num_agents for i in range(num_agents): agent_up(["--agent-name", f"agent-{i}"], fluent_offset=i) time.sleep(3) for _ in range(steps): for agent_id, agent_is_up in enumerate(agents_are_up): if random.choice([True, False]): # Flip agents status randomly. continue if should_disconnect: # Can't just randomly deploy up/down due to just getting a Docker name conflict. if agent_is_up: agent_down(["--agent-name", f"agent-{agent_id}"]) else: agent_up(["--agent-name", f"agent-{agent_id}"], fluent_offset=agent_id) agents_are_up[agent_id] = not agents_are_up[agent_id] else: if random.choice([True, False]): agent_disable([f"agent-{agent_id}"]) agents_are_up[agent_id] = False else: agent_enable([f"agent-{agent_id}"]) agents_are_up[agent_id] = True time.sleep(10) # Validate that our master kept track of the agent reconnect spam. agent_list = json.loads( subprocess.check_output( [ "det", "agent", "list", "--json", ] ).decode() ) assert sum(agents_are_up) <= len(agent_list) for agent in agent_list: agent_id = int(agent["id"].replace("agent-", "")) assert agents_are_up[agent_id] == agent["enabled"] # Can we still schedule something? if any(agents_are_up): experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-one-short-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state( experiment_id, bindings.determinedexperimentv1State.STATE_COMPLETED ) for agent_id in range(num_agents): agent_down(["--agent-name", f"agent-{agent_id}"]) master_down([])
def run_gc_checkpoints_test(checkpoint_storage: Dict[str, str]) -> None: fixtures = [ ( conf.fixtures_path("no_op/gc_checkpoints_decreasing.yaml"), { (bindings.determinedexperimentv1State.STATE_COMPLETED.value): {800, 900, 1000}, (bindings.determinedexperimentv1State.STATE_DELETED.value): { 100, 200, 300, 400, 500, 600, 700, }, }, ), ( conf.fixtures_path("no_op/gc_checkpoints_increasing.yaml"), { (bindings.determinedexperimentv1State.STATE_COMPLETED.value): { 100, 200, 300, 900, 1000, }, (bindings.determinedexperimentv1State.STATE_DELETED.value): { 400, 500, 600, 700, 800, }, }, ), ] all_checkpoints: List[Tuple[Any, List[bindings.v1CheckpointWorkload]]] = [] for base_conf_path, result in fixtures: config = conf.load_config(str(base_conf_path)) config["checkpoint_storage"].update(checkpoint_storage) with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config, f) experiment_id = exp.create_experiment(tf.name, conf.fixtures_path("no_op")) exp.wait_for_experiment_state( experiment_id, bindings.determinedexperimentv1State.STATE_COMPLETED) # In some configurations, checkpoint GC will run on an auxillary machine, which may have to # be spun up still. So we'll wait for it to run. wait_for_gc_to_finish(experiment_id) # Checkpoints are not marked as deleted until gc_checkpoint task starts. retries = 5 for retry in range(retries): trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 cpoints = exp.workloads_with_checkpoint(trials[0].workloads) sorted_checkpoints = sorted( cpoints, key=lambda ckp: int(ckp.totalBatches), ) assert len(sorted_checkpoints) == 10 by_state = {} # type: Dict[str, Set[int]] for ckpt in sorted_checkpoints: by_state.setdefault(ckpt.state.value, set()).add(ckpt.totalBatches) if by_state == result: all_checkpoints.append((config, sorted_checkpoints)) break if retry + 1 == retries: assert by_state == result time.sleep(1) # Check that the actual checkpoint storage (for shared_fs) reflects the # deletions. We want to wait for the GC containers to exit, so check # repeatedly with a timeout. max_checks = 30 for i in range(max_checks): time.sleep(1) try: storage_states = [] for config, checkpoints in all_checkpoints: checkpoint_config = config["checkpoint_storage"] storage_manager = storage.build(checkpoint_config, container_path=None) storage_state = {} # type: Dict[str, Any] for checkpoint in checkpoints: assert checkpoint.uuid is not None storage_id = checkpoint.uuid storage_state[storage_id] = {} if checkpoint.state == bindings.determinedcheckpointv1State.STATE_COMPLETED: storage_state[storage_id]["found"] = False try: with storage_manager.restore_path(storage_id): storage_state[storage_id]["found"] = True except errors.CheckpointNotFound: pass elif checkpoint.state == bindings.determinedcheckpointv1State.STATE_DELETED: storage_state[storage_id] = { "deleted": False, "checkpoint": checkpoint } try: with storage_manager.restore_path(storage_id): pass except errors.CheckpointNotFound: storage_state[storage_id]["deleted"] = True storage_states.append(storage_state) for storage_state in storage_states: for state in storage_state.values(): if state.get("deleted", None) is False: json_states = json.dumps(storage_states) raise AssertionError( f"Some checkpoints were not deleted: JSON:{json_states}" ) if state.get("found", None) is False: json_states = json.dumps(storage_states) raise AssertionError( f"Some checkpoints were not found: JSON:{json_states}" ) except AssertionError: if i == max_checks - 1: raise else: break
def test_workspace_org() -> None: master_url = conf.make_master_url() authentication.cli_auth = authentication.Authentication(master_url, try_reauth=True) sess = session.Session(master_url, None, None, None) test_experiments: List[bindings.v1Experiment] = [] test_projects: List[bindings.v1Project] = [] test_workspaces: List[bindings.v1Workspace] = [] try: # Uncategorized workspace / project should exist already. r = bindings.get_GetWorkspaces(sess, name="Uncategorized") assert len(r.workspaces) == 1 default_workspace = r.workspaces[0] assert default_workspace.immutable r2 = bindings.get_GetWorkspaceProjects(sess, id=default_workspace.id) assert len(r2.projects) == 1 default_project = r2.projects[0] assert default_project.name == "Uncategorized" assert default_project.immutable # Add a test workspace. r3 = bindings.post_PostWorkspace( sess, body=bindings.v1PostWorkspaceRequest(name="_TestOnly")) made_workspace = r3.workspace test_workspaces.append(made_workspace) get_workspace = bindings.get_GetWorkspace( sess, id=made_workspace.id).workspace assert get_workspace.name == made_workspace.name assert not made_workspace.immutable and not get_workspace.immutable # Patch the workspace w_patch = bindings.v1PatchWorkspace.from_json(made_workspace.to_json()) w_patch.name = "_TestPatched" bindings.patch_PatchWorkspace(sess, body=w_patch, id=made_workspace.id) get_workspace = bindings.get_GetWorkspace( sess, id=made_workspace.id).workspace assert get_workspace.name == "_TestPatched" # Archive the workspace assert not made_workspace.archived bindings.post_ArchiveWorkspace(sess, id=made_workspace.id) get_workspace_2 = bindings.get_GetWorkspace( sess, id=made_workspace.id).workspace assert get_workspace_2.archived with pytest.raises(errors.APIException): # Cannot patch archived workspace bindings.patch_PatchWorkspace(sess, body=w_patch, id=made_workspace.id) with pytest.raises(errors.APIException): # Cannot create project inside archived workspace bindings.post_PostProject( sess, body=bindings.v1PostProjectRequest( name="Nope2", workspaceId=made_workspace.id), workspaceId=made_workspace.id, ) bindings.post_UnarchiveWorkspace(sess, id=made_workspace.id) get_workspace_3 = bindings.get_GetWorkspace( sess, id=made_workspace.id).workspace assert not get_workspace_3.archived # Refuse to patch, archive, unarchive, or delete the default workspace with pytest.raises(errors.APIException): bindings.patch_PatchWorkspace(sess, body=w_patch, id=default_workspace.id) with pytest.raises(errors.APIException): bindings.post_ArchiveWorkspace(sess, id=default_workspace.id) with pytest.raises(errors.APIException): bindings.post_UnarchiveWorkspace(sess, id=default_workspace.id) with pytest.raises(errors.APIException): bindings.delete_DeleteWorkspace(sess, id=default_workspace.id) # Sort test and default workspaces. workspace2 = bindings.post_PostWorkspace( sess, body=bindings.v1PostWorkspaceRequest(name="_TestWS")).workspace test_workspaces.append(workspace2) list_test_1 = bindings.get_GetWorkspaces(sess).workspaces assert ["Uncategorized", "_TestPatched", "_TestWS"] == [w.name for w in list_test_1] list_test_2 = bindings.get_GetWorkspaces( sess, orderBy=bindings.v1OrderBy.ORDER_BY_DESC).workspaces assert ["_TestWS", "_TestPatched", "Uncategorized"] == [w.name for w in list_test_2] list_test_3 = bindings.get_GetWorkspaces( sess, sortBy=bindings.v1GetWorkspacesRequestSortBy.SORT_BY_NAME ).workspaces assert ["_TestPatched", "_TestWS", "Uncategorized"] == [w.name for w in list_test_3] # Test pinned workspaces. pinned = bindings.get_GetWorkspaces( sess, pinned=True, ).workspaces assert len(pinned) == 2 bindings.post_UnpinWorkspace(sess, id=made_workspace.id) pinned = bindings.get_GetWorkspaces( sess, pinned=True, ).workspaces assert len(pinned) == 1 bindings.post_PinWorkspace(sess, id=made_workspace.id) pinned = bindings.get_GetWorkspaces( sess, pinned=True, ).workspaces assert len(pinned) == 2 # Add a test project to a workspace. r4 = bindings.post_PostProject( sess, body=bindings.v1PostProjectRequest(name="_TestOnly", workspaceId=made_workspace.id), workspaceId=made_workspace.id, ) made_project = r4.project test_projects.append(made_project) get_project = bindings.get_GetProject(sess, id=made_project.id).project assert get_project.name == made_project.name assert not made_project.immutable and not get_project.immutable # Project cannot be created in the default workspace. with pytest.raises(errors.APIException): bindings.post_PostProject( sess, body=bindings.v1PostProjectRequest( name="Nope", workspaceId=default_workspace.id), workspaceId=default_workspace.id, ) # Patch the project p_patch = bindings.v1PatchProject.from_json(made_project.to_json()) p_patch.name = "_TestPatchedProject" bindings.patch_PatchProject(sess, body=p_patch, id=made_project.id) get_project = bindings.get_GetProject(sess, id=made_project.id).project assert get_project.name == "_TestPatchedProject" # Archive the project assert not made_project.archived bindings.post_ArchiveProject(sess, id=made_project.id) get_project_2 = bindings.get_GetProject(sess, id=made_project.id).project assert get_project_2.archived # Cannot patch or move an archived project with pytest.raises(errors.APIException): bindings.patch_PatchProject(sess, body=p_patch, id=made_project.id) with pytest.raises(errors.APIException): bindings.post_MoveProject( sess, projectId=made_project.id, body=bindings.v1MoveProjectRequest( destinationWorkspaceId=workspace2.id, projectId=made_project.id, ), ) # Unarchive the project bindings.post_UnarchiveProject(sess, id=made_project.id) get_project_3 = bindings.get_GetProject(sess, id=made_project.id).project assert not get_project_3.archived # Can't archive, un-archive, or move while parent workspace is archived bindings.post_ArchiveWorkspace(sess, id=made_workspace.id) get_project_4 = bindings.get_GetProject(sess, id=made_project.id).project assert get_project_4.archived with pytest.raises(errors.APIException): bindings.post_ArchiveProject(sess, id=made_project.id) with pytest.raises(errors.APIException): bindings.post_UnarchiveProject(sess, id=made_project.id) with pytest.raises(errors.APIException): bindings.post_MoveProject( sess, projectId=made_project.id, body=bindings.v1MoveProjectRequest( destinationWorkspaceId=workspace2.id, projectId=made_project.id, ), ) bindings.post_UnarchiveWorkspace(sess, id=made_workspace.id) # Refuse to patch, archive, unarchive, or delete the default project with pytest.raises(errors.APIException): bindings.patch_PatchProject(sess, body=p_patch, id=default_project.id) with pytest.raises(errors.APIException): bindings.post_ArchiveProject(sess, id=default_project.id) with pytest.raises(errors.APIException): bindings.post_UnarchiveProject(sess, id=default_project.id) with pytest.raises(errors.APIException): bindings.delete_DeleteProject(sess, id=default_project.id) # Sort workspaces' projects. p1 = bindings.post_PostProject( sess, body=bindings.v1PostProjectRequest(name="_TestPRJ", workspaceId=made_workspace.id), workspaceId=made_workspace.id, ).project p2 = bindings.post_PostProject( sess, body=bindings.v1PostProjectRequest(name="_TestEarly", workspaceId=made_workspace.id), workspaceId=made_workspace.id, ).project test_projects += [p1, p2] list_test_4 = bindings.get_GetWorkspaceProjects( sess, id=made_workspace.id).projects assert ["_TestPatchedProject", "_TestPRJ", "_TestEarly"] == [p.name for p in list_test_4] list_test_5 = bindings.get_GetWorkspaceProjects( sess, id=made_workspace.id, orderBy=bindings.v1OrderBy.ORDER_BY_DESC).projects assert ["_TestEarly", "_TestPRJ", "_TestPatchedProject"] == [p.name for p in list_test_5] list_test_6 = bindings.get_GetWorkspaceProjects( sess, id=made_workspace.id, sortBy=bindings.v1GetWorkspaceProjectsRequestSortBy.SORT_BY_NAME, ).projects assert ["_TestEarly", "_TestPatchedProject", "_TestPRJ"] == [p.name for p in list_test_6] # Move a project to another workspace bindings.post_MoveProject( sess, projectId=made_project.id, body=bindings.v1MoveProjectRequest( destinationWorkspaceId=workspace2.id, projectId=made_project.id, ), ) get_project = bindings.get_GetProject(sess, id=made_project.id).project assert get_project.workspaceId == workspace2.id # Default project cannot be moved. with pytest.raises(errors.APIException): bindings.post_MoveProject( sess, projectId=default_project.id, body=bindings.v1MoveProjectRequest( destinationWorkspaceId=workspace2.id, projectId=default_project.id, ), ) # Project cannot be moved into the default workspace. with pytest.raises(errors.APIException): bindings.post_MoveProject( sess, projectId=made_project.id, body=bindings.v1MoveProjectRequest( destinationWorkspaceId=default_workspace.id, projectId=made_project.id, ), ) # Project cannot be moved into an archived workspace. bindings.post_ArchiveWorkspace(sess, id=made_workspace.id) with pytest.raises(errors.APIException): bindings.post_MoveProject( sess, projectId=made_project.id, body=bindings.v1MoveProjectRequest( destinationWorkspaceId=made_workspace.id, projectId=made_project.id, ), ) bindings.post_UnarchiveWorkspace(sess, id=made_workspace.id) # Add a test note to a project. note = bindings.v1Note(name="Hello", contents="Hello World") note2 = bindings.v1Note(name="Hello 2", contents="Hello World") bindings.post_AddProjectNote( sess, body=note, projectId=made_project.id, ) r5 = bindings.post_AddProjectNote( sess, body=note2, projectId=made_project.id, ) returned_notes = r5.notes assert len(returned_notes) == 2 # Put notes r6 = bindings.put_PutProjectNotes( sess, body=bindings.v1PutProjectNotesRequest(notes=[note], projectId=made_project.id), projectId=made_project.id, ) returned_notes = r6.notes assert len(returned_notes) == 1 # Create an experiment in the default project. test_exp_id = run_basic_test(conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) test_exp = bindings.get_GetExperiment( sess, experimentId=test_exp_id).experiment test_experiments.append(test_exp) wait_for_experiment_state( test_exp_id, bindings.determinedexperimentv1State.STATE_COMPLETED) assert test_exp.projectId == default_project.id # Move the test experiment into a user-made project dproj_exp = bindings.get_GetProjectExperiments( sess, id=default_project.id).experiments exp_count = len( bindings.get_GetProjectExperiments(sess, id=made_project.id).experiments) assert exp_count == 0 mbody = bindings.v1MoveExperimentRequest( destinationProjectId=made_project.id, experimentId=test_exp_id) bindings.post_MoveExperiment(sess, experimentId=test_exp_id, body=mbody) modified_exp = bindings.get_GetExperiment( sess, experimentId=test_exp_id).experiment assert modified_exp.projectId == made_project.id # Confirm the test experiment is in the new project, no longer in old project. exp_count = len( bindings.get_GetProjectExperiments(sess, id=made_project.id).experiments) assert exp_count == 1 dproj_exp2 = bindings.get_GetProjectExperiments( sess, id=default_project.id).experiments assert len(dproj_exp2) == len(dproj_exp) - 1 # Cannot move an experiment out of an archived project bindings.post_ArchiveProject(sess, id=made_project.id) mbody2 = bindings.v1MoveExperimentRequest( destinationProjectId=default_project.id, experimentId=test_exp_id) with pytest.raises(errors.APIException): bindings.post_MoveExperiment(sess, experimentId=test_exp_id, body=mbody2) bindings.post_UnarchiveProject(sess, id=made_project.id) # Moving an experiment into default project bindings.post_MoveExperiment(sess, experimentId=test_exp_id, body=mbody2) # Cannot move an experiment into an archived project bindings.post_ArchiveProject(sess, id=made_project.id) with pytest.raises(errors.APIException): bindings.post_MoveExperiment(sess, experimentId=test_exp_id, body=mbody) finally: # Clean out experiments, projects, workspaces. # In dependency order: for e in test_experiments: bindings.delete_DeleteExperiment(sess, experimentId=e.id) for p in test_projects: bindings.delete_DeleteProject(sess, id=p.id) for w in test_workspaces: bindings.delete_DeleteWorkspace(sess, id=w.id)
def test_pytorch_native_api() -> None: exp_id = exp.create_native_experiment(conf.fixtures_path("pytorch_no_op"), [sys.executable, "model_def.py"]) exp.wait_for_experiment_state(exp_id, "COMPLETED")
def test_drain_agent() -> None: """ Start an experiment, `disable --drain` the agent once the trial is running, make sure the experiment still finishes, but the new ones won't schedule. """ slots = _fetch_slots() assert len(slots) == 1 agent_id = slots[0]["agent_id"] experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state(experiment_id, determinedexperimentv1State.STATE_ACTIVE) exp.wait_for_experiment_active_workload(experiment_id) exp.wait_for_experiment_workload_progress(experiment_id) # Disable and quickly enable it back. with _disable_agent(agent_id, drain=True): pass # Try to launch another experiment. It shouldn't get scheduled because the # slot is still busy with the first experiment. experiment_id_no_start = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) time.sleep(5) exp.wait_for_experiment_state(experiment_id_no_start, determinedexperimentv1State.STATE_ACTIVE) with _disable_agent(agent_id, drain=True): # Check for 15 seconds it doesn't get scheduled into the same slot. for _ in range(15): trials = exp.experiment_trials(experiment_id_no_start) assert len(trials) == 0 # Ensure the first one has finished with the correct number of workloads. exp.wait_for_experiment_state(experiment_id, determinedexperimentv1State.STATE_COMPLETED) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 assert len(trials[0].workloads) == 7 # Ensure the slot is empty. slots = _fetch_slots() assert len(slots) == 1 assert slots[0]["enabled"] is False assert slots[0]["draining"] is True assert slots[0]["allocation_id"] == "FREE" # Check agent state. command = ["det", "-m", conf.make_master_url(), "agent", "list", "--json"] output = subprocess.check_output(command).decode() agent_data = cast(List[Dict[str, Any]], json.loads(output))[0] assert agent_data["id"] == agent_id assert agent_data["enabled"] is False assert agent_data["draining"] is True exp.cancel_single(experiment_id_no_start)
def run_gc_checkpoints_test(checkpoint_storage: Dict[str, str]) -> None: fixtures = [ ( conf.fixtures_path("no_op/gc_checkpoints_decreasing.yaml"), { "COMPLETED": {8, 9, 10}, "DELETED": {1, 2, 3, 4, 5, 6, 7} }, ), ( conf.fixtures_path("no_op/gc_checkpoints_increasing.yaml"), { "COMPLETED": {1, 2, 3, 9, 10}, "DELETED": {4, 5, 6, 7, 8} }, ), ] all_checkpoints = [] for base_conf_path, result in fixtures: config = conf.load_config(str(base_conf_path)) config["checkpoint_storage"].update(checkpoint_storage) with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config, f) experiment_id = exp.create_experiment(tf.name, conf.fixtures_path("no_op")) exp.wait_for_experiment_state(experiment_id, "COMPLETED") # Checkpoints are not marked as deleted until gc_checkpoint task starts. retries = 5 for retry in range(retries): trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 checkpoints = sorted( (step["checkpoint"] for step in trials[0]["steps"]), key=operator.itemgetter("step_id"), ) assert len(checkpoints) == 10 by_state = {} # type: Dict[str, Set[int]] for checkpoint in checkpoints: by_state.setdefault(checkpoint["state"], set()).add(checkpoint["step_id"]) if by_state == result: all_checkpoints.append((config, checkpoints)) break if retry + 1 == retries: assert by_state == result time.sleep(1) # Check that the actual checkpoint storage (for shared_fs) reflects the # deletions. We want to wait for the GC containers to exit, so check # repeatedly with a timeout. max_checks = 30 for i in range(max_checks): time.sleep(1) try: for config, checkpoints in all_checkpoints: checkpoint_config = config["checkpoint_storage"] if checkpoint_config["type"] == "shared_fs": deleted_exception = check.CheckFailedError elif checkpoint_config["type"] == "s3": deleted_exception = botocore.exceptions.ClientError else: raise NotImplementedError( f'unsupported storage type {checkpoint_config["type"]}' ) storage_manager = storage.build(checkpoint_config, container_path=None) for checkpoint in checkpoints: metadata = storage.StorageMetadata.from_json(checkpoint) if checkpoint["state"] == "COMPLETED": with storage_manager.restore_path(metadata): pass elif checkpoint["state"] == "DELETED": try: with storage_manager.restore_path(metadata): raise AssertionError("checkpoint not deleted") except deleted_exception: pass except AssertionError: if i == max_checks - 1: raise else: break
def run_gc_checkpoints_test(checkpoint_storage: Dict[str, str]) -> None: fixtures = [ ( conf.fixtures_path("no_op/gc_checkpoints_decreasing.yaml"), { "COMPLETED": {8, 9, 10}, "DELETED": {1, 2, 3, 4, 5, 6, 7} }, ), ( conf.fixtures_path("no_op/gc_checkpoints_increasing.yaml"), { "COMPLETED": {1, 2, 3, 9, 10}, "DELETED": {4, 5, 6, 7, 8} }, ), ] all_checkpoints = [] for base_conf_path, result in fixtures: config = conf.load_config(str(base_conf_path)) config["checkpoint_storage"].update(checkpoint_storage) with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config, f) experiment_id = exp.create_experiment(tf.name, conf.fixtures_path("no_op")) exp.wait_for_experiment_state(experiment_id, "COMPLETED") # Checkpoints are not marked as deleted until gc_checkpoint task starts. retries = 5 for retry in range(retries): trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 checkpoints = sorted( (step.checkpoint for step in trials[0].steps), key=operator.itemgetter("step_id"), ) assert len(checkpoints) == 10 by_state = {} # type: Dict[str, Set[int]] for checkpoint in checkpoints: by_state.setdefault(checkpoint.state, set()).add(checkpoint.step_id) if by_state == result: all_checkpoints.append((config, checkpoints)) break if retry + 1 == retries: assert by_state == result time.sleep(1) # Check that the actual checkpoint storage (for shared_fs) reflects the # deletions. We want to wait for the GC containers to exit, so check # repeatedly with a timeout. max_checks = 30 for check in range(max_checks): time.sleep(1) try: for config, checkpoints in all_checkpoints: checkpoint_config = config["checkpoint_storage"] if checkpoint_config["type"] == "shared_fs" and ( "storage_path" not in checkpoint_config): if "tensorboard_path" in checkpoint_config: checkpoint_config[ "storage_path"] = checkpoint_config.get( "tensorboard_path", None) else: checkpoint_config[ "storage_path"] = checkpoint_config.get( "checkpoint_path", None) root = os.path.join(checkpoint_config["host_path"], checkpoint_config["storage_path"]) for checkpoint in checkpoints: dirname = os.path.join(root, checkpoint.uuid) if checkpoint.state == "COMPLETED": assert os.path.isdir(dirname) elif checkpoint.state == "DELETED": assert not os.path.exists(dirname) except AssertionError: if check == max_checks - 1: raise else: break