示例#1
0
def test_noop_pause_of_experiment_without_trials() -> None:
    """
    Walk through starting, pausing, and resuming a single no-op experiment
    which will never schedule a trial.
    """
    config_obj = conf.load_config(
        conf.fixtures_path("no_op/single-one-short-step.yaml"))
    impossibly_large = 100
    config_obj["max_restarts"] = 0
    config_obj["resources"] = {"slots_per_trial": impossibly_large}
    with tempfile.NamedTemporaryFile() as tf:
        with open(tf.name, "w") as f:
            yaml.dump(config_obj, f)
        experiment_id = exp.create_experiment(tf.name,
                                              conf.fixtures_path("no_op"),
                                              None)
    exp.pause_experiment(experiment_id)
    exp.wait_for_experiment_state(experiment_id, "PAUSED")

    exp.activate_experiment(experiment_id)
    exp.wait_for_experiment_state(experiment_id, "ACTIVE")

    for _ in range(5):
        assert exp.experiment_state(experiment_id) == "ACTIVE"
        time.sleep(1)

    exp.cancel_single(experiment_id)
示例#2
0
def test_agent_restart_exp_container_failure(
        managed_cluster_restarts: ManagedCluster) -> None:
    managed_cluster_restarts.ensure_agent_ok()
    try:
        exp_id = exp.create_experiment(
            conf.fixtures_path("no_op/single-medium-train-step.yaml"),
            conf.fixtures_path("no_op"),
            None,
        )
        exp.wait_for_experiment_workload_progress(exp_id)
        container_ids = list(_local_container_ids_for_experiment(exp_id))
        if len(container_ids) != 1:
            pytest.fail(
                f"unexpected number of local containers for the experiment: {len(container_ids)}"
            )
        # Get task id / allocation id
        tasks_data = _task_list_json(managed_cluster_restarts.master_url)
        assert len(tasks_data) == 1
        exp_task_before = list(tasks_data.values())[0]

        managed_cluster_restarts.kill_agent()
        subprocess.run(["docker", "kill", container_ids[0]],
                       check=True,
                       stdout=subprocess.PIPE)
    except Exception:
        managed_cluster_restarts.restart_agent()
        raise
    else:
        managed_cluster_restarts.restart_agent()
        # As soon as the agent is back, the original allocation should be considered dead,
        # but the new one should be allocated.
        state = exp.experiment_state(exp_id)
        assert state == EXP_STATE.STATE_ACTIVE
        tasks_data = _task_list_json(managed_cluster_restarts.master_url)
        assert len(tasks_data) == 1
        exp_task_after = list(tasks_data.values())[0]

        assert exp_task_before["task_id"] == exp_task_after["task_id"]
        assert exp_task_before["allocation_id"] != exp_task_after[
            "allocation_id"]

        exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED)