Exemplo n.º 1
0
def test_failed_job_status(ray_start_with_dashboard, disable_aiohttp_cache,
                           enable_test_module):
    address = ray_start_with_dashboard["webui_url"]
    assert wait_until_server_available(address)
    address = format_web_url(address)

    entrypoint_cmd = ("python -c\""
                      "import ray;"
                      "ray.init();"
                      "import time;"
                      "time.sleep(5);"
                      "import sys;"
                      "sys.exit(1);"
                      "\"")
    client = JobSubmissionClient(address)
    job_id = client.submit_job(entrypoint=entrypoint_cmd)

    def wait_for_job_to_fail():
        data = _get_snapshot(address)
        for job_entry in data["data"]["snapshot"]["jobs"].values():
            if job_entry["status"] is not None:
                assert job_entry["config"]["metadata"][
                    "jobSubmissionId"] == job_id
                assert job_entry["status"] in {"PENDING", "RUNNING", "FAILED"}
                assert job_entry["statusMessage"] is not None
                return job_entry["status"] == "FAILED"

        return False

    wait_for_condition(wait_for_job_to_fail, timeout=30)
Exemplo n.º 2
0
def _get_sdk_client(address: Optional[str],
                    create_cluster_if_needed: bool = False
                    ) -> JobSubmissionClient:
    if address is None:
        if "RAY_ADDRESS" not in os.environ:
            raise ValueError(
                "Address must be specified using either the --address flag "
                "or RAY_ADDRESS environment variable.")
        address = os.environ["RAY_ADDRESS"]

    return JobSubmissionClient(address, create_cluster_if_needed)
Exemplo n.º 3
0
def test_temporary_uri_reference(monkeypatch, expiration_s):
    """Test that temporary GCS URI references are deleted after expiration_s."""
    monkeypatch.setenv("RAY_RUNTIME_ENV_TEMPORARY_REFERENCE_EXPIRATION_S",
                       str(expiration_s))
    # We can't use a fixture with a shared Ray runtime because we need to set the
    # expiration_s env var before Ray starts.
    with _ray_start(include_dashboard=True, num_cpus=1) as ctx:
        headers = {
            "Connection": "keep-alive",
            "Authorization": "TOK:<MY_TOKEN>"
        }
        address = ctx.address_info["webui_url"]
        assert wait_until_server_available(address)
        client = JobSubmissionClient(format_web_url(address), headers=headers)
        with tempfile.TemporaryDirectory() as tmp_dir:
            path = Path(tmp_dir)

            hello_file = path / "hi.txt"
            with hello_file.open(mode="w") as f:
                f.write("hi\n")

            start = time.time()

            client.submit_job(entrypoint="echo hi",
                              runtime_env={"working_dir": tmp_dir})

            # Give time for deletion to occur if expiration_s is 0.
            time.sleep(2)
            # Need to connect to Ray to check internal_kv.
            # ray.init(address="auto")

            print("Starting Internal KV checks at time ", time.time() - start)
            if expiration_s > 0:
                assert not check_internal_kv_gced()
                wait_for_condition(check_internal_kv_gced,
                                   timeout=2 * expiration_s)
                assert expiration_s < time.time() - start < 2 * expiration_s
                print("Internal KV was GC'ed at time ", time.time() - start)
            else:
                wait_for_condition(check_internal_kv_gced)
                print("Internal KV was GC'ed at time ", time.time() - start)
Exemplo n.º 4
0
def _log_job_status(client: JobSubmissionClient, job_id: str):
    status = client.get_job_status(job_id)
    if status.status == JobStatus.SUCCEEDED:
        _log_big_success_msg(f"Job '{job_id}' succeeded")
    elif status.status == JobStatus.STOPPED:
        cli_logger.warning(f"Job '{job_id}' was stopped")
    elif status.status == JobStatus.FAILED:
        _log_big_error_msg(f"Job '{job_id}' failed")
        if status.message is not None:
            cli_logger.print(f"Status message: {status.message}")
    else:
        # Catch-all.
        cli_logger.print(f"Status for job '{job_id}': {status.status}")
        if status.message is not None:
            cli_logger.print(f"Status message: {status.message}")
Exemplo n.º 5
0
async def _tail_logs(client: JobSubmissionClient, job_id: str):
    async for lines in client.tail_job_logs(job_id):
        print(lines, end="")

    _log_job_status(client, job_id)
Exemplo n.º 6
0
def _check_job_succeeded(client: JobSubmissionClient, job_id: str) -> bool:
    status = client.get_job_status(job_id)
    if status == JobStatus.FAILED:
        stdout, stderr = client.get_job_logs(job_id)
        raise RuntimeError(f"Job failed\nstdout:\n{stdout}\nstderr:\n{stderr}")
    return status == JobStatus.SUCCEEDED
Exemplo n.º 7
0
def job_sdk_client(ray_start_with_dashboard, disable_aiohttp_cache,
                   enable_test_module):
    address = ray_start_with_dashboard["webui_url"]
    assert wait_until_server_available(address)
    yield JobSubmissionClient(format_web_url(address))
Exemplo n.º 8
0
def _check_job_stopped(client: JobSubmissionClient, job_id: str) -> bool:
    status = client.get_job_status(job_id)
    return status.status == JobStatus.STOPPED
Exemplo n.º 9
0
def _check_job_failed(client: JobSubmissionClient, job_id: str) -> bool:
    status = client.get_job_status(job_id)
    return status.status == JobStatus.FAILED
Exemplo n.º 10
0
def job_sdk_client(headers):
    with _ray_start(include_dashboard=True, num_cpus=1) as address_info:
        address = address_info["webui_url"]
        assert wait_until_server_available(address)
        yield JobSubmissionClient(format_web_url(address), headers=headers)