Пример #1
0
def test_failed_job_status(ray_start_with_dashboard, disable_aiohttp_cache,
                           enable_test_module):
    address = ray_start_with_dashboard.address_info["webui_url"]
    assert wait_until_server_available(address)
    address = format_web_url(address)

    job_sleep_time_s = 5
    entrypoint_cmd = ('python -c"'
                      "import ray;"
                      "ray.init();"
                      "import time;"
                      f"time.sleep({job_sleep_time_s});"
                      "import sys;"
                      "sys.exit(1);"
                      '"')
    start_time_s = int(time.time())
    client = JobSubmissionClient(address)
    runtime_env = {"env_vars": {"RAY_TEST_456": "456"}}
    metadata = {"ray_test_789": "789"}
    job_id = client.submit_job(entrypoint=entrypoint_cmd,
                               metadata=metadata,
                               runtime_env=runtime_env)

    def wait_for_job_to_fail():
        data = _get_snapshot(address)

        legacy_job_failed = False
        job_failed = False

        # Test legacy job snapshot (one driver per job).
        for job_entry in data["data"]["snapshot"]["jobs"].values():
            if job_entry["status"] is not None:
                assert job_entry["config"]["metadata"][
                    "jobSubmissionId"] == job_id
                assert job_entry["status"] in {"PENDING", "RUNNING", "FAILED"}
                assert job_entry["statusMessage"] is not None
                legacy_job_failed = job_entry["status"] == "FAILED"

        # Test new jobs snapshot (0 to N drivers per job).
        for job_submission_id, entry in data["data"]["snapshot"][
                "jobSubmission"].items():
            if entry["status"] is not None:
                assert entry["status"] in {"PENDING", "RUNNING", "FAILED"}
                assert entry["message"] is not None
                # TODO(architkulkarni): Disable automatic camelcase.
                assert entry["runtimeEnv"] == {
                    "envVars": {
                        "RAYTest456": "456"
                    }
                }
                assert entry["metadata"] == {"rayTest789": "789"}
                assert entry["errorType"] is None
                assert abs(entry["startTime"] - start_time_s) <= 2
                if entry["status"] == "FAILED":
                    job_failed = True
                    assert entry[
                        "endTime"] >= entry["startTime"] + job_sleep_time_s
        return legacy_job_failed and job_failed

    wait_for_condition(wait_for_job_to_fail, timeout=10)
Пример #2
0
def test_cli_apis_sanity_check(ray_start_cluster):
    """Test all of CLI APIs work as expected."""
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=2)
    ray.init(address=cluster.address)
    for _ in range(3):
        cluster.add_node(num_cpus=2)
    runner = CliRunner()

    client = JobSubmissionClient(
        f"http://{ray._private.worker.global_worker.node.address_info['webui_url']}"
    )

    @ray.remote
    def f():
        import time

        time.sleep(30)

    @ray.remote
    class Actor:
        pass

    obj = ray.put(3)  # noqa
    task = f.remote()  # noqa
    actor = Actor.remote()  # noqa
    actor_runtime_env = Actor.options(  # noqa
        runtime_env={
            "pip": ["requests"]
        }).remote()
    job_id = client.submit_job(  # noqa
        # Entrypoint shell command to execute
        entrypoint="ls", )
    pg = ray.util.placement_group(bundles=[{"CPU": 1}])  # noqa

    def verify_output(resource_name, necessary_substrings: List[str]):
        result = runner.invoke(cli_list, [resource_name])
        exit_code_correct = result.exit_code == 0
        substring_matched = all(substr in result.output
                                for substr in necessary_substrings)
        print(result.output)
        return exit_code_correct and substring_matched

    wait_for_condition(lambda: verify_output("actors", ["actor_id"]))
    wait_for_condition(lambda: verify_output("workers", ["worker_id"]))
    wait_for_condition(lambda: verify_output("nodes", ["node_id"]))
    wait_for_condition(
        lambda: verify_output("placement-groups", ["placement_group_id"]))
    wait_for_condition(lambda: verify_output("jobs", ["raysubmit"]))
    wait_for_condition(lambda: verify_output("tasks", ["task_id"]))
    wait_for_condition(lambda: verify_output("objects", ["object_id"]))
    wait_for_condition(lambda: verify_output("runtime-envs", ["runtime_env"]))
Пример #3
0
    def _get_job_client(self) -> "JobSubmissionClient":
        from ray.job_submission import JobSubmissionClient  # noqa: F811

        if not self.job_client:
            self.job_client = JobSubmissionClient(
                self.cluster_manager.get_cluster_address())
        return self.job_client
Пример #4
0
def test_list_jobs(shutdown_only):
    ray.init()
    client = JobSubmissionClient(
        f"http://{ray.worker.global_worker.node.address_info['webui_url']}")
    job_id = client.submit_job(  # noqa
        # Entrypoint shell command to execute
        entrypoint="ls", )

    def verify():
        job_data = list(list_jobs().values())[0]
        job_id_from_api = list(list_jobs().keys())[0]
        correct_state = job_data["status"] == "SUCCEEDED"
        correct_id = job_id == job_id_from_api
        return correct_state and correct_id

    wait_for_condition(verify)
    print(list_jobs())
Пример #5
0
def _get_sdk_client(
        address: Optional[str],
        create_cluster_if_needed: bool = False) -> JobSubmissionClient:

    if address is None and "RAY_ADDRESS" in os.environ:
        address = os.environ["RAY_ADDRESS"]

    cli_logger.labeled_value("Job submission server address", address)
    return JobSubmissionClient(address, create_cluster_if_needed)
Пример #6
0
def test_ray_tune_basic(job_sdk_client: JobSubmissionClient):
    run_cmd = "python ray_tune_basic.py"
    job_id = job_sdk_client.submit_job(
        entrypoint=run_cmd,
        runtime_env={"working_dir": DRIVER_SCRIPT_DIR},
    )
    wait_for_condition(_check_job_succeeded,
                       timeout=30,
                       client=job_sdk_client,
                       job_id=job_id)
Пример #7
0
def test_per_task_runtime_env(job_sdk_client: JobSubmissionClient):
    run_cmd = "python per_task_runtime_env.py"
    job_id = job_sdk_client.submit_job(
        entrypoint=run_cmd,
        runtime_env={"working_dir": DRIVER_SCRIPT_DIR},
    )

    wait_for_condition(_check_job_succeeded,
                       client=job_sdk_client,
                       job_id=job_id)
Пример #8
0
def _get_sdk_client(
        address: Optional[str],
        create_cluster_if_needed: bool = False) -> JobSubmissionClient:

    if address is None:
        if "RAY_ADDRESS" not in os.environ:
            raise ValueError(
                "Address must be specified using either the --address flag "
                "or RAY_ADDRESS environment variable.")
        address = os.environ["RAY_ADDRESS"]

    cli_logger.labeled_value("Job submission server address", address)
    return JobSubmissionClient(address, create_cluster_if_needed)
Пример #9
0
def _log_job_status(client: JobSubmissionClient, job_id: str):
    info = client.get_job_info(job_id)
    if info.status == JobStatus.SUCCEEDED:
        _log_big_success_msg(f"Job '{job_id}' succeeded")
    elif info.status == JobStatus.STOPPED:
        cli_logger.warning(f"Job '{job_id}' was stopped")
    elif info.status == JobStatus.FAILED:
        _log_big_error_msg(f"Job '{job_id}' failed")
        if info.message is not None:
            cli_logger.print(f"Status message: {info.message}", no_format=True)
    else:
        # Catch-all.
        cli_logger.print(f"Status for job '{job_id}': {info.status}")
        if info.message is not None:
            cli_logger.print(f"Status message: {info.message}", no_format=True)
Пример #10
0
def _check_job_failed(client: JobSubmissionClient, job_id: str) -> bool:
    status = client.get_job_status(job_id)
    return status == JobStatus.FAILED
Пример #11
0
def _check_job_succeeded(client: JobSubmissionClient, job_id: str) -> bool:
    status = client.get_job_status(job_id)
    if status == JobStatus.FAILED:
        logs = client.get_job_logs(job_id)
        raise RuntimeError(f"Job failed\nlogs:\n{logs}")
    return status == JobStatus.SUCCEEDED
Пример #12
0
async def test_state_data_source_client(ray_start_cluster):
    cluster = ray_start_cluster
    # head
    cluster.add_node(num_cpus=2)
    ray.init(address=cluster.address)
    # worker
    worker = cluster.add_node(num_cpus=2)

    GRPC_CHANNEL_OPTIONS = (
        *ray_constants.GLOBAL_GRPC_OPTIONS,
        ("grpc.max_send_message_length",
         ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE),
        ("grpc.max_receive_message_length",
         ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE),
    )
    gcs_channel = ray._private.utils.init_grpc_channel(cluster.address,
                                                       GRPC_CHANNEL_OPTIONS,
                                                       asynchronous=True)
    client = StateDataSourceClient(gcs_channel)
    """
    Test actor
    """
    result = await client.get_all_actor_info()
    assert isinstance(result, GetAllActorInfoReply)
    """
    Test placement group
    """
    result = await client.get_all_placement_group_info()
    assert isinstance(result, GetAllPlacementGroupReply)
    """
    Test node
    """
    result = await client.get_all_node_info()
    assert isinstance(result, GetAllNodeInfoReply)
    """
    Test worker info
    """
    result = await client.get_all_worker_info()
    assert isinstance(result, GetAllWorkerInfoReply)
    """
    Test job
    """
    job_client = JobSubmissionClient(
        f"http://{ray._private.worker.global_worker.node.address_info['webui_url']}"
    )
    job_id = job_client.submit_job(  # noqa
        # Entrypoint shell command to execute
        entrypoint="ls", )
    result = client.get_job_info()
    assert list(result.keys())[0] == job_id
    assert isinstance(result, dict)
    """
    Test tasks
    """
    with pytest.raises(ValueError):
        # Since we didn't register this node id, it should raise an exception.
        result = await client.get_task_info("1234")

    wait_for_condition(lambda: len(ray.nodes()) == 2)
    for node in ray.nodes():
        node_id = node["NodeID"]
        ip = node["NodeManagerAddress"]
        port = int(node["NodeManagerPort"])
        client.register_raylet_client(node_id, ip, port)
        result = await client.get_task_info(node_id)
        assert isinstance(result, GetTasksInfoReply)

    assert len(client.get_all_registered_raylet_ids()) == 2
    """
    Test objects
    """
    with pytest.raises(ValueError):
        # Since we didn't register this node id, it should raise an exception.
        result = await client.get_object_info("1234")

    wait_for_condition(lambda: len(ray.nodes()) == 2)
    for node in ray.nodes():
        node_id = node["NodeID"]
        ip = node["NodeManagerAddress"]
        port = int(node["NodeManagerPort"])
        client.register_raylet_client(node_id, ip, port)
        result = await client.get_object_info(node_id)
        assert isinstance(result, GetNodeStatsReply)
    """
    Test runtime env
    """
    with pytest.raises(ValueError):
        # Since we didn't register this node id, it should raise an exception.
        result = await client.get_runtime_envs_info("1234")
    wait_for_condition(lambda: len(ray.nodes()) == 2)
    for node in ray.nodes():
        node_id = node["NodeID"]
        key = f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{node_id}"

        def get_port():
            return ray.experimental.internal_kv._internal_kv_get(
                key, namespace=ray_constants.KV_NAMESPACE_DASHBOARD)

        wait_for_condition(lambda: get_port() is not None)
        # The second index is the gRPC port
        port = json.loads(get_port())[1]
        ip = node["NodeManagerAddress"]
        client.register_agent_client(node_id, ip, port)
        result = await client.get_runtime_envs_info(node_id)
        assert isinstance(result, GetRuntimeEnvsInfoReply)
    """
    Test logs
    """
    with pytest.raises(ValueError):
        result = await client.list_logs("1234", "*")
    with pytest.raises(ValueError):
        result = await client.stream_log("1234", "raylet.out", True, 100, 1, 5)

    wait_for_condition(lambda: len(ray.nodes()) == 2)
    # The node information should've been registered in the previous section.
    for node in ray.nodes():
        node_id = node["NodeID"]
        result = await client.list_logs(node_id, timeout=30, glob_filter="*")
        assert isinstance(result, ListLogsReply)

        stream = await client.stream_log(node_id, "raylet.out", False, 10, 1,
                                         5)
        async for logs in stream:
            log_lines = len(logs.data.decode().split("\n"))
            assert isinstance(logs, StreamLogReply)
            assert log_lines >= 10
            assert log_lines <= 11
    """
    Test the exception is raised when the RPC error occurs.
    """
    cluster.remove_node(worker)
    # Wait until the dead node information is propagated.
    wait_for_condition(lambda: len(
        list(filter(lambda node: node["Alive"], ray.nodes()))) == 1)
    for node in ray.nodes():
        node_id = node["NodeID"]
        if node["Alive"]:
            continue

        # Querying to the dead node raises gRPC error, which should raise an exception.
        with pytest.raises(DataSourceUnavailable):
            await client.get_object_info(node_id)

        # Make sure unregister API works as expected.
        client.unregister_raylet_client(node_id)
        assert len(client.get_all_registered_raylet_ids()) == 1
        # Since the node_id is unregistered, the API should raise ValueError.
        with pytest.raises(ValueError):
            result = await client.get_object_info(node_id)
Пример #13
0
def job_sdk_client(headers):
    with _ray_start(include_dashboard=True, num_cpus=1) as ctx:
        address = ctx.address_info["webui_url"]
        assert wait_until_server_available(address)
        yield JobSubmissionClient(format_web_url(address), headers=headers)
Пример #14
0
async def test_state_data_source_client(ray_start_cluster):
    cluster = ray_start_cluster
    # head
    cluster.add_node(num_cpus=2)
    ray.init(address=cluster.address)
    # worker
    worker = cluster.add_node(num_cpus=2)

    GRPC_CHANNEL_OPTIONS = (
        ("grpc.enable_http_proxy", 0),
        ("grpc.max_send_message_length",
         ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE),
        ("grpc.max_receive_message_length",
         ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE),
    )
    gcs_channel = ray._private.utils.init_grpc_channel(cluster.address,
                                                       GRPC_CHANNEL_OPTIONS,
                                                       asynchronous=True)
    client = StateDataSourceClient(gcs_channel)
    """
    Test actor
    """
    result = await client.get_all_actor_info()
    assert isinstance(result, GetAllActorInfoReply)
    """
    Test placement group
    """
    result = await client.get_all_placement_group_info()
    assert isinstance(result, GetAllPlacementGroupReply)
    """
    Test node
    """
    result = await client.get_all_node_info()
    assert isinstance(result, GetAllNodeInfoReply)
    """
    Test worker info
    """
    result = await client.get_all_worker_info()
    assert isinstance(result, GetAllWorkerInfoReply)
    """
    Test job
    """
    job_client = JobSubmissionClient(
        f"http://{ray.worker.global_worker.node.address_info['webui_url']}")
    job_id = job_client.submit_job(  # noqa
        # Entrypoint shell command to execute
        entrypoint="ls", )
    result = client.get_job_info()
    assert list(result.keys())[0] == job_id
    assert isinstance(result, dict)
    """
    Test tasks
    """
    with pytest.raises(ValueError):
        # Since we didn't register this node id, it should raise an exception.
        result = await client.get_task_info("1234")

    wait_for_condition(lambda: len(ray.nodes()) == 2)
    for node in ray.nodes():
        node_id = node["NodeID"]
        ip = node["NodeManagerAddress"]
        port = int(node["NodeManagerPort"])
        client.register_raylet_client(node_id, ip, port)
        result = await client.get_task_info(node_id)
        assert isinstance(result, GetTasksInfoReply)

    assert len(client.get_all_registered_raylet_ids()) == 2
    """
    Test objects
    """
    with pytest.raises(ValueError):
        # Since we didn't register this node id, it should raise an exception.
        result = await client.get_object_info("1234")

    wait_for_condition(lambda: len(ray.nodes()) == 2)
    for node in ray.nodes():
        node_id = node["NodeID"]
        ip = node["NodeManagerAddress"]
        port = int(node["NodeManagerPort"])
        client.register_raylet_client(node_id, ip, port)
        result = await client.get_object_info(node_id)
        assert isinstance(result, GetNodeStatsReply)
    """
    Test the exception is raised when the RPC error occurs.
    """
    cluster.remove_node(worker)
    # Wait until the dead node information is propagated.
    wait_for_condition(lambda: len(
        list(filter(lambda node: node["Alive"], ray.nodes()))) == 1)
    for node in ray.nodes():
        node_id = node["NodeID"]
        if node["Alive"]:
            continue

        # Querying to the dead node raises gRPC error, which should be
        # translated into `StateSourceNetworkException`
        with pytest.raises(StateSourceNetworkException):
            result = await client.get_object_info(node_id)

        # Make sure unregister API works as expected.
        client.unregister_raylet_client(node_id)
        assert len(client.get_all_registered_raylet_ids()) == 1
        # Since the node_id is unregistered, the API should raise ValueError.
        with pytest.raises(ValueError):
            result = await client.get_object_info(node_id)
Пример #15
0
async def _tail_logs(client: JobSubmissionClient, job_id: str):
    async for lines in client.tail_job_logs(job_id):
        print(lines, end="")

    _log_job_status(client, job_id)
Пример #16
0
def _check_job_stopped(client: JobSubmissionClient, job_id: str) -> bool:
    status = client.get_job_status(job_id)
    return status == JobStatus.STOPPED
Пример #17
0
 def _get_job_client(self) -> JobSubmissionClient:
     if not self.job_client:
         self.job_client = JobSubmissionClient(
             self.cluster_manager.get_cluster_address()
         )
     return self.job_client
Пример #18
0
def ray_job_submit(
    script_name: str,
    head_service: str,
    k8s_namespace: str = "default",
    ray_dashboard_port: int = 8265,
) -> str:
    """Submits a Python script via the Ray Job Submission API, using the Python SDK.
    Waits for successful completion of the job and returns the job logs as a string.

    Uses `kubectl port-forward` to access the Ray head's dashboard port.

    Scripts live in `tests/kuberay/scripts`. This directory is used as the working
    dir for the job.

    Args:
        script_name: The name of the script to submit.
        head_service: The name of the Ray head K8s service.
        k8s_namespace: K8s namespace the Ray cluster belongs to.
        ray_dashboard_port: The port on which the Ray head is running the Ray dashboard.
    """
    with _kubectl_port_forward(service=head_service,
                               namespace=k8s_namespace,
                               target_port=ray_dashboard_port) as local_port:
        # It takes a bit of time to establish the connection.
        # Try a few times to instantiate the JobSubmissionClient, as the client's
        # instantiation does not retry on connection errors.
        for trie in range(1, 7):
            time.sleep(5)
            try:
                client = JobSubmissionClient(f"http://127.0.0.1:{local_port}")
            except ConnectionError as e:
                if trie < 6:
                    logger.info(
                        "Job client connection failed. Retrying in 5 seconds.")
                else:
                    raise e from None
        job_id = client.submit_job(
            entrypoint=f"python {script_name}",
            runtime_env={
                "working_dir": SCRIPTS_DIR,
                # Throw in some extra data for fun, to validate runtime envs.
                "pip": ["pytest==6.0.0"],
                "env_vars": {
                    "key_foo": "value_bar"
                },
            },
        )
        # Wait for the job to complete successfully.
        # This logic is copied from the Job Submission docs.
        start = time.time()
        timeout = 60
        while time.time() - start <= timeout:
            status = client.get_job_status(job_id)
            print(f"status: {status}")
            if status in {
                    JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED
            }:
                break
            time.sleep(5)

        assert status == JobStatus.SUCCEEDED
        return client.get_job_logs(job_id)
Пример #19
0
def test_successful_job_status(ray_start_with_dashboard, disable_aiohttp_cache,
                               enable_test_module, address_suffix):
    address = ray_start_with_dashboard.address_info["webui_url"]
    assert wait_until_server_available(address)
    address = format_web_url(address)

    job_sleep_time_s = 5
    entrypoint = ('python -c"'
                  "import ray;"
                  "ray.init();"
                  "import time;"
                  f"time.sleep({job_sleep_time_s});"
                  '"')

    client = JobSubmissionClient(address + address_suffix)
    start_time_s = int(time.time())
    runtime_env = {"env_vars": {"RAY_TEST_123": "123"}}
    metadata = {"ray_test_456": "456"}
    job_id = client.submit_job(entrypoint=entrypoint,
                               metadata=metadata,
                               runtime_env=runtime_env)

    def wait_for_job_to_succeed():
        data = _get_snapshot(address)
        legacy_job_succeeded = False
        job_succeeded = False

        # Test legacy job snapshot (one driver per job).
        for job_entry in data["data"]["snapshot"]["jobs"].values():
            if job_entry["status"] is not None:
                assert job_entry["config"]["metadata"][
                    "jobSubmissionId"] == job_id
                assert job_entry["status"] in {
                    "PENDING", "RUNNING", "SUCCEEDED"
                }
                assert job_entry["statusMessage"] is not None
                legacy_job_succeeded = job_entry["status"] == "SUCCEEDED"

        # Test new jobs snapshot (0 to N drivers per job).
        assert data["data"]["snapshot"]["jobSubmission"]
        for job_submission_id, entry in data["data"]["snapshot"][
                "jobSubmission"].items():
            if entry["status"] is not None:
                assert entry["jobSubmissionId"] == job_id
                assert entry["entrypoint"] == entrypoint
                assert entry["status"] in {"PENDING", "RUNNING", "SUCCEEDED"}
                assert entry["message"] is not None
                # TODO(architkulkarni): Disable automatic camelcase.
                assert entry["runtimeEnv"] == {
                    "envVars": {
                        "RAYTest123": "123"
                    }
                }
                assert entry["metadata"] == {"rayTest456": "456"}
                assert entry["errorType"] is None
                assert abs(entry["startTime"] - start_time_s * 1000) <= 2000
                if entry["status"] == "SUCCEEDED":
                    job_succeeded = True
                    assert (entry["endTime"] >=
                            entry["startTime"] + job_sleep_time_s * 1000)

        print(f"Legacy job submission succeeded: {legacy_job_succeeded}")
        print(f"Job submission succeeded: {job_succeeded}")
        return legacy_job_succeeded and job_succeeded

    wait_for_condition(wait_for_job_to_succeed, timeout=45)