def test_failed_job_status(ray_start_with_dashboard, disable_aiohttp_cache, enable_test_module): address = ray_start_with_dashboard.address_info["webui_url"] assert wait_until_server_available(address) address = format_web_url(address) job_sleep_time_s = 5 entrypoint_cmd = ('python -c"' "import ray;" "ray.init();" "import time;" f"time.sleep({job_sleep_time_s});" "import sys;" "sys.exit(1);" '"') start_time_s = int(time.time()) client = JobSubmissionClient(address) runtime_env = {"env_vars": {"RAY_TEST_456": "456"}} metadata = {"ray_test_789": "789"} job_id = client.submit_job(entrypoint=entrypoint_cmd, metadata=metadata, runtime_env=runtime_env) def wait_for_job_to_fail(): data = _get_snapshot(address) legacy_job_failed = False job_failed = False # Test legacy job snapshot (one driver per job). for job_entry in data["data"]["snapshot"]["jobs"].values(): if job_entry["status"] is not None: assert job_entry["config"]["metadata"][ "jobSubmissionId"] == job_id assert job_entry["status"] in {"PENDING", "RUNNING", "FAILED"} assert job_entry["statusMessage"] is not None legacy_job_failed = job_entry["status"] == "FAILED" # Test new jobs snapshot (0 to N drivers per job). for job_submission_id, entry in data["data"]["snapshot"][ "jobSubmission"].items(): if entry["status"] is not None: assert entry["status"] in {"PENDING", "RUNNING", "FAILED"} assert entry["message"] is not None # TODO(architkulkarni): Disable automatic camelcase. assert entry["runtimeEnv"] == { "envVars": { "RAYTest456": "456" } } assert entry["metadata"] == {"rayTest789": "789"} assert entry["errorType"] is None assert abs(entry["startTime"] - start_time_s) <= 2 if entry["status"] == "FAILED": job_failed = True assert entry[ "endTime"] >= entry["startTime"] + job_sleep_time_s return legacy_job_failed and job_failed wait_for_condition(wait_for_job_to_fail, timeout=10)
def test_ray_tune_basic(job_sdk_client: JobSubmissionClient): run_cmd = "python ray_tune_basic.py" job_id = job_sdk_client.submit_job( entrypoint=run_cmd, runtime_env={"working_dir": DRIVER_SCRIPT_DIR}, ) wait_for_condition(_check_job_succeeded, timeout=30, client=job_sdk_client, job_id=job_id)
def test_per_task_runtime_env(job_sdk_client: JobSubmissionClient): run_cmd = "python per_task_runtime_env.py" job_id = job_sdk_client.submit_job( entrypoint=run_cmd, runtime_env={"working_dir": DRIVER_SCRIPT_DIR}, ) wait_for_condition(_check_job_succeeded, client=job_sdk_client, job_id=job_id)
def test_cli_apis_sanity_check(ray_start_cluster): """Test all of CLI APIs work as expected.""" cluster = ray_start_cluster cluster.add_node(num_cpus=2) ray.init(address=cluster.address) for _ in range(3): cluster.add_node(num_cpus=2) runner = CliRunner() client = JobSubmissionClient( f"http://{ray._private.worker.global_worker.node.address_info['webui_url']}" ) @ray.remote def f(): import time time.sleep(30) @ray.remote class Actor: pass obj = ray.put(3) # noqa task = f.remote() # noqa actor = Actor.remote() # noqa actor_runtime_env = Actor.options( # noqa runtime_env={ "pip": ["requests"] }).remote() job_id = client.submit_job( # noqa # Entrypoint shell command to execute entrypoint="ls", ) pg = ray.util.placement_group(bundles=[{"CPU": 1}]) # noqa def verify_output(resource_name, necessary_substrings: List[str]): result = runner.invoke(cli_list, [resource_name]) exit_code_correct = result.exit_code == 0 substring_matched = all(substr in result.output for substr in necessary_substrings) print(result.output) return exit_code_correct and substring_matched wait_for_condition(lambda: verify_output("actors", ["actor_id"])) wait_for_condition(lambda: verify_output("workers", ["worker_id"])) wait_for_condition(lambda: verify_output("nodes", ["node_id"])) wait_for_condition( lambda: verify_output("placement-groups", ["placement_group_id"])) wait_for_condition(lambda: verify_output("jobs", ["raysubmit"])) wait_for_condition(lambda: verify_output("tasks", ["task_id"])) wait_for_condition(lambda: verify_output("objects", ["object_id"])) wait_for_condition(lambda: verify_output("runtime-envs", ["runtime_env"]))
def test_list_jobs(shutdown_only): ray.init() client = JobSubmissionClient( f"http://{ray.worker.global_worker.node.address_info['webui_url']}") job_id = client.submit_job( # noqa # Entrypoint shell command to execute entrypoint="ls", ) def verify(): job_data = list(list_jobs().values())[0] job_id_from_api = list(list_jobs().keys())[0] correct_state = job_data["status"] == "SUCCEEDED" correct_id = job_id == job_id_from_api return correct_state and correct_id wait_for_condition(verify) print(list_jobs())
def ray_job_submit( script_name: str, head_service: str, k8s_namespace: str = "default", ray_dashboard_port: int = 8265, ) -> str: """Submits a Python script via the Ray Job Submission API, using the Python SDK. Waits for successful completion of the job and returns the job logs as a string. Uses `kubectl port-forward` to access the Ray head's dashboard port. Scripts live in `tests/kuberay/scripts`. This directory is used as the working dir for the job. Args: script_name: The name of the script to submit. head_service: The name of the Ray head K8s service. k8s_namespace: K8s namespace the Ray cluster belongs to. ray_dashboard_port: The port on which the Ray head is running the Ray dashboard. """ with _kubectl_port_forward(service=head_service, namespace=k8s_namespace, target_port=ray_dashboard_port) as local_port: # It takes a bit of time to establish the connection. # Try a few times to instantiate the JobSubmissionClient, as the client's # instantiation does not retry on connection errors. for trie in range(1, 7): time.sleep(5) try: client = JobSubmissionClient(f"http://127.0.0.1:{local_port}") except ConnectionError as e: if trie < 6: logger.info( "Job client connection failed. Retrying in 5 seconds.") else: raise e from None job_id = client.submit_job( entrypoint=f"python {script_name}", runtime_env={ "working_dir": SCRIPTS_DIR, # Throw in some extra data for fun, to validate runtime envs. "pip": ["pytest==6.0.0"], "env_vars": { "key_foo": "value_bar" }, }, ) # Wait for the job to complete successfully. # This logic is copied from the Job Submission docs. start = time.time() timeout = 60 while time.time() - start <= timeout: status = client.get_job_status(job_id) print(f"status: {status}") if status in { JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED }: break time.sleep(5) assert status == JobStatus.SUCCEEDED return client.get_job_logs(job_id)
async def test_state_data_source_client(ray_start_cluster): cluster = ray_start_cluster # head cluster.add_node(num_cpus=2) ray.init(address=cluster.address) # worker worker = cluster.add_node(num_cpus=2) GRPC_CHANNEL_OPTIONS = ( ("grpc.enable_http_proxy", 0), ("grpc.max_send_message_length", ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE), ("grpc.max_receive_message_length", ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE), ) gcs_channel = ray._private.utils.init_grpc_channel(cluster.address, GRPC_CHANNEL_OPTIONS, asynchronous=True) client = StateDataSourceClient(gcs_channel) """ Test actor """ result = await client.get_all_actor_info() assert isinstance(result, GetAllActorInfoReply) """ Test placement group """ result = await client.get_all_placement_group_info() assert isinstance(result, GetAllPlacementGroupReply) """ Test node """ result = await client.get_all_node_info() assert isinstance(result, GetAllNodeInfoReply) """ Test worker info """ result = await client.get_all_worker_info() assert isinstance(result, GetAllWorkerInfoReply) """ Test job """ job_client = JobSubmissionClient( f"http://{ray.worker.global_worker.node.address_info['webui_url']}") job_id = job_client.submit_job( # noqa # Entrypoint shell command to execute entrypoint="ls", ) result = client.get_job_info() assert list(result.keys())[0] == job_id assert isinstance(result, dict) """ Test tasks """ with pytest.raises(ValueError): # Since we didn't register this node id, it should raise an exception. result = await client.get_task_info("1234") wait_for_condition(lambda: len(ray.nodes()) == 2) for node in ray.nodes(): node_id = node["NodeID"] ip = node["NodeManagerAddress"] port = int(node["NodeManagerPort"]) client.register_raylet_client(node_id, ip, port) result = await client.get_task_info(node_id) assert isinstance(result, GetTasksInfoReply) assert len(client.get_all_registered_raylet_ids()) == 2 """ Test objects """ with pytest.raises(ValueError): # Since we didn't register this node id, it should raise an exception. result = await client.get_object_info("1234") wait_for_condition(lambda: len(ray.nodes()) == 2) for node in ray.nodes(): node_id = node["NodeID"] ip = node["NodeManagerAddress"] port = int(node["NodeManagerPort"]) client.register_raylet_client(node_id, ip, port) result = await client.get_object_info(node_id) assert isinstance(result, GetNodeStatsReply) """ Test the exception is raised when the RPC error occurs. """ cluster.remove_node(worker) # Wait until the dead node information is propagated. wait_for_condition(lambda: len( list(filter(lambda node: node["Alive"], ray.nodes()))) == 1) for node in ray.nodes(): node_id = node["NodeID"] if node["Alive"]: continue # Querying to the dead node raises gRPC error, which should be # translated into `StateSourceNetworkException` with pytest.raises(StateSourceNetworkException): result = await client.get_object_info(node_id) # Make sure unregister API works as expected. client.unregister_raylet_client(node_id) assert len(client.get_all_registered_raylet_ids()) == 1 # Since the node_id is unregistered, the API should raise ValueError. with pytest.raises(ValueError): result = await client.get_object_info(node_id)
async def test_state_data_source_client(ray_start_cluster): cluster = ray_start_cluster # head cluster.add_node(num_cpus=2) ray.init(address=cluster.address) # worker worker = cluster.add_node(num_cpus=2) GRPC_CHANNEL_OPTIONS = ( *ray_constants.GLOBAL_GRPC_OPTIONS, ("grpc.max_send_message_length", ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE), ("grpc.max_receive_message_length", ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE), ) gcs_channel = ray._private.utils.init_grpc_channel(cluster.address, GRPC_CHANNEL_OPTIONS, asynchronous=True) client = StateDataSourceClient(gcs_channel) """ Test actor """ result = await client.get_all_actor_info() assert isinstance(result, GetAllActorInfoReply) """ Test placement group """ result = await client.get_all_placement_group_info() assert isinstance(result, GetAllPlacementGroupReply) """ Test node """ result = await client.get_all_node_info() assert isinstance(result, GetAllNodeInfoReply) """ Test worker info """ result = await client.get_all_worker_info() assert isinstance(result, GetAllWorkerInfoReply) """ Test job """ job_client = JobSubmissionClient( f"http://{ray._private.worker.global_worker.node.address_info['webui_url']}" ) job_id = job_client.submit_job( # noqa # Entrypoint shell command to execute entrypoint="ls", ) result = client.get_job_info() assert list(result.keys())[0] == job_id assert isinstance(result, dict) """ Test tasks """ with pytest.raises(ValueError): # Since we didn't register this node id, it should raise an exception. result = await client.get_task_info("1234") wait_for_condition(lambda: len(ray.nodes()) == 2) for node in ray.nodes(): node_id = node["NodeID"] ip = node["NodeManagerAddress"] port = int(node["NodeManagerPort"]) client.register_raylet_client(node_id, ip, port) result = await client.get_task_info(node_id) assert isinstance(result, GetTasksInfoReply) assert len(client.get_all_registered_raylet_ids()) == 2 """ Test objects """ with pytest.raises(ValueError): # Since we didn't register this node id, it should raise an exception. result = await client.get_object_info("1234") wait_for_condition(lambda: len(ray.nodes()) == 2) for node in ray.nodes(): node_id = node["NodeID"] ip = node["NodeManagerAddress"] port = int(node["NodeManagerPort"]) client.register_raylet_client(node_id, ip, port) result = await client.get_object_info(node_id) assert isinstance(result, GetNodeStatsReply) """ Test runtime env """ with pytest.raises(ValueError): # Since we didn't register this node id, it should raise an exception. result = await client.get_runtime_envs_info("1234") wait_for_condition(lambda: len(ray.nodes()) == 2) for node in ray.nodes(): node_id = node["NodeID"] key = f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{node_id}" def get_port(): return ray.experimental.internal_kv._internal_kv_get( key, namespace=ray_constants.KV_NAMESPACE_DASHBOARD) wait_for_condition(lambda: get_port() is not None) # The second index is the gRPC port port = json.loads(get_port())[1] ip = node["NodeManagerAddress"] client.register_agent_client(node_id, ip, port) result = await client.get_runtime_envs_info(node_id) assert isinstance(result, GetRuntimeEnvsInfoReply) """ Test logs """ with pytest.raises(ValueError): result = await client.list_logs("1234", "*") with pytest.raises(ValueError): result = await client.stream_log("1234", "raylet.out", True, 100, 1, 5) wait_for_condition(lambda: len(ray.nodes()) == 2) # The node information should've been registered in the previous section. for node in ray.nodes(): node_id = node["NodeID"] result = await client.list_logs(node_id, timeout=30, glob_filter="*") assert isinstance(result, ListLogsReply) stream = await client.stream_log(node_id, "raylet.out", False, 10, 1, 5) async for logs in stream: log_lines = len(logs.data.decode().split("\n")) assert isinstance(logs, StreamLogReply) assert log_lines >= 10 assert log_lines <= 11 """ Test the exception is raised when the RPC error occurs. """ cluster.remove_node(worker) # Wait until the dead node information is propagated. wait_for_condition(lambda: len( list(filter(lambda node: node["Alive"], ray.nodes()))) == 1) for node in ray.nodes(): node_id = node["NodeID"] if node["Alive"]: continue # Querying to the dead node raises gRPC error, which should raise an exception. with pytest.raises(DataSourceUnavailable): await client.get_object_info(node_id) # Make sure unregister API works as expected. client.unregister_raylet_client(node_id) assert len(client.get_all_registered_raylet_ids()) == 1 # Since the node_id is unregistered, the API should raise ValueError. with pytest.raises(ValueError): result = await client.get_object_info(node_id)
def test_successful_job_status(ray_start_with_dashboard, disable_aiohttp_cache, enable_test_module, address_suffix): address = ray_start_with_dashboard.address_info["webui_url"] assert wait_until_server_available(address) address = format_web_url(address) job_sleep_time_s = 5 entrypoint = ('python -c"' "import ray;" "ray.init();" "import time;" f"time.sleep({job_sleep_time_s});" '"') client = JobSubmissionClient(address + address_suffix) start_time_s = int(time.time()) runtime_env = {"env_vars": {"RAY_TEST_123": "123"}} metadata = {"ray_test_456": "456"} job_id = client.submit_job(entrypoint=entrypoint, metadata=metadata, runtime_env=runtime_env) def wait_for_job_to_succeed(): data = _get_snapshot(address) legacy_job_succeeded = False job_succeeded = False # Test legacy job snapshot (one driver per job). for job_entry in data["data"]["snapshot"]["jobs"].values(): if job_entry["status"] is not None: assert job_entry["config"]["metadata"][ "jobSubmissionId"] == job_id assert job_entry["status"] in { "PENDING", "RUNNING", "SUCCEEDED" } assert job_entry["statusMessage"] is not None legacy_job_succeeded = job_entry["status"] == "SUCCEEDED" # Test new jobs snapshot (0 to N drivers per job). assert data["data"]["snapshot"]["jobSubmission"] for job_submission_id, entry in data["data"]["snapshot"][ "jobSubmission"].items(): if entry["status"] is not None: assert entry["jobSubmissionId"] == job_id assert entry["entrypoint"] == entrypoint assert entry["status"] in {"PENDING", "RUNNING", "SUCCEEDED"} assert entry["message"] is not None # TODO(architkulkarni): Disable automatic camelcase. assert entry["runtimeEnv"] == { "envVars": { "RAYTest123": "123" } } assert entry["metadata"] == {"rayTest456": "456"} assert entry["errorType"] is None assert abs(entry["startTime"] - start_time_s * 1000) <= 2000 if entry["status"] == "SUCCEEDED": job_succeeded = True assert (entry["endTime"] >= entry["startTime"] + job_sleep_time_s * 1000) print(f"Legacy job submission succeeded: {legacy_job_succeeded}") print(f"Job submission succeeded: {job_succeeded}") return legacy_job_succeeded and job_succeeded wait_for_condition(wait_for_job_to_succeed, timeout=45)