def test_basic(ray_start_with_dashboard): """Dashboard test that starts a Ray cluster with a dashboard server running, then hits the dashboard API and asserts that it receives sensible data.""" assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) address_info = ray_start_with_dashboard node_id = address_info["node_id"] address = address_info["redis_address"] address = address.split(":") assert len(address) == 2 client = redis.StrictRedis(host=address[0], port=int(address[1]), password=ray_constants.REDIS_DEFAULT_PASSWORD) all_processes = ray.worker._global_node.all_processes assert ray_constants.PROCESS_TYPE_DASHBOARD in all_processes assert ray_constants.PROCESS_TYPE_REPORTER not in all_processes dashboard_proc_info = all_processes[ ray_constants.PROCESS_TYPE_DASHBOARD][0] dashboard_proc = psutil.Process(dashboard_proc_info.process.pid) assert dashboard_proc.status() in [ psutil.STATUS_RUNNING, psutil.STATUS_SLEEPING ] raylet_proc_info = all_processes[ray_constants.PROCESS_TYPE_RAYLET][0] raylet_proc = psutil.Process(raylet_proc_info.process.pid) def _search_agent(processes): for p in processes: try: for c in p.cmdline(): if "new_dashboard/agent.py" in c: return p except Exception: pass # Test for bad imports, the agent should be restarted. logger.info("Test for bad imports.") agent_proc = _search_agent(raylet_proc.children()) prepare_test_files() agent_pids = set() try: assert agent_proc is not None agent_proc.kill() agent_proc.wait() # The agent will be restarted for imports failure. for x in range(40): agent_proc = _search_agent(raylet_proc.children()) if agent_proc: agent_pids.add(agent_proc.pid) time.sleep(0.1) finally: cleanup_test_files() assert len(agent_pids) > 1, agent_pids agent_proc = _search_agent(raylet_proc.children()) if agent_proc: agent_proc.kill() agent_proc.wait() logger.info("Test agent register is OK.") wait_for_condition(lambda: _search_agent(raylet_proc.children())) assert dashboard_proc.status() in [ psutil.STATUS_RUNNING, psutil.STATUS_SLEEPING ] agent_proc = _search_agent(raylet_proc.children()) agent_pid = agent_proc.pid # Check if agent register is OK. for x in range(5): logger.info("Check agent is alive.") agent_proc = _search_agent(raylet_proc.children()) assert agent_proc.pid == agent_pid time.sleep(1) # Check redis keys are set. logger.info("Check redis keys are set.") dashboard_address = client.get(dashboard_consts.REDIS_KEY_DASHBOARD) assert dashboard_address is not None dashboard_rpc_address = client.get( dashboard_consts.REDIS_KEY_DASHBOARD_RPC) assert dashboard_rpc_address is not None key = f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{node_id}" agent_ports = client.get(key) assert agent_ports is not None
def test_raylet_info_endpoint(shutdown_only): addresses = ray.init(include_webui=True, num_cpus=6) @ray.remote def f(): return "test" @ray.remote(num_cpus=1) class ActorA: def __init__(self): pass @ray.remote(resources={"CustomResource": 1}) class ActorB: def __init__(self): pass @ray.remote(num_cpus=2) class ActorC: def __init__(self): self.children = [ActorA.remote(), ActorB.remote()] def local_store(self): self.local_storage = [f.remote() for _ in range(10)] def remote_store(self): self.remote_storage = ray.put("test") def getpid(self): return os.getpid() c = ActorC.remote() actor_pid = ray.get(c.getpid.remote()) c.local_store.remote() c.remote_store.remote() assert (wait_until_server_available(addresses["webui_url"]) is True) start_time = time.time() while True: time.sleep(1) try: webui_url = addresses["webui_url"] webui_url = webui_url.replace("localhost", "http://127.0.0.1") response = requests.get(webui_url + "/api/raylet_info") response.raise_for_status() try: raylet_info = response.json() except Exception as ex: print("failed response: {}".format(response.text)) raise ex actor_info = raylet_info["result"]["actors"] try: assert len(actor_info) == 1 _, parent_actor_info = actor_info.popitem() assert parent_actor_info["numObjectIdsInScope"] == 13 assert parent_actor_info["numLocalObjects"] == 10 children = parent_actor_info["children"] assert len(children) == 2 break except AssertionError: if time.time() > start_time + 30: raise Exception("Timed out while waiting for actor info \ or object store info update.") except requests.exceptions.ConnectionError: if time.time() > start_time + 30: raise Exception( "Timed out while waiting for dashboard to start.") assert parent_actor_info["usedResources"]["CPU"] == 2 assert parent_actor_info["numExecutedTasks"] == 4 for _, child_actor_info in children.items(): if child_actor_info["state"] == -1: assert child_actor_info["requiredResources"]["CustomResource"] == 1 else: assert child_actor_info["state"] == 1 assert len(child_actor_info["children"]) == 0 assert child_actor_info["usedResources"]["CPU"] == 1 profiling_id = requests.get(webui_url + "/api/launch_profiling", params={ "node_id": ray.nodes()[0]["NodeID"], "pid": actor_pid, "duration": 5 }).json()["result"] start_time = time.time() while True: # Sometimes some startup time is required if time.time() - start_time > 30: raise RayTestTimeoutException( "Timed out while collecting profiling stats.") profiling_info = requests.get(webui_url + "/api/check_profiling_status", params={ "profiling_id": profiling_id, }).json() status = profiling_info["result"]["status"] assert status in ("finished", "pending", "error") if status in ("finished", "error"): break time.sleep(1)
def test_node_info(disable_aiohttp_cache, ray_start_with_dashboard): @ray.remote class Actor: def getpid(self): return os.getpid() actors = [Actor.remote(), Actor.remote()] actor_pids = [actor.getpid.remote() for actor in actors] actor_pids = set(ray.get(actor_pids)) assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) node_id = ray_start_with_dashboard["node_id"] timeout_seconds = 10 start_time = time.time() last_ex = None while True: time.sleep(1) try: response = requests.get(webui_url + "/nodes?view=hostnamelist") response.raise_for_status() hostname_list = response.json() assert hostname_list["result"] is True, hostname_list["msg"] hostname_list = hostname_list["data"]["hostNameList"] assert len(hostname_list) == 1 hostname = hostname_list[0] response = requests.get(webui_url + f"/nodes/{node_id}") response.raise_for_status() detail = response.json() assert detail["result"] is True, detail["msg"] detail = detail["data"]["detail"] assert detail["hostname"] == hostname assert detail["raylet"]["state"] == "ALIVE" assert "raylet" in detail["cmdline"][0] assert len(detail["workers"]) >= 2 assert len(detail["actors"]) == 2, detail["actors"] assert len(detail["raylet"]["viewData"]) > 0 actor_worker_pids = set() for worker in detail["workers"]: if "ray::Actor" in worker["cmdline"][0]: actor_worker_pids.add(worker["pid"]) assert actor_worker_pids == actor_pids response = requests.get(webui_url + "/nodes?view=summary") response.raise_for_status() summary = response.json() assert summary["result"] is True, summary["msg"] assert len(summary["data"]["summary"]) == 1 summary = summary["data"]["summary"][0] assert summary["hostname"] == hostname assert summary["raylet"]["state"] == "ALIVE" assert "raylet" in summary["cmdline"][0] assert "workers" not in summary assert "actors" not in summary assert "viewData" not in summary["raylet"] break except Exception as ex: last_ex = ex finally: if time.time() > start_time + timeout_seconds: ex_stack = traceback.format_exception( type(last_ex), last_ex, last_ex.__traceback__) if last_ex else [] ex_stack = "".join(ex_stack) raise Exception(f"Timed out while testing, {ex_stack}")
def test_actor_pubsub(disable_aiohttp_cache, ray_start_with_dashboard): timeout = 5 assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) address_info = ray_start_with_dashboard address = address_info["redis_address"] address = address.split(":") assert len(address) == 2 client = redis.StrictRedis(host=address[0], port=int(address[1]), password=ray_constants.REDIS_DEFAULT_PASSWORD) p = client.pubsub(ignore_subscribe_messages=True) p.psubscribe(ray.gcs_utils.RAY_ACTOR_PUBSUB_PATTERN) @ray.remote class DummyActor: def __init__(self): pass # Create a dummy actor. a = DummyActor.remote() def handle_pub_messages(client, msgs, timeout, expect_num): start_time = time.time() while time.time() - start_time < timeout and len(msgs) < expect_num: msg = client.get_message() if msg is None: time.sleep(0.01) continue pubsub_msg = ray.gcs_utils.PubSubMessage.FromString(msg["data"]) actor_data = ray.gcs_utils.ActorTableData.FromString( pubsub_msg.data) msgs.append(actor_data) msgs = [] handle_pub_messages(p, msgs, timeout, 2) # Assert we received published actor messages with state # DEPENDENCIES_UNREADY and ALIVE. assert len(msgs) == 2 # Kill actor. ray.kill(a) handle_pub_messages(p, msgs, timeout, 3) # Assert we received published actor messages with state DEAD. assert len(msgs) == 3 def actor_table_data_to_dict(message): return dashboard_utils.message_to_dict( message, { "actorId", "parentId", "jobId", "workerId", "rayletId", "actorCreationDummyObjectId", "callerId", "taskId", "parentTaskId", "sourceActorId", "placementGroupId" }, including_default_value_fields=False) non_state_keys = ("actorId", "jobId", "taskSpec") for msg in msgs: actor_data_dict = actor_table_data_to_dict(msg) # DEPENDENCIES_UNREADY is 0, which would not be keeped in dict. We # need check its original value. if msg.state == 0: assert len(actor_data_dict) > 5 for k in non_state_keys: assert k in actor_data_dict # For status that is not DEPENDENCIES_UNREADY, only states fields will # be published. elif actor_data_dict["state"] in ("ALIVE", "DEAD"): assert actor_data_dict.keys() == { "state", "address", "timestamp", "pid" } else: raise Exception("Unknown state: {}".format( actor_data_dict["state"]))
def test_memory_dashboard(shutdown_only): """Test Memory table. These tests verify examples in this document. https://docs.ray.io/en/latest/memory-management.html#debugging-using-ray-memory """ addresses = ray.init(num_cpus=2) webui_url = addresses["webui_url"].replace("localhost", "http://127.0.0.1") assert (wait_until_server_available(addresses["webui_url"]) is True) def get_memory_table(): memory_table = requests.get(webui_url + "/api/memory_table").json() return memory_table["result"] def memory_table_ready(): """Wait until the new fresh memory table is ready.""" global prev_memory_table memory_table = get_memory_table() is_ready = memory_table["group"] != prev_memory_table prev_memory_table = memory_table["group"] return is_ready def stop_memory_table(): requests.get(webui_url + "/api/stop_memory_table").json() def test_local_reference(): @ray.remote def f(arg): return arg # a and b are local references. a = ray.put(None) # Noqa F841 b = f.remote(None) # Noqa F841 wait_for_condition(memory_table_ready) memory_table = get_memory_table() summary = memory_table["summary"] group = memory_table["group"] assert summary["total_captured_in_objects"] == 0 assert summary["total_pinned_in_memory"] == 0 assert summary["total_used_by_pending_task"] == 0 assert summary["total_local_ref_count"] == 2 for table in group.values(): for entry in table["entries"]: assert ( entry["reference_type"] == ReferenceType.LOCAL_REFERENCE) stop_memory_table() return True def test_object_pineed_in_memory(): import numpy as np a = ray.put(np.zeros(1)) b = ray.get(a) # Noqa F841 del a wait_for_condition(memory_table_ready) memory_table = get_memory_table() summary = memory_table["summary"] group = memory_table["group"] assert summary["total_captured_in_objects"] == 0 assert summary["total_pinned_in_memory"] == 1 assert summary["total_used_by_pending_task"] == 0 assert summary["total_local_ref_count"] == 0 for table in group.values(): for entry in table["entries"]: assert ( entry["reference_type"] == ReferenceType.PINNED_IN_MEMORY) stop_memory_table() return True def test_pending_task_references(): @ray.remote def f(arg): time.sleep(1) a = ray.put(None) # Noqa F841 b = f.remote(a) # Noqa F841 wait_for_condition(memory_table_ready) memory_table = get_memory_table() summary = memory_table["summary"] assert summary["total_captured_in_objects"] == 0 assert summary["total_pinned_in_memory"] == 1 assert summary["total_used_by_pending_task"] == 1 assert summary["total_local_ref_count"] == 1 # Make sure the function f is done before going to the next test. # Otherwise, the memory table will be corrupted because the # task f won't be done when the next test is running. ray.get(b) stop_memory_table() return True def test_serialized_object_id_reference(): @ray.remote def f(arg): time.sleep(1) a = ray.put(None) # Noqa F841 b = f.remote([a]) # Noqa F841 wait_for_condition(memory_table_ready) memory_table = get_memory_table() summary = memory_table["summary"] assert summary["total_captured_in_objects"] == 0 assert summary["total_pinned_in_memory"] == 0 assert summary["total_used_by_pending_task"] == 1 assert summary["total_local_ref_count"] == 2 # Make sure the function f is done before going to the next test. # Otherwise, the memory table will be corrupted because the # task f won't be done when the next test is running. ray.get(b) stop_memory_table() return True def test_captured_object_id_reference(): a = ray.put(None) b = ray.put([a]) # Noqa F841 del a wait_for_condition(memory_table_ready) memory_table = get_memory_table() summary = memory_table["summary"] assert summary["total_captured_in_objects"] == 1 assert summary["total_pinned_in_memory"] == 0 assert summary["total_used_by_pending_task"] == 0 assert summary["total_local_ref_count"] == 1 stop_memory_table() return True def test_actor_handle_reference(): @ray.remote class Actor: pass a = Actor.remote() # Noqa F841 b = Actor.remote() # Noqa F841 c = Actor.remote() # Noqa F841 wait_for_condition(memory_table_ready) memory_table = get_memory_table() summary = memory_table["summary"] group = memory_table["group"] assert summary["total_captured_in_objects"] == 0 assert summary["total_pinned_in_memory"] == 0 assert summary["total_used_by_pending_task"] == 0 assert summary["total_local_ref_count"] == 0 assert summary["total_actor_handles"] == 3 for table in group.values(): for entry in table["entries"]: assert (entry["reference_type"] == ReferenceType.ACTOR_HANDLE) stop_memory_table() return True # These tests should be retried because it takes at least one second # to get the fresh new memory table. It is because memory table is updated # Whenever raylet and node info is renewed which takes 1 second. assert (wait_for_condition( test_local_reference, timeout=30000, retry_interval_ms=1000) is True) assert (wait_for_condition(test_object_pineed_in_memory, timeout=30000, retry_interval_ms=1000) is True) assert (wait_for_condition(test_pending_task_references, timeout=30000, retry_interval_ms=1000) is True) assert (wait_for_condition(test_serialized_object_id_reference, timeout=30000, retry_interval_ms=1000) is True) assert (wait_for_condition(test_captured_object_id_reference, timeout=30000, retry_interval_ms=1000) is True) assert (wait_for_condition(test_actor_handle_reference, timeout=30000, retry_interval_ms=1000) is True)
def test_aiohttp_cache(enable_test_module, ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) timeout_seconds = 5 start_time = time.time() value1_timestamps = [] while True: time.sleep(1) try: for x in range(10): response = requests.get(webui_url + "/test/aiohttp_cache/t1?value=1") response.raise_for_status() timestamp = response.json()["data"]["timestamp"] value1_timestamps.append(timestamp) assert len(collections.Counter(value1_timestamps)) > 1 break except (AssertionError, requests.exceptions.ConnectionError) as e: logger.info("Retry because of %s", e) finally: if time.time() > start_time + timeout_seconds: raise Exception("Timed out while testing.") sub_path_timestamps = [] for x in range(10): response = requests.get(webui_url + f"/test/aiohttp_cache/tt{x}?value=1") response.raise_for_status() timestamp = response.json()["data"]["timestamp"] sub_path_timestamps.append(timestamp) assert len(collections.Counter(sub_path_timestamps)) == 10 volatile_value_timestamps = [] for x in range(10): response = requests.get(webui_url + f"/test/aiohttp_cache/tt?value={x}") response.raise_for_status() timestamp = response.json()["data"]["timestamp"] volatile_value_timestamps.append(timestamp) assert len(collections.Counter(volatile_value_timestamps)) == 10 response = requests.get(webui_url + "/test/aiohttp_cache/raise_exception") response.raise_for_status() result = response.json() assert result["result"] is False assert "KeyError" in result["msg"] volatile_value_timestamps = [] for x in range(10): response = requests.get(webui_url + f"/test/aiohttp_cache_lru/tt{x % 4}") response.raise_for_status() timestamp = response.json()["data"]["timestamp"] volatile_value_timestamps.append(timestamp) assert len(collections.Counter(volatile_value_timestamps)) == 4 volatile_value_timestamps = [] data = collections.defaultdict(set) for x in [0, 1, 2, 3, 4, 5, 2, 1, 0, 3]: response = requests.get(webui_url + f"/test/aiohttp_cache_lru/t1?value={x}") response.raise_for_status() timestamp = response.json()["data"]["timestamp"] data[x].add(timestamp) volatile_value_timestamps.append(timestamp) assert len(collections.Counter(volatile_value_timestamps)) == 8 assert len(data[3]) == 2 assert len(data[0]) == 2
def test_worker_stats(shutdown_only): addresses = ray.init(num_cpus=1, include_webui=True) raylet = ray.nodes()[0] num_cpus = raylet["Resources"]["CPU"] raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], ray.nodes()[0]["NodeManagerPort"]) channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) def try_get_node_stats(num_retry=5, timeout=2): reply = None for _ in range(num_retry): try: reply = stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest(), timeout=timeout) break except grpc.RpcError: continue assert reply is not None return reply reply = try_get_node_stats() # Check that there is one connected driver. drivers = [worker for worker in reply.workers_stats if worker.is_driver] assert len(drivers) == 1 assert os.getpid() == drivers[0].pid @ray.remote def f(): ray.show_in_webui("test") return os.getpid() @ray.remote class Actor: def __init__(self): pass def f(self): ray.show_in_webui("test") return os.getpid() # Test show_in_webui for remote functions. worker_pid = ray.get(f.remote()) reply = try_get_node_stats() target_worker_present = False for worker in reply.workers_stats: stats = worker.core_worker_stats if stats.webui_display[""] == '{"message": "test", "dtype": "text"}': target_worker_present = True assert worker.pid == worker_pid else: assert stats.webui_display[""] == "" # Empty proto assert target_worker_present # Test show_in_webui for remote actors. a = Actor.remote() worker_pid = ray.get(a.f.remote()) reply = try_get_node_stats() target_worker_present = False for worker in reply.workers_stats: stats = worker.core_worker_stats if stats.webui_display[""] == '{"message": "test", "dtype": "text"}': target_worker_present = True assert worker.pid == worker_pid else: assert stats.webui_display[""] == "" # Empty proto assert target_worker_present timeout_seconds = 20 start_time = time.time() while True: if time.time() - start_time > timeout_seconds: raise RayTestTimeoutException( "Timed out while waiting for worker processes") # Wait for the workers to start. if len(reply.workers_stats) < num_cpus + 1: time.sleep(1) reply = try_get_node_stats() continue # Check that the rest of the processes are workers, 1 for each CPU. assert len(reply.workers_stats) == num_cpus + 1 views = [view.view_name for view in reply.view_data] assert "local_available_resource" in views # Check that all processes are Python. pids = [worker.pid for worker in reply.workers_stats] processes = [ p.info["name"] for p in psutil.process_iter(attrs=["pid", "name"]) if p.info["pid"] in pids ] for process in processes: # TODO(ekl) why does travis/mi end up in the process list assert ("python" in process or "conda" in process or "travis" in process or "runner" in process or "ray" in process) break # Test kill_actor. def actor_killed(PID): """Check For the existence of a unix pid.""" try: os.kill(PID, 0) except OSError: return True else: return False assert (wait_until_server_available(addresses["webui_url"]) is True) webui_url = addresses["webui_url"] webui_url = webui_url.replace("localhost", "http://127.0.0.1") for worker in reply.workers_stats: if worker.is_driver: continue requests.get(webui_url + "/api/kill_actor", params={ "actor_id": ray.utils.binary_to_hex( worker.core_worker_stats.actor_id), "ip_address": worker.core_worker_stats.ip_address, "port": worker.core_worker_stats.port }) timeout_seconds = 20 start_time = time.time() while True: if time.time() - start_time > timeout_seconds: raise RayTestTimeoutException("Timed out while killing actors") if all( actor_killed(worker.pid) for worker in reply.workers_stats if not worker.is_driver): break
def test_get_job_info(disable_aiohttp_cache, ray_start_with_dashboard): @ray.remote class Actor: def getpid(self): return os.getpid() actor = Actor.remote() actor_pid = ray.get(actor.getpid.remote()) actor_id = actor._actor_id.hex() assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) ip = ray._private.services.get_node_ip_address() def _check(): resp = requests.get(f"{webui_url}/jobs?view=summary") resp.raise_for_status() result = resp.json() assert result["result"] is True, resp.text job_summary = result["data"]["summary"] assert len(job_summary) == 1, resp.text one_job = job_summary[0] assert "jobId" in one_job job_id = one_job["jobId"] assert ray._raylet.JobID(hex_to_binary(one_job["jobId"])) assert "driverIpAddress" in one_job assert one_job["driverIpAddress"] == ip assert "driverPid" in one_job assert one_job["driverPid"] == str(os.getpid()) assert "config" in one_job assert type(one_job["config"]) is dict assert "isDead" in one_job assert one_job["isDead"] is False assert "timestamp" in one_job one_job_summary_keys = one_job.keys() resp = requests.get(f"{webui_url}/jobs/{job_id}") resp.raise_for_status() result = resp.json() assert result["result"] is True, resp.text job_detail = result["data"]["detail"] assert "jobInfo" in job_detail assert len(one_job_summary_keys - job_detail["jobInfo"].keys()) == 0 assert "jobActors" in job_detail job_actors = job_detail["jobActors"] assert len(job_actors) == 1, resp.text one_job_actor = job_actors[actor_id] assert "taskSpec" in one_job_actor assert type(one_job_actor["taskSpec"]) is dict assert "functionDescriptor" in one_job_actor["taskSpec"] assert type(one_job_actor["taskSpec"]["functionDescriptor"]) is dict assert "pid" in one_job_actor assert one_job_actor["pid"] == actor_pid check_actor_keys = [ "name", "timestamp", "address", "actorId", "jobId", "state" ] for k in check_actor_keys: assert k in one_job_actor assert "jobWorkers" in job_detail job_workers = job_detail["jobWorkers"] assert len(job_workers) == 1, resp.text one_job_worker = job_workers[0] check_worker_keys = [ "cmdline", "pid", "cpuTimes", "memoryInfo", "cpuPercent", "coreWorkerStats", "language", "jobId" ] for k in check_worker_keys: assert k in one_job_worker timeout_seconds = 10 start_time = time.time() last_ex = None while True: time.sleep(1) try: _check() break except (AssertionError, KeyError, IndexError) as ex: last_ex = ex finally: if time.time() > start_time + timeout_seconds: ex_stack = traceback.format_exception( type(last_ex), last_ex, last_ex.__traceback__) if last_ex else [] ex_stack = "".join(ex_stack) raise Exception(f"Timed out while testing, {ex_stack}")
def test_actor_groups(ray_start_with_dashboard): @ray.remote class Foo: def __init__(self, num): self.num = num def do_task(self): return self.num @ray.remote(num_gpus=1) class InfeasibleActor: pass foo_actors = [Foo.remote(4), Foo.remote(5)] infeasible_actor = InfeasibleActor.remote() # noqa results = [actor.do_task.remote() for actor in foo_actors] # noqa webui_url = ray_start_with_dashboard["webui_url"] assert wait_until_server_available(webui_url) webui_url = format_web_url(webui_url) timeout_seconds = 5 start_time = time.time() last_ex = None while True: time.sleep(1) try: response = requests.get(webui_url + "/logical/actor_groups") response.raise_for_status() actor_groups_resp = response.json() assert actor_groups_resp["result"] is True, actor_groups_resp[ "msg"] actor_groups = actor_groups_resp["data"]["actorGroups"] assert "Foo" in actor_groups summary = actor_groups["Foo"]["summary"] # 2 __init__ tasks and 2 do_task tasks assert summary["numExecutedTasks"] == 4 assert summary["stateToCount"]["ALIVE"] == 2 entries = actor_groups["Foo"]["entries"] foo_entry = entries[0] assert type(foo_entry["gpus"]) is list assert "timestamp" in foo_entry assert "actorConstructor" in foo_entry assert "actorClass" in foo_entry assert "actorId" in foo_entry assert "ipAddress" in foo_entry assert len(entries) == 2 assert "InfeasibleActor" in actor_groups entries = actor_groups["InfeasibleActor"]["entries"] assert "requiredResources" in entries[0] assert "GPU" in entries[0]["requiredResources"] break except Exception as ex: last_ex = ex finally: if time.time() > start_time + timeout_seconds: ex_stack = traceback.format_exception( type(last_ex), last_ex, last_ex.__traceback__) if last_ex else [] ex_stack = "".join(ex_stack) raise Exception(f"Timed out while testing, {ex_stack}")
def test_submit_job_validation(ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) job_root_dir = os.path.join( os.path.dirname(ray_start_with_dashboard["session_dir"]), "job") shutil.rmtree(job_root_dir, ignore_errors=True) def _ensure_available_nodes(): resp = requests.post(f"{webui_url}/jobs") resp.raise_for_status() result = resp.json() assert result["result"] is False return "no nodes available" not in result["msg"] wait_for_condition(_ensure_available_nodes, timeout=5) # Invalid value. resp = requests.post(f"{webui_url}/jobs", json={ "language": "Unsupported", "runtime_env": { "working_dir": "http://xxx/yyy.zip" }, "driver_entry": "python_file_name_without_ext", }) resp.raise_for_status() result = resp.json() assert result["result"] is False msg = result["msg"] assert "language" in msg and "Unsupported" in msg, resp.text # Missing required field. resp = requests.post(f"{webui_url}/jobs", json={ "language": job_consts.PYTHON, "runtime_env": { "working_dir": "http://xxx/yyy.zip" }, }) resp.raise_for_status() result = resp.json() assert result["result"] is False msg = result["msg"] assert all(p in msg for p in ["missing", "driver_entry"]), resp.text # Incorrect value type. resp = requests.post(f"{webui_url}/jobs", json={ "language": job_consts.PYTHON, "runtime_env": { "working_dir": ["http://xxx/yyy.zip"] }, "driver_entry": "python_file_name_without_ext", }) resp.raise_for_status() result = resp.json() assert result["result"] is False msg = result["msg"] assert all(p in msg for p in ["working_dir", "str"]), resp.text # Invalid key. resp = requests.post(f"{webui_url}/jobs", json={ "language": job_consts.PYTHON, "runtime_env": { "working_dir": "http://xxx/yyy.zip" }, "driver_entry": "python_file_name_without_ext", "invalid_key": 1, }) resp.raise_for_status() result = resp.json() assert result["result"] is False msg = result["msg"] assert all(p in msg for p in ["unexpected", "invalid_key"]), resp.text
def test_log(disable_aiohttp_cache, ray_start_with_dashboard): @ray.remote def write_log(s): print(s) test_log_text = "test_log_text" ray.get(write_log.remote(test_log_text)) assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) node_id = ray_start_with_dashboard["node_id"] timeout_seconds = 10 start_time = time.time() last_ex = None while True: time.sleep(1) try: response = requests.get(webui_url + "/log_index") response.raise_for_status() parser = LogUrlParser() parser.feed(response.text) all_nodes_log_urls = parser.get_urls() assert len(all_nodes_log_urls) == 1 response = requests.get(all_nodes_log_urls[0]) response.raise_for_status() parser = LogUrlParser() parser.feed(response.text) # Search test_log_text from all worker logs. parsed_url = urllib.parse.urlparse(all_nodes_log_urls[0]) paths = parser.get_urls() urls = [] for p in paths: if "worker" in p: urls.append(parsed_url._replace(path=p).geturl()) for u in urls: response = requests.get(u) response.raise_for_status() if test_log_text in response.text: break else: raise Exception(f"Can't find {test_log_text} from {urls}") # Test range request. response = requests.get(webui_url + "/logs/dashboard.log", headers={"Range": "bytes=43-51"}) response.raise_for_status() assert response.text == "Dashboard" # Test logUrl in node info. response = requests.get(webui_url + f"/nodes/{node_id}") response.raise_for_status() node_info = response.json() assert node_info["result"] is True node_info = node_info["data"]["detail"] assert "logUrl" in node_info assert node_info["logUrl"] in all_nodes_log_urls break except Exception as ex: last_ex = ex finally: if time.time() > start_time + timeout_seconds: ex_stack = traceback.format_exception( type(last_ex), last_ex, last_ex.__traceback__) if last_ex else [] ex_stack = "".join(ex_stack) raise Exception(f"Timed out while testing, {ex_stack}")