def test_multi_node_metrics_export_port_discovery(ray_start_cluster): NUM_NODES = 3 cluster = ray_start_cluster nodes = [cluster.add_node() for _ in range(NUM_NODES)] nodes = { node.address_info["metrics_export_port"]: node.address_info for node in nodes } cluster.wait_for_nodes() ray.init(address=cluster.address) node_info_list = ray.nodes() for node_info in node_info_list: metrics_export_port = node_info["MetricsExportPort"] address_info = nodes[metrics_export_port] assert (address_info["raylet_socket_name"] == node_info["RayletSocketName"]) # Make sure we can ping Prometheus endpoints. def test_prometheus_endpoint(): response = requests.get( "http://localhost:{}".format(metrics_export_port)) return response.status_code == 200 wait_until_succeeded_without_exception( test_prometheus_endpoint, (requests.exceptions.ConnectionError, ))
def test_memory_table(disable_aiohttp_cache, ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])) @ray.remote class ActorWithObjs: def __init__(self): self.obj_ref = ray.put([1, 2, 3]) def get_obj(self): return ray.get(self.obj_ref) my_obj = ray.put([1, 2, 3] * 100) # noqa actors = [ActorWithObjs.remote() for _ in range(2)] # noqa results = ray.get([actor.get_obj.remote() for actor in actors]) # noqa webui_url = format_web_url(ray_start_with_dashboard["webui_url"]) resp = requests.get(webui_url + "/memory/set_fetch", params={"shouldFetch": "true"}) resp.raise_for_status() def check_mem_table(): resp = requests.get(f"{webui_url}/memory/memory_table") resp_data = resp.json() assert resp_data["result"] latest_memory_table = resp_data["data"]["memoryTable"] summary = latest_memory_table["summary"] # 1 ref per handle and per object the actor has a ref to assert summary["totalActorHandles"] == len(actors) * 2 # 1 ref for my_obj assert summary["totalLocalRefCount"] == 1 wait_until_succeeded_without_exception(check_mem_table, (AssertionError, ), timeout_ms=1000)
def test_submit_job(disable_aiohttp_cache, enable_test_module, ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) job = _prepare_job_for_test(webui_url) job_root_dir = os.path.join( os.path.dirname(ray_start_with_dashboard["session_dir"]), "job") shutil.rmtree(job_root_dir, ignore_errors=True) job_id = None job_submitted = False def _check_running(): nonlocal job_id nonlocal job_submitted if not job_submitted: resp = requests.post(f"{webui_url}/jobs", json=job) resp.raise_for_status() result = resp.json() assert result["result"] is True, resp.text job_submitted = True resp = requests.get(f"{webui_url}/jobs?view=summary") resp.raise_for_status() result = resp.json() assert result["result"] is True, resp.text summary = result["data"]["summary"] assert len(summary) == 2 # TODO(fyrestone): Return a job id when POST /jobs # The larger job id is the one we submitted. job_ids = sorted(s["jobId"] for s in summary) job_id = job_ids[1] resp = requests.get(f"{webui_url}/jobs/{job_id}") resp.raise_for_status() result = resp.json() assert result["result"] is True, resp.text job_info = result["data"]["detail"]["jobInfo"] assert job_info["jobId"] == job_id resp = requests.get(f"{webui_url}/jobs/{job_id}") resp.raise_for_status() result = resp.json() assert result["result"] is True, resp.text job_info = result["data"]["detail"]["jobInfo"] assert job_info["isDead"] is False job_actors = result["data"]["detail"]["jobActors"] job_workers = result["data"]["detail"]["jobWorkers"] assert len(job_actors) > 0 assert len(job_workers) > 0 wait_until_succeeded_without_exception(_check_running, exceptions=(AssertionError, KeyError, IndexError), timeout_ms=30 * 1000, raise_last_ex=True)
def test_logs(enable_test_module, disable_aiohttp_cache, ray_start_cluster_head): cluster = ray_start_cluster_head assert (wait_until_server_available(cluster.webui_url) is True) webui_url = cluster.webui_url webui_url = format_web_url(webui_url) nodes = ray.nodes() assert len(nodes) == 1 node_ip = nodes[0]["NodeManagerAddress"] @ray.remote class LoggingActor: def go(self, n): i = 0 while i < n: print(f"On number {i}") i += 1 def get_pid(self): return os.getpid() la = LoggingActor.remote() la2 = LoggingActor.remote() la_pid = str(ray.get(la.get_pid.remote())) la2_pid = str(ray.get(la2.get_pid.remote())) ray.get(la.go.remote(4)) ray.get(la2.go.remote(1)) def check_logs(): node_logs_response = requests.get( f"{webui_url}/node_logs", params={"ip": node_ip}) node_logs_response.raise_for_status() node_logs = node_logs_response.json() assert node_logs["result"] assert type(node_logs["data"]["logs"]) is dict assert all( pid in node_logs["data"]["logs"] for pid in (la_pid, la2_pid)) assert len(node_logs["data"]["logs"][la2_pid]) == 1 actor_one_logs_response = requests.get( f"{webui_url}/node_logs", params={ "ip": node_ip, "pid": str(la_pid) }) actor_one_logs_response.raise_for_status() actor_one_logs = actor_one_logs_response.json() assert actor_one_logs["result"] assert type(actor_one_logs["data"]["logs"]) is dict assert len(actor_one_logs["data"]["logs"][la_pid]) == 4 wait_until_succeeded_without_exception( check_logs, (AssertionError), timeout_ms=1000)
def test_pending_actor(ray_addresses): assert (wait_until_server_available(addresses["webui_url"]) is True) webui_url = ray_addresses["webui_url"].replace("localhost", "http://127.0.0.1") raylet_info = requests.get(webui_url + "/api/raylet_info").json() actor_info = raylet_info["result"]["actors"] assert len(actor_info) == 1 _, infeasible_actor_info = actor_info.popitem() wait_until_succeeded_without_exception( test_pending_actor, (AssertionError, requests.exceptions.ConnectionError), addresses, timeout_ms=30000, retry_interval_ms=1000)
def test_raylet_infeasible_tasks(shutdown_only): """ This test creates an actor that requires 5 GPUs but a ray cluster only has 3 GPUs. As a result, the new actor should be an infeasible actor. """ addresses = ray.init(num_gpus=3) @ray.remote(num_gpus=5) class ActorRequiringGPU: def __init__(self): pass ActorRequiringGPU.remote() def test_infeasible_actor(ray_addresses): assert (wait_until_server_available(addresses["webui_url"]) is True) webui_url = ray_addresses["webui_url"].replace("localhost", "http://127.0.0.1") raylet_info = requests.get(webui_url + "/api/raylet_info").json() actor_info = raylet_info["result"]["actors"] assert len(actor_info) == 1 _, infeasible_actor_info = actor_info.popitem() assert infeasible_actor_info["state"] == -1 assert infeasible_actor_info["invalidStateType"] == "infeasibleActor" assert (wait_until_succeeded_without_exception( test_infeasible_actor, (AssertionError, requests.exceptions.ConnectionError), addresses, timeout_ms=30000, retry_interval_ms=1000) is True)
def test_get_cluster_status(ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) address_info = ray_start_with_dashboard webui_url = address_info["webui_url"] webui_url = format_web_url(webui_url) # Check that the cluster_status endpoint works without the underlying data # from the GCS, but returns nothing. def get_cluster_status(): response = requests.get(f"{webui_url}/api/cluster_status") response.raise_for_status() print(response.json()) assert response.json()["result"] assert "autoscalingStatus" in response.json()["data"] assert response.json()["data"]["autoscalingStatus"] is None assert "autoscalingError" in response.json()["data"] assert response.json()["data"]["autoscalingError"] is None assert "clusterStatus" in response.json()["data"] assert "loadMetricsReport" in response.json()["data"]["clusterStatus"] wait_until_succeeded_without_exception(get_cluster_status, (requests.RequestException, )) # Populate the GCS field, check that the data is returned from the # endpoint. address = address_info["redis_address"] address = address.split(":") assert len(address) == 2 client = redis.StrictRedis( host=address[0], port=int(address[1]), password=ray_constants.REDIS_DEFAULT_PASSWORD) client.hset(DEBUG_AUTOSCALING_STATUS_LEGACY, "value", "hello") client.hset(DEBUG_AUTOSCALING_ERROR, "value", "world") response = requests.get(f"{webui_url}/api/cluster_status") response.raise_for_status() assert response.json()["result"] assert "autoscalingStatus" in response.json()["data"] assert response.json()["data"]["autoscalingStatus"] == "hello" assert "autoscalingError" in response.json()["data"] assert response.json()["data"]["autoscalingError"] == "world" assert "clusterStatus" in response.json()["data"] assert "loadMetricsReport" in response.json()["data"]["clusterStatus"]
def test_errors(enable_test_module, disable_aiohttp_cache, ray_start_cluster_head): cluster = ray_start_cluster_head assert (wait_until_server_available(cluster.webui_url) is True) webui_url = cluster.webui_url webui_url = format_web_url(webui_url) nodes = ray.nodes() assert len(nodes) == 1 node_ip = nodes[0]["NodeManagerAddress"] @ray.remote class ErrorActor(): def go(self): raise ValueError("This is an error") def get_pid(self): return os.getpid() ea = ErrorActor.remote() ea_pid = ea.get_pid.remote() ea.go.remote() def check_errs(): node_errs_response = requests.get( f"{webui_url}/node_logs", params={"ip": node_ip}) node_errs_response.raise_for_status() node_errs = node_errs_response.json() assert node_errs["result"] assert type(node_errs["data"]["errors"]) is dict assert ea_pid in node_errs["data"]["errors"] assert len(node_errs["data"]["errors"][ea_pid]) == 1 actor_err_response = requests.get( f"{webui_url}/node_logs", params={ "ip": node_ip, "pid": str(ea_pid) }) actor_err_response.raise_for_status() actor_errs = actor_err_response.json() assert actor_errs["result"] assert type(actor_errs["data"]["errors"]) is dict assert len(actor_errs["data"]["errors"][ea_pid]) == 4 wait_until_succeeded_without_exception( check_errs, (AssertionError), timeout_ms=1000)
def test_raylet_pending_tasks(shutdown_only): # Make sure to specify num_cpus. Otherwise, the test can be broken # when the number of cores is less than the number of spawned actors. addresses = ray.init(num_gpus=3, num_cpus=4) @ray.remote(num_gpus=1) class ActorRequiringGPU: def __init__(self): pass @ray.remote class ParentActor: def __init__(self): self.a = [ActorRequiringGPU.remote() for i in range(4)] # If we do not get ParentActor actor handler, reference counter will # terminate ParentActor. parent_actor = ParentActor.remote() assert parent_actor is not None def test_pending_actor(ray_addresses): assert (wait_until_server_available(addresses["webui_url"]) is True) webui_url = ray_addresses["webui_url"].replace("localhost", "http://127.0.0.1") raylet_info = requests.get(webui_url + "/api/raylet_info").json() actor_info = raylet_info["result"]["actors"] assert len(actor_info) == 1 _, infeasible_actor_info = actor_info.popitem() # Verify there are 4 spawned actors. children = infeasible_actor_info["children"] assert len(children) == 4 pending_actor_detected = 0 for child_id, child in children.items(): if ("invalidStateType" in child and child["invalidStateType"] == "pendingActor"): pending_actor_detected += 1 # 4 GPUActors are spawned although there are only 3 GPUs. # One actor should be in the pending state. assert pending_actor_detected == 1 assert (wait_until_succeeded_without_exception( test_pending_actor, (AssertionError, requests.exceptions.ConnectionError), addresses, timeout_ms=30000, retry_interval_ms=1000) is True)