def node_stats(node_manager_address=None, node_manager_port=None, include_memory_info=True): """Returns NodeStats object describing memory usage in the cluster.""" from ray.core.generated import node_manager_pb2, node_manager_pb2_grpc # We can ask any Raylet for the global memory info. assert node_manager_address is not None and node_manager_port is not None raylet_address = "{}:{}".format(node_manager_address, node_manager_port) channel = utils.init_grpc_channel( raylet_address, options=[ ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH), ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH), ], ) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) node_stats = stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest( include_memory_info=include_memory_info), timeout=30.0, ) return node_stats
async def _connect_to_dashboard(self): """Connect to the dashboard. If the dashboard is not started, then this method will never returns. Returns: The ReportEventServiceStub object. """ while True: try: # TODO: Use async version if performance is an issue dashboard_rpc_address = internal_kv._internal_kv_get( dashboard_consts.DASHBOARD_RPC_ADDRESS, namespace=ray_constants.KV_NAMESPACE_DASHBOARD, ) if dashboard_rpc_address: logger.info("Report events to %s", dashboard_rpc_address) options = ray_constants.GLOBAL_GRPC_OPTIONS channel = utils.init_grpc_channel(dashboard_rpc_address, options=options, asynchronous=True) return event_pb2_grpc.ReportEventServiceStub(channel) except Exception: logger.exception("Connect to dashboard failed.") await asyncio.sleep( event_consts.RETRY_CONNECT_TO_DASHBOARD_INTERVAL_SECONDS)
def __init__(self, address): from ray._private.utils import init_grpc_channel logger.debug(f"Connecting to gcs address: {address}") options = [ ("grpc.enable_http_proxy", 0), ("grpc.max_send_message_length", GcsClient.MAX_MESSAGE_LENGTH), ("grpc.max_receive_message_length", GcsClient.MAX_MESSAGE_LENGTH) ] channel = init_grpc_channel(address, options=options) self._kv_stub = gcs_service_pb2_grpc.InternalKVGcsServiceStub(channel)
def get_workers(): raylet = ray.nodes()[0] raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], raylet["NodeManagerPort"]) channel = init_grpc_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) return [ worker for worker in stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest()).core_workers_stats if worker.worker_type != common_pb2.DRIVER ]
def create_gcs_channel(address: str, aio=False): """Returns a GRPC channel to GCS. Args: address: GCS address string, e.g. ip:port aio: Whether using grpc.aio Returns: grpc.Channel or grpc.aio.Channel to GCS """ from ray._private.utils import init_grpc_channel return init_grpc_channel(address, options=_GRPC_OPTIONS, asynchronous=aio)
async def _update_stubs(self, change): if change.old: node_id, _ = change.old ip = DataSource.node_id_to_ip[node_id] self._stubs.pop(node_id) self._ip_to_node_id.pop(ip) if change.new: node_id, ports = change.new ip = DataSource.node_id_to_ip[node_id] options = ray_constants.GLOBAL_GRPC_OPTIONS channel = init_grpc_channel(f"{ip}:{ports[1]}", options=options, asynchronous=True) stub = reporter_pb2_grpc.LogServiceStub(channel) self._stubs[node_id] = stub self._ip_to_node_id[ip] = node_id
async def _update_stubs(self, change): if change.old: node_id, _ = change.old ip = DataSource.node_id_to_ip[node_id] self._stubs.pop(node_id) self._ip_to_node_id.pop(ip) if change.new: node_id, ports = change.new ip = DataSource.node_id_to_ip[node_id] options = (("grpc.enable_http_proxy", 0), ) channel = init_grpc_channel(f"{ip}:{ports[1]}", options=options, asynchronous=True) stub = reporter_pb2_grpc.LogServiceStub(channel) self._stubs[node_id] = stub self._ip_to_node_id[ip] = node_id
async def _connect_to_dashboard(self): """ Connect to the dashboard. If the dashboard is not started, then this method will never returns. Returns: The ReportEventServiceStub object. """ while True: try: aioredis = self._dashboard_agent.aioredis_client dashboard_rpc_address = await aioredis.get( dashboard_consts.REDIS_KEY_DASHBOARD_RPC) if dashboard_rpc_address: logger.info("Report events to %s", dashboard_rpc_address) options = (("grpc.enable_http_proxy", 0), ) channel = utils.init_grpc_channel(dashboard_rpc_address, options=options, asynchronous=True) return event_pb2_grpc.ReportEventServiceStub(channel) except Exception: logger.exception("Connect to dashboard failed.") await asyncio.sleep( event_consts.RETRY_CONNECT_TO_DASHBOARD_INTERVAL_SECONDS)
def get_store_stats(state, node_manager_address=None, node_manager_port=None): """Returns a formatted string describing memory usage in the cluster.""" from ray.core.generated import node_manager_pb2 from ray.core.generated import node_manager_pb2_grpc # We can ask any Raylet for the global memory info, that Raylet internally # asks all nodes in the cluster for memory stats. if node_manager_address is None or node_manager_port is None: # We should ask for a raylet that is alive. raylet = None for node in state.node_table(): if node["Alive"]: raylet = node break assert raylet is not None, "Every raylet is dead" raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], raylet["NodeManagerPort"]) else: raylet_address = "{}:{}".format(node_manager_address, node_manager_port) channel = utils.init_grpc_channel( raylet_address, options=[ ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH), ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH), ], ) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) reply = stub.FormatGlobalMemoryInfo( node_manager_pb2.FormatGlobalMemoryInfoRequest( include_memory_info=False), timeout=30.0, ) return store_stats_summary(reply)
def test_worker_stats(shutdown_only): ray.init(num_cpus=1, include_dashboard=True) raylet = ray.nodes()[0] num_cpus = raylet["Resources"]["CPU"] raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], ray.nodes()[0]["NodeManagerPort"]) channel = init_grpc_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) def try_get_node_stats(num_retry=5, timeout=2): reply = None for _ in range(num_retry): try: reply = stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest(), timeout=timeout) break except grpc.RpcError: continue assert reply is not None return reply reply = try_get_node_stats() # Check that there is one connected driver. drivers = [ worker for worker in reply.core_workers_stats if worker.worker_type == common_pb2.DRIVER ] assert len(drivers) == 1 assert os.getpid() == drivers[0].pid @ray.remote def f(): ray.worker.show_in_dashboard("test") return os.getpid() @ray.remote class Actor: def __init__(self): pass def f(self): ray.worker.show_in_dashboard("test") return os.getpid() # Test show_in_dashboard for remote functions. worker_pid = ray.get(f.remote()) reply = try_get_node_stats() target_worker_present = False for stats in reply.core_workers_stats: if stats.webui_display[""] == '{"message": "test", "dtype": "text"}': target_worker_present = True assert stats.pid == worker_pid else: assert stats.webui_display[""] == "" # Empty proto assert target_worker_present # Test show_in_dashboard for remote actors. a = Actor.remote() worker_pid = ray.get(a.f.remote()) reply = try_get_node_stats() target_worker_present = False for stats in reply.core_workers_stats: if stats.webui_display[""] == '{"message": "test", "dtype": "text"}': target_worker_present = True else: assert stats.webui_display[""] == "" # Empty proto assert target_worker_present if _WIN32: timeout_seconds = 40 else: timeout_seconds = 20 start_time = time.time() while True: if time.time() - start_time > timeout_seconds: raise RayTestTimeoutException( "Timed out while waiting for worker processes") # Wait for the workers to start. if len(reply.core_workers_stats) < num_cpus + 2: time.sleep(1) reply = try_get_node_stats() continue # Check that the rest of the processes are workers, 1 for each CPU. assert len(reply.core_workers_stats) == num_cpus + 2 # Check that all processes are Python. pids = [worker.pid for worker in reply.core_workers_stats] processes = [ p.info["name"] for p in psutil.process_iter(attrs=["pid", "name"]) if p.info["pid"] in pids ] for process in processes: # TODO(ekl) why does travis/mi end up in the process list assert ("python" in process or "mini" in process or "conda" in process or "travis" in process or "runner" in process or "pytest" in process or "ray" in process), process break