def run(self): counter = 0 while True: time.sleep(1.0) replies = {} try: for node in self.nodes: node_id = node["NodeID"] stub = self.stubs[node_id] reply = stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest( include_memory_info=self.include_memory_info), timeout=2) reply_dict = MessageToDict(reply) reply_dict["nodeId"] = node_id replies[node["NodeManagerAddress"]] = reply_dict with self._raylet_stats_lock: for address, reply_dict in replies.items(): self._raylet_stats[address] = reply_dict except Exception: logger.exception(traceback.format_exc()) finally: counter += 1 # From time to time, check if new nodes have joined the cluster # and update self.nodes if counter % 10: self._update_nodes()
def node_stats(node_manager_address=None, node_manager_port=None, include_memory_info=True): """Returns NodeStats object describing memory usage in the cluster.""" import grpc from ray.core.generated import node_manager_pb2 from ray.core.generated import node_manager_pb2_grpc # We can ask any Raylet for the global memory info. assert (node_manager_address is not None and node_manager_port is not None) raylet_address = "{}:{}".format(node_manager_address, node_manager_port) channel = grpc.insecure_channel( raylet_address, options=[ ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH), ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH), ], ) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) node_stats = stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest( include_memory_info=include_memory_info), timeout=30.0) return node_stats
def try_get_node_stats(num_retry=5, timeout=2): reply = None for _ in range(num_retry): try: reply = stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest(), timeout=timeout) break except grpc.RpcError: continue assert reply is not None return reply
def get_workers(): raylet = ray.nodes()[0] raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], raylet["NodeManagerPort"]) channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) return [ worker for worker in stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest()).core_workers_stats if worker.worker_type != common_pb2.DRIVER ]
def get_num_workers(): raylet = ray.nodes()[0] raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], raylet["NodeManagerPort"]) channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) return len([ worker for worker in stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest()).workers_stats if not worker.is_driver ])
async def _update_node_stats(self): for ip, stub in self._stubs.items(): node_info = DataSource.nodes.get(ip) if node_info["state"] != "ALIVE": continue try: reply = await stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest(), timeout=2) reply_dict = node_stats_to_dict(reply) DataSource.node_stats[ip] = reply_dict except Exception: logger.exception(f"Error updating node stats of {ip}.")
async def _update_node_stats(self): for node_id, stub in self._stubs.items(): node_info = DataSource.nodes.get(node_id) if node_info["state"] != "ALIVE": continue try: reply = await stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest( include_memory_info=self._collect_memory_info), timeout=2) reply_dict = node_stats_to_dict(reply) DataSource.node_stats[node_id] = reply_dict except Exception: logger.exception(f"Error updating node stats of {node_id}.")
def test_initial_workers(shutdown_only): # `num_cpus` should be <=2 because a Travis CI machine only has 2 CPU cores ray.init(num_cpus=1, include_dashboard=True, _internal_config=json.dumps({"enable_multi_tenancy": True})) raylet = ray.nodes()[0] raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], raylet["NodeManagerPort"]) channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) wait_for_condition(lambda: len([ worker for worker in stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest()).workers_stats if not worker.is_driver ]) == 1, timeout=10)
def run(self): counter = 0 while True: time.sleep(1.0) replies = {} for node in self.nodes: node_id = node["NodeID"] stub = self.stubs[node_id] reply = stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest(), timeout=2) replies[node["NodeManagerAddress"]] = reply with self._raylet_stats_lock: for address, reply in replies.items(): self._raylet_stats[address] = MessageToDict(reply) counter += 1 # From time to time, check if new nodes have joined the cluster # and update self.nodes if counter % 10: self.update_nodes()
def stat(address): if not address: address = services.find_redis_address_or_die() logger.info("Connecting to Ray instance at {}.".format(address)) ray.init(address=address) import grpc from ray.core.generated import node_manager_pb2 from ray.core.generated import node_manager_pb2_grpc for raylet in ray.nodes(): raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], ray.nodes()[0]["NodeManagerPort"]) logger.info("Querying raylet {}".format(raylet_address)) channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) reply = stub.GetNodeStats(node_manager_pb2.GetNodeStatsRequest(), timeout=2.0) print(reply)
def stat(address): """Get the current metrics protobuf from a Ray cluster (developer tool).""" if not address: address = services.find_redis_address_or_die() logger.info("Connecting to Ray instance at {}.".format(address)) ray.init(address=address) import grpc from ray.core.generated import node_manager_pb2 from ray.core.generated import node_manager_pb2_grpc for raylet in ray.nodes(): raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], ray.nodes()[0]["NodeManagerPort"]) logger.info("Querying raylet {}".format(raylet_address)) channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) reply = stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest(include_memory_info=False), timeout=2.0) print(reply)