示例#1
0
 def run(self):
     counter = 0
     while True:
         time.sleep(1.0)
         replies = {}
         try:
             for node in self.nodes:
                 node_id = node["NodeID"]
                 stub = self.stubs[node_id]
                 reply = stub.GetNodeStats(
                     node_manager_pb2.GetNodeStatsRequest(
                         include_memory_info=self.include_memory_info),
                     timeout=2)
                 reply_dict = MessageToDict(reply)
                 reply_dict["nodeId"] = node_id
                 replies[node["NodeManagerAddress"]] = reply_dict
             with self._raylet_stats_lock:
                 for address, reply_dict in replies.items():
                     self._raylet_stats[address] = reply_dict
         except Exception:
             logger.exception(traceback.format_exc())
         finally:
             counter += 1
             # From time to time, check if new nodes have joined the cluster
             # and update self.nodes
             if counter % 10:
                 self._update_nodes()
示例#2
0
def node_stats(node_manager_address=None,
               node_manager_port=None,
               include_memory_info=True):
    """Returns NodeStats object describing memory usage in the cluster."""

    import grpc
    from ray.core.generated import node_manager_pb2
    from ray.core.generated import node_manager_pb2_grpc

    # We can ask any Raylet for the global memory info.
    assert (node_manager_address is not None and node_manager_port is not None)
    raylet_address = "{}:{}".format(node_manager_address, node_manager_port)
    channel = grpc.insecure_channel(
        raylet_address,
        options=[
            ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
            ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
        ],
    )
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
    node_stats = stub.GetNodeStats(
        node_manager_pb2.GetNodeStatsRequest(
            include_memory_info=include_memory_info),
        timeout=30.0)
    return node_stats
示例#3
0
 def try_get_node_stats(num_retry=5, timeout=2):
     reply = None
     for _ in range(num_retry):
         try:
             reply = stub.GetNodeStats(
                 node_manager_pb2.GetNodeStatsRequest(), timeout=timeout)
             break
         except grpc.RpcError:
             continue
     assert reply is not None
     return reply
示例#4
0
def get_workers():
    raylet = ray.nodes()[0]
    raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                    raylet["NodeManagerPort"])
    channel = grpc.insecure_channel(raylet_address)
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
    return [
        worker for worker in stub.GetNodeStats(
            node_manager_pb2.GetNodeStatsRequest()).core_workers_stats
        if worker.worker_type != common_pb2.DRIVER
    ]
示例#5
0
def get_num_workers():
    raylet = ray.nodes()[0]
    raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                    raylet["NodeManagerPort"])
    channel = grpc.insecure_channel(raylet_address)
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
    return len([
        worker for worker in stub.GetNodeStats(
            node_manager_pb2.GetNodeStatsRequest()).workers_stats
        if not worker.is_driver
    ])
示例#6
0
 async def _update_node_stats(self):
     for ip, stub in self._stubs.items():
         node_info = DataSource.nodes.get(ip)
         if node_info["state"] != "ALIVE":
             continue
         try:
             reply = await stub.GetNodeStats(
                 node_manager_pb2.GetNodeStatsRequest(), timeout=2)
             reply_dict = node_stats_to_dict(reply)
             DataSource.node_stats[ip] = reply_dict
         except Exception:
             logger.exception(f"Error updating node stats of {ip}.")
示例#7
0
 async def _update_node_stats(self):
     for node_id, stub in self._stubs.items():
         node_info = DataSource.nodes.get(node_id)
         if node_info["state"] != "ALIVE":
             continue
         try:
             reply = await stub.GetNodeStats(
                 node_manager_pb2.GetNodeStatsRequest(
                     include_memory_info=self._collect_memory_info),
                 timeout=2)
             reply_dict = node_stats_to_dict(reply)
             DataSource.node_stats[node_id] = reply_dict
         except Exception:
             logger.exception(f"Error updating node stats of {node_id}.")
def test_initial_workers(shutdown_only):
    # `num_cpus` should be <=2 because a Travis CI machine only has 2 CPU cores
    ray.init(num_cpus=1,
             include_dashboard=True,
             _internal_config=json.dumps({"enable_multi_tenancy": True}))
    raylet = ray.nodes()[0]
    raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                    raylet["NodeManagerPort"])
    channel = grpc.insecure_channel(raylet_address)
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
    wait_for_condition(lambda: len([
        worker for worker in stub.GetNodeStats(
            node_manager_pb2.GetNodeStatsRequest()).workers_stats
        if not worker.is_driver
    ]) == 1,
                       timeout=10)
示例#9
0
 def run(self):
     counter = 0
     while True:
         time.sleep(1.0)
         replies = {}
         for node in self.nodes:
             node_id = node["NodeID"]
             stub = self.stubs[node_id]
             reply = stub.GetNodeStats(
                 node_manager_pb2.GetNodeStatsRequest(), timeout=2)
             replies[node["NodeManagerAddress"]] = reply
         with self._raylet_stats_lock:
             for address, reply in replies.items():
                 self._raylet_stats[address] = MessageToDict(reply)
         counter += 1
         # From time to time, check if new nodes have joined the cluster
         # and update self.nodes
         if counter % 10:
             self.update_nodes()
def stat(address):
    if not address:
        address = services.find_redis_address_or_die()
    logger.info("Connecting to Ray instance at {}.".format(address))
    ray.init(address=address)

    import grpc
    from ray.core.generated import node_manager_pb2
    from ray.core.generated import node_manager_pb2_grpc

    for raylet in ray.nodes():
        raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                        ray.nodes()[0]["NodeManagerPort"])
        logger.info("Querying raylet {}".format(raylet_address))

        channel = grpc.insecure_channel(raylet_address)
        stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
        reply = stub.GetNodeStats(node_manager_pb2.GetNodeStatsRequest(),
                                  timeout=2.0)
        print(reply)
示例#11
0
def stat(address):
    """Get the current metrics protobuf from a Ray cluster (developer tool)."""
    if not address:
        address = services.find_redis_address_or_die()
    logger.info("Connecting to Ray instance at {}.".format(address))
    ray.init(address=address)

    import grpc
    from ray.core.generated import node_manager_pb2
    from ray.core.generated import node_manager_pb2_grpc

    for raylet in ray.nodes():
        raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                        ray.nodes()[0]["NodeManagerPort"])
        logger.info("Querying raylet {}".format(raylet_address))

        channel = grpc.insecure_channel(raylet_address)
        stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
        reply = stub.GetNodeStats(
            node_manager_pb2.GetNodeStatsRequest(include_memory_info=False),
            timeout=2.0)
        print(reply)