示例#1
0
def test_worker_stats(ray_start_regular):
    raylet = ray.nodes()[0]
    num_cpus = raylet["Resources"]["CPU"]
    raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                    ray.nodes()[0]["NodeManagerPort"])

    channel = grpc.insecure_channel(raylet_address)
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
    reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest())
    # Check that there is one connected driver.
    drivers = [worker for worker in reply.workers_stats if worker.is_driver]
    assert len(drivers) == 1
    assert os.getpid() == drivers[0].pid

    timeout_seconds = 20
    start_time = time.time()
    while True:
        if time.time() - start_time > timeout_seconds:
            raise RayTestTimeoutException(
                "Timed out while waiting for worker processes")

        # Wait for the workers to start.
        if len(reply.workers_stats) < num_cpus + 1:
            time.sleep(1)
            reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest())
            continue

        # Check that the rest of the processes are workers, 1 for each CPU.
        print(reply)
        assert len(reply.workers_stats) >= num_cpus + 1
        views = [view.view_name for view in reply.view_data]
        assert "redis_latency" in views
        assert "local_available_resource" in views
        # Check that all processes are Python.
        pids = [worker.pid for worker in reply.workers_stats]
        processes = [
            p.info["name"] for p in psutil.process_iter(attrs=["pid", "name"])
            if p.info["pid"] in pids
        ]
        for process in processes:
            # TODO(ekl): what is with travis/mi when running in Travis?
            assert ("python" in process or "ray" in process
                    or "travis/mi" in process)
        break
示例#2
0
 def try_get_node_stats(num_retry=5, timeout=2):
     reply = None
     for _ in range(num_retry):
         try:
             reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest(),
                                       timeout=timeout)
             break
         except grpc.RpcError:
             continue
     assert reply is not None
     return reply
示例#3
0
文件: dashboard.py 项目: sytelus/ray
 def run(self):
     counter = 0
     while True:
         time.sleep(1.0)
         with self._raylet_stats_lock:
             for node, stub in zip(self.nodes, self.stubs):
                 reply = stub.GetNodeStats(
                     node_manager_pb2.NodeStatsRequest())
                 self._raylet_stats[
                     node["NodeManagerAddress"]] = MessageToDict(reply)
         counter += 1
         # From time to time, check if new nodes have joined the cluster
         # and update self.nodes
         if counter % 10:
             self.update_nodes()
示例#4
0
 def run(self):
     counter = 0
     while True:
         time.sleep(1.0)
         replies = {}
         for node in self.nodes:
             node_id = node["NodeID"]
             stub = self.stubs[node_id]
             reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest())
             replies[node["NodeManagerAddress"]] = reply
         with self._raylet_stats_lock:
             for address, reply in replies.items():
                 self._raylet_stats[address] = MessageToDict(reply)
         counter += 1
         # From time to time, check if new nodes have joined the cluster
         # and update self.nodes
         if counter % 10:
             self.update_nodes()
示例#5
0
文件: scripts.py 项目: moazmagdy/ray
def stat(address):
    if not address:
        address = services.find_redis_address_or_die()
    logger.info("Connecting to Ray instance at {}.".format(address))
    ray.init(address=address)

    import grpc
    from ray.core.generated import node_manager_pb2
    from ray.core.generated import node_manager_pb2_grpc

    for raylet in ray.nodes():
        raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                        ray.nodes()[0]["NodeManagerPort"])
        logger.info("Querying raylet {}".format(raylet_address))

        channel = grpc.insecure_channel(raylet_address)
        stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
        reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest(),
                                  timeout=2.0)
        print(reply)