示例#1
0
def node_stats(node_manager_address=None,
               node_manager_port=None,
               include_memory_info=True):
    """Returns NodeStats object describing memory usage in the cluster."""

    from ray.core.generated import node_manager_pb2, node_manager_pb2_grpc

    # We can ask any Raylet for the global memory info.
    assert node_manager_address is not None and node_manager_port is not None
    raylet_address = "{}:{}".format(node_manager_address, node_manager_port)
    channel = utils.init_grpc_channel(
        raylet_address,
        options=[
            ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
            ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
        ],
    )

    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
    node_stats = stub.GetNodeStats(
        node_manager_pb2.GetNodeStatsRequest(
            include_memory_info=include_memory_info),
        timeout=30.0,
    )
    return node_stats
示例#2
0
    async def _connect_to_dashboard(self):
        """Connect to the dashboard. If the dashboard is not started, then
        this method will never returns.

        Returns:
            The ReportEventServiceStub object.
        """
        while True:
            try:
                # TODO: Use async version if performance is an issue
                dashboard_rpc_address = internal_kv._internal_kv_get(
                    dashboard_consts.DASHBOARD_RPC_ADDRESS,
                    namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
                )
                if dashboard_rpc_address:
                    logger.info("Report events to %s", dashboard_rpc_address)
                    options = ray_constants.GLOBAL_GRPC_OPTIONS
                    channel = utils.init_grpc_channel(dashboard_rpc_address,
                                                      options=options,
                                                      asynchronous=True)
                    return event_pb2_grpc.ReportEventServiceStub(channel)
            except Exception:
                logger.exception("Connect to dashboard failed.")
            await asyncio.sleep(
                event_consts.RETRY_CONNECT_TO_DASHBOARD_INTERVAL_SECONDS)
示例#3
0
 def __init__(self, address):
     from ray._private.utils import init_grpc_channel
     logger.debug(f"Connecting to gcs address: {address}")
     options = [
         ("grpc.enable_http_proxy", 0),
         ("grpc.max_send_message_length", GcsClient.MAX_MESSAGE_LENGTH),
         ("grpc.max_receive_message_length", GcsClient.MAX_MESSAGE_LENGTH)
     ]
     channel = init_grpc_channel(address, options=options)
     self._kv_stub = gcs_service_pb2_grpc.InternalKVGcsServiceStub(channel)
示例#4
0
def get_workers():
    raylet = ray.nodes()[0]
    raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                    raylet["NodeManagerPort"])
    channel = init_grpc_channel(raylet_address)
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
    return [
        worker for worker in stub.GetNodeStats(
            node_manager_pb2.GetNodeStatsRequest()).core_workers_stats
        if worker.worker_type != common_pb2.DRIVER
    ]
示例#5
0
def create_gcs_channel(address: str, aio=False):
    """Returns a GRPC channel to GCS.

    Args:
        address: GCS address string, e.g. ip:port
        aio: Whether using grpc.aio
    Returns:
        grpc.Channel or grpc.aio.Channel to GCS
    """
    from ray._private.utils import init_grpc_channel
    return init_grpc_channel(address, options=_GRPC_OPTIONS, asynchronous=aio)
示例#6
0
文件: log_head.py 项目: tchordia/ray
    async def _update_stubs(self, change):
        if change.old:
            node_id, _ = change.old
            ip = DataSource.node_id_to_ip[node_id]
            self._stubs.pop(node_id)
            self._ip_to_node_id.pop(ip)
        if change.new:
            node_id, ports = change.new
            ip = DataSource.node_id_to_ip[node_id]

            options = ray_constants.GLOBAL_GRPC_OPTIONS
            channel = init_grpc_channel(f"{ip}:{ports[1]}",
                                        options=options,
                                        asynchronous=True)
            stub = reporter_pb2_grpc.LogServiceStub(channel)
            self._stubs[node_id] = stub
            self._ip_to_node_id[ip] = node_id
示例#7
0
    async def _update_stubs(self, change):
        if change.old:
            node_id, _ = change.old
            ip = DataSource.node_id_to_ip[node_id]
            self._stubs.pop(node_id)
            self._ip_to_node_id.pop(ip)
        if change.new:
            node_id, ports = change.new
            ip = DataSource.node_id_to_ip[node_id]

            options = (("grpc.enable_http_proxy", 0), )
            channel = init_grpc_channel(f"{ip}:{ports[1]}",
                                        options=options,
                                        asynchronous=True)
            stub = reporter_pb2_grpc.LogServiceStub(channel)
            self._stubs[node_id] = stub
            self._ip_to_node_id[ip] = node_id
示例#8
0
    async def _connect_to_dashboard(self):
        """ Connect to the dashboard. If the dashboard is not started, then
        this method will never returns.

        Returns:
            The ReportEventServiceStub object.
        """
        while True:
            try:
                aioredis = self._dashboard_agent.aioredis_client
                dashboard_rpc_address = await aioredis.get(
                    dashboard_consts.REDIS_KEY_DASHBOARD_RPC)
                if dashboard_rpc_address:
                    logger.info("Report events to %s", dashboard_rpc_address)
                    options = (("grpc.enable_http_proxy", 0), )
                    channel = utils.init_grpc_channel(dashboard_rpc_address,
                                                      options=options,
                                                      asynchronous=True)
                    return event_pb2_grpc.ReportEventServiceStub(channel)
            except Exception:
                logger.exception("Connect to dashboard failed.")
            await asyncio.sleep(
                event_consts.RETRY_CONNECT_TO_DASHBOARD_INTERVAL_SECONDS)
示例#9
0
def get_store_stats(state, node_manager_address=None, node_manager_port=None):
    """Returns a formatted string describing memory usage in the cluster."""

    from ray.core.generated import node_manager_pb2
    from ray.core.generated import node_manager_pb2_grpc

    # We can ask any Raylet for the global memory info, that Raylet internally
    # asks all nodes in the cluster for memory stats.
    if node_manager_address is None or node_manager_port is None:
        # We should ask for a raylet that is alive.
        raylet = None
        for node in state.node_table():
            if node["Alive"]:
                raylet = node
                break
        assert raylet is not None, "Every raylet is dead"
        raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                        raylet["NodeManagerPort"])
    else:
        raylet_address = "{}:{}".format(node_manager_address,
                                        node_manager_port)

    channel = utils.init_grpc_channel(
        raylet_address,
        options=[
            ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
            ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
        ],
    )

    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
    reply = stub.FormatGlobalMemoryInfo(
        node_manager_pb2.FormatGlobalMemoryInfoRequest(
            include_memory_info=False),
        timeout=30.0,
    )
    return store_stats_summary(reply)
示例#10
0
def test_worker_stats(shutdown_only):
    ray.init(num_cpus=1, include_dashboard=True)
    raylet = ray.nodes()[0]
    num_cpus = raylet["Resources"]["CPU"]
    raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                    ray.nodes()[0]["NodeManagerPort"])

    channel = init_grpc_channel(raylet_address)
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)

    def try_get_node_stats(num_retry=5, timeout=2):
        reply = None
        for _ in range(num_retry):
            try:
                reply = stub.GetNodeStats(
                    node_manager_pb2.GetNodeStatsRequest(), timeout=timeout)
                break
            except grpc.RpcError:
                continue
        assert reply is not None
        return reply

    reply = try_get_node_stats()
    # Check that there is one connected driver.
    drivers = [
        worker for worker in reply.core_workers_stats
        if worker.worker_type == common_pb2.DRIVER
    ]
    assert len(drivers) == 1
    assert os.getpid() == drivers[0].pid

    @ray.remote
    def f():
        ray.worker.show_in_dashboard("test")
        return os.getpid()

    @ray.remote
    class Actor:
        def __init__(self):
            pass

        def f(self):
            ray.worker.show_in_dashboard("test")
            return os.getpid()

    # Test show_in_dashboard for remote functions.
    worker_pid = ray.get(f.remote())
    reply = try_get_node_stats()
    target_worker_present = False
    for stats in reply.core_workers_stats:
        if stats.webui_display[""] == '{"message": "test", "dtype": "text"}':
            target_worker_present = True
            assert stats.pid == worker_pid
        else:
            assert stats.webui_display[""] == ""  # Empty proto
    assert target_worker_present

    # Test show_in_dashboard for remote actors.
    a = Actor.remote()
    worker_pid = ray.get(a.f.remote())
    reply = try_get_node_stats()
    target_worker_present = False
    for stats in reply.core_workers_stats:
        if stats.webui_display[""] == '{"message": "test", "dtype": "text"}':
            target_worker_present = True
        else:
            assert stats.webui_display[""] == ""  # Empty proto
    assert target_worker_present

    if _WIN32:
        timeout_seconds = 40
    else:
        timeout_seconds = 20
    start_time = time.time()
    while True:
        if time.time() - start_time > timeout_seconds:
            raise RayTestTimeoutException(
                "Timed out while waiting for worker processes")

        # Wait for the workers to start.
        if len(reply.core_workers_stats) < num_cpus + 2:
            time.sleep(1)
            reply = try_get_node_stats()
            continue

        # Check that the rest of the processes are workers, 1 for each CPU.
        assert len(reply.core_workers_stats) == num_cpus + 2
        # Check that all processes are Python.
        pids = [worker.pid for worker in reply.core_workers_stats]
        processes = [
            p.info["name"] for p in psutil.process_iter(attrs=["pid", "name"])
            if p.info["pid"] in pids
        ]
        for process in processes:
            # TODO(ekl) why does travis/mi end up in the process list
            assert ("python" in process or "mini" in process
                    or "conda" in process or "travis" in process
                    or "runner" in process or "pytest" in process
                    or "ray" in process), process
        break