예제 #1
0
def test_prometheus_file_based_service_discovery(ray_start_cluster):
    # Make sure Prometheus service discovery file is correctly written
    # when number of nodes are dynamically changed.
    NUM_NODES = 5
    cluster = ray_start_cluster
    nodes = [cluster.add_node() for _ in range(NUM_NODES)]
    cluster.wait_for_nodes()
    addr = ray.init(address=cluster.address)
    redis_address = addr["redis_address"]
    writer = PrometheusServiceDiscoveryWriter(
        redis_address, ray.ray_constants.REDIS_DEFAULT_PASSWORD, "/tmp/ray")

    def get_metrics_export_address_from_node(nodes):
        return [
            "{}:{}".format(node.node_ip_address, node.metrics_export_port)
            for node in nodes
        ]

    loaded_json_data = json.loads(writer.get_file_discovery_content())[0]
    assert (set(get_metrics_export_address_from_node(nodes)) == set(
        loaded_json_data["targets"]))

    # Let's update nodes.
    for _ in range(3):
        nodes.append(cluster.add_node())

    # Make sure service discovery file content is correctly updated.
    loaded_json_data = json.loads(writer.get_file_discovery_content())[0]
    assert (set(get_metrics_export_address_from_node(nodes)) == set(
        loaded_json_data["targets"]))
예제 #2
0
def test_prometheus_file_based_service_discovery(ray_start_cluster):
    # Make sure Prometheus service discovery file is correctly written
    # when number of nodes are dynamically changed.
    NUM_NODES = 5
    cluster = ray_start_cluster
    nodes = [cluster.add_node() for _ in range(NUM_NODES)]
    cluster.wait_for_nodes()
    addr = ray.init(address=cluster.address)
    writer = PrometheusServiceDiscoveryWriter(
        addr["gcs_address"],
        "/tmp/ray",
    )

    def get_metrics_export_address_from_node(nodes):
        node_export_addrs = [
            "{}:{}".format(node.node_ip_address, node.metrics_export_port)
            for node in nodes
        ]
        # monitor should be run on head node for `ray_start_cluster` fixture
        autoscaler_export_addr = "{}:{}".format(
            cluster.head_node.node_ip_address, AUTOSCALER_METRIC_PORT)
        return node_export_addrs + [autoscaler_export_addr]

    loaded_json_data = json.loads(writer.get_file_discovery_content())[0]
    assert set(get_metrics_export_address_from_node(nodes)) == set(
        loaded_json_data["targets"])

    # Let's update nodes.
    for _ in range(3):
        nodes.append(cluster.add_node())

    # Make sure service discovery file content is correctly updated.
    loaded_json_data = json.loads(writer.get_file_discovery_content())[0]
    assert set(get_metrics_export_address_from_node(nodes)) == set(
        loaded_json_data["targets"])
예제 #3
0
 def __init__(self, dashboard_head):
     super().__init__(dashboard_head)
     self._stubs = {}
     self._ray_config = None
     DataSource.agents.signal.append(self._update_stubs)
     # TODO(fyrestone): Avoid using ray.state in dashboard, it's not
     # asynchronous and will lead to low performance. ray disconnect()
     # will be hang when the ray.state is connected and the GCS is exit.
     # Please refer to: https://github.com/ray-project/ray/issues/16328
     assert dashboard_head.gcs_address or dashboard_head.redis_address
     gcs_address = dashboard_head.gcs_address
     temp_dir = dashboard_head.temp_dir
     self.service_discovery = PrometheusServiceDiscoveryWriter(
         gcs_address, temp_dir)
예제 #4
0
class ReportHead(dashboard_utils.DashboardHeadModule):
    def __init__(self, dashboard_head):
        super().__init__(dashboard_head)
        self._stubs = {}
        self._ray_config = None
        DataSource.agents.signal.append(self._update_stubs)
        # TODO(fyrestone): Avoid using ray.state in dashboard, it's not
        # asynchronous and will lead to low performance. ray disconnect()
        # will be hang when the ray.state is connected and the GCS is exit.
        # Please refer to: https://github.com/ray-project/ray/issues/16328
        assert dashboard_head.gcs_address or dashboard_head.redis_address
        gcs_address = dashboard_head.gcs_address
        temp_dir = dashboard_head.temp_dir
        self.service_discovery = PrometheusServiceDiscoveryWriter(
            gcs_address, temp_dir)

    async def _update_stubs(self, change):
        if change.old:
            node_id, port = change.old
            ip = DataSource.node_id_to_ip[node_id]
            self._stubs.pop(ip)
        if change.new:
            node_id, ports = change.new
            ip = DataSource.node_id_to_ip[node_id]
            options = GLOBAL_GRPC_OPTIONS
            channel = ray._private.utils.init_grpc_channel(f"{ip}:{ports[1]}",
                                                           options=options,
                                                           asynchronous=True)
            stub = reporter_pb2_grpc.ReporterServiceStub(channel)
            self._stubs[ip] = stub

    @routes.get("/api/launch_profiling")
    async def launch_profiling(self, req) -> aiohttp.web.Response:
        ip = req.query["ip"]
        pid = int(req.query["pid"])
        duration = int(req.query["duration"])
        reporter_stub = self._stubs[ip]
        reply = await reporter_stub.GetProfilingStats(
            reporter_pb2.GetProfilingStatsRequest(pid=pid, duration=duration))
        profiling_info = (json.loads(reply.profiling_stats)
                          if reply.profiling_stats else reply.std_out)
        return dashboard_optional_utils.rest_response(
            success=True,
            message="Profiling success.",
            profiling_info=profiling_info)

    @routes.get("/api/ray_config")
    async def get_ray_config(self, req) -> aiohttp.web.Response:
        if self._ray_config is None:
            try:
                config_path = os.path.expanduser("~/ray_bootstrap_config.yaml")
                with open(config_path) as f:
                    cfg = yaml.safe_load(f)
            except yaml.YAMLError:
                return dashboard_optional_utils.rest_response(
                    success=False,
                    message=f"No config found at {config_path}.",
                )
            except FileNotFoundError:
                return dashboard_optional_utils.rest_response(
                    success=False,
                    message="Invalid config, could not load YAML.")

            payload = {
                "min_workers": cfg.get("min_workers", "unspecified"),
                "max_workers": cfg.get("max_workers", "unspecified"),
            }

            try:
                payload["head_type"] = cfg["head_node"]["InstanceType"]
            except KeyError:
                payload["head_type"] = "unknown"

            try:
                payload["worker_type"] = cfg["worker_nodes"]["InstanceType"]
            except KeyError:
                payload["worker_type"] = "unknown"

            self._ray_config = payload

        return dashboard_optional_utils.rest_response(
            success=True,
            message="Fetched ray config.",
            **self._ray_config,
        )

    @routes.get("/api/cluster_status")
    async def get_cluster_status(self, req):
        """Returns status information about the cluster.

        Currently contains two fields:
            autoscaling_status (str)-- a status message from the autoscaler.
            autoscaling_error (str)-- an error message from the autoscaler if
                anything has gone wrong during autoscaling.

        These fields are both read from the GCS, it's expected that the
        autoscaler writes them there.
        """

        assert ray.experimental.internal_kv._internal_kv_initialized()
        legacy_status = internal_kv._internal_kv_get(
            DEBUG_AUTOSCALING_STATUS_LEGACY)
        formatted_status_string = internal_kv._internal_kv_get(
            DEBUG_AUTOSCALING_STATUS)
        formatted_status = (json.loads(formatted_status_string.decode())
                            if formatted_status_string else {})
        error = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_ERROR)
        return dashboard_optional_utils.rest_response(
            success=True,
            message="Got cluster status.",
            autoscaling_status=legacy_status.decode()
            if legacy_status else None,
            autoscaling_error=error.decode() if error else None,
            cluster_status=formatted_status if formatted_status else None,
        )

    async def run(self, server):
        # Need daemon True to avoid dashboard hangs at exit.
        self.service_discovery.daemon = True
        self.service_discovery.start()
        gcs_addr = self._dashboard_head.gcs_address
        subscriber = GcsAioResourceUsageSubscriber(gcs_addr)
        await subscriber.subscribe()

        while True:
            try:
                # The key is b'RAY_REPORTER:{node id hex}',
                # e.g. b'RAY_REPORTER:2b4fbd...'
                key, data = await subscriber.poll()
                if key is None:
                    continue
                data = json.loads(data)
                node_id = key.split(":")[-1]
                DataSource.node_physical_stats[node_id] = data
            except Exception:
                logger.exception(
                    "Error receiving node physical stats from reporter agent.")

    @staticmethod
    def is_minimal_module():
        return False
예제 #5
0
            filename=args.logging_filename,
            max_bytes=args.logging_rotate_bytes,
            backup_count=args.logging_rotate_backup_count)

        dashboard = Dashboard(
            args.host,
            args.port,
            args.port_retries,
            args.redis_address,
            redis_password=args.redis_password,
            log_dir=args.log_dir)
        # TODO(fyrestone): Avoid using ray.state in dashboard, it's not
        # asynchronous and will lead to low performance. ray disconnect()
        # will be hang when the ray.state is connected and the GCS is exit.
        # Please refer to: https://github.com/ray-project/ray/issues/16328
        service_discovery = PrometheusServiceDiscoveryWriter(
            args.redis_address, args.redis_password, args.temp_dir)
        # Need daemon True to avoid dashboard hangs at exit.
        service_discovery.daemon = True
        service_discovery.start()
        loop = asyncio.get_event_loop()
        loop.run_until_complete(dashboard.run())
    except Exception as e:
        traceback_str = ray._private.utils.format_error_message(
            traceback.format_exc())
        message = f"The dashboard on node {platform.uname()[1]} " \
                  f"failed with the following " \
                  f"error:\n{traceback_str}"
        if isinstance(e, FrontendNotFoundError):
            logger.warning(message)
        else:
            logger.error(message)