Exemplo n.º 1
0
class MetricsService(Service):
    """
    A service to provide a registry where metrics instruments can be registered and retrieved from.
    It continuously reports metrics to the specified InfluxDB instance.
    """
    def __init__(self,
                 influx_server: str,
                 influx_user: str,
                 influx_password: str,
                 influx_database: str,
                 host: str,
                 reporting_frequency: int = 10):

        self._influx_server = influx_server
        self._reporting_frequency = reporting_frequency
        self._registry = HostMetricsRegistry(host)
        self._reporter = InfluxReporter(registry=self._registry,
                                        protocol='https',
                                        port=443,
                                        database=influx_database,
                                        username=influx_user,
                                        password=influx_password,
                                        server=influx_server)

    logger = get_extended_debug_logger(
        'trinity.components.builtin.metrics.MetricsService')

    @property
    def registry(self) -> HostMetricsRegistry:
        """
        Return the :class:`trinity.components.builtin.metrics.registry.HostMetricsRegistry` at which
        metrics instruments can be registered and retrieved.
        """
        return self._registry

    async def run(self) -> None:
        self.logger.info("Reporting metrics to %s", self._influx_server)
        self.manager.run_daemon_task(self._continuously_report)
        await self.manager.wait_finished()

    async def _continuously_report(self) -> None:
        async for _ in trio_utils.every(self._reporting_frequency):
            self._reporter.report_now()
Exemplo n.º 2
0
class BaseMetricsService(Service, MetricsServiceAPI):
    """
    A service to provide a registry where metrics instruments can be registered and retrieved from.
    It continuously reports metrics to the specified InfluxDB instance.
    """

    MIN_SECONDS_BETWEEN_ERROR_LOGS = 60

    def __init__(self, influx_server: str, influx_user: str,
                 influx_password: str, influx_database: str, host: str,
                 port: int, protocol: str, reporting_frequency: int):
        self._unreported_error: Exception = None
        self._last_time_reported: float = 0.0
        self._influx_server = influx_server
        self._reporting_frequency = reporting_frequency
        self._registry = HostMetricsRegistry(host)
        self._reporter = InfluxReporter(registry=self._registry,
                                        database=influx_database,
                                        username=influx_user,
                                        password=influx_password,
                                        protocol=protocol,
                                        port=port,
                                        server=influx_server)

    logger = get_logger('trinity.components.builtin.metrics.MetricsService')

    @property
    def registry(self) -> HostMetricsRegistry:
        """
        Return the :class:`trinity.components.builtin.metrics.registry.HostMetricsRegistry` at which
        metrics instruments can be registered and retrieved.
        """
        return self._registry

    async def run(self) -> None:
        self.logger.info("Reporting metrics to %s", self._influx_server)
        self.manager.run_daemon_task(self.continuously_report)
        await self.manager.wait_finished()

    def report_now(self) -> None:
        try:
            self._reporter.report_now()
        except (HTTPException, ConnectionError) as exc:

            # This method is usually called every few seconds. If there's an issue with the
            # connection we do not want to flood the log and tame down warnings.

            # 1. We log the first instance of an exception immediately
            # 2. We log follow up exceptions only after a minimum time has elapsed
            # This means that we also might overwrite exceptions for different errors

            if self._is_justified_to_log_error():
                self._log_and_clear(exc)
            else:
                self._unreported_error = exc
        else:
            # If errors disappear, we want to make sure we eventually report the last instance
            if self._unreported_error is not None and self._is_justified_to_log_error(
            ):
                self._log_and_clear(self._unreported_error)

    def _log_and_clear(self, error: Exception) -> None:
        self.logger.warning("Unable to report metrics: %s", error)
        self._unreported_error = None
        self._last_time_reported = time.monotonic()

    def _is_justified_to_log_error(self) -> bool:
        return (self._last_time_reported == 0.0
                or time.monotonic() - self._last_time_reported >
                self.MIN_SECONDS_BETWEEN_ERROR_LOGS)

    @abstractmethod
    async def continuously_report(self) -> None:
        ...
Exemplo n.º 3
0
class Metrics(Service):
    logger = logging.getLogger('alexandria.metrics.Metrics')

    def __init__(self,
                 host: str,
                 client: ClientAPI,
                 kademlia: KademliaAPI,
                 influx_server: str,
                 influx_user: str,
                 influx_password: str,
                 influx_database: str,
                 influx_port: int = 443,
                 influx_protocol: str = 'https',
                 reporting_frequency: int = 10,
                 process_collection_frequency: int = 3):
        self._influx_server = influx_server

        self._reporting_frequency = reporting_frequency
        self._process_collection_frequency = process_collection_frequency

        self._registry = HostMetricsRegistry(host)

        self._reporter = InfluxReporter(
            registry=self._registry,
            protocol=influx_protocol,
            port=influx_port,
            database=influx_database,
            username=influx_user,
            password=influx_password,
            server=influx_server
        )

        self.client = client
        self.kademlia = kademlia

    @classmethod
    def from_cli_args(cls,
                      args: Namespace,
                      client: ClientAPI,
                      kademlia: KademliaAPI,
                      ) -> 'Metrics':
        return cls(
            host=args.metrics_host,
            client=client,
            kademlia=kademlia,
            influx_server=args.metrics_influx_server,
            influx_user=args.metrics_influx_user,
            influx_password=args.metrics_influx_password,
            influx_database=args.metrics_influx_database,
            influx_port=args.metrics_influx_port,
            influx_protocol=args.metrics_influx_protocol,
            reporting_frequency=args.metrics_reporting_frequency,
        )

    async def run(self) -> None:
        self.manager.run_daemon_task(
            self._continuously_report,
            self._reporting_frequency,
        )
        self.manager.run_daemon_task(
            self._collect_system_metrics,
            self._process_collection_frequency,
        )
        self.manager.run_daemon_task(
            self._report_routing_table_stats,
            10,
        )
        self.manager.run_daemon_task(
            self._report_content_manager_stats,
            10,
        )
        self.logger.info('Metrics started')
        for payload_type in PAYLOAD_TYPES:
            self.manager.run_daemon_task(self._report_inbound_message_stats, payload_type)

        self.manager.run_daemon_task(self._report_event, self.client.events.session_created, 'events/session-created')  # noqa: E501
        self.manager.run_daemon_task(self._report_event, self.client.events.session_idle, 'events/session-idle')  # noqa: E501
        self.manager.run_daemon_task(self._report_event, self.client.events.handshake_complete, 'events/handshake-complete')  # noqa: E501
        self.manager.run_daemon_task(self._report_event, self.client.events.handshake_timeout, 'events/handshake-timeout')  # noqa: E501

        self.manager.run_daemon_task(self._report_event, self.client.events.datagram_received, 'datagram/inbound')  # noqa: E501
        self.manager.run_daemon_task(self._report_event, self.client.events.datagram_sent, 'datagram/outbound')  # noqa: E501

        self.manager.run_daemon_task(self._report_event, self.client.events.sent_ping, 'messages/outbound/Ping')  # noqa: E501
        self.manager.run_daemon_task(self._report_event, self.client.events.sent_pong, 'messages/outbound/Pong')  # noqa: E501
        self.manager.run_daemon_task(self._report_event, self.client.events.sent_find_nodes, 'messages/outbound/FindNodes')  # noqa: E501
        self.manager.run_daemon_task(self._report_event, self.client.events.sent_found_nodes, 'messages/outbound/FoundNodes')  # noqa: E501
        self.manager.run_daemon_task(self._report_event, self.client.events.sent_advertise, 'messages/outbound/Advertise')  # noqa: E501
        self.manager.run_daemon_task(self._report_event, self.client.events.sent_ack, 'messages/outbound/Ack')  # noqa: E501
        self.manager.run_daemon_task(self._report_event, self.client.events.sent_locate, 'messages/outbound/Locate')  # noqa: E501
        self.manager.run_daemon_task(self._report_event, self.client.events.sent_locations, 'messages/outbound/Locations')  # noqa: E501
        self.manager.run_daemon_task(self._report_event, self.client.events.sent_retrieve, 'messages/outbound/Retrieve')  # noqa: E501
        self.manager.run_daemon_task(self._report_event, self.client.events.sent_chunk, 'messages/outbound/Chunk')  # noqa: E501

        await self.manager.wait_finished()

    async def _continuously_report(self, frequency: int) -> None:
        async for _ in every(frequency):
            self._reporter.report_now()

    async def _report_event(self, event: EventAPI[Any], suffix: str) -> None:
        counter = self._registry.counter(f'alexandria.{suffix}.counter')
        meter = self._registry.meter(f'alexandria.{suffix}.meter')

        async with event.subscribe() as subscription:
            async for _ in subscription:
                counter.inc()
                meter.mark()

    async def _report_routing_table_stats(self, frequency: int) -> None:
        size_gauge = self._registry.gauge('alexandria.dht/routing-table/total-nodes.gauge')
        async for _ in every(frequency):
            stats = self.kademlia.routing_table.get_stats()
            size_gauge.set_value(stats.total_nodes)

    async def _report_inbound_message_stats(self, payload_type: Type[ssz.Serializable]) -> None:
        name = payload_type.__name__
        counter = self._registry.counter(f'alexandria.messages/inbound/{name}.counter')
        meter = self._registry.meter(f'alexandria.messages/inbound/{name}.meter')

        async with self.client.message_dispatcher.subscribe(payload_type) as subscription:
            async for payload in subscription:
                counter.inc()
                meter.mark()

    async def _report_content_manager_stats(self, frequency: int) -> None:
        gauge = self._registry.gauge

        durable_db_item_count_gauge = gauge('alexandria.content/durable-db/item-count.gauge')

        ephemeral_db_item_count_gauge = gauge('alexandria.content/ephemeral-db/item-count.gauge')
        ephemeral_db_capacity_gauge = gauge('alexandria.content/ephemeral-db/capacity.gauge')
        ephemeral_db_size_gauge = gauge('alexandria.content/ephemeral-db/size.gauge')

        ephemeral_index_capacity_gauge = gauge('alexandria.content/ephemeral-index/capacity.gauge')
        ephemeral_index_size_gauge = gauge('alexandria.content/ephemeral-index/size.gauge')

        cache_db_item_count_gauge = gauge('alexandria.content/cache-db/item-count.gauge')
        cache_db_capacity_gauge = gauge('alexandria.content/cache-db/capacity.gauge')
        cache_db_size_gauge = gauge('alexandria.content/cache-db/size.gauge')

        cache_index_capacity_gauge = gauge('alexandria.content/cache-index/capacity.gauge')
        cache_index_size_gauge = gauge('alexandria.content/cache-index/size.gauge')

        async for _ in every(frequency):
            stats = self.kademlia.content_manager.get_stats()

            durable_db_item_count_gauge.set_value(stats.durable_item_count)

            ephemeral_db_item_count_gauge.set_value(stats.ephemeral_db_count)
            ephemeral_db_capacity_gauge.set_value(stats.ephemeral_db_capacity)
            ephemeral_db_size_gauge.set_value(
                stats.ephemeral_db_total_capacity - stats.ephemeral_db_capacity
            )

            ephemeral_index_capacity_gauge.set_value(stats.ephemeral_index_capacity)
            ephemeral_index_size_gauge.set_value(
                stats.ephemeral_index_total_capacity - stats.ephemeral_index_capacity
            )

            cache_db_item_count_gauge.set_value(stats.cache_db_count)
            cache_db_capacity_gauge.set_value(stats.cache_db_capacity)
            cache_db_size_gauge.set_value(stats.cache_db_total_capacity - stats.cache_db_capacity)

            cache_index_capacity_gauge.set_value(stats.cache_index_capacity)
            cache_index_size_gauge.set_value(
                stats.cache_index_total_capacity - stats.cache_index_capacity
            )

    async def _collect_system_metrics(self, frequency: int) -> None:
        cpu_sysload_gauge = self._registry.gauge('alexandria.system/cpu/sysload.gauge')
        cpu_syswait_gauge = self._registry.gauge('alexandria.system/cpu/syswait.gauge')

        memory_used_gauge = self._registry.gauge('alexandria.system/memory/used.gauge')
        memory_free_gauge = self._registry.gauge('alexandria.system/memory/free.gauge')

        disk_readdata_meter = self._registry.meter('alexandria.system/disk/readdata.meter')
        disk_writedata_meter = self._registry.meter('alexandria.system/disk/writedata.meter')

        network_in_packets_meter = self._registry.meter('alexandria.network/in/packets/total.meter')
        network_out_packets_meter = self._registry.meter('alexandria.network/out/packets/total.meter')  # noqa: E501

        previous = read_system_stats()
        async for _ in every(frequency, initial_delay=frequency):
            current = read_system_stats()

            global_time = current.cpu_stats.global_time - previous.cpu_stats.global_time
            cpu_sysload_gauge.set_value(global_time / frequency)
            global_wait = current.cpu_stats.global_wait_io - previous.cpu_stats.global_wait_io
            cpu_syswait_gauge.set_value(global_wait / frequency)

            memory_used_gauge.set_value(current.memory_stats.used)
            memory_free_gauge.set_value(current.memory_stats.free)

            read_bytes = current.disk_stats.read_bytes - previous.disk_stats.read_bytes
            disk_readdata_meter.mark(read_bytes)

            write_bytes = current.disk_stats.write_bytes - previous.disk_stats.write_bytes
            disk_writedata_meter.mark(write_bytes)

            in_packets = current.network_stats.in_packets - previous.network_stats.in_packets
            network_in_packets_meter.mark(in_packets)
            out_packets = current.network_stats.out_packets - previous.network_stats.out_packets
            network_out_packets_meter.mark(out_packets)

            previous = current