class MetricsService(Service): """ A service to provide a registry where metrics instruments can be registered and retrieved from. It continuously reports metrics to the specified InfluxDB instance. """ def __init__(self, influx_server: str, influx_user: str, influx_password: str, influx_database: str, host: str, reporting_frequency: int = 10): self._influx_server = influx_server self._reporting_frequency = reporting_frequency self._registry = HostMetricsRegistry(host) self._reporter = InfluxReporter(registry=self._registry, protocol='https', port=443, database=influx_database, username=influx_user, password=influx_password, server=influx_server) logger = get_extended_debug_logger( 'trinity.components.builtin.metrics.MetricsService') @property def registry(self) -> HostMetricsRegistry: """ Return the :class:`trinity.components.builtin.metrics.registry.HostMetricsRegistry` at which metrics instruments can be registered and retrieved. """ return self._registry async def run(self) -> None: self.logger.info("Reporting metrics to %s", self._influx_server) self.manager.run_daemon_task(self._continuously_report) await self.manager.wait_finished() async def _continuously_report(self) -> None: async for _ in trio_utils.every(self._reporting_frequency): self._reporter.report_now()
class BaseMetricsService(Service, MetricsServiceAPI): """ A service to provide a registry where metrics instruments can be registered and retrieved from. It continuously reports metrics to the specified InfluxDB instance. """ MIN_SECONDS_BETWEEN_ERROR_LOGS = 60 def __init__(self, influx_server: str, influx_user: str, influx_password: str, influx_database: str, host: str, port: int, protocol: str, reporting_frequency: int): self._unreported_error: Exception = None self._last_time_reported: float = 0.0 self._influx_server = influx_server self._reporting_frequency = reporting_frequency self._registry = HostMetricsRegistry(host) self._reporter = InfluxReporter(registry=self._registry, database=influx_database, username=influx_user, password=influx_password, protocol=protocol, port=port, server=influx_server) logger = get_logger('trinity.components.builtin.metrics.MetricsService') @property def registry(self) -> HostMetricsRegistry: """ Return the :class:`trinity.components.builtin.metrics.registry.HostMetricsRegistry` at which metrics instruments can be registered and retrieved. """ return self._registry async def run(self) -> None: self.logger.info("Reporting metrics to %s", self._influx_server) self.manager.run_daemon_task(self.continuously_report) await self.manager.wait_finished() def report_now(self) -> None: try: self._reporter.report_now() except (HTTPException, ConnectionError) as exc: # This method is usually called every few seconds. If there's an issue with the # connection we do not want to flood the log and tame down warnings. # 1. We log the first instance of an exception immediately # 2. We log follow up exceptions only after a minimum time has elapsed # This means that we also might overwrite exceptions for different errors if self._is_justified_to_log_error(): self._log_and_clear(exc) else: self._unreported_error = exc else: # If errors disappear, we want to make sure we eventually report the last instance if self._unreported_error is not None and self._is_justified_to_log_error( ): self._log_and_clear(self._unreported_error) def _log_and_clear(self, error: Exception) -> None: self.logger.warning("Unable to report metrics: %s", error) self._unreported_error = None self._last_time_reported = time.monotonic() def _is_justified_to_log_error(self) -> bool: return (self._last_time_reported == 0.0 or time.monotonic() - self._last_time_reported > self.MIN_SECONDS_BETWEEN_ERROR_LOGS) @abstractmethod async def continuously_report(self) -> None: ...
class Metrics(Service): logger = logging.getLogger('alexandria.metrics.Metrics') def __init__(self, host: str, client: ClientAPI, kademlia: KademliaAPI, influx_server: str, influx_user: str, influx_password: str, influx_database: str, influx_port: int = 443, influx_protocol: str = 'https', reporting_frequency: int = 10, process_collection_frequency: int = 3): self._influx_server = influx_server self._reporting_frequency = reporting_frequency self._process_collection_frequency = process_collection_frequency self._registry = HostMetricsRegistry(host) self._reporter = InfluxReporter( registry=self._registry, protocol=influx_protocol, port=influx_port, database=influx_database, username=influx_user, password=influx_password, server=influx_server ) self.client = client self.kademlia = kademlia @classmethod def from_cli_args(cls, args: Namespace, client: ClientAPI, kademlia: KademliaAPI, ) -> 'Metrics': return cls( host=args.metrics_host, client=client, kademlia=kademlia, influx_server=args.metrics_influx_server, influx_user=args.metrics_influx_user, influx_password=args.metrics_influx_password, influx_database=args.metrics_influx_database, influx_port=args.metrics_influx_port, influx_protocol=args.metrics_influx_protocol, reporting_frequency=args.metrics_reporting_frequency, ) async def run(self) -> None: self.manager.run_daemon_task( self._continuously_report, self._reporting_frequency, ) self.manager.run_daemon_task( self._collect_system_metrics, self._process_collection_frequency, ) self.manager.run_daemon_task( self._report_routing_table_stats, 10, ) self.manager.run_daemon_task( self._report_content_manager_stats, 10, ) self.logger.info('Metrics started') for payload_type in PAYLOAD_TYPES: self.manager.run_daemon_task(self._report_inbound_message_stats, payload_type) self.manager.run_daemon_task(self._report_event, self.client.events.session_created, 'events/session-created') # noqa: E501 self.manager.run_daemon_task(self._report_event, self.client.events.session_idle, 'events/session-idle') # noqa: E501 self.manager.run_daemon_task(self._report_event, self.client.events.handshake_complete, 'events/handshake-complete') # noqa: E501 self.manager.run_daemon_task(self._report_event, self.client.events.handshake_timeout, 'events/handshake-timeout') # noqa: E501 self.manager.run_daemon_task(self._report_event, self.client.events.datagram_received, 'datagram/inbound') # noqa: E501 self.manager.run_daemon_task(self._report_event, self.client.events.datagram_sent, 'datagram/outbound') # noqa: E501 self.manager.run_daemon_task(self._report_event, self.client.events.sent_ping, 'messages/outbound/Ping') # noqa: E501 self.manager.run_daemon_task(self._report_event, self.client.events.sent_pong, 'messages/outbound/Pong') # noqa: E501 self.manager.run_daemon_task(self._report_event, self.client.events.sent_find_nodes, 'messages/outbound/FindNodes') # noqa: E501 self.manager.run_daemon_task(self._report_event, self.client.events.sent_found_nodes, 'messages/outbound/FoundNodes') # noqa: E501 self.manager.run_daemon_task(self._report_event, self.client.events.sent_advertise, 'messages/outbound/Advertise') # noqa: E501 self.manager.run_daemon_task(self._report_event, self.client.events.sent_ack, 'messages/outbound/Ack') # noqa: E501 self.manager.run_daemon_task(self._report_event, self.client.events.sent_locate, 'messages/outbound/Locate') # noqa: E501 self.manager.run_daemon_task(self._report_event, self.client.events.sent_locations, 'messages/outbound/Locations') # noqa: E501 self.manager.run_daemon_task(self._report_event, self.client.events.sent_retrieve, 'messages/outbound/Retrieve') # noqa: E501 self.manager.run_daemon_task(self._report_event, self.client.events.sent_chunk, 'messages/outbound/Chunk') # noqa: E501 await self.manager.wait_finished() async def _continuously_report(self, frequency: int) -> None: async for _ in every(frequency): self._reporter.report_now() async def _report_event(self, event: EventAPI[Any], suffix: str) -> None: counter = self._registry.counter(f'alexandria.{suffix}.counter') meter = self._registry.meter(f'alexandria.{suffix}.meter') async with event.subscribe() as subscription: async for _ in subscription: counter.inc() meter.mark() async def _report_routing_table_stats(self, frequency: int) -> None: size_gauge = self._registry.gauge('alexandria.dht/routing-table/total-nodes.gauge') async for _ in every(frequency): stats = self.kademlia.routing_table.get_stats() size_gauge.set_value(stats.total_nodes) async def _report_inbound_message_stats(self, payload_type: Type[ssz.Serializable]) -> None: name = payload_type.__name__ counter = self._registry.counter(f'alexandria.messages/inbound/{name}.counter') meter = self._registry.meter(f'alexandria.messages/inbound/{name}.meter') async with self.client.message_dispatcher.subscribe(payload_type) as subscription: async for payload in subscription: counter.inc() meter.mark() async def _report_content_manager_stats(self, frequency: int) -> None: gauge = self._registry.gauge durable_db_item_count_gauge = gauge('alexandria.content/durable-db/item-count.gauge') ephemeral_db_item_count_gauge = gauge('alexandria.content/ephemeral-db/item-count.gauge') ephemeral_db_capacity_gauge = gauge('alexandria.content/ephemeral-db/capacity.gauge') ephemeral_db_size_gauge = gauge('alexandria.content/ephemeral-db/size.gauge') ephemeral_index_capacity_gauge = gauge('alexandria.content/ephemeral-index/capacity.gauge') ephemeral_index_size_gauge = gauge('alexandria.content/ephemeral-index/size.gauge') cache_db_item_count_gauge = gauge('alexandria.content/cache-db/item-count.gauge') cache_db_capacity_gauge = gauge('alexandria.content/cache-db/capacity.gauge') cache_db_size_gauge = gauge('alexandria.content/cache-db/size.gauge') cache_index_capacity_gauge = gauge('alexandria.content/cache-index/capacity.gauge') cache_index_size_gauge = gauge('alexandria.content/cache-index/size.gauge') async for _ in every(frequency): stats = self.kademlia.content_manager.get_stats() durable_db_item_count_gauge.set_value(stats.durable_item_count) ephemeral_db_item_count_gauge.set_value(stats.ephemeral_db_count) ephemeral_db_capacity_gauge.set_value(stats.ephemeral_db_capacity) ephemeral_db_size_gauge.set_value( stats.ephemeral_db_total_capacity - stats.ephemeral_db_capacity ) ephemeral_index_capacity_gauge.set_value(stats.ephemeral_index_capacity) ephemeral_index_size_gauge.set_value( stats.ephemeral_index_total_capacity - stats.ephemeral_index_capacity ) cache_db_item_count_gauge.set_value(stats.cache_db_count) cache_db_capacity_gauge.set_value(stats.cache_db_capacity) cache_db_size_gauge.set_value(stats.cache_db_total_capacity - stats.cache_db_capacity) cache_index_capacity_gauge.set_value(stats.cache_index_capacity) cache_index_size_gauge.set_value( stats.cache_index_total_capacity - stats.cache_index_capacity ) async def _collect_system_metrics(self, frequency: int) -> None: cpu_sysload_gauge = self._registry.gauge('alexandria.system/cpu/sysload.gauge') cpu_syswait_gauge = self._registry.gauge('alexandria.system/cpu/syswait.gauge') memory_used_gauge = self._registry.gauge('alexandria.system/memory/used.gauge') memory_free_gauge = self._registry.gauge('alexandria.system/memory/free.gauge') disk_readdata_meter = self._registry.meter('alexandria.system/disk/readdata.meter') disk_writedata_meter = self._registry.meter('alexandria.system/disk/writedata.meter') network_in_packets_meter = self._registry.meter('alexandria.network/in/packets/total.meter') network_out_packets_meter = self._registry.meter('alexandria.network/out/packets/total.meter') # noqa: E501 previous = read_system_stats() async for _ in every(frequency, initial_delay=frequency): current = read_system_stats() global_time = current.cpu_stats.global_time - previous.cpu_stats.global_time cpu_sysload_gauge.set_value(global_time / frequency) global_wait = current.cpu_stats.global_wait_io - previous.cpu_stats.global_wait_io cpu_syswait_gauge.set_value(global_wait / frequency) memory_used_gauge.set_value(current.memory_stats.used) memory_free_gauge.set_value(current.memory_stats.free) read_bytes = current.disk_stats.read_bytes - previous.disk_stats.read_bytes disk_readdata_meter.mark(read_bytes) write_bytes = current.disk_stats.write_bytes - previous.disk_stats.write_bytes disk_writedata_meter.mark(write_bytes) in_packets = current.network_stats.in_packets - previous.network_stats.in_packets network_in_packets_meter.mark(in_packets) out_packets = current.network_stats.out_packets - previous.network_stats.out_packets network_out_packets_meter.mark(out_packets) previous = current