async def _on_data_chunk(self, metric: str, data_chunk): # Fast-path if there are no value checks: do not decode the whole data # chunk, only extract the last timestamp and bump timeout checks. if not self._has_value_checks: if len(data_chunk.time_delta) > 0: last_timestamp = Timestamp(sum(data_chunk.time_delta)) self._bump_timeout_checks(metric, last_timestamp) return tv_pairs = [ TvPair(timestamp=Timestamp(t), value=v) for t, v in zip( accumulate(data_chunk.time_delta), data_chunk.value) if not isnan(v) ] if len(tv_pairs) == 0: logger.debug( f"No non-NaN values in DataChunk for metric {metric!r}") return # check that all values in this data chunk are within the desired # thresholds for check in self._checks.values(): check.check(metric, tv_pairs) # "bump" all timeout checks with the last timestamp for which we # received values, i.e. reset the asynchronous timers that would # fire if we do not receive value for too long. last_timestamp = tv_pairs[-1].timestamp self._bump_timeout_checks(metric, last_timestamp)
def create_source_plugin( self, source_id: str, source_config: Dict, rpc_function: PluginRPCFunctionType, ) -> Optional[SourcePlugin]: source_type = source_config["type"].replace("-", "_") if source_id not in self._source_plugins: full_module_name = f"metricq_wizard_plugin_{source_type}" if importlib.util.find_spec(full_module_name): plugin_module = importlib.import_module(full_module_name) entry_point: EntryPointType = plugin_module.get_plugin self._source_plugins[source_id] = entry_point( source_config, rpc_function ) self._source_config_revision[source_id] = source_config.get("_rev") self._source_plugin_creation_time[source_id] = Timestamp.now() self._source_plugin_initial_configured_metrics[source_id] = self._source_plugins[source_id].get_configured_metrics() logger.debug( f"Currently configured metrics: {self._source_plugin_initial_configured_metrics[source_id]}") else: logger.error( f"Plugin {full_module_name} for source {source_id} not found." ) if source_id in self._source_plugins: return self._source_plugins[source_id] logger.error(f"Plugin instance for source {source_id} not found.") return None
def test_check_internal_chained_exception_report(check): errmsg_toplevel = "top-level exception" errmsg_cause = "chained exception" def check_metric_raising(*args, **kwargs): try: raise RuntimeError(errmsg_cause) except RuntimeError as e: raise RuntimeError(errmsg_toplevel) from e check.check_metric = check_metric_raising check.check("foo", tv_pairs=[TvPair(Timestamp(0), 0.0)]) report: Report = check._report_queue._queue.get_nowait() assert report.service == check._name assert report.state == State.CRITICAL header, *causes = report.message.splitlines() assert errmsg_toplevel in header assert len(causes) == 1 assert "caused by:" in causes[0] assert errmsg_cause in causes[0]
def __init__( self, session_key: str, ): self.session_key: str = session_key self._source_plugins: Dict[str, SourcePlugin] = {} self._source_config_revision: Dict[str, str] = {} self._source_plugin_creation_time: Dict[str, Timestamp] = {} self._source_plugin_initial_configured_metrics: Dict[str, Sequence[str]] = {} self.creation_time = Timestamp.now()
def _bacnet_reader_put_result_in_source_queue(self, device_name: str, device_address_string: str, result_values: Dict): fut = asyncio.run_coroutine_threadsafe( self._result_queue.put((Timestamp.now(), device_name, device_address_string, result_values)), loop=self.event_loop, ) try: fut.result() except Exception: logger.exception("Can't put BACnet result in queue!")
async def _on_data_chunk(self, metric: str, data_chunk): # check that all values in this data chunk are within the desired # thresholds await self._check_values(metric, data_chunk.value) # "bump" all timeout checks with the last timestamp for which we # received values, i.e. reset the asynchronous timers that would # fire if we do not receive value for too long. last_timestamp = Timestamp(sum(data_chunk.time_delta)) await self._bump_timeout_checks(metric, last_timestamp) # flush all reports to the NSCA host await self._nsca_client.flush()
async def test_history_last_value(history_client: HistoryClient, mocker: MockerFixture): TIME = Timestamp(0) VALUE = mocker.sentinel.VALUE response = mock_history_response( time_delta=[TIME.posix_ns], value=[VALUE], ) patch_history_data_request(mocker, response) assert await history_client.history_last_value( DEFAULT_METRIC) == TimeValue(timestamp=TIME, value=VALUE)
def test_check_internal_exception_log_entry(check, caplog): errmsg = "check_metric_raising" def check_metric_raising(*args, **kwargs): raise ValueError(errmsg) check.check_metric = check_metric_raising with caplog.at_level(logging.ERROR): check.check("foo", tv_pairs=[TvPair(Timestamp(0), 0.0)]) assert errmsg in caplog.text assert any("Unhandled exception" in message for logger, level, message in caplog.record_tuples)
def test_check_internal_exception_report(check): errmsg = "check_metric_raising" def check_metric_raising(*args, **kwargs): raise ValueError(errmsg) check.check_metric = check_metric_raising check.check("foo", tv_pairs=[TvPair(Timestamp(0), 0.0)]) report: Report = check._report_queue._queue.get_nowait() assert report.service == check._name assert report.state == State.CRITICAL assert errmsg in report.message
async def test_history_aggregate(history_client: HistoryClient, mocker: MockerFixture): TIME = Timestamp(0) AGGREGATE = create_autospec(history_pb2.HistoryResponse.Aggregate, set_spec=True) response = mock_history_response( time_delta=[TIME.posix_ns], aggregate=[AGGREGATE], ) patch_history_data_request(mocker, response) assert await history_client.history_aggregate( DEFAULT_METRIC) == TimeAggregate.from_proto(timestamp=TIME, proto=AGGREGATE)
async def update(self): assert self.prev_timestamp is not None, "update() called before _on_config()" now = Timestamp.now() send_metrics = list() send_metrics.append( self.send("cpu.usage", now, sum(psutil.cpu_percent(percpu=True))) ) for mem_name, mem_value in psutil.virtual_memory()._asdict().items(): send_metrics.append(self.send(f"mem.{mem_name}", now, mem_value)) for swap_name, swap_value in psutil.swap_memory()._asdict().items(): send_metrics.append(self.send(f"swap.{swap_name}", now, swap_value)) net_io = psutil.net_io_counters(pernic=True, nowrap=True) duration_s = (now - self.prev_timestamp).s for nic_name, net_values in net_io.items(): prev_net_values = self.prev_net_io[nic_name] send_metrics.extend( [ self.send( f"net.{nic_name}.sent.bytes", now, (net_values.bytes_sent - prev_net_values.bytes_sent) / duration_s, ), self.send( f"net.{nic_name}.sent.packets", now, (net_values.packets_sent - prev_net_values.packets_sent) / duration_s, ), self.send( f"net.{nic_name}.recv.bytes", now, (net_values.bytes_recv - prev_net_values.bytes_recv) / duration_s, ), self.send( f"net.{nic_name}.recv.packets", now, (net_values.packets_recv - prev_net_values.packets_recv) / duration_s, ), ] ) disk_io = psutil.disk_io_counters(perdisk=True, nowrap=True) duration_s = (now - self.prev_timestamp).s for disk_name, disk_values in disk_io.items(): prev_disk_values = self.prev_disk_io[disk_name] send_metrics.extend( [ self.send( f"disk.{disk_name}.written.bytes", now, (disk_values.write_bytes - prev_disk_values.write_bytes) / duration_s, ), self.send( f"disk.{disk_name}.written.count", now, (disk_values.write_count - prev_disk_values.write_count) / duration_s, ), self.send( f"disk.{disk_name}.read.bytes", now, (disk_values.read_bytes - prev_disk_values.read_bytes) / duration_s, ), self.send( f"disk.{disk_name}.read.count", now, (disk_values.read_count - prev_disk_values.read_count) / duration_s, ), ] ) self.prev_disk_io = disk_io self.prev_net_io = net_io self.prev_timestamp = now await asyncio.gather(*send_metrics)
async def _on_config(self, **config): logger.info("config: {}", config) rate = config["rate"] self.period = Timedelta.from_s(1 / rate) try: self.prefix = config["prefix"] if self.prefix != "" and not self.prefix.endswith("."): self.prefix = self.prefix + "." except KeyError: logger.info("No explicit prefix given, using hostname") self.prefix = socket.gethostname() + "." meta = dict() # Initialize CPU usage: psutil.cpu_percent(percpu=True) meta["cpu.usage"] = { "rate": rate, "description": "CPU usage (100% = 1 logical CPU busy)", "unit": "%", } # Initialize memory for mem_name in psutil.virtual_memory()._fields: meta[f"mem.{mem_name}"] = { "rate": rate, "description": "See https://psutil.readthedocs.io/en/latest/#psutil.virtual_memory", "unit": "%" if mem_name == "percent" else "B", } for swap_name in psutil.swap_memory()._fields: meta[f"swap.{swap_name}"] = { "rate": rate, "description": "See https://psutil.readthedocs.io/en/latest/#psutil.swap_memory", "unit": "%" if swap_name == "percent" else "B", } # Network self.prev_net_io = psutil.net_io_counters(pernic=True, nowrap=True) self.prev_timestamp = Timestamp.now() for nic_name in self.prev_net_io.keys(): for sr in "sent", "recv": meta[f"net.{nic_name}.{sr}.bytes"] = { "rate": rate, "description": f"Total data {sr} on nic {nic_name}", "unit": "B/s", } meta[f"net.{nic_name}.{sr}.packets"] = { "rate": rate, "description": f"Number of packets {sr} on nic {nic_name}", "unit": "Hz", } # Disk self.prev_disk_io = psutil.disk_io_counters(perdisk=True, nowrap=True) for disk_name in self.prev_disk_io.keys(): for rw in "read", "written": meta[f"disk.{disk_name}.{rw}.count"] = { "rate": rate, "description": f"Number of {rw}s on partition {disk_name}", "unit": "Hz", } meta[f"disk.{disk_name}.{rw}.bytes"] = { "rate": rate, "description": f"Total data {rw} on partition {disk_name}", "unit": "B/s", } await self.declare_metrics( {self.prefix + key: value for key, value in meta.items()} )
async def _worker_task(self, object_group, worker_task_stop_future): start_time = Timestamp.now() interval = object_group["interval"] device_address_str = object_group["device_address_str"] object_type = object_group["object_type"] objects = [(object_type, instance) for instance in object_group["object_instances"]] chunk_size = object_group.get("chunk_size") logger.debug( f"starting BACnetSource worker task for device {device_address_str}" ) logger.debug( "This is {} the main thread.", "" if threading.current_thread() == threading.main_thread() else "not", ) # wait for random time between 10 ms and 10.01s random_wait_time = random.random() * 10 + 0.01 await asyncio.sleep(random_wait_time) self._worker_tasks_count_starting += 1 await self.event_loop.run_in_executor( None, functools.partial( self._bacnet_reader.request_device_properties, device_address_str, skip_when_cached=True, request_timeout=Timedelta.from_s(30), ), ) await self.event_loop.run_in_executor( None, functools.partial( self._bacnet_reader.request_object_properties, device_address_str, objects, skip_when_cached=True, chunk_size=chunk_size, request_timeout=Timedelta.from_s(30), ), ) device_info = self._bacnet_reader.get_device_info( device_address_str, device_identifier=object_group.get("device_identifier")) if device_info is None: logger.error("Missing device info for {}. Stopping worker task!", device_address_str) self._worker_tasks_count_failed += 1 return device_name = self._object_name_vendor_specific_mapping.get( device_info["objectName"], device_info["objectName"]) device_name = substitute_all( device_name, self._object_name_vendor_specific_substitutions) metrics = {} missing_metrics = 0 for object_instance in object_group["object_instances"]: metadata = { "rate": 1.0 / interval, "device": device_address_str, "objectType": object_type, "objectInstance": object_instance, } object_info = self._bacnet_reader.get_object_info( device_address_str, object_type, object_instance) if (object_info is None or "objectName" not in object_info or "description" not in object_info): logger.error( "No object info for ({}, {}) of {} available!", object_type, object_instance, device_address_str, ) missing_metrics += 1 continue # Get vendor-specific-address from object cache object_name = object_info.get("3000", object_info["objectName"]) object_name = self._object_name_vendor_specific_mapping.get( object_name, object_name) object_name = substitute_all( object_name, self._object_name_vendor_specific_substitutions) metric_id = (Template(object_group["metric_id"]).safe_substitute({ "objectName": object_name, "deviceName": device_name }).replace("'", ".").replace("`", ".").replace("´", ".").replace(" ", "")) if "description" in object_group: description = (Template( object_group["description"]).safe_substitute({ "objectName": object_name, "objectDescription": object_info["description"], "deviceName": device_name, "deviceDescription": device_info["description"], }).replace("'", ".").replace("`", ".").replace("´", ".")) metadata["description"] = substitute_all( description, self._object_description_vendor_specific_substitutions) if "units" in object_info: metadata["unit"] = object_info["units"] metrics[metric_id] = metadata try: await self.declare_metrics(metrics) except RPCError: logger.exception( f"Can't declare metadata for device {device_address_str}. Stopping worker task!" ) self._worker_tasks_count_failed += 1 return segmentationSupport = "unknown" device_address = Address(device_address_str) device_info = self._bacnet_reader.deviceInfoCache.get_device_info( device_address) if device_info: segmentationSupport = device_info.segmentationSupported start_duration = Timestamp.now() - start_time logger.info( f"Started BACnetSource worker task for device {device_address_str}! Took {start_duration.s - random_wait_time:.2f} s (waited {random_wait_time:.2f} s), {missing_metrics} metrics have no object info" ) self._worker_tasks_count_running += 1 deadline = Timestamp.now() while True: self._bacnet_reader.request_values(device_address_str, objects, chunk_size=chunk_size) if object_group.get("nan_at_timeout"): for metric_id in metrics: now = Timestamp.now() last_timestamp = self._last_time_send_by_metric.get( metric_id, now) if now - last_timestamp >= Timedelta.from_s(6 * interval): timestamp_nan = last_timestamp + Timedelta.from_s( 5 * interval) await self.send(metric_id, timestamp_nan, float("nan")) self._last_time_send_by_metric[ metric_id] = timestamp_nan logger.warn( "Timeout for metric {} reached. Sending NaN! Device: {}", metric_id, device_address_str, ) try: deadline += Timedelta.from_s(interval) now = Timestamp.now() while now >= deadline: logger.warn( "Missed deadline {}, it is now {}. Device: {}, {}, chunk size: {}", deadline, now, device_address_str, segmentationSupport, chunk_size, ) deadline += Timedelta.from_s(interval) timeout = (deadline - now).s await asyncio.wait_for(asyncio.shield(worker_task_stop_future), timeout=timeout) worker_task_stop_future.result() logger.info("stopping BACnetSource worker task") break except asyncio.TimeoutError: # This is the normal case, just continue with the loop continue
async def task(self): self._main_task_stop_future = self.event_loop.create_future() logger.info( f"Current worker count (expected/starting/running/failed): ({self._worker_tasks_count_expected}/{self._worker_tasks_count_starting}/{self._worker_tasks_count_running}/{self._worker_tasks_count_failed})" ) last_state_log = Timestamp.now() while True: queue_get_task = asyncio.create_task(self._result_queue.get()) done, pending = await asyncio.wait( {queue_get_task, self._main_task_stop_future}, return_when=asyncio.FIRST_COMPLETED, ) if queue_get_task in done: result: Tuple[Timestamp, str, str, Dict] = queue_get_task.result() timestamp, device_name, device_address_string, result_values = result device_config = self._device_config[device_address_string] device_name = self._object_name_vendor_specific_mapping.get( device_name, device_name) device_name = substitute_all( device_name, self._object_name_vendor_specific_substitutions) for object_name, object_result in result_values.items(): object_name = self._object_name_vendor_specific_mapping.get( object_name, object_name) object_name = substitute_all( object_name, self._object_name_vendor_specific_substitutions) # TODO maybe support more placeholders metric_id = (Template( device_config["metric_id"]).safe_substitute({ "objectName": object_name, "deviceName": device_name }).replace("'", ".").replace("`", ".").replace( "´", ".").replace(" ", "")) if "presentValue" in object_result and isinstance( object_result["presentValue"], (int, float)): await self.send(metric_id, timestamp, object_result["presentValue"]) self._last_time_send_by_metric[metric_id] = timestamp self._result_queue.task_done() if Timestamp.now() - last_state_log > Timedelta.from_string( "5min"): logger.info( f"Current worker count (expected/starting/running/failed): ({self._worker_tasks_count_expected}/{self._worker_tasks_count_starting}/{self._worker_tasks_count_running}/{self._worker_tasks_count_failed})" ) last_state_log = Timestamp.now() if self._main_task_stop_future in done: logger.info("stopping BACnetSource main task") break
self.value = value async def task(self): await self.send(self.metric, time=self.timestamp, value=self.value) await self.stop() @click.command() @click_log.simple_verbosity_option(logger, default="warning") @click.version_option(version=client_version) @metricq_server_option() @metricq_token_option(default="source-send") @click.option( "--timestamp", type=TIMESTAMP, default=Timestamp.now(), show_default="now", help="Timestamp to send.", ) @click.argument("metric", required=True) @click.argument("value", required=True, type=float) def main(server: str, token: str, timestamp: Timestamp, metric: Metric, value: float): """Send a single time-value pair for the given metric.""" send = MetricQSend( token=token, management_url=server, metric=metric, timestamp=timestamp, value=value, )