def send_cgroup_telemetry(self): if self.last_cgroup_telemetry is None: self.last_cgroup_telemetry = datetime.datetime.utcnow() if datetime.datetime.utcnow() >= ( self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD): try: for cgroup_name, metrics in CGroupsTelemetry.collect_all_tracked( ).items(): for metric_group, metric_name, value in metrics: if value > 0: report_metric(metric_group, metric_name, cgroup_name, value) except Exception as e: logger.warn("Failed to collect performance metrics: {0} [{1}]", e, traceback.format_exc()) # Look for extension cgroups we're not already tracking and track them try: CGroupsTelemetry.update_tracked( self.protocol.client.get_current_handlers()) except Exception as e: logger.warn( "Monitor: updating tracked extensions raised {0}: {1}", e, traceback.format_exc()) self.last_cgroup_telemetry = datetime.datetime.utcnow()
def exercise_telemetry_instantiation(self, test_cgroup): test_extension_name = test_cgroup.name CGroupsTelemetry.track_cgroup(test_cgroup) self.assertIn('cpu', test_cgroup.cgroups) self.assertIn('memory', test_cgroup.cgroups) self.assertTrue(CGroupsTelemetry.is_tracked(test_extension_name)) consume_cpu_time() time.sleep(1) metrics, limits = CGroupsTelemetry.collect_all_tracked() my_metrics = metrics[test_extension_name] self.assertEqual(len(my_metrics), 2) for item in my_metrics: metric_family, metric_name, metric_value = item if metric_family == "Process": self.assertEqual(metric_name, "% Processor Time") self.assertGreater(metric_value, 0.0) elif metric_family == "Memory": self.assertEqual(metric_name, "Total Memory Usage") self.assertGreater(metric_value, 100000) else: self.fail("Unknown metric {0}/{1} value {2}".format( metric_family, metric_name, metric_value)) my_limits = limits[test_extension_name] self.assertIsInstance(my_limits, CGroupsLimits, msg="is not the correct instance") self.assertGreater(my_limits.cpu_limit, 0.0) self.assertGreater(my_limits.memory_limit, 0.0)
def exercise_telemetry_instantiation(self, test_cgroup): test_extension_name = test_cgroup.name CGroupsTelemetry.track_cgroup(test_cgroup) self.assertIn('cpu', test_cgroup.cgroups) self.assertIn('memory', test_cgroup.cgroups) self.assertTrue(CGroupsTelemetry.is_tracked(test_extension_name)) consume_cpu_time() time.sleep(1) metrics, limits = CGroupsTelemetry.collect_all_tracked() my_metrics = metrics[test_extension_name] self.assertEqual(len(my_metrics), 2) for item in my_metrics: metric_family, metric_name, metric_value = item if metric_family == "Process": self.assertEqual(metric_name, "% Processor Time") self.assertGreater(metric_value, 0.0) elif metric_family == "Memory": self.assertEqual(metric_name, "Total Memory Usage") self.assertGreater(metric_value, 100000) else: self.fail("Unknown metric {0}/{1} value {2}".format(metric_family, metric_name, metric_value)) my_limits = limits[test_extension_name] self.assertIsInstance(my_limits, CGroupsLimits, msg="is not the correct instance") self.assertGreater(my_limits.cpu_limit, 0.0) self.assertGreater(my_limits.memory_limit, 0.0)
def send_cgroup_telemetry(self): if self.last_cgroup_telemetry is None: self.last_cgroup_telemetry = datetime.datetime.utcnow() if datetime.datetime.utcnow() >= ( self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD): try: metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked( ) for cgroup_name, metrics in metric_reported.items(): thresholds = metric_threshold[cgroup_name] for metric_group, metric_name, value in metrics: if value > 0: report_metric(metric_group, metric_name, cgroup_name, value) if metric_group == "Memory": if value >= thresholds["memory"]: msg = "CGroup {0}: Crossed the Memory Threshold. Current Value:{1}, Threshold:{2}.".format( cgroup_name, value, thresholds["memory"]) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) if metric_group == "Process": if value >= thresholds["cpu"]: msg = "CGroup {0}: Crossed the Processor Threshold. Current Value:{1}, Threshold:{2}.".format( cgroup_name, value, thresholds["cpu"]) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) except Exception as e: logger.warn( "Monitor: failed to collect cgroups performance metrics: {0}", ustr(e)) logger.verbose(traceback.format_exc()) # Look for extension cgroups we're not already tracking and track them try: CGroupsTelemetry.update_tracked( self.protocol.client.get_current_handlers()) except Exception as e: logger.warn( "Monitor: failed to update cgroups tracked extensions: {0}", ustr(e)) logger.verbose(traceback.format_exc()) self.last_cgroup_telemetry = datetime.datetime.utcnow()
def send_cgroup_telemetry(self): if self.last_cgroup_telemetry is None: self.last_cgroup_telemetry = datetime.datetime.utcnow() if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD): try: metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked() for cgroup_name, metrics in metric_reported.items(): thresholds = metric_threshold[cgroup_name] for metric_group, metric_name, value in metrics: if value > 0: report_metric(metric_group, metric_name, cgroup_name, value) if metric_group == "Memory": # Memory is collected in bytes, and limit is set in megabytes. if value >= CGroups._format_memory_value('megabytes', thresholds.memory_limit): msg = "CGroup {0}: Crossed the Memory Threshold. " \ "Current Value: {1} bytes, Threshold: {2} megabytes." \ .format(cgroup_name, value, thresholds.memory_limit) logger.warn(msg) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) if metric_group == "Process": if value >= thresholds.cpu_limit: msg = "CGroup {0}: Crossed the Processor Threshold. " \ "Current Value: {1}, Threshold: {2}." \ .format(cgroup_name, value, thresholds.cpu_limit) logger.warn(msg) add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsLimitsCrossed, is_success=True, message=msg, log_event=True) except Exception as e: logger.warn("Monitor: failed to collect cgroups performance metrics: {0}", ustr(e)) logger.verbose(traceback.format_exc()) # Look for extension cgroups we're not already tracking and track them try: CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers()) except Exception as e: logger.warn("Monitor: failed to update cgroups tracked extensions: {0}", ustr(e)) logger.verbose(traceback.format_exc()) self.last_cgroup_telemetry = datetime.datetime.utcnow()
def send_cgroup_telemetry(self): if self.last_cgroup_telemetry is None: self.last_cgroup_telemetry = datetime.datetime.utcnow() if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD): try: for cgroup_name, metrics in CGroupsTelemetry.collect_all_tracked().items(): for metric_group, metric_name, value in metrics: if value > 0: report_metric(metric_group, metric_name, cgroup_name, value) except Exception as e: logger.warn("Failed to collect performance metrics: {0} [{1}]", e, traceback.format_exc()) # Look for extension cgroups we're not already tracking and track them try: CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers()) except Exception as e: logger.warn("Monitor: updating tracked extensions raised {0}: {1}", e, traceback.format_exc()) self.last_cgroup_telemetry = datetime.datetime.utcnow()