def run(self): if self.rdt_enabled and not check_resctrl(): return elif not self.rdt_enabled: log.warning('Rdt disabled. Skipping collecting measurements ' 'and resctrl synchronization') else: # Resctrl is enabled and available - cleanup previous runs. cleanup_resctrl() if not are_privileges_sufficient(): log.critical( "Impossible to use perf_event_open. You need to: adjust " "/proc/sys/kernel/perf_event_paranoid; or has CAP_DAC_OVERRIDE capability" " set. You can run process as root too. See man 2 perf_event_open for " "details.") return while True: # Collect information about tasks running on node. tasks = self.node.get_tasks() # Keep sync of found tasks and internally managed containers. self._sync_containers_state(tasks) # Owca internal metrics. internal_metrics = [ Metric(name='owca_up', type=MetricType.COUNTER, value=time.time()), Metric(name='owca_tasks', type=MetricType.GAUGE, value=len(tasks)), ] # Platform information platform, platform_metrics, platform_labels = platforms.collect_platform_information( ) # Common labels common_labels = dict(platform_labels, **self.extra_labels) # Update platform_metrics with common labels. for metric in platform_metrics + internal_metrics: metric.labels.update(common_labels) # Build labeled tasks_metrics and task_metrics_values. tasks_measurements: TasksMeasurements = {} tasks_resources: TasksResources = {} tasks_labels: TasksLabels = {} tasks_metrics: List[Metric] = [] for task, container in self.containers.items(): # Single task data task_measurements = container.get_measurements() task_metrics = create_metrics(task_measurements) # Prepare tasks labels based on Mesos tasks metadata labels and task id. task_labels = { sanitize_mesos_label(label_key): label_value for label_key, label_value in task.labels.items() } task_labels['task_id'] = task.task_id # Task scoped label decoration. for task_metric in task_metrics: task_metric.labels.update(common_labels) task_metric.labels.update(task_labels) # Aggregate over all tasks. tasks_labels[task.task_id] = task_labels tasks_measurements[task.task_id] = task_measurements tasks_resources[task.task_id] = task.resources tasks_metrics += task_metrics self.metrics_storage.store(platform_metrics + tasks_metrics + internal_metrics) anomalies, extra_metrics = self.detector.detect( platform, tasks_measurements, tasks_resources, tasks_labels) log.debug('Anomalies detected: %d', len(anomalies)) # Note: anomaly metrics include metrics found in ContentionAnomaly.metrics. anomaly_metrics = convert_anomalies_to_metrics( anomalies, tasks_labels) # Extra anomaly statistics if len(anomalies): self.anomaly_last_occurence = time.time() self.anomaly_counter += len(anomalies) statistics_metrics = [ Metric(name='anomaly_count', type=MetricType.COUNTER, value=self.anomaly_counter), ] if self.anomaly_last_occurence: statistics_metrics.extend([ Metric(name='anomaly_last_occurence', type=MetricType.COUNTER, value=self.anomaly_last_occurence), ]) # Update anomaly & extra metrics with common labels. for metric in anomaly_metrics + extra_metrics + statistics_metrics: metric.labels.update(common_labels) self.anomalies_storage.store(anomaly_metrics + extra_metrics + statistics_metrics) if not self.wait_or_finish(): break # cleanup for container in self.containers.values(): container.cleanup()
def test_check_resctrl(*mock): assert check_resctrl()