예제 #1
0
def test_clean_resctrl(exists_mock, isdir_mock, rmdir_mock, listdir_mock):
    from owca.resctrl import cleanup_resctrl
    cleanup_resctrl()
    assert listdir_mock.call_count == 2
    assert isdir_mock.call_count == 6
    assert exists_mock.call_count == 6
    assert rmdir_mock.call_count == 2
    rmdir_mock.assert_has_calls([
        call('/sys/fs/resctrl/mesos-2'),
        call('/sys/fs/resctrl/mon_groups/mesos-2'),
    ])
예제 #2
0
    def run(self):
        if self.rdt_enabled and not check_resctrl():
            return
        elif not self.rdt_enabled:
            log.warning('Rdt disabled. Skipping collecting measurements '
                        'and resctrl synchronization')
        else:
            # Resctrl is enabled and available - cleanup previous runs.
            cleanup_resctrl()

        if not are_privileges_sufficient():
            log.critical(
                "Impossible to use perf_event_open. You need to: adjust "
                "/proc/sys/kernel/perf_event_paranoid; or has CAP_DAC_OVERRIDE capability"
                " set. You can run process as root too. See man 2 perf_event_open for "
                "details.")
            return

        while True:
            # Collect information about tasks running on node.
            tasks = self.node.get_tasks()

            # Keep sync of found tasks and internally managed containers.
            self._sync_containers_state(tasks)

            # Owca internal metrics.
            internal_metrics = [
                Metric(name='owca_up',
                       type=MetricType.COUNTER,
                       value=time.time()),
                Metric(name='owca_tasks',
                       type=MetricType.GAUGE,
                       value=len(tasks)),
            ]

            # Platform information
            platform, platform_metrics, platform_labels = platforms.collect_platform_information(
            )

            # Common labels
            common_labels = dict(platform_labels, **self.extra_labels)

            # Update platform_metrics with common labels.
            for metric in platform_metrics + internal_metrics:
                metric.labels.update(common_labels)

            # Build labeled tasks_metrics and task_metrics_values.
            tasks_measurements: TasksMeasurements = {}
            tasks_resources: TasksResources = {}
            tasks_labels: TasksLabels = {}
            tasks_metrics: List[Metric] = []
            for task, container in self.containers.items():
                # Single task data
                task_measurements = container.get_measurements()
                task_metrics = create_metrics(task_measurements)
                # Prepare tasks labels based on Mesos tasks metadata labels and task id.
                task_labels = {
                    sanitize_mesos_label(label_key): label_value
                    for label_key, label_value in task.labels.items()
                }
                task_labels['task_id'] = task.task_id

                # Task scoped label decoration.
                for task_metric in task_metrics:
                    task_metric.labels.update(common_labels)
                    task_metric.labels.update(task_labels)

                # Aggregate over all tasks.
                tasks_labels[task.task_id] = task_labels
                tasks_measurements[task.task_id] = task_measurements
                tasks_resources[task.task_id] = task.resources
                tasks_metrics += task_metrics

            self.metrics_storage.store(platform_metrics + tasks_metrics +
                                       internal_metrics)

            anomalies, extra_metrics = self.detector.detect(
                platform, tasks_measurements, tasks_resources, tasks_labels)

            log.debug('Anomalies detected: %d', len(anomalies))

            # Note: anomaly metrics include metrics found in ContentionAnomaly.metrics.
            anomaly_metrics = convert_anomalies_to_metrics(
                anomalies, tasks_labels)

            # Extra anomaly statistics
            if len(anomalies):
                self.anomaly_last_occurence = time.time()
                self.anomaly_counter += len(anomalies)

            statistics_metrics = [
                Metric(name='anomaly_count',
                       type=MetricType.COUNTER,
                       value=self.anomaly_counter),
            ]
            if self.anomaly_last_occurence:
                statistics_metrics.extend([
                    Metric(name='anomaly_last_occurence',
                           type=MetricType.COUNTER,
                           value=self.anomaly_last_occurence),
                ])

            # Update anomaly & extra metrics with common labels.
            for metric in anomaly_metrics + extra_metrics + statistics_metrics:
                metric.labels.update(common_labels)

            self.anomalies_storage.store(anomaly_metrics + extra_metrics +
                                         statistics_metrics)

            if not self.wait_or_finish():
                break

        # cleanup
        for container in self.containers.values():
            container.cleanup()