Exemplo n.º 1
0
def test_sanitize_labels(label_key, expected_label_key):
    assert sanitize_mesos_label(label_key) == expected_label_key
Exemplo n.º 2
0
    def run(self):
        if self.rdt_enabled and not check_resctrl():
            return
        elif not self.rdt_enabled:
            log.warning('Rdt disabled. Skipping collecting measurements '
                        'and resctrl synchronization')
        else:
            # Resctrl is enabled and available - cleanup previous runs.
            cleanup_resctrl()

        if not are_privileges_sufficient():
            log.critical(
                "Impossible to use perf_event_open. You need to: adjust "
                "/proc/sys/kernel/perf_event_paranoid; or has CAP_DAC_OVERRIDE capability"
                " set. You can run process as root too. See man 2 perf_event_open for "
                "details.")
            return

        while True:
            # Collect information about tasks running on node.
            tasks = self.node.get_tasks()

            # Keep sync of found tasks and internally managed containers.
            self._sync_containers_state(tasks)

            # Owca internal metrics.
            internal_metrics = [
                Metric(name='owca_up',
                       type=MetricType.COUNTER,
                       value=time.time()),
                Metric(name='owca_tasks',
                       type=MetricType.GAUGE,
                       value=len(tasks)),
            ]

            # Platform information
            platform, platform_metrics, platform_labels = platforms.collect_platform_information(
            )

            # Common labels
            common_labels = dict(platform_labels, **self.extra_labels)

            # Update platform_metrics with common labels.
            for metric in platform_metrics + internal_metrics:
                metric.labels.update(common_labels)

            # Build labeled tasks_metrics and task_metrics_values.
            tasks_measurements: TasksMeasurements = {}
            tasks_resources: TasksResources = {}
            tasks_labels: TasksLabels = {}
            tasks_metrics: List[Metric] = []
            for task, container in self.containers.items():
                # Single task data
                task_measurements = container.get_measurements()
                task_metrics = create_metrics(task_measurements)
                # Prepare tasks labels based on Mesos tasks metadata labels and task id.
                task_labels = {
                    sanitize_mesos_label(label_key): label_value
                    for label_key, label_value in task.labels.items()
                }
                task_labels['task_id'] = task.task_id

                # Task scoped label decoration.
                for task_metric in task_metrics:
                    task_metric.labels.update(common_labels)
                    task_metric.labels.update(task_labels)

                # Aggregate over all tasks.
                tasks_labels[task.task_id] = task_labels
                tasks_measurements[task.task_id] = task_measurements
                tasks_resources[task.task_id] = task.resources
                tasks_metrics += task_metrics

            self.metrics_storage.store(platform_metrics + tasks_metrics +
                                       internal_metrics)

            anomalies, extra_metrics = self.detector.detect(
                platform, tasks_measurements, tasks_resources, tasks_labels)

            log.debug('Anomalies detected: %d', len(anomalies))

            # Note: anomaly metrics include metrics found in ContentionAnomaly.metrics.
            anomaly_metrics = convert_anomalies_to_metrics(
                anomalies, tasks_labels)

            # Extra anomaly statistics
            if len(anomalies):
                self.anomaly_last_occurence = time.time()
                self.anomaly_counter += len(anomalies)

            statistics_metrics = [
                Metric(name='anomaly_count',
                       type=MetricType.COUNTER,
                       value=self.anomaly_counter),
            ]
            if self.anomaly_last_occurence:
                statistics_metrics.extend([
                    Metric(name='anomaly_last_occurence',
                           type=MetricType.COUNTER,
                           value=self.anomaly_last_occurence),
                ])

            # Update anomaly & extra metrics with common labels.
            for metric in anomaly_metrics + extra_metrics + statistics_metrics:
                metric.labels.update(common_labels)

            self.anomalies_storage.store(anomaly_metrics + extra_metrics +
                                         statistics_metrics)

            if not self.wait_or_finish():
                break

        # cleanup
        for container in self.containers.values():
            container.cleanup()
Exemplo n.º 3
0
def test_runner_containers_state(*mocks):
    """Tests proper interaction between runner instance and functions for
    creating anomalies and calculating the desired state.

    Also tests labelling of metrics during iteration loop.
    """

    # Task labels
    task_labels = {
        'org.apache.aurora.metadata.application': 'redis',
        'org.apache.aurora.metadata.load_generator': 'rpc-perf',
        'org.apache.aurora.metadata.name': 'redis--6792',
    }
    task_labels_sanitized = {
        sanitize_mesos_label(label_key): label_value
        for label_key, label_value in task_labels.items()
    }
    task_labels_sanitized_with_task_id = {'task_id': 'task-id-/t1'}
    task_labels_sanitized_with_task_id.update(task_labels_sanitized)

    # Node mock
    node_mock = Mock(
        spec=MesosNode,
        get_tasks=Mock(return_value=[
            task('/t1', resources=dict(cpus=8.), labels=task_labels)
        ]))

    # Storage mocks
    metrics_storage = Mock(spec=storage.Storage, store=Mock())
    anomalies_storage = Mock(spec=storage.Storage, store=Mock())

    # Detector mock - simulate returning one anomaly and additional metric
    detector_mock = Mock(
        spec=AnomalyDetector,
        detect=Mock(return_value=(
            [
                anomaly('task1', ['task2'],
                        metrics=[metric('contention_related_metric')])
            ],  # one anomaly + related metric
            [metric('bar')]  # one extra metric
        )))

    extra_labels = dict(el='ev')  # extra label with some extra value

    runner = DetectionRunner(
        node=node_mock,
        metrics_storage=metrics_storage,
        anomalies_storage=anomalies_storage,
        detector=detector_mock,
        rdt_enabled=False,
        extra_labels=extra_labels,
    )

    # Mock to finish after one iteration.
    runner.wait_or_finish = Mock(return_value=False)

    platform_mock = Mock(spec=platforms.Platform)
    with patch('owca.platforms.collect_platform_information',
               return_value=(platform_mock, [metric('platform-cpu-usage')],
                             {})):
        runner.run()

    # store() method was called twice:
    # 1. Before calling detect() to store state of the environment.
    metrics_storage.store.assert_called_once_with([
        metric('platform-cpu-usage',
               labels=extra_labels),  # Store metrics from platform ...
        Metric(name='cpu_usage',
               value=23,
               labels=dict(extra_labels,
                           **task_labels_sanitized_with_task_id)),
        Metric('owca_up',
               type=MetricType.COUNTER,
               value=1234567890.123,
               labels=extra_labels),
        Metric('owca_tasks',
               type=MetricType.GAUGE,
               value=1,
               labels=extra_labels),
    ])  # and task

    # 2. After calling detect to store information about detected anomalies.
    expected_anomaly_metrics = anomaly_metrics('task1', ['task2'])
    for m in expected_anomaly_metrics:
        m.labels.update(extra_labels)

    expected_anomaly_metrics.extend([
        metric('contention_related_metric',
               labels=dict({
                   'uuid': 'fake-uuid',
                   'type': 'anomaly'
               }, **extra_labels)),
        metric('bar', extra_labels),
        Metric('anomaly_count',
               type=MetricType.COUNTER,
               value=1,
               labels=extra_labels),
        Metric('anomaly_last_occurence',
               type=MetricType.COUNTER,
               value=1234567890.123,
               labels=extra_labels),
    ])
    anomalies_storage.store.assert_called_once_with(expected_anomaly_metrics)

    # Check that detector was called with proper arguments.
    detector_mock.detect.assert_called_once_with(
        platform_mock, {'task-id-/t1': {
            'cpu_usage': 23
        }}, {'task-id-/t1': {
            'cpus': 8
        }}, {'task-id-/t1': task_labels_sanitized_with_task_id})

    # assert expected state (new container based on first task /t1)
    assert (runner.containers == {
        task('/t1', resources=dict(cpus=8.), labels=task_labels):
        container('/t1')
    })

    runner.wait_or_finish.assert_called_once()