Пример #1
0
 def test_chunk_samples(self):
     # Generate 1.2 kbs of metric samples
     test_collector = MetricsCollector(
         self._services,
         5,
         10,
         self.timeout,
         grpc_max_msg_size_mb=0.01,
         loop=asyncio.new_event_loop(),
     )
     samples = self._generate_samples(2000)
     chunked_samples = test_collector._chunk_samples(samples)
     self.assertEqual(len(list(chunked_samples)), 2)
Пример #2
0
    def setUp(self):
        ServiceRegistry.add_service('test', '0.0.0.0', 0)
        ServiceRegistry._PROXY_CONFIG = {'local_port': 1234,
                                         'cloud_address': 'test',
                                         'proxy_cloud_connections': True}

        self._services = ['test']
        self.gateway_id = "2876171d-bf38-4254-b4da-71a713952904"
        self.queue_length = 5
        self.timeout = 1
        self._collector = MetricsCollector(self._services, 5, 10,
                                           self.timeout,
                                           grpc_max_msg_size_mb=4,
                                           queue_length=self.queue_length,
                                           loop=asyncio.new_event_loop())
Пример #3
0
def main():
    """
    Main magmad function
    """
    service = MagmaService('magmad', mconfigs_pb2.MagmaD())

    # Optionally pipe errors to Sentry
    sentry_init(service_name=service.name)

    logging.info('Starting magmad for UUID: %s', snowflake.make_snowflake())

    # Create service manager
    services = service.config.get('magma_services')
    init_system = service.config.get('init_system', 'systemd')
    registered_dynamic_services = service.config.get(
        'registered_dynamic_services',
        [],
    )
    enabled_dynamic_services = []
    if service.mconfig is not None:
        enabled_dynamic_services = service.mconfig.dynamic_services

    # Poll the services' Service303 interface
    service_poller = ServicePoller(
        service.loop,
        service.config,
        enabled_dynamic_services,
    )
    service_poller.start()

    service_manager = ServiceManager(
        services,
        init_system,
        service_poller,
        registered_dynamic_services,
        enabled_dynamic_services,
    )

    # Get metrics service config
    metrics_config = service.config.get('metricsd')
    metrics_services = metrics_config['services']
    collect_interval = metrics_config['collect_interval']
    sync_interval = metrics_config['sync_interval']
    grpc_timeout = metrics_config['grpc_timeout']
    grpc_msg_size = metrics_config.get('max_grpc_msg_size_mb', 4)
    metrics_post_processor_fn = metrics_config.get('post_processing_fn')

    metric_scrape_targets = [
        ScrapeTarget(t['url'], t['name'], t['interval'])
        for t in metrics_config.get('metric_scrape_targets', [])
    ]

    # Create local metrics collector
    metrics_collector = MetricsCollector(
        services=metrics_services,
        collect_interval=collect_interval,
        sync_interval=sync_interval,
        grpc_timeout=grpc_timeout,
        grpc_max_msg_size_mb=grpc_msg_size,
        loop=service.loop,
        post_processing_fn=get_metrics_postprocessor_fn(
            metrics_post_processor_fn, ),
        scrape_targets=metric_scrape_targets,
    )

    # Poll and sync the metrics collector loops
    metrics_collector.run()

    # Start a background thread to stream updates from the cloud
    stream_client = None
    if service.config.get('enable_config_streamer', False):
        stream_client = StreamerClient(
            {
                CONFIG_STREAM_NAME:
                ConfigManager(
                    services,
                    service_manager,
                    service,
                    MconfigManagerImpl(),
                ),
            },
            service.loop,
        )

    # Create sync rpc client with a heartbeat of 30 seconds (timeout = 60s)
    sync_rpc_client = None
    if service.config.get('enable_sync_rpc', False):
        sync_rpc_client = SyncRPCClient(
            service.loop,
            30,
            service.config.get('print_grpc_payload', False),
        )

    first_time_bootstrap = True

    # This is called when bootstrap succeeds and when _bootstrap_check is
    # invoked but bootstrap is not needed. If it's invoked right after certs
    # are generated, certs_generated is true, control_proxy will restart.
    async def bootstrap_success_cb(certs_generated: bool):
        nonlocal first_time_bootstrap
        if first_time_bootstrap:
            if stream_client:
                stream_client.start()
            if sync_rpc_client:
                sync_rpc_client.start()
            first_time_bootstrap = False
        if certs_generated:
            svcs_to_restart = []
            if 'control_proxy' in services:
                svcs_to_restart.append('control_proxy')

            # fluent-bit caches TLS client certs in memory, so we need to
            # restart it whenever the certs change
            fresh_mconfig = get_mconfig_manager().load_service_mconfig(
                'magmad',
                mconfigs_pb2.MagmaD(),
            )
            dynamic_svcs = fresh_mconfig.dynamic_services or []
            if 'td-agent-bit' in dynamic_svcs:
                svcs_to_restart.append('td-agent-bit')

            await service_manager.restart_services(services=svcs_to_restart)

    # Create bootstrap manager
    bootstrap_manager = BootstrapManager(service, bootstrap_success_cb)

    # Initialize kernel version poller if it is enabled
    kernel_version_poller = None
    if service.config.get('enable_kernel_version_checking', False):
        kernel_version_poller = KernelVersionsPoller(service)
        kernel_version_poller.start()

    # gateway status generator to bundle various information about this
    # gateway into an object.
    gateway_status_factory = GatewayStatusFactory(
        service=service,
        service_poller=service_poller,
        kernel_version_poller=kernel_version_poller,
    )

    # _grpc_client_manager to manage grpc client recycling
    grpc_client_manager = GRPCClientManager(
        service_name="state",
        service_stub=StateServiceStub,
        max_client_reuse=60,
    )

    # Initialize StateReporter
    state_reporter = StateReporter(
        config=service.config,
        mconfig=service.mconfig,
        loop=service.loop,
        bootstrap_manager=bootstrap_manager,
        gw_status_factory=gateway_status_factory,
        grpc_client_manager=grpc_client_manager,
    )

    # Initialize ServiceHealthWatchdog
    service_health_watchdog = ServiceHealthWatchdog(
        config=service.config,
        loop=service.loop,
        service_poller=service_poller,
        service_manager=service_manager,
    )

    # Start _bootstrap_manager
    bootstrap_manager.start_bootstrap_manager()

    # Start all services when magmad comes up
    service.loop.create_task(service_manager.start_services())

    # Start state reporting loop
    state_reporter.start()

    # Start service timeout health check loop
    service_health_watchdog.start()

    # Start upgrade manager loop
    if service.config.get('enable_upgrade_manager', False):
        upgrader = _get_upgrader_impl(service)
        service.loop.create_task(start_upgrade_loop(service, upgrader))

    # Start network health metric collection
    if service.config.get('enable_network_monitor', False):
        service.loop.create_task(metrics_collection_loop(service.config))

    # Create generic command executor
    command_executor = None
    if service.config.get('generic_command_config', None):
        command_executor = get_command_executor_impl(service)

    # Start loop to monitor unattended upgrade status
    service.loop.create_task(monitor_unattended_upgrade_status())

    # Add all servicers to the server
    magmad_servicer = MagmadRpcServicer(
        service,
        services,
        service_manager,
        get_mconfig_manager(),
        command_executor,
        service.loop,
        service.config.get('print_grpc_payload', False),
    )
    magmad_servicer.add_to_server(service.rpc_server)

    if SDWatchdog.has_notify():
        # Create systemd watchdog
        sdwatchdog = SDWatchdog(
            tasks=[bootstrap_manager, state_reporter],
            update_status=True,
        )
        # Start watchdog loop
        service.loop.create_task(sdwatchdog.run())

    # Run the service loop
    service.run()

    # Cleanup the service
    service.close()
Пример #4
0
class MetricsCollectorTests(unittest.TestCase):
    """
    Tests for the MetricCollector collect and sync
    """
    @classmethod
    def setUpClass(cls):
        cls.queue_size = 5

    def setUp(self):
        ServiceRegistry.add_service('test', '0.0.0.0', 0)
        ServiceRegistry._PROXY_CONFIG = {
            'local_port': 1234,
            'cloud_address': 'test',
            'proxy_cloud_connections': True
        }

        self._services = ['test']
        self.gateway_id = "2876171d-bf38-4254-b4da-71a713952904"
        self.timeout = 1
        self._collector = MetricsCollector(self._services,
                                           5,
                                           10,
                                           self.timeout,
                                           grpc_max_msg_size_mb=4,
                                           loop=asyncio.new_event_loop())

    @unittest.mock.patch('magma.magmad.metrics_collector.MetricsControllerStub'
                         )
    def test_sync(self, controller_mock):
        """
        Test if the collector syncs our sample.
        """
        # Mock out Collect.future
        mock = unittest.mock.Mock()
        mock.Collect.future.side_effect = [
            unittest.mock.Mock(),
            unittest.mock.Mock(),
            unittest.mock.Mock()
        ]
        controller_mock.side_effect = [mock, mock, mock]

        # Call with no samples
        service_name = "test"
        self._collector.sync(service_name)
        controller_mock.Collect.future.assert_not_called()
        self._collector._loop.stop()

        # Call with new samples to send
        samples = [MetricFamily(name="1234")]
        self._collector._samples_for_service[service_name].extend(samples)
        with unittest.mock.patch('snowflake.snowflake') as mock_snowflake:
            mock_snowflake.side_effect = lambda: self.gateway_id
            self._collector.sync(service_name)
        mock.Collect.future.assert_called_once_with(
            MetricsContainer(gatewayId=self.gateway_id, family=samples),
            self.timeout)
        self.assertCountEqual(
            self._collector._samples_for_service[service_name], [])

        # Reduce max msg size to trigger msg chunking
        self._collector.grpc_max_msg_size_bytes = 1500
        samples = self._generate_samples(140)
        self._collector._samples_for_service[service_name].extend(samples)
        chunk1 = samples[:70]
        chunk2 = samples[70:140]

        with unittest.mock.patch('snowflake.snowflake') as mock_snowflake:
            mock_snowflake.side_effect = lambda: self.gateway_id
            self._collector.sync(service_name)
        mock.Collect.future.assert_any_call(
            MetricsContainer(gatewayId=self.gateway_id, family=chunk1),
            self.timeout)
        mock.Collect.future.assert_any_call(
            MetricsContainer(gatewayId=self.gateway_id, family=chunk2),
            self.timeout)
        self.assertCountEqual(
            self._collector._samples_for_service[service_name], [])

    def test_collect(self):
        """
        Test if the collector syncs our sample.
        """
        mock = unittest.mock.MagicMock()
        service_name = "test"
        samples = [MetricFamily(name="2345")]
        self._collector._samples_for_service[service_name].clear()
        self._collector._samples_for_service[service_name].extend(samples)
        mock.result.side_effect = [MetricsContainer(family=samples)]
        mock.exception.side_effect = [False]

        self._collector.collect_done('test', mock)
        # Should dequeue sample from the left, and enqueue on right
        # collector should add one more metric for collection success/failure
        self.assertEqual(
            len(self._collector._samples_for_service[service_name]),
            len(samples * 2) + 1)

    def test_collect_start_time(self):
        """
        Test if the collector syncs our sample.
        """
        mock = unittest.mock.MagicMock()
        start_metric = Metric()
        start_metric.gauge.value = calendar.timegm(time.gmtime()) - 1
        start_time = MetricFamily(
            name=str(metricsd_pb2.process_start_time_seconds),
            metric=[start_metric],
        )
        samples = [start_time]
        service_name = "test"
        self._collector._samples_for_service[service_name].clear()
        mock.result.side_effect = [MetricsContainer(family=samples)]
        mock.exception.side_effect = [False]

        self._collector.collect_done('test', mock)

        # should have uptime, start time, and collection success
        self.assertEqual(
            len(self._collector._samples_for_service[service_name]), 3)
        uptime_list = [
            fam for fam in self._collector._samples_for_service[service_name]
            if fam.name == str(metricsd_pb2.process_uptime_seconds)
        ]
        self.assertEqual(len(uptime_list), 1)
        self.assertEqual(len(uptime_list[0].metric), 1)
        self.assertGreater(uptime_list[0].metric[0].gauge.value, 0)

        # ensure no exceptions with empty metric
        empty = MetricFamily(name=str(metricsd_pb2.process_start_time_seconds))
        samples = [empty]
        self._collector._samples_for_service[service_name].clear()
        mock.result.side_effect = [MetricsContainer(family=samples)]
        mock.exception.side_effect = [False]
        try:
            self._collector.collect_done('test', mock)
        except Exception:  # pylint: disable=broad-except
            self.fail("Collection with empty metric should not have failed")

    def test_counter_to_proto(self):
        test_counter = prometheus_client.core.CounterMetricFamily(
            "test",
            "",
            labels=["testLabel"],
        )
        test_counter.add_metric(["val"], 1.23)
        test_counter.add_metric(["val2"], 2.34)

        proto = _counter_to_proto(test_counter)
        self.assertEqual(proto.name, test_counter.name)
        self.assertEqual(proto.type, metrics_pb2.COUNTER)

        self.assertEqual(2, len(proto.metric))
        self.assertEqual("val", proto.metric[0].label[0].value)
        self.assertEqual(1.23, proto.metric[0].counter.value)
        self.assertEqual("val2", proto.metric[1].label[0].value)
        self.assertEqual(2.34, proto.metric[1].counter.value)

    def test_gauge_to_proto(self):
        test_gauge = prometheus_client.core.GaugeMetricFamily(
            "test",
            "",
            labels=["testLabel"],
        )
        test_gauge.add_metric(["val"], 1.23)
        test_gauge.add_metric(["val2"], 2.34)

        proto = _gauge_to_proto(test_gauge)
        self.assertEqual(proto.name, test_gauge.name)
        self.assertEqual(proto.type, metrics_pb2.GAUGE)

        self.assertEqual(2, len(proto.metric))
        self.assertEqual("val", proto.metric[0].label[0].value)
        self.assertEqual(1.23, proto.metric[0].gauge.value)
        self.assertEqual("val2", proto.metric[1].label[0].value)
        self.assertEqual(2.34, proto.metric[1].gauge.value)

    def test_untyped_to_proto(self):
        test_untyped = prometheus_client.core.UntypedMetricFamily(
            "test",
            "",
            labels=["testLabel"],
        )
        test_untyped.add_metric(["val"], 1.23)
        test_untyped.add_metric(["val2"], 2.34)

        proto = _untyped_to_proto(test_untyped)
        self.assertEqual(proto.name, test_untyped.name)
        self.assertEqual(proto.type, metrics_pb2.UNTYPED)

        self.assertEqual(2, len(proto.metric))
        self.assertEqual("val", proto.metric[0].label[0].value)
        self.assertEqual(1.23, proto.metric[0].untyped.value)
        self.assertEqual("val2", proto.metric[1].label[0].value)
        self.assertEqual(2.34, proto.metric[1].untyped.value)

    def test_summary_to_proto(self):
        test_summary = prometheus_client.core.SummaryMetricFamily(
            "test",
            "",
            labels=["testLabel"],
        )
        # Add first unique labelset metrics
        test_summary.add_metric(["val1"], 10, 0.1)
        test_summary.add_sample("test", {
            "quantile": "0.0",
            "testLabel": "val1"
        }, 0.01)
        test_summary.add_sample("test", {
            "quantile": "0.5",
            "testLabel": "val1"
        }, 0.02)
        test_summary.add_sample("test", {
            "quantile": "1.0",
            "testLabel": "val1"
        }, 0.03)

        # Add second unique labelset metrics
        test_summary.add_metric(["val2"], 20, 0.2)
        test_summary.add_sample("test", {
            "quantile": "0.0",
            "testLabel": "val2"
        }, 0.02)
        test_summary.add_sample("test", {
            "quantile": "0.5",
            "testLabel": "val2"
        }, 0.04)
        test_summary.add_sample("test", {
            "quantile": "1.0",
            "testLabel": "val2"
        }, 0.06)

        protos = _summary_to_proto(test_summary)
        self.assertEqual(2, len(protos))

        for proto in protos:
            self.assertEqual(proto.name, test_summary.name)
            self.assertEqual(proto.type, metrics_pb2.SUMMARY)
            if proto.metric[0].label[0].value == "val1":
                self.assertEqual(1, len(proto.metric))
                self.assertEqual(10, proto.metric[0].summary.sample_count)
                self.assertEqual(0.1, proto.metric[0].summary.sample_sum)
                self.assertEqual(3, len(proto.metric[0].summary.quantile))
                self.assertEqual(0.01,
                                 proto.metric[0].summary.quantile[0].value)
                self.assertEqual(0.02,
                                 proto.metric[0].summary.quantile[1].value)
                self.assertEqual(0.03,
                                 proto.metric[0].summary.quantile[2].value)
            else:
                self.assertEqual(1, len(proto.metric))
                self.assertEqual(20, proto.metric[0].summary.sample_count)
                self.assertEqual(0.2, proto.metric[0].summary.sample_sum)
                self.assertEqual(3, len(proto.metric[0].summary.quantile))
                self.assertEqual(0.02,
                                 proto.metric[0].summary.quantile[0].value)
                self.assertEqual(0.04,
                                 proto.metric[0].summary.quantile[1].value)
                self.assertEqual(0.06,
                                 proto.metric[0].summary.quantile[2].value)

    def test_histogram_to_proto(self):
        test_hist = prometheus_client.core.HistogramMetricFamily(
            "test",
            "",
            labels=["testLabel"],
        )
        # Add first unique labelset metrics
        test_hist.add_metric(["val1"], [(1, 1), (10, 2), (100, 3)], 6)

        # Add second unique labelset metrics
        test_hist.add_metric(["val2"], [(1, 2), (10, 3), (100, 4)], 9)

        protos = _histogram_to_proto(test_hist)
        self.assertEqual(2, len(protos))

        for proto in protos:
            self.assertEqual(proto.name, test_hist.name)
            self.assertEqual(proto.type, metrics_pb2.HISTOGRAM)
            if proto.metric[0].label[0].value == "val1":
                self.assertEqual(1, len(proto.metric))
                self.assertEqual(3, proto.metric[0].histogram.sample_count)
                self.assertEqual(6, proto.metric[0].histogram.sample_sum)
                self.assertEqual(3, len(proto.metric[0].histogram.bucket))
                self.assertEqual(
                    1, proto.metric[0].histogram.bucket[0].cumulative_count)
                self.assertEqual(
                    2, proto.metric[0].histogram.bucket[1].cumulative_count)
                self.assertEqual(
                    3, proto.metric[0].histogram.bucket[2].cumulative_count)
            else:
                self.assertEqual(1, len(proto.metric))
                self.assertEqual(4, proto.metric[0].histogram.sample_count)
                self.assertEqual(9, proto.metric[0].histogram.sample_sum)
                self.assertEqual(3, len(proto.metric[0].histogram.bucket))
                self.assertEqual(
                    2, proto.metric[0].histogram.bucket[0].cumulative_count)
                self.assertEqual(
                    3, proto.metric[0].histogram.bucket[1].cumulative_count)
                self.assertEqual(
                    4, proto.metric[0].histogram.bucket[2].cumulative_count)

    def _generate_samples(self, number):
        samples = []
        for _ in range(number):
            sample_name = randrange(10000)
            samples.append(MetricFamily(name=str(sample_name)))
        return samples
Пример #5
0
class MetricsCollectorTests(unittest.TestCase):
    """
    Tests for the MetricCollector collect and sync
    """
    @classmethod
    def setUpClass(cls):
        cls.queue_size = 5

    def setUp(self):
        ServiceRegistry.add_service('test', '0.0.0.0', 0)
        ServiceRegistry._PROXY_CONFIG = {
            'local_port': 1234,
            'cloud_address': 'test',
            'proxy_cloud_connections': True
        }

        self._services = ['test']
        self.gateway_id = "2876171d-bf38-4254-b4da-71a713952904"
        self.queue_length = 5
        self.timeout = 1
        self._collector = MetricsCollector(self._services,
                                           5,
                                           10,
                                           self.timeout,
                                           queue_length=self.queue_length,
                                           loop=asyncio.new_event_loop())

    @unittest.mock.patch('magma.magmad.metrics_collector.MetricsControllerStub'
                         )
    def test_sync(self, controller_mock):
        """
        Test if the collector syncs our sample.
        """
        # Mock out Collect.future
        mock = unittest.mock.Mock()
        mock.Collect.future.side_effect = [unittest.mock.Mock()]
        controller_mock.side_effect = [mock]

        # Call with no samples
        self._collector.sync()
        controller_mock.Collect.future.assert_not_called()
        self._collector._loop.stop()

        # Call with new samples to send and some to retry
        samples = [MetricFamily(name="1234")]
        self._collector._samples.extend(samples)
        self._collector._retry_queue.extend(samples)
        with unittest.mock.patch('snowflake.snowflake') as mock_snowflake:
            mock_snowflake.side_effect = lambda: self.gateway_id
            self._collector.sync()
        mock.Collect.future.assert_called_once_with(
            MetricsContainer(gatewayId=self.gateway_id, family=samples * 2),
            self.timeout)
        self.assertCountEqual(self._collector._samples, [])
        self.assertCountEqual(self._collector._retry_queue, [])

    def test_sync_queue(self):
        """
        Test if the sync queues items on failure
        """
        # We should retry sending the newest samples
        samples = [
            MetricFamily(name=str(i)) for i in range(self.queue_length + 1)
        ]
        mock_future = MockFuture(is_error=True)
        self._collector.sync_done(samples, mock_future)
        self.assertCountEqual(self._collector._samples, [])
        self.assertCountEqual(self._collector._retry_queue,
                              samples[-self.queue_length:])

        # On success don't retry to send
        self._collector._retry_queue.clear()
        mock_future = MockFuture(is_error=False)
        self._collector.sync_done(samples, mock_future)
        self.assertCountEqual(self._collector._samples, [])
        self.assertCountEqual(self._collector._retry_queue, [])

    def test_collect(self):
        """
        Test if the collector syncs our sample.
        """
        mock = unittest.mock.MagicMock()
        samples = [MetricFamily(name="2345")]
        self._collector._samples.clear()
        self._collector._samples.extend(samples)
        mock.result.side_effect = [MetricsContainer(family=samples)]
        mock.exception.side_effect = [False]

        self._collector.collect_done('test', mock)
        # Should dequeue sample from the left, and enqueue on right
        # collector should add one more metric for collection success/failure
        self.assertEqual(len(self._collector._samples), len(samples * 2) + 1)

    def test_collect_start_time(self):
        """
        Test if the collector syncs our sample.
        """
        mock = unittest.mock.MagicMock()
        start_metric = Metric()
        start_metric.gauge.value = calendar.timegm(time.gmtime()) - 1
        start_time = MetricFamily(
            name=str(metricsd_pb2.process_start_time_seconds),
            metric=[start_metric],
        )
        samples = [start_time]
        self._collector._samples.clear()
        mock.result.side_effect = [MetricsContainer(family=samples)]
        mock.exception.side_effect = [False]

        self._collector.collect_done('test', mock)

        # should have uptime, start time, and collection success
        self.assertEqual(len(self._collector._samples), 3)
        uptime_list = [
            fam for fam in self._collector._samples
            if fam.name == str(metricsd_pb2.process_uptime_seconds)
        ]
        self.assertEqual(len(uptime_list), 1)
        self.assertEqual(len(uptime_list[0].metric), 1)
        self.assertGreater(uptime_list[0].metric[0].gauge.value, 0)

        # ensure no exceptions with empty metric
        empty = MetricFamily(name=str(metricsd_pb2.process_start_time_seconds))
        samples = [empty]
        self._collector._samples.clear()
        mock.result.side_effect = [MetricsContainer(family=samples)]
        mock.exception.side_effect = [False]
        try:
            self._collector.collect_done('test', mock)
        except Exception:  # pylint: disable=broad-except
            self.fail("Collection with empty metric should not have failed")