def _parse_versions_and_log_warning(self, agw_version, unpacked_mconfig): # version should be in X.X.X format with only non-negative numbers allowed for X version_regex = re.compile( r"[0-9]+\.(?P<minor_version>[0-9]+)\.[0-9]+") # unpack the magmad structure to get orce_version field if unpacked_mconfig.Is(mconfigs_pb2.MagmaD.DESCRIPTOR): magmad_parsed = mconfigs_pb2.MagmaD() unpacked_mconfig.Unpack(magmad_parsed) orc8r_version = magmad_parsed.orc8r_version agw_version_parsed = version_regex.match(agw_version) orc8r_version_parsed = version_regex.match(orc8r_version) # agw_version is not in expected format if not agw_version_parsed: logging.warning("Gateway version: %s not valid", agw_version) # orc8r_version is not in expected format if not orc8r_version_parsed: logging.warning("Orchestrator version: %s not valid", orc8r_version) return VersionInfo(agw_version_parsed, orc8r_version_parsed) logging.error( "Expecting MagmaD mconfig structure, but received a different structure: %s.", unpacked_mconfig.type_url, ) return VersionInfo(None, None)
def process_update(self, stream_name, updates, resync): """ Handle config updates. Resync is ignored since the entire config structure is passed in every update. Inputs: - updates - list of GatewayConfigs protobuf structures - resync - boolean indicating whether all database information will be resent (hence cached data can be discarded). This is ignored since config is contained in one DB element, hence all data is sent in every update. """ if len(updates) == 0: logging.info('No config update to process') return # We will only take the last update for update in updates[:-1]: logging.info('Ignoring config update %s', update.key) # Deserialize and store the last config update logging.info('Processing config update %s', updates[-1].key) mconfig_str = updates[-1].value.decode() mconfig = self._mconfig_manager.deserialize_mconfig( mconfig_str, self._allow_unknown_fields, ) if 'magmad' not in mconfig.configs_by_key: logging.error('Invalid config! Magmad service config missing') return self._mconfig_manager.update_stored_mconfig(mconfig_str) self._magmad_service.reload_mconfig() def did_mconfig_change(serv_name): return mconfig.configs_by_key.get(serv_name) != \ self._mconfig.configs_by_key.get(serv_name) # Reload magmad configs locally if did_mconfig_change('magmad'): self._loop.create_task( self._service_manager.update_dynamic_services( load_service_mconfig( 'magmad', mconfigs_pb2.MagmaD()).dynamic_services, ), ) services_to_restart = [ srv for srv in self._services if did_mconfig_change(srv) ] if services_to_restart: self._loop.create_task( self._service_manager.restart_services(services_to_restart), ) self._mconfig = mconfig configs_by_key = {} for srv in self._services: if srv in mconfig.configs_by_key: configs_by_key[srv] = mconfig.configs_by_key.get(srv) magmad_events.processed_updates(configs_by_key)
def get_unexpected_restart_summary(self): service = MagmaService('magmad', mconfigs_pb2.MagmaD()) service_poller = ServicePoller(service.loop, service.config) service_poller.start() asyncio.set_event_loop(service.loop) # noinspection PyProtectedMember # pylint: disable=protected-access async def fetch_info(): restart_frequencies = {} await service_poller._get_service_info() for service_name in service_poller.service_info.keys(): restarts = int( UNEXPECTED_SERVICE_RESTARTS .labels(service_name=service_name) ._value.get(), ) restart_frequencies[service_name] = RestartFrequency( count=restarts, time_interval='', ) return restart_frequencies return service.loop.run_until_complete(fetch_info())
def test_unpack_mconfig_any(self): magmad_mconfig = mconfigs_pb2.MagmaD( checkin_interval=10, checkin_timeout=5, autoupgrade_enabled=True, autoupgrade_poll_interval=300, package_version='1.0.0-0', images=[], tier_id='default', feature_flags={'flag1': False}, ) magmad_any = Any( type_url='type.googleapis.com/magma.mconfig.MagmaD', value=magmad_mconfig.SerializeToString(), ) actual = mconfigs.unpack_mconfig_any(magmad_any, mconfigs_pb2.MagmaD()) self.assertEqual(magmad_mconfig, actual)
def setUp(self): ServiceRegistry.add_service('test', '0.0.0.0', 0) self._stub = None # Use a new event loop to ensure isolated tests self._service = MagmaService('test', mconfigs_pb2.MagmaD(), loop=asyncio.new_event_loop()) # Clear the global event loop so tests rely only on the event loop that # was manually set asyncio.set_event_loop(None)
def setUp(self): ServiceRegistry.add_service('test', '0.0.0.0', 0) self._stub = None self._loop = asyncio.new_event_loop() # Use a new event loop to ensure isolated tests self._service = MagmaService( name='test', empty_mconfig=mconfigs_pb2.MagmaD(), loop=self._loop, ) asyncio.set_event_loop(self._service.loop)
def _collect_service_metrics(): config = MagmaService('magmad', mconfigs_pb2.MagmaD()).config magma_services = [ "magma@" + service for service in config['magma_services'] ] non_magma_services = ["sctpd", "openvswitch-switch"] for service in magma_services + non_magma_services: cmd = [ "systemctl", "show", service, "--property=MainPID,MemoryCurrent,MemoryAccounting,MemoryLimit" ] # TODO(@wallyrb): Move away from subprocess and use psystemd output = subprocess.check_output(cmd) output_str = str(output, "utf-8").strip().replace("MainPID=", "").replace( "MemoryCurrent=", "").replace("MemoryAccounting=", "").replace("MemoryLimit=", "") properties = output_str.split("\n") pid = int(properties[0]) memory = properties[1] memory_accounting = properties[2] memory_limit = properties[3] if pid != 0: try: p = psutil.Process(pid=pid) cpu_percentage = p.cpu_percent(interval=1) except psutil.NoSuchProcess: logging.warning( "When collecting CPU usage for service %s: Process with PID %d no longer exists.", service, pid) continue else: _counter_set( SERVICE_CPU_PERCENTAGE.labels(service_name=service, ), cpu_percentage, ) if not memory.isnumeric(): continue if memory_accounting == "yes": _counter_set( SERVICE_MEMORY_USAGE.labels(service_name=service, ), int(memory), ) if memory_limit.isnumeric(): _counter_set( SERVICE_MEMORY_PERCENTAGE.labels(service_name=service, ), int(memory) / int(memory_limit), )
def test_load_mconfig(self, get_service_config_value_mock): # Fixture mconfig has 1 missing service, 1 unregistered type magmad_fixture = mconfigs_pb2.MagmaD( checkin_interval=10, checkin_timeout=5, autoupgrade_enabled=True, autoupgrade_poll_interval=300, package_version='1.0.0-0', images=[], tier_id='default', feature_flags={'flag1': False}, ) magmad_fixture_any = any_pb2.Any() magmad_fixture_any.Pack(magmad_fixture) magmad_fixture_serialized = MessageToJson(magmad_fixture_any) fixture = ''' { "offset": 42, "configs": { "configs_by_key": { "magmad": %s, "mme": { "@type": "type.googleapis.com/magma.mconfig.NotAType", "value": "test1" }, "not_a_service": { "@type": "type.googleapis.com/magma.mconfig.MagmaD", "value": "test2" } } } } ''' % magmad_fixture_serialized get_service_config_value_mock.return_value = ['mme'] with mock.patch('builtins.open', mock.mock_open(read_data=fixture)): manager = mconfig_managers.StreamedMconfigManager() actual = manager.load_mconfig() expected_configs_by_key = {'magmad': magmad_fixture_any} expected = mconfig_pb2.OffsetGatewayConfigs( offset=42, configs=mconfig_pb2.GatewayConfigs( configs_by_key=expected_configs_by_key, ), ) self.assertEqual(expected, actual)
def test_load_mconfig(self, get_service_config_value_mock): # Fixture mconfig has 1 unrecognized service, 1 unregistered type magmad_fixture = mconfigs_pb2.MagmaD( checkin_interval=10, checkin_timeout=5, autoupgrade_enabled=True, autoupgrade_poll_interval=300, package_version='1.0.0-0', images=[], tier_id='default', feature_flags={'flag1': False}, ) magmad_fixture_any = any_pb2.Any() magmad_fixture_any.Pack(magmad_fixture) magmad_fixture_serialized = MessageToJson(magmad_fixture_any) fixture = ''' { "configs_by_key": { "magmad": %s, "foo": { "@type": "type.googleapis.com/magma.mconfig.NotAType", "value": "test1" }, "not_a_service": { "@type": "type.googleapis.com/magma.mconfig.MagmaD", "value": "test2" } } } ''' % magmad_fixture_serialized get_service_config_value_mock.return_value = { 'magma_services': ['foo'], } with mock.patch('builtins.open', mock.mock_open(read_data=fixture)): manager = mconfig_managers.MconfigManagerImpl() with self.assertRaises(LoadConfigError): manager.load_mconfig()
async def bootstrap_success_cb(certs_generated: bool): nonlocal first_time_bootstrap if first_time_bootstrap: if stream_client: stream_client.start() if sync_rpc_client: sync_rpc_client.start() first_time_bootstrap = False if certs_generated: svcs_to_restart = [] if 'control_proxy' in services: svcs_to_restart.append('control_proxy') # fluent-bit caches TLS client certs in memory, so we need to # restart it whenever the certs change fresh_mconfig = get_mconfig_manager().load_service_mconfig( 'magmad', mconfigs_pb2.MagmaD(), ) dynamic_svcs = fresh_mconfig.dynamic_services or [] if 'td-agent-bit' in dynamic_svcs: svcs_to_restart.append('td-agent-bit') await service_manager.restart_services(services=svcs_to_restart)
def main(): """ Main magmad function """ service = MagmaService('magmad', mconfigs_pb2.MagmaD()) # Optionally pipe errors to Sentry sentry_init(service_name=service.name) logging.info('Starting magmad for UUID: %s', snowflake.make_snowflake()) # Create service manager services = service.config.get('magma_services') init_system = service.config.get('init_system', 'systemd') registered_dynamic_services = service.config.get( 'registered_dynamic_services', [], ) enabled_dynamic_services = [] if service.mconfig is not None: enabled_dynamic_services = service.mconfig.dynamic_services # Poll the services' Service303 interface service_poller = ServicePoller( service.loop, service.config, enabled_dynamic_services, ) service_poller.start() service_manager = ServiceManager( services, init_system, service_poller, registered_dynamic_services, enabled_dynamic_services, ) # Get metrics service config metrics_config = service.config.get('metricsd') metrics_services = metrics_config['services'] collect_interval = metrics_config['collect_interval'] sync_interval = metrics_config['sync_interval'] grpc_timeout = metrics_config['grpc_timeout'] grpc_msg_size = metrics_config.get('max_grpc_msg_size_mb', 4) metrics_post_processor_fn = metrics_config.get('post_processing_fn') metric_scrape_targets = [ ScrapeTarget(t['url'], t['name'], t['interval']) for t in metrics_config.get('metric_scrape_targets', []) ] # Create local metrics collector metrics_collector = MetricsCollector( services=metrics_services, collect_interval=collect_interval, sync_interval=sync_interval, grpc_timeout=grpc_timeout, grpc_max_msg_size_mb=grpc_msg_size, loop=service.loop, post_processing_fn=get_metrics_postprocessor_fn( metrics_post_processor_fn, ), scrape_targets=metric_scrape_targets, ) # Poll and sync the metrics collector loops metrics_collector.run() # Start a background thread to stream updates from the cloud stream_client = None if service.config.get('enable_config_streamer', False): stream_client = StreamerClient( { CONFIG_STREAM_NAME: ConfigManager( services, service_manager, service, MconfigManagerImpl(), ), }, service.loop, ) # Create sync rpc client with a heartbeat of 30 seconds (timeout = 60s) sync_rpc_client = None if service.config.get('enable_sync_rpc', False): sync_rpc_client = SyncRPCClient( service.loop, 30, service.config.get('print_grpc_payload', False), ) first_time_bootstrap = True # This is called when bootstrap succeeds and when _bootstrap_check is # invoked but bootstrap is not needed. If it's invoked right after certs # are generated, certs_generated is true, control_proxy will restart. async def bootstrap_success_cb(certs_generated: bool): nonlocal first_time_bootstrap if first_time_bootstrap: if stream_client: stream_client.start() if sync_rpc_client: sync_rpc_client.start() first_time_bootstrap = False if certs_generated: svcs_to_restart = [] if 'control_proxy' in services: svcs_to_restart.append('control_proxy') # fluent-bit caches TLS client certs in memory, so we need to # restart it whenever the certs change fresh_mconfig = get_mconfig_manager().load_service_mconfig( 'magmad', mconfigs_pb2.MagmaD(), ) dynamic_svcs = fresh_mconfig.dynamic_services or [] if 'td-agent-bit' in dynamic_svcs: svcs_to_restart.append('td-agent-bit') await service_manager.restart_services(services=svcs_to_restart) # Create bootstrap manager bootstrap_manager = BootstrapManager(service, bootstrap_success_cb) # Initialize kernel version poller if it is enabled kernel_version_poller = None if service.config.get('enable_kernel_version_checking', False): kernel_version_poller = KernelVersionsPoller(service) kernel_version_poller.start() # gateway status generator to bundle various information about this # gateway into an object. gateway_status_factory = GatewayStatusFactory( service=service, service_poller=service_poller, kernel_version_poller=kernel_version_poller, ) # _grpc_client_manager to manage grpc client recycling grpc_client_manager = GRPCClientManager( service_name="state", service_stub=StateServiceStub, max_client_reuse=60, ) # Initialize StateReporter state_reporter = StateReporter( config=service.config, mconfig=service.mconfig, loop=service.loop, bootstrap_manager=bootstrap_manager, gw_status_factory=gateway_status_factory, grpc_client_manager=grpc_client_manager, ) # Initialize ServiceHealthWatchdog service_health_watchdog = ServiceHealthWatchdog( config=service.config, loop=service.loop, service_poller=service_poller, service_manager=service_manager, ) # Start _bootstrap_manager bootstrap_manager.start_bootstrap_manager() # Start all services when magmad comes up service.loop.create_task(service_manager.start_services()) # Start state reporting loop state_reporter.start() # Start service timeout health check loop service_health_watchdog.start() # Start upgrade manager loop if service.config.get('enable_upgrade_manager', False): upgrader = _get_upgrader_impl(service) service.loop.create_task(start_upgrade_loop(service, upgrader)) # Start network health metric collection if service.config.get('enable_network_monitor', False): service.loop.create_task(metrics_collection_loop(service.config)) # Create generic command executor command_executor = None if service.config.get('generic_command_config', None): command_executor = get_command_executor_impl(service) # Start loop to monitor unattended upgrade status service.loop.create_task(monitor_unattended_upgrade_status()) # Add all servicers to the server magmad_servicer = MagmadRpcServicer( service, services, service_manager, get_mconfig_manager(), command_executor, service.loop, service.config.get('print_grpc_payload', False), ) magmad_servicer.add_to_server(service.rpc_server) if SDWatchdog.has_notify(): # Create systemd watchdog sdwatchdog = SDWatchdog( tasks=[bootstrap_manager, state_reporter], update_status=True, ) # Start watchdog loop service.loop.create_task(sdwatchdog.run()) # Run the service loop service.run() # Cleanup the service service.close()
def main(): """ Main magmad function """ service = MagmaService('magmad', mconfigs_pb2.MagmaD()) logging.info('Starting magmad for UUID: %s', snowflake.make_snowflake()) # Create service manager services = service.config['magma_services'] init_system = service.config.get('init_system', 'systemd') registered_dynamic_services = service.config.get( 'registered_dynamic_services', []) enabled_dynamic_services = [] if service.mconfig is not None: enabled_dynamic_services = service.mconfig.dynamic_services # Poll the services' Service303 interface service_poller = ServicePoller(service.loop, service.config) service_poller.start() service_manager = ServiceManager(services, init_system, service_poller, registered_dynamic_services, enabled_dynamic_services) # Get metrics service config metrics_config = service.config['metricsd'] metrics_services = metrics_config['services'] collect_interval = metrics_config['collect_interval'] sync_interval = metrics_config['sync_interval'] grpc_timeout = metrics_config['grpc_timeout'] queue_length = metrics_config['queue_length'] # Create local metrics collector metrics_collector = MetricsCollector(metrics_services, collect_interval, sync_interval, grpc_timeout, queue_length, service.loop) # Poll and sync the metrics collector loops metrics_collector.run() # Start a background thread to stream updates from the cloud stream_client = None if service.config.get('enable_config_streamer', False): stream_client = StreamerClient( { CONFIG_STREAM_NAME: ConfigManager( services, service_manager, service, MconfigManagerImpl(), ), }, service.loop, ) # Schedule periodic checkins checkin_manager = CheckinManager(service, service_poller) # Create sync rpc client with a heartbeat of 30 seconds (timeout = 60s) sync_rpc_client = None if service.config.get('enable_sync_rpc', False): sync_rpc_client = SyncRPCClient(service.loop, 30) first_time_bootstrap = True # This is called when bootstrap succeeds and when _bootstrap_check is # invoked but bootstrap is not needed. If it's invoked right after certs # are generated, certs_generated is true, control_proxy will restart. async def bootstrap_success_cb(certs_generated): nonlocal first_time_bootstrap if first_time_bootstrap: if stream_client: stream_client.start() await checkin_manager.try_checkin() if sync_rpc_client: sync_rpc_client.start() first_time_bootstrap = False if certs_generated and 'control_proxy' in services: service.loop.create_task( service_manager.restart_services(services=['control_proxy'])) # Create bootstrap manager bootstrap_manager = BootstrapManager(service, bootstrap_success_cb) async def checkin_failure_cb(err_code): await bootstrap_manager.on_checkin_fail(err_code) checkin_manager.set_failure_cb(checkin_failure_cb) # Start bootstrap_manager after checkin_manager's callback is set bootstrap_manager.start_bootstrap_manager() # Schedule periodic state reporting state_manager = StateReporter(service, checkin_manager) state_manager.start() # Start all services when magmad comes up service.loop.create_task(service_manager.start_services()) # Start upgrade manager loop if service.config.get('enable_upgrade_manager', False): upgrader = _get_upgrader_impl(service) service.loop.create_task(start_upgrade_loop(service, upgrader)) # Start network health metric collection if service.config.get('enable_network_monitor', False): service.loop.create_task(metrics_collection_loop(service.config)) if service.config.get('enable_systemd_tailer', False): service.loop.create_task(start_systemd_tailer(service.config)) # Create generic command executor command_executor = None if service.config.get('generic_command_config', None): command_executor = get_command_executor_impl(service) # Start loop to monitor unattended upgrade status service.loop.create_task(monitor_unattended_upgrade_status(service.loop)) # Add all servicers to the server magmad_servicer = MagmadRpcServicer( service, services, service_manager, get_mconfig_manager(), command_executor, service.loop, ) magmad_servicer.add_to_server(service.rpc_server) if SDWatchdog.has_notify(): # Create systemd watchdog sdwatchdog = SDWatchdog(tasks=[bootstrap_manager, checkin_manager], update_status=True) # Start watchdog loop service.loop.create_task(sdwatchdog.run()) # Run the service loop service.run() # Cleanup the service service.close()
def process_update(self, stream_name, updates, resync): """ Handle config updates. Resync is ignored since the entire config structure is passed in every update. Inputs: - updates - list of GatewayConfigs protobuf structures - resync - boolean indicating whether all database information will be resent (hence cached data can be discarded). This is ignored since config is contained in one DB element, hence all data is sent in every update. """ if len(updates) == 0: logging.info('No config update to process') return # We will only take the last update for update in updates[:-1]: logging.info('Ignoring config update %s', update.key) # Deserialize and store the last config update logging.info('Processing config update %s', updates[-1].key) mconfig_str = updates[-1].value.decode() mconfig = self._mconfig_manager.deserialize_mconfig( mconfig_str, self._allow_unknown_fields, ) if MAGMAD not in mconfig.configs_by_key: logging.error('Invalid config! Magmad service config missing') return self._mconfig_manager.update_stored_mconfig(mconfig_str) self._magmad_service.reload_mconfig() def did_mconfig_change(serv_name): return mconfig.configs_by_key.get(serv_name) != \ self._mconfig.configs_by_key.get(serv_name) # Reload magmad configs locally if did_mconfig_change(MAGMAD) or ( SHARED_MCONFIG in mconfig.configs_by_key and did_mconfig_change(SHARED_MCONFIG)): logging.info("Restarting dynamic services due to config change") self._loop.create_task( self._service_manager.update_dynamic_services( load_service_mconfig( MAGMAD, mconfigs_pb2.MagmaD()).dynamic_services, ), ) services_to_restart = [] if SHARED_MCONFIG in mconfig.configs_by_key and did_mconfig_change( SHARED_MCONFIG): logging.info("Shared config changed. Restarting all services.") services_to_restart = self._services else: services_to_restart = [ srv for srv in self._services if did_mconfig_change(srv) ] if services_to_restart: self._loop.create_task( self._service_manager.restart_services(services_to_restart), ) self._mconfig = mconfig configs_by_key = {} for srv in self._services: if srv in mconfig.configs_by_key: configs_by_key[srv] = mconfig.configs_by_key.get(srv) agw_version = self._magmad_service.version unpacked_mconfig = mconfig.configs_by_key.get(MAGMAD) version_info = self._parse_versions_and_log_warning( agw_version, unpacked_mconfig) agw_version_parsed = version_info.agw_version orc8r_version_parsed = version_info.orc8r_version if agw_version_parsed and orc8r_version_parsed: agw_minor = int(agw_version_parsed.group('minor_version')) orc8r_minor = int(orc8r_version_parsed.group('minor_version')) if agw_minor - orc8r_minor <= -1: logging.warning( "Gateway is more than one minor version behind orc8r. Please consider updating it." ) magmad_events.processed_updates(configs_by_key)