class Forchestrator(VarzUpdater, OrchestrationManager): """Main class encompassing faucet orchestrator components for dynamically controlling faucet ACLs at runtime""" _DETAIL_FORMAT = '%s is %s: %s' def __init__(self, config): self._config = config self._structural_config_file = None self._behavioral_config_file = None self._forch_config_dir = None self._faucet_config_dir = None self._gauge_config_file = None self._segments_vlans_file = None self._faucet_events = None self._start_time = datetime.fromtimestamp(time.time()).isoformat() self._faucet_prom_endpoint = None self._gauge_prom_endpoint = None self._behavioral_config = None self._faucet_collector = None self._local_collector = None self._cpn_collector = None self._varz_collector = None self._faucetizer = None self._authenticator = None self._faucetize_scheduler = None self._config_file_watcher = None self._faucet_state_scheduler = None self._gauge_metrics_scheduler = None self._device_report_handler = None self._port_state_manager = None self._initialized = False self._active_state = State.initializing self._active_state_lock = threading.Lock() self._should_enable_faucetizer = False self._should_ignore_static_behavior = False self._should_ignore_auth_result = False self._forch_config_errors = {} self._system_errors = {} self._faucet_config_summary = SystemState.FaucetConfigSummary() self._metrics = None self._varz_proxy = None self._last_faucet_config_writing_time = None self._last_received_faucet_config_hash = None self._config_hash_verification_timeout_sec = ( self._config.event_client.config_hash_verification_timeout_sec or _DEFAULT_CONFIG_HASH_VERIFICATION_TIMEOUT_SEC) self._states_lock = threading.Lock() self._timer_lock = threading.Lock() self._logger = get_logger('forch') def initialize(self): """Initialize forchestrator instance""" self._should_enable_faucetizer = self._calculate_orchestration_config() self._metrics = ForchMetrics(self._config.varz_interface) self._metrics.start() self._varz_collector = VarzStateCollector() self._faucet_collector = FaucetStateCollector( self._config, is_faucetizer_enabled=self._should_enable_faucetizer) self._faucet_collector.set_placement_callback( self._process_device_placement) self._faucet_collector.set_get_gauge_metrics( lambda: self._varz_collector.retry_get_metrics( self._gauge_prom_endpoint, _TARGET_GAUGE_METRICS)) self._faucet_collector.set_get_dva_state( (lambda switch, port: self._port_state_manager.get_dva_state( switch, port) if self._port_state_manager else None)) self._faucet_collector.set_forch_metrics(self._metrics) self._faucet_state_scheduler = HeartbeatScheduler(interval_sec=1) self._faucet_state_scheduler.add_callback( self._faucet_collector.heartbeat_update_stack_state) self._faucet_state_scheduler.add_callback(self._verify_config_hash) gauge_metrics_interval_sec = self._config.dataplane_monitoring.gauge_metrics_interval_sec if gauge_metrics_interval_sec: self._initialize_gauge_metrics_scheduler( gauge_metrics_interval_sec) self._local_collector = LocalStateCollector(self._config.process, self.cleanup, self.handle_active_state, metrics=self._metrics) self._cpn_collector = CPNStateCollector(self._config.cpn_monitoring) faucet_prom_port = os.getenv('FAUCET_PROM_PORT', str(_DEFAULT_FAUCET_PROM_PORT)) self._faucet_prom_endpoint = f"http://{_FAUCET_PROM_HOST}:{faucet_prom_port}" gauge_prom_port = os.getenv('GAUGE_PROM_PORT', str(_DEFAULT_GAUGE_PROM_PORT)) self._gauge_prom_endpoint = f"http://{_GAUGE_PROM_HOST}:{gauge_prom_port}" self._initialize_orchestration() self._logger.info('Attaching event channel...') self._faucet_events = forch.faucet_event_client.FaucetEventClient( self._config.event_client) self._local_collector.initialize() self._cpn_collector.initialize() self._logger.info('Using peer controller %s', self._get_peer_controller_url()) if str(self._config.proxy_server): self._varz_proxy = ForchProxy(self._config.proxy_server, content_type='text/plain') self._varz_proxy.start() self._validate_config_files() varz_retry = 10 while varz_retry > 0: time.sleep(10) try: self._get_varz_config() break except Exception as e: self._logger.error('Waiting for varz config: %s', e) varz_retry -= 1 if varz_retry == 0: raise MetricsFetchingError('Could not get Faucet varz metrics') self._register_handlers() self._start() self._initialized = True def _initialize_orchestration(self): sequester_config = self._calculate_sequester_config() if self._should_enable_faucetizer: self._initialize_faucetizer(sequester_config.sequester_segment) self._faucetizer.reload_structural_config() if not self._faucetizer.tail_acl_config_valid(): error_msg = 'All auth was disabled due to missing ACL for tail_acl config' self._logger.error(error_msg) with self._states_lock: self._should_ignore_static_behavior = True self._should_ignore_auth_result = True self._forch_config_errors[TAIL_ACL_CONFIG] = error_msg if self._gauge_config_file: self._faucetizer.reload_and_flush_gauge_config( self._gauge_config_file) if self._segments_vlans_file: try: self._faucetizer.reload_segments_to_vlans( self._segments_vlans_file) except Exception as error: error_msg = ( 'All auth was disabled due to error when reading ' 'segments-to-vlans file') self._should_ignore_static_behavior = True self._should_ignore_auth_result = True self._forch_config_errors[SEGMENTS_VLANS_FILE] = error_msg self._logger.error('%s %s: %s', error_msg, self._segments_vlans_file, error) if sequester_config.sequester_segment: self._device_report_handler = self._create_device_report_handler() self._faucet_collector.set_device_state_reporter( self._device_report_handler) self._port_state_manager = PortStateManager( self._faucetizer, self, self._device_report_handler, orch_config=self._config.orchestration) self._attempt_authenticator_initialise() self._process_static_device_placement() self._process_static_device_behavior() if self._faucetizer: self._faucetizer.flush_behavioral_config(force=True) def _create_device_report_handler(self): sequester_config = self._config.orchestration.sequester_config address = sequester_config.service_address or _DEFAULT_SERVER_ADDRESS port = sequester_config.service_port or _DEFAULT_SERVER_PORT service_target = f'{address}:{port}' unauth_vlan = self._config.orchestration.unauthenticated_vlan tunnel_ip = self._config.orchestration.sequester_config.tunnel_ip self._logger.info('Connecting report client to %s, local %s, vlan %s', service_target, tunnel_ip, unauth_vlan) endpoint_handler = None if tunnel_ip: endpoint_handler = EndpointHandler(tunnel_ip, self._structural_config_file) return DeviceReportClient(self._handle_device_result, service_target, unauth_vlan, tunnel_ip, endpoint_handler=endpoint_handler) def _attempt_authenticator_initialise(self): orch_config = self._config.orchestration if not orch_config.HasField('auth_config'): return self._logger.info('Initializing authenticator') self._authenticator = Authenticator(orch_config.auth_config, self._handle_auth_result, metrics=self._metrics) def _process_static_device_placement(self): placement_file_name = self._config.orchestration.static_device_placement if not placement_file_name: return placement_file_path = os.path.join(self._forch_config_dir, placement_file_name) content = None if os.path.isfile(placement_file_path): with open(placement_file_path, 'r') as fd: content = fd.read() self._reload_static_device_placement(placement_file_path, content) self._config_file_watcher.register_file_callback( placement_file_path, self._reload_static_device_placement) def _reload_static_device_placement(self, file_path, new, current=None): try: self._logger.info('Reading static device placement file: %s', file_path) new_mac_placements = yaml_content_proto( new, DevicesState).device_mac_placements except Exception as error: msg = f'All auth was disabled: could not load static placement file {file_path}' self._logger.error('%s: %s', msg, error) if self._faucetizer: self._faucetizer.clear_static_placements() with self._states_lock: self._forch_config_errors[STATIC_PLACEMENT_FILE] = msg self._should_ignore_auth_result = True return current_macs = set() if current: current_mac_placements = yaml_content_proto( current, DevicesState).device_mac_placements current_macs = set(current_mac_placements) for eth_src, device_placement in new_mac_placements.items(): if eth_src in current_macs: current_macs.remove(eth_src) current_placement = current_mac_placements[current_macs] if current_placement.SerializeToString( ) == device_placement.SerializeToString(): continue if self._faucetizer: self._faucetizer.clear_static_placement(eth_src) self._process_device_placement(eth_src, device_placement, static=True) # Remove macs that're deleted if self._faucetizer: for mac in current_macs: self._faucetizer.clear_static_placement(mac) def _process_static_device_behavior(self): if self._should_ignore_static_behavior: return behaviors_file_name = self._config.orchestration.static_device_behavior if not behaviors_file_name: return behaviors_file_path = os.path.join(self._forch_config_dir, behaviors_file_name) content = None if os.path.isfile(behaviors_file_path): with open(behaviors_file_path, 'r') as fd: content = fd.read() self._reload_static_device_behavior(behaviors_file_path, content) self._config_file_watcher.register_file_callback( behaviors_file_path, self._reload_static_device_behavior) def _reload_static_device_behavior(self, file_path, new, current=None): try: self._logger.info('Reading static device behavior file: %s', file_path) mac_hehaviors = yaml_content_proto( new, DevicesState).device_mac_behaviors except Exception as error: msg = f'All auth was disabled: could not load static behavior file {file_path}' self._logger.error('%s: %s', msg, error) self._port_state_manager.clear_static_device_behaviors() with self._states_lock: self._forch_config_errors[STATIC_BEHAVIORAL_FILE] = msg self._should_ignore_auth_result = True return current_macs = set() if current: try: device_states = yaml_content_proto(current, DevicesState) current_mac_hehaviors = device_states.device_mac_behaviors current_macs = set(current_mac_hehaviors) except Exception: # Ignore any exceptions with the last content pass with self._states_lock: self._forch_config_errors.pop(STATIC_BEHAVIORAL_FILE, None) self._should_ignore_auth_result = False self._logger.info('Authentication resumed') for mac, device_behavior in mac_hehaviors.items(): if mac in current_macs: current_macs.remove(mac) current_behavior = current_mac_hehaviors[mac] if current_behavior.SerializeToString( ) == device_behavior.SerializeToString(): continue self._port_state_manager.handle_static_device_behavior( mac, device_behavior) for mac in current_macs: self._port_state_manager.clear_static_device_behavior(mac) def _handle_device_result(self, device_result): return self._port_state_manager.handle_testing_result(device_result) def update_device_state_varz(self, mac, state): if self._metrics: self._metrics.update_var('device_state', state, labels=[mac]) def update_static_vlan_varz(self, mac, vlan): if self._metrics: self._metrics.update_var('static_mac_vlan', labels=[mac], value=vlan) def update_device_testing_vlans(self, mac, device_vlan, assigned_vlan): """Updates device testing vlan in device report handler""" if self._device_report_handler: self._device_report_handler.process_port_assign( mac, device_vlan, assigned_vlan) def _calculate_orchestration_config(self): orch_config = self._config.orchestration self._forch_config_dir = os.getenv('FORCH_CONFIG_DIR', _DEFAULT_FORCH_CONFIG_DIR) self._faucet_config_dir = os.getenv('FAUCET_CONFIG_DIR', _DEFAULT_FAUCET_CONFIG_DIR) behavioral_config_file = (orch_config.behavioral_config_file or os.getenv('FAUCET_CONFIG_FILE') or _DEFAULT_BEHAVIORAL_CONFIG) self._behavioral_config_file = os.path.join(self._faucet_config_dir, behavioral_config_file) gauge_config_file = orch_config.gauge_config_file if gauge_config_file: self._gauge_config_file = os.path.join(self._forch_config_dir, gauge_config_file) structural_config_file = orch_config.structural_config_file if not structural_config_file: return False self._structural_config_file = os.path.join(self._forch_config_dir, structural_config_file) if not os.path.exists(self._structural_config_file): raise Exception( f'Structural config file does not exist: {self._structural_config_file}' ) if orch_config.segments_vlans_file: self._segments_vlans_file = os.path.join( self._forch_config_dir, orch_config.segments_vlans_file) else: with self._states_lock: self._should_ignore_static_behavior = True self._should_ignore_auth_result = True self._logger.info( 'All auth was disabled as segments_vlans_file is not configured' ) return True def _calculate_sequester_config(self): if not self._config.orchestration.HasField('sequester_config'): return OrchestrationConfig.SequesterConfig() config = self._config.orchestration.sequester_config config.sequester_segment = config.sequester_segment or DEFAULT_SEQUESTER_SEGMENT config.sequester_timeout_sec = config.sequester_timeout_sec or DEFAULT_SEQUESTER_TIMEOUT_SEC return config def _validate_config_files(self): if not os.path.exists(self._behavioral_config_file): raise Exception( f'Behavioral config file does not exist: {self._behavioral_config_file}' ) if self._structural_config_file == self._behavioral_config_file: raise Exception( 'Structural and behavioral config file cannot be the same: ' f'{self._behavioral_config_file}') def _initialize_faucetizer(self, sequester_segment=None): orch_config = self._config.orchestration self._config_file_watcher = FileChangeWatcher( os.path.dirname(self._structural_config_file)) self._faucetizer = faucetizer.Faucetizer(orch_config, self._structural_config_file, self._behavioral_config_file, self, sequester_segment) def callback_adapter(func): return lambda file_path, new, current: func(file_path) if orch_config.faucetize_interval_sec: self._faucetize_scheduler = HeartbeatScheduler( orch_config.faucetize_interval_sec) update_write_faucet_config = ( lambda: (self._faucetizer.reload_structural_config(), self._faucetizer.flush_behavioral_config(force=True))) self._faucetize_scheduler.add_callback(update_write_faucet_config) else: self._config_file_watcher.register_file_callback( self._structural_config_file, callback_adapter(self._faucetizer.reload_structural_config)) if self._gauge_config_file: self._config_file_watcher.register_file_callback( self._gauge_config_file, callback_adapter( self._faucetizer.reload_and_flush_gauge_config)) if self._segments_vlans_file: self._config_file_watcher.register_file_callback( self._segments_vlans_file, callback_adapter(self._faucetizer.reload_segments_to_vlans)) def _initialize_gauge_metrics_scheduler(self, interval_sec): get_gauge_metrics = ( lambda target_metrics: self._varz_collector.retry_get_metrics( self._gauge_prom_endpoint, target_metrics)) heartbeat_update_packet_count = functools.partial( self._faucet_collector.heartbeat_update_packet_count, interval=interval_sec, get_metrics=get_gauge_metrics) self._gauge_metrics_scheduler = HeartbeatScheduler( interval_sec=interval_sec) self._gauge_metrics_scheduler.add_callback( heartbeat_update_packet_count) def reregister_include_file_watchers(self, old_include_files, new_include_files): """reregister the include file watchers""" self._config_file_watcher.unregister_file_callbacks(old_include_files) for new_include_file in new_include_files: self._config_file_watcher.register_file_callback( new_include_file, self._faucetizer.reload_include_file) def _process_device_placement(self, eth_src, device_placement, static=False): """Call device placement API for faucetizer/authenticator""" propagate_placement, mac, stale_mac = self._port_state_manager.handle_device_placement( eth_src, device_placement, static) src_mac = mac if mac else eth_src if self._authenticator and propagate_placement: if stale_mac: self._authenticator.process_device_placement( stale_mac, DevicePlacement(connected=False)) self._authenticator.process_device_placement( src_mac, device_placement) else: self._logger.info('Ignored deauthentication for %s on %s:%s', src_mac, device_placement.switch, device_placement.port) def _handle_auth_result(self, mac, access, segment, role): self._faucet_collector.update_radius_result(mac, access, segment, role) with self._states_lock: if self._should_ignore_auth_result: self._logger.warning( 'Ingoring authentication result for device %s', mac) else: device_behavior = DeviceBehavior(segment=segment, role=role) self._port_state_manager.handle_device_behavior( mac, device_behavior) def _register_handlers(self): fcoll = self._faucet_collector handlers = [ (FaucetEvent.ConfigChange, self._process_config_change), (FaucetEvent.DpChange, lambda event: fcoll.process_dp_change( event.timestamp, event.dp_name, None, event.reason == "cold_start")), (FaucetEvent.LagChange, lambda event: fcoll.process_lag_state( event.timestamp, event.dp_name, event.port_no, event.role, event.state)), (FaucetEvent.StackState, lambda event: fcoll.process_stack_state( event.timestamp, event.dp_name, event.port, event.state)), (FaucetEvent.StackTopoChange, fcoll.process_stack_topo_change_event), (FaucetEvent.PortChange, fcoll.process_port_change), (FaucetEvent.L2Learn, lambda event: fcoll.process_port_learn( event.timestamp, event.dp_name, event.port_no, event.eth_src, event.vid, event.l3_src_ip)), (FaucetEvent.L2Expire, lambda event: fcoll.process_port_expire( event.timestamp, event.dp_name, event.port_no, event.eth_src, event.vid)), ] self._faucet_events.register_handlers(handlers) def _get_varz_config(self): metrics = self._varz_collector.retry_get_metrics( self._faucet_prom_endpoint, _TARGET_FAUCET_METRICS) varz_hash_info = metrics['faucet_config_hash_info'] assert len(varz_hash_info.samples ) == 1, 'exactly one config hash info not found' varz_config_hashes = varz_hash_info.samples[0].labels['hashes'] varz_config_error = varz_hash_info.samples[0].labels['error'] if varz_config_error: raise Exception(f'Varz config error: {varz_config_error}') return metrics, varz_config_hashes def _restore_states(self): # Make sure the event socket is connected so there's no loss of information. Ordering # is important here, need to connect the socket before scraping current state to avoid # loss of events inbetween. assert self._faucet_events.event_socket_connected, 'restore states without connection' # Restore config first before restoring all state from varz. metrics, varz_config_hashes = self._get_varz_config() self._restore_faucet_config(time.time(), varz_config_hashes) event_horizon = self._faucet_collector.restore_states_from_metrics( metrics) self._faucet_events.set_event_horizon(event_horizon) def _restore_faucet_config(self, timestamp, config_hash): config_info, faucet_dps, behavioral_config = self._get_faucet_config() self._behavioral_config = behavioral_config self._update_config_warning_varz() if config_hash != config_info['hashes']: self._logger.warning('Config hash does not match') self._last_received_faucet_config_hash = config_hash self._faucet_collector.process_dataplane_config_change( timestamp, faucet_dps) def _process_config_change(self, event): self._faucet_collector.process_dp_config_change( event.timestamp, event.dp_name, event.restart_type, event.dp_id) if event.config_hash_info.hashes: self._restore_faucet_config(event.timestamp, event.config_hash_info.hashes) def _verify_config_hash(self): with self._timer_lock: if not self._last_faucet_config_writing_time: return elapsed_time = time.time() - self._last_faucet_config_writing_time if elapsed_time < self._config_hash_verification_timeout_sec: return config_info, _, _ = self._get_faucet_config() if config_info['hashes'] != self._last_received_faucet_config_hash: raise Exception( f'Config hash does not match after ' f'{self._config_hash_verification_timeout_sec} seconds') self._last_faucet_config_writing_time = None def reset_faucet_config_writing_time(self): """reset faucet config writing time""" with self._timer_lock: self._last_faucet_config_writing_time = time.time() def _faucet_events_connect(self): self._logger.info('Attempting faucet event sock connection...') time.sleep(1) try: self._faucet_events.connect() self._restore_states() self._faucet_collector.set_state_restored(True) except Exception as e: self._logger.error( "Cannot restore states or connect to faucet: %s", str(e)) self._faucet_collector.set_state_restored(False, e) def main_loop(self): """Main event processing loop""" if not self._initialized: self._logger.warning('Not properly initialized') return False self._logger.info('Entering main event loop...') try: while self._faucet_events: while not self._faucet_events.event_socket_connected: self._faucet_events_connect() try: self._faucet_events.next_event(blocking=True) except FaucetEventOrderError as e: self._logger.error("Faucet event order error: %s", e) if self._metrics: self._metrics.inc_var( 'faucet_event_out_of_sequence_count') self._restore_states() except KeyboardInterrupt: self._logger.info('Keyboard interrupt. Exiting.') self._faucet_events.disconnect() except Exception as e: self._logger.error("Exception found in main loop: %s", e) raise return True def _start(self): """Start forchestrator components""" if self._faucetize_scheduler: self._faucetize_scheduler.start() if self._config_file_watcher: self._config_file_watcher.start() if self._faucet_state_scheduler: self._faucet_state_scheduler.start() if self._gauge_metrics_scheduler: self._gauge_metrics_scheduler.start() if self._metrics: self._metrics.update_var('forch_version', {'version': __version__}) if self._device_report_handler: self._device_report_handler.start() def stop(self): """Stop forchestrator components""" if self._faucetize_scheduler: self._faucetize_scheduler.stop() if self._faucet_state_scheduler: self._faucet_state_scheduler.stop() if self._authenticator: self._authenticator.stop() if self._config_file_watcher: self._config_file_watcher.stop() if self._metrics: self._metrics.stop() if self._varz_proxy: self._varz_proxy.stop() if self._device_report_handler: self._device_report_handler.stop() def _get_controller_info(self, target): controllers = self._config.site.controllers if target not in controllers: return (f'missing_target_{target}', _DEFAULT_PORT) controller = controllers[target] host = controller.fqdn or target port = controller.port or _DEFAULT_PORT return (host, port) def get_local_port(self): """Get the local port for this instance""" info = self._get_controller_info(self._get_controller_name()) self._logger.info('Local controller is at %s on %s', info[0], info[1]) return int(info[1]) def _make_controller_url(self, info): return f'http://{info[0]}:{info[1]}' def _get_local_controller_url(self): return self._make_controller_url( self._get_controller_info(self._get_controller_name())) def _get_peer_controller_name(self): name = self._get_controller_name() controllers = self._config.site.controllers if name not in controllers: return f'missing_controller_name_{name}' if len(controllers) != 2: return 'num_controllers_%s' % len(controllers) things = set(controllers.keys()) things.remove(name) return list(things)[0] def _get_peer_controller_info(self): return self._get_controller_info(self._get_peer_controller_name()) def _get_peer_controller_url(self): return self._make_controller_url(self._get_peer_controller_info()) def _get_controller_name(self): return os.getenv('CONTROLLER_NAME') def get_system_state(self, path, params): """Get an overview of the system state""" system_state = SystemState() self._populate_versions(system_state.versions) system_state.peer_controller_url = self._get_peer_controller_url() system_state.summary_sources.CopyFrom(self._get_system_summary(path)) system_state.site_name = self._config.site.name or 'unknown' system_state.controller_name = self._get_controller_name() system_state.authentication_mode = self._get_sys_auth_mode() system_state.config_summary.CopyFrom(self._get_config_summary()) self._distill_summary(system_state.summary_sources, system_state) return system_state def _distill_summary(self, summaries, system_state): try: start_time = self._start_time summary_fields = summaries.ListFields() summary_values = [value[1] for value in summary_fields] change_counts = list( map(lambda subsystem: subsystem.change_count or 0, summary_values)) last_changes = list( map(lambda subsystem: subsystem.last_change or start_time, summary_values)) last_updates = list( map(lambda subsystem: subsystem.last_update or start_time, summary_values)) summary, detail = self._get_combined_summary(summaries) system_state.system_state = summary system_state.system_state_detail = detail self._logger.info('system_state_change_count sources: %s', change_counts) system_state.system_state_change_count = sum(change_counts) system_state.system_state_last_change = max(last_changes) system_state.system_state_last_update = max(last_updates) except Exception as e: system_state.system_state = State.broken system_state.system_state_detail = str(e) self._logger.exception(e) def _get_config_details(self): config_details = [] detail = '' if self._forch_config_errors: config_details.append('forch') if config_details: detail += '. config errors: ' + ', '.join(config_details) return detail def _get_combined_summary(self, summary): controller_state, controller_state_detail = self._get_controller_state( ) if controller_state != State.active: return controller_state, controller_state_detail has_error = False has_warning = False details = [] detail = '' for field, subsystem in summary.ListFields(): state = subsystem.state if state in (State.down, State.broken): has_error = True details.append(field.name) elif state != State.healthy: has_warning = True details.append(field.name) if details: detail += 'broken subsystems: ' + ', '.join(details) detail += self._get_config_details() if not self._faucet_events.event_socket_connected: has_error = True detail += '. Faucet disconnected' with self._states_lock: for errors in self._system_errors: if errors: has_error = True detail += '. ' + '. '.join(errors.values()) if not detail: detail = 'n/a' if has_error: return State.broken, detail if has_warning: return State.damaged, detail return State.healthy, detail def _get_system_summary(self, path): states = SystemState.SummarySources() states.cpn_state.CopyFrom(self._cpn_collector.get_cpn_summary()) states.process_state.CopyFrom( self._local_collector.get_process_summary()) states.dataplane_state.CopyFrom( self._faucet_collector.get_dataplane_summary()) states.switch_state.CopyFrom( self._faucet_collector.get_switch_summary()) states.list_hosts.CopyFrom(self._faucet_collector.get_host_summary()) states.vrrp_state.CopyFrom(self._local_collector.get_vrrp_summary()) url_base = self._extract_url_base(path) for field, value in states.ListFields(): value.detail_url = f'{url_base}/?{field.name}' return states def _get_config_summary(self): config_summary = SystemState.ConfigSummary() config_summary.faucet_config.CopyFrom(self._faucet_config_summary) config_summary.forch_config.errors.update(self._forch_config_errors) return config_summary def _extract_url_base(self, path): slash = path.find('/') host = path[:slash] return f'http://{host}' def _augment_state_reply(self, reply, path): url = self._extract_url_base(path) if isinstance(reply, Message): reply.system_state_url = url else: reply['system_state_url'] = url return reply def _get_controller_state(self): with self._active_state_lock: if self._active_state == State.initializing: return State.initializing, 'Initializing' if self._active_state == State.inactive: detail = 'This controller is inactive. Please view peer controller.' return State.inactive, detail cpn_state = self._cpn_collector.get_cpn_state() peer_controller = self._get_peer_controller_name() if peer_controller in cpn_state.cpn_nodes: peer_controller_state = cpn_state.cpn_nodes[peer_controller].state else: self._logger.error('Cannot get peer controller state for %s', peer_controller) peer_controller_state = State.broken if cpn_state.cpn_state == State.initializing: return State.initializing, 'Initializing' if peer_controller_state != State.healthy: return State.split, 'Lost reachability to peer controller.' return State.active, None def _get_faucet_config_hash_info(self, new_conf_hashes): # Code taken from faucet/valves_manager.py parse_configs. new_present_conf_hashes = [ (conf_file, conf_hash) for conf_file, conf_hash in sorted(new_conf_hashes.items()) if conf_hash is not None ] conf_files = [conf_file for conf_file, _ in new_present_conf_hashes] conf_hashes = [conf_hash for _, conf_hash in new_present_conf_hashes] return dict(config_files=','.join(conf_files), hashes=','.join(conf_hashes)) def _get_faucet_config(self): try: new_conf_hashes, _, new_dps, top_conf = config_parser.dp_parser( self._behavioral_config_file, 'fconfig') config_hash_info = self._get_faucet_config_hash_info( new_conf_hashes) self._faucet_config_summary = SystemState.FaucetConfigSummary() for file_name, file_hash in new_conf_hashes.items(): self._logger.info('Loaded conf %s as %s', file_name, file_hash) self._faucet_config_summary.hashes[file_name] = file_hash for warning, message in self._validate_config(top_conf): self._logger.warning('Config warning %s: %s', warning, message) self._faucet_config_summary.warnings[warning] = message return config_hash_info, new_dps, top_conf except Exception as e: self._logger.error('Cannot read faucet config: %s', e) raise def _validate_config(self, config): warnings = [] faucet_dp_macs = set() for dp_name, dp_obj in config['dps'].items(): if 'interface_ranges' in dp_obj: raise Exception( 'Forch does not support parameter \'interface_ranges\' in faucet config' ) if 'faucet_dp_mac' not in dp_obj: warnings.append((dp_name, 'faucet_dp_mac not defined')) else: faucet_dp_macs.add(dp_obj['faucet_dp_mac']) for if_name, if_obj in dp_obj['interfaces'].items(): if_key = '%s:%02d' % (dp_name, int(if_name)) is_egress = 1 if 'lacp' in if_obj else 0 is_stack = 1 if 'stack' in if_obj else 0 is_access = 1 if 'native_vlan' in if_obj else 0 is_tap = 1 if if_obj['description'] == 'TAP' else 0 is_mirror = 1 if if_obj['description'] == 'MIRROR' else 0 if (is_egress + is_stack + is_access + is_tap + is_mirror) != 1: warnings.append( (if_key, 'misconfigured interface config: %d %d %d %d %d' % (is_egress, is_stack, is_access, is_tap, is_mirror))) if 'loop_protect_external' in if_obj: warnings.append( (if_key, 'deprecated loop_protect_external')) if is_access and 'max_hosts' not in if_obj: warnings.append((if_key, 'missing recommended max_hosts')) if len(faucet_dp_macs) > 1: warnings.append(('faucet_dp_mac', 'faucet_dp_mac for DPs are not identical')) return warnings def _update_config_warning_varz(self): self._metrics.update_var('faucet_config_warning_count', len(self._faucet_config_summary.warnings)) for warning_key, warning_msg in self._faucet_config_summary.warnings.items( ): self._metrics.update_var('faucet_config_warning', 1, [warning_key, warning_msg]) def _populate_versions(self, versions): versions.forch = __version__ try: versions.faucet = os.popen( 'faucet --version').read().strip().split()[1] except Exception as e: versions.faucet = f'Cannot get faucet version: {e}' def _get_ryu_config(self): metrics = self._varz_collector.retry_get_metrics( self._faucet_prom_endpoint, _TARGET_FAUCET_METRICS) if 'ryu_config' not in metrics or not metrics['ryu_config'].samples: return {'warnings': 'Ryu config is missing'} ryu_config = {} for sample in metrics['ryu_config'].samples: param = sample.labels['param'] value = sample.value ryu_config[param] = value return ryu_config def _get_sys_auth_mode(self): static_auth_enabled = ( self._config.orchestration.static_device_behavior and not self._should_ignore_static_behavior and not self._forch_config_errors.get(STATIC_BEHAVIORAL_FILE)) dynamic_auth_enabled = (self._authenticator and not self._should_ignore_auth_result) if static_auth_enabled and dynamic_auth_enabled: sys_auth_mode = AuthMode.all elif static_auth_enabled: sys_auth_mode = AuthMode.static_only elif dynamic_auth_enabled: sys_auth_mode = AuthMode.dynamic_only else: sys_auth_mode = AuthMode.disabled return sys_auth_mode def update_initialization_varz(self): """Update Forch initialization Varz""" if not self._metrics: return sys_auth_mode = AuthMode.Mode.Name(self._get_sys_auth_mode()) self._metrics.update_var('system_initialization', {'auth_mode': sys_auth_mode}) def cleanup(self): """Clean up relevant internal data in all collectors""" self._faucet_collector.cleanup() def handle_active_state(self, active_state): """Handler for local state collector to handle controller active state""" with self._active_state_lock: self._active_state = active_state def get_switch_state(self, path, params): """Get the state of the switches""" switch = params.get('switch') port = params.get('port') host = self._extract_url_base(path) reply = self._faucet_collector.get_switch_state(switch, port, host) return self._augment_state_reply(reply, path) def get_dataplane_state(self, path, params): """Get the dataplane state overview""" reply = self._faucet_collector.get_dataplane_state() return self._augment_state_reply(reply, path) def get_host_path(self, path, params): """Get active host path""" eth_src = params.get('eth_src') eth_dst = params.get('eth_dst') to_egress = params.get('to_egress') == 'true' reply = self._faucet_collector.get_host_path(eth_src, eth_dst, to_egress) return self._augment_state_reply(reply, path) def get_list_hosts(self, path, params): """List learned access devices""" eth_src = params.get('eth_src') host = self._extract_url_base(path) reply = self._faucet_collector.get_list_hosts(host, eth_src) return self._augment_state_reply(reply, path) def get_cpn_state(self, path, params): """Get CPN state""" reply = self._cpn_collector.get_cpn_state() return self._augment_state_reply(reply, path) def get_process_state(self, path, params): """Get certain processes state on the controller machine""" reply = self._local_collector.get_process_state() return self._augment_state_reply(reply, path) def get_vrrp_state(self, path, params): """Get VRRP state""" reply = self._local_collector.get_vrrp_state() return self._augment_state_reply(reply, path) def get_sys_config(self, path, params): """Get overall config from faucet config file""" try: assert self._behavioral_config, 'behavioral config not initialized' faucet_config_map = { 'behavioral': self._behavioral_config, 'warnings': dict(self._faucet_config_summary.warnings) } reply = { 'faucet': faucet_config_map, 'forch': proto_dict(self._config), 'ryu': self._get_ryu_config() } if self._faucetizer: reply['faucet'][ 'structural'] = self._faucetizer.get_structural_config() return self._augment_state_reply(reply, path) except Exception as e: return f"Cannot read faucet config: {e}"
class Forchestrator: """Main class encompassing faucet orchestrator components for dynamically controlling faucet ACLs at runtime""" _DETAIL_FORMAT = '%s is %s: %s' def __init__(self, config): self._config = config self._faucet_events = None self._server = None self._start_time = datetime.fromtimestamp(time.time()).isoformat() self._faucet_collector = FaucetStateCollector() self._local_collector = LocalStateCollector(config.get('process')) self._cpn_collector = CPNStateCollector() def initialize(self): """Initialize forchestrator instance""" LOGGER.info('Attaching event channel...') self._faucet_events = forch.faucet_event_client.FaucetEventClient( self._config.get('event_client', {})) self._faucet_events.connect() self._local_collector.initialize() self._cpn_collector.initialize() LOGGER.info('Using peer controller %s', self._get_peer_controller_url()) def main_loop(self): """Main event processing loop""" LOGGER.info('Entering main event loop...') try: while self._handle_faucet_events(): pass except KeyboardInterrupt: LOGGER.info('Keyboard interrupt. Exiting.') self._faucet_events.disconnect() except Exception as e: LOGGER.error("Exception: %s", e) raise # TODO: This should likely be moved into the faucet_state_collector. # pylint: disable=too-many-locals def _handle_faucet_events(self): while self._faucet_events: event = self._faucet_events.next_event(blocking=True) if not event: return True timestamp = event.get("time") LOGGER.debug("Event: %r", event) (name, dpid, port, active) = self._faucet_events.as_port_state(event) if dpid and port: LOGGER.debug('Port state %s %s %s', name, port, active) self._faucet_collector.process_port_state( timestamp, name, port, active) (name, dpid, port, target_mac, src_ip) = self._faucet_events.as_port_learn(event) if dpid and port: LOGGER.debug('Port learn %s %s %s', name, port, target_mac) self._faucet_collector.process_port_learn( timestamp, name, port, target_mac, src_ip) (name, dpid, restart_type, dps_config) = self._faucet_events.as_config_change(event) if dpid is not None: LOGGER.debug('DP restart %s %s', name, restart_type) self._faucet_collector.process_dp_config_change( timestamp, name, restart_type, dpid) if dps_config: LOGGER.debug('Config change. New config: %s', dps_config) self._faucet_collector.process_dataplane_config_change( timestamp, dps_config) (stack_root, graph, dps) = self._faucet_events.as_stack_topo_change(event) if stack_root is not None: LOGGER.debug('stack dataplane_state change root:%s', stack_root) self._faucet_collector.process_stack_topo_change( timestamp, stack_root, graph, dps) (name, port, active) = self._faucet_events.as_lag_status(event) if name and port: LOGGER.debug('LAG state %s %s %s', name, port, active) self._faucet_collector.process_lag_state( timestamp, name, port, active) (name, connected) = self._faucet_events.as_dp_change(event) if name: LOGGER.debug('DP %s connected %r', name, connected) self._faucet_collector.process_dp_change( timestamp, name, connected) return False def _get_controller_info(self, target): controllers = self._config.get('site', {}).get('controllers', {}) if target not in controllers: return (f'missing_target_{target}', _DEFAULT_PORT) controller = controllers[target] controller = controller if controller else {} port = controller.get('port', _DEFAULT_PORT) host = controller.get('fqdn', target) return (host, port) def get_local_port(self): """Get the local port for this instance""" info = self._get_controller_info(self._get_controller_name()) LOGGER.info('Local controller is at %s on %s', info[0], info[1]) return int(info[1]) def _make_controller_url(self, info): return f'http://{info[0]}:{info[1]}' def _get_local_controller_url(self): return self._make_controller_url( self._get_controller_info(self._get_controller_name())) def _get_peer_controller_info(self): name = self._get_controller_name() controllers = self._config.get('site', {}).get('controllers', {}) if name not in controllers: return (f'missing_controller_name_{name}', _DEFAULT_PORT) if len(controllers) != 2: return ('num_controllers_%s' % len(controllers), _DEFAULT_PORT) things = set(controllers.keys()) things.remove(name) peer = list(things)[0] return self._get_controller_info(peer) def _get_peer_controller_url(self): return self._make_controller_url(self._get_peer_controller_info()) def _get_controller_name(self): return os.getenv('CONTROLLER_NAME') def get_system_state(self, path, params): """Get an overview of the system state""" system_summary = self._get_system_summary(path) overview = { 'peer_controller_url': self._get_peer_controller_url(), 'summary_sources': system_summary, 'site_name': self._config.get('site', {}).get('name', 'unknown'), 'controller_name': self._get_controller_name(), } overview.update(self._distill_summary(system_summary)) return overview def _distill_summary(self, summaries): try: start_time = self._start_time summary_values = summaries.values() change_counts = list( map(lambda subsystem: subsystem.get('change_count', 0), summary_values)) last_changes = list( map(lambda subsystem: subsystem.get('last_change', start_time), summary_values)) last_updates = list( map(lambda subsystem: subsystem.get('last_update', start_time), summary_values)) summary, detail = self._get_combined_summary(summaries) system_summary = { 'system_state': summary, 'system_state_detail': detail, 'system_state_change_count': sum(change_counts), 'system_state_last_change': max(last_changes), 'system_state_last_update': max(last_updates) } except Exception as e: system_summary = { 'system_state': 'error', 'system_state_detail': str(e) } LOGGER.exception('Calculating state summary') return system_summary def _get_combined_summary(self, summary): has_error = False has_warning = False details = [] for subsystem_name in summary: subsystem = summary[subsystem_name] state = subsystem.get('state', constants.STATE_BROKEN) if state in (constants.STATE_DOWN, constants.STATE_BROKEN): has_error = True details.append(subsystem_name) elif state != constants.STATE_HEALTHY: has_warning = True details.append(subsystem_name) if details: detail = 'broken subsystems: ' + ', '.join(details) else: detail = 'n/a' if not self._faucet_events.event_socket_connected: has_error = True detail += '. Faucet disconnected.' vrrp_state = self._local_collector.get_vrrp_state() if not vrrp_state.get('is_master'): detail = 'This controller is inactive. Please view peer controller.' return constants.STATE_INACTIVE, detail if has_error: return constants.STATE_BROKEN, detail if has_warning: return constants.STATE_DAMAGED, detail return constants.STATE_HEALTHY, detail def _get_system_summary(self, path): states = { 'cpn_state': self._cpn_collector.get_cpn_summary(), 'process_state': self._local_collector.get_process_summary(), 'dataplane_state': self._faucet_collector.get_dataplane_summary(), 'switch_state': self._faucet_collector.get_switch_summary(), 'list_hosts': self._faucet_collector.get_host_summary() } url_base = self._extract_url_base(path) for state in states: summary = states[state] summary['url'] = f'{url_base}/?{state}' return states def _extract_url_base(self, path): slash = path.find('/') host = path[:slash] return f'http://{host}' def _augment_state_reply(self, reply, path): url = self._extract_url_base(path) reply['system_state_url'] = url def get_switch_state(self, path, params): """Get the state of the switches""" switch = params.get('switch') port = params.get('port') reply = self._faucet_collector.get_switch_state(switch, port) self._augment_state_reply(reply, path) return reply def get_dataplane_state(self, path, params): """Get the dataplane state overview""" reply = self._faucet_collector.get_dataplane_state() self._augment_state_reply(reply, path) return reply def get_host_path(self, path, params): """Get active host path""" eth_src = params.get('eth_src') eth_dst = params.get('eth_dst') to_egress = params.get('to_egress') == 'true' reply = self._faucet_collector.get_host_path(eth_src, eth_dst, to_egress) self._augment_state_reply(reply, path) return reply def get_list_hosts(self, path, params): """List learned access devices""" eth_src = params.get('eth_src') host = self._extract_url_base(path) reply = self._faucet_collector.get_list_hosts(host, eth_src) self._augment_state_reply(reply, path) return reply def get_cpn_state(self, path, params): """Get CPN state""" reply = self._cpn_collector.get_cpn_state() self._augment_state_reply(reply, path) return reply def get_process_state(self, path, params): """Get certain processes state on the controller machine""" reply = self._local_collector.get_process_state() self._augment_state_reply(reply, path) return reply
class Forchestrator: """Main class encompassing faucet orchestrator components for dynamically controlling faucet ACLs at runtime""" _DETAIL_FORMAT = '%s is %s: %s' def __init__(self, config): self._config = config self._faucet_config_file = None self._faucet_events = None self._start_time = datetime.fromtimestamp(time.time()).isoformat() self._faucet_collector = None self._varz_collector = None self._local_collector = None self._cpn_collector = None self._initialized = False self._active_state = State.initializing self._active_state_lock = threading.Lock() def initialize(self): """Initialize forchestrator instance""" self._faucet_collector = FaucetStateCollector() self._local_collector = LocalStateCollector( self._config.get('process'), self.cleanup, self.handle_active_state) self._cpn_collector = CPNStateCollector() self._faucet_config_file = os.path.join(os.getenv('FAUCET_CONFIG_DIR'), _FAUCET_CONFIG_DEFAULT) if not self._faucet_config_file or not os.path.exists( self._faucet_config_file): raise Exception( f"Faucet config file does not exist: {self._faucet_config_file}" ) prom_port = os.getenv('PROMETHEUS_PORT') if not prom_port: raise Exception("PROMETHEUS_PORT is not set") prom_url = f"http://{_PROMETHEUS_HOST}:{prom_port}" self._varz_collector = VarzStateCollector(prom_url) LOGGER.info('Attaching event channel...') self._faucet_events = forch.faucet_event_client.FaucetEventClient( self._config.get('event_client', {})) self._local_collector.initialize() self._cpn_collector.initialize() LOGGER.info('Using peer controller %s', self._get_peer_controller_url()) self._register_handlers() self._initialized = True def initialized(self): """If forch is initialized or not""" return self._initialized def _register_handlers(self): fcoll = self._faucet_collector self._faucet_events.register_handlers([ (FaucetEvent.ConfigChange, self._process_config_change), (FaucetEvent.DpChange, lambda event: fcoll.process_dp_change( event.timestamp, event.dp_name, None, event.reason == "cold_start")), (FaucetEvent.LagChange, lambda event: fcoll.process_lag_state( event.timestamp, event.dp_name, event.port_no, event.state)), (FaucetEvent.StackState, lambda event: fcoll.process_stack_state( event.timestamp, event.dp_name, event.port, event.state)), (FaucetEvent.StackTopoChange, fcoll.process_stack_topo_change_event), ]) def _restore_states(self): # Make sure the event socket is connected so there's no loss of information. Ordering # is important here, need to connect the socket before scraping current state to avoid # loss of events inbetween. assert self._faucet_events.event_socket_connected, 'restore states without connection' metrics = self._varz_collector.get_metrics() # Restore config first before restoring all state from varz. varz_hash_info = metrics['faucet_config_hash_info'] assert len(varz_hash_info.samples ) == 1, 'exactly one config hash info not found' varz_config_hashes = varz_hash_info.samples[0].labels['hashes'] self._restore_faucet_config(time.time(), varz_config_hashes) event_horizon = self._faucet_collector.restore_states_from_metrics( metrics) self._faucet_events.set_event_horizon(event_horizon) def _restore_faucet_config(self, timestamp, config_hash): config_info, faucet_dps, _ = self._get_faucet_config() assert config_hash == config_info[ 'hashes'], 'config hash info does not match' self._faucet_collector.process_dataplane_config_change( timestamp, faucet_dps) def _process_config_change(self, event): self._faucet_collector.process_dp_config_change( event.timestamp, event.dp_name, event.restart_type, event.dp_id) if event.config_hash_info.hashes: self._restore_faucet_config(event.timestamp, event.config_hash_info.hashes) def _faucet_events_connect(self): LOGGER.info('Attempting faucet event sock connection...') time.sleep(1) try: self._faucet_events.connect() self._restore_states() self._faucet_collector.set_state_restored(True) except Exception as e: LOGGER.error("Cannot restore states or connect to faucet", exc_info=True) self._faucet_collector.set_state_restored(False, e) def main_loop(self): """Main event processing loop""" LOGGER.info('Entering main event loop...') try: while self._faucet_events: while not self._faucet_events.event_socket_connected: self._faucet_events_connect() self._process_faucet_event() except KeyboardInterrupt: LOGGER.info('Keyboard interrupt. Exiting.') self._faucet_events.disconnect() except Exception as e: LOGGER.error("Exception: %s", e) raise def _process_faucet_event(self): event = self._faucet_events.next_event(blocking=True) try: self._handle_faucet_event(event) except Exception as e: LOGGER.warning('While processing event %s', event) raise e def _handle_faucet_event(self, event): if not event: return timestamp = event.get("time") LOGGER.debug("Event: %r", event) (name, dpid, port, active) = self._faucet_events.as_port_state(event) if dpid and port: LOGGER.debug('Port state %s %s %s', name, port, active) self._faucet_collector.process_port_state(timestamp, name, port, active) (name, dpid, port, target_mac, src_ip) = self._faucet_events.as_port_learn(event) if dpid and port: LOGGER.debug('Port learn %s %s %s', name, port, target_mac) self._faucet_collector.process_port_learn(timestamp, name, port, target_mac, src_ip) def _get_controller_info(self, target): controllers = self._config.get('site', {}).get('controllers', {}) if target not in controllers: return (f'missing_target_{target}', _DEFAULT_PORT) controller = controllers[target] controller = controller if controller else {} port = controller.get('port', _DEFAULT_PORT) host = controller.get('fqdn', target) return (host, port) def get_local_port(self): """Get the local port for this instance""" info = self._get_controller_info(self._get_controller_name()) LOGGER.info('Local controller is at %s on %s', info[0], info[1]) return int(info[1]) def _make_controller_url(self, info): return f'http://{info[0]}:{info[1]}' def _get_local_controller_url(self): return self._make_controller_url( self._get_controller_info(self._get_controller_name())) def _get_peer_controller_name(self): name = self._get_controller_name() controllers = self._config.get('site', {}).get('controllers', {}) if name not in controllers: return f'missing_controller_name_{name}' if len(controllers) != 2: return 'num_controllers_%s' % len(controllers) things = set(controllers.keys()) things.remove(name) return list(things)[0] def _get_peer_controller_info(self): return self._get_controller_info(self._get_peer_controller_name()) def _get_peer_controller_url(self): return self._make_controller_url(self._get_peer_controller_info()) def _get_controller_name(self): return os.getenv('CONTROLLER_NAME') def get_system_state(self, path, params): """Get an overview of the system state""" system_state = SystemState() self._populate_versions(system_state.versions) system_state.peer_controller_url = self._get_peer_controller_url() system_state.summary_sources.CopyFrom(self._get_system_summary(path)) system_state.site_name = self._config.get('site', {}).get('name', 'unknown') system_state.controller_name = self._get_controller_name() self._distill_summary(system_state.summary_sources, system_state) return system_state def _distill_summary(self, summaries, system_state): try: start_time = self._start_time summary_fields = summaries.ListFields() summary_values = [value[1] for value in summary_fields] change_counts = list( map(lambda subsystem: subsystem.change_count or 0, summary_values)) last_changes = list( map(lambda subsystem: subsystem.last_change or start_time, summary_values)) last_updates = list( map(lambda subsystem: subsystem.last_update or start_time, summary_values)) summary, detail = self._get_combined_summary(summaries) system_state.system_state = summary system_state.system_state_detail = detail system_state.system_state_change_count = sum(change_counts) system_state.system_state_last_change = max(last_changes) system_state.system_state_last_update = max(last_updates) except Exception as e: system_state.system_state = State.broken system_state.system_state_detail = str(e) LOGGER.exception(e) def _get_combined_summary(self, summary): controller_state, controller_state_detail = self._get_controller_state( ) if controller_state != State.active: return controller_state, controller_state_detail has_error = False has_warning = False details = [] for field, subsystem in summary.ListFields(): state = subsystem.state if state in (State.down, State.broken): has_error = True details.append(field.name) elif state != State.healthy: has_warning = True details.append(field.name) if details: detail = 'broken subsystems: ' + ', '.join(details) else: detail = 'n/a' if not self._faucet_events.event_socket_connected: has_error = True detail += '. Faucet disconnected.' if has_error: return State.broken, detail if has_warning: return State.damaged, detail return State.healthy, detail def _get_system_summary(self, path): states = SystemState.SummarySources() states.cpn_state.CopyFrom(self._cpn_collector.get_cpn_summary()) states.process_state.CopyFrom( self._local_collector.get_process_summary()) states.dataplane_state.CopyFrom( self._faucet_collector.get_dataplane_summary()) states.switch_state.CopyFrom( self._faucet_collector.get_switch_summary()) states.list_hosts.CopyFrom(self._faucet_collector.get_host_summary()) url_base = self._extract_url_base(path) for field, value in states.ListFields(): value.detail_url = f'{url_base}/?{field.name}' return states def _extract_url_base(self, path): slash = path.find('/') host = path[:slash] return f'http://{host}' def _augment_state_reply(self, reply, path): url = self._extract_url_base(path) if isinstance(reply, Message): reply.system_state_url = url else: reply['system_state_url'] = url return reply def _get_controller_state(self): with self._active_state_lock: active_state = self._active_state if active_state == State.initializing: return State.initializing, 'Initializing' if active_state == State.inactive: detail = 'This controller is inactive. Please view peer controller.' return State.inactive, detail if active_state != State.active: return State.broken, 'Internal error' cpn_state = self._cpn_collector.get_cpn_state() peer_controller = self._get_peer_controller_name() if peer_controller in cpn_state.cpn_nodes: peer_controller_state = cpn_state.cpn_nodes[peer_controller].state else: LOGGER.error('Cannot get peer controller state for %s', peer_controller) peer_controller_state = State.broken if cpn_state.cpn_state == State.initializing: return State.initializing, 'Initializing' if peer_controller_state != State.healthy: return State.split, 'Lost reachability to peer controller.' return State.active, None def _get_faucet_config_hash_info(self, new_conf_hashes): # Code taken from faucet/valves_manager.py parse_configs. new_present_conf_hashes = [ (conf_file, conf_hash) for conf_file, conf_hash in sorted(new_conf_hashes.items()) if conf_hash is not None ] conf_files = [conf_file for conf_file, _ in new_present_conf_hashes] conf_hashes = [conf_hash for _, conf_hash in new_present_conf_hashes] return dict(config_files=','.join(conf_files), hashes=','.join(conf_hashes)) def _get_faucet_config(self): try: (new_conf_hashes, _, new_dps, top_conf) = config_parser.dp_parser(self._faucet_config_file, 'fconfig') config_hash_info = self._get_faucet_config_hash_info( new_conf_hashes) return config_hash_info, new_dps, top_conf except Exception as e: LOGGER.error('Cannot read faucet config: %s', e) raise e def _populate_versions(self, versions): versions.forch = __version__ try: versions.faucet = os.popen( 'faucet --version').read().strip().split()[1] except Exception as e: versions.faucet = f'Cannot get faucet version: {e}' def cleanup(self): """Clean up relevant internal data in all collectors""" self._faucet_collector.cleanup() def handle_active_state(self, active_state): """Handler for local state collector to handle controller active state""" with self._active_state_lock: self._active_state = active_state self._faucet_collector.set_active(active_state) def get_switch_state(self, path, params): """Get the state of the switches""" switch = params.get('switch') port = params.get('port') host = self._extract_url_base(path) reply = self._faucet_collector.get_switch_state(switch, port, host) return self._augment_state_reply(reply, path) def get_dataplane_state(self, path, params): """Get the dataplane state overview""" reply = self._faucet_collector.get_dataplane_state() return self._augment_state_reply(reply, path) def get_host_path(self, path, params): """Get active host path""" eth_src = params.get('eth_src') eth_dst = params.get('eth_dst') to_egress = params.get('to_egress') == 'true' reply = self._faucet_collector.get_host_path(eth_src, eth_dst, to_egress) return self._augment_state_reply(reply, path) def get_list_hosts(self, path, params): """List learned access devices""" eth_src = params.get('eth_src') host = self._extract_url_base(path) reply = self._faucet_collector.get_list_hosts(host, eth_src) return self._augment_state_reply(reply, path) def get_cpn_state(self, path, params): """Get CPN state""" reply = self._cpn_collector.get_cpn_state() return self._augment_state_reply(reply, path) def get_process_state(self, path, params): """Get certain processes state on the controller machine""" reply = self._local_collector.get_process_state() return self._augment_state_reply(reply, path) def get_sys_config(self, path, params): """Get overall config from facuet config file""" try: _, _, faucet_config = self._get_faucet_config() reply = {'faucet': faucet_config} return self._augment_state_reply(reply, path) except Exception as e: return f"Cannot read faucet config: {e}"
class Forchestrator: """Main class encompassing faucet orchestrator components for dynamically controlling faucet ACLs at runtime""" _DETAIL_FORMAT = '%s is %s: %s' def __init__(self, config): self._config = config self._faucet_config_file = None self._faucet_events = None self._start_time = datetime.fromtimestamp(time.time()).isoformat() self._faucet_collector = None self._varz_collector = None self._local_collector = None self._cpn_collector = None self._initialized = False self._is_active = False self._active_state_lock = threading.Lock() self._event_horizon = 0 def initialize(self): """Initialize forchestrator instance""" self._faucet_collector = FaucetStateCollector() self._local_collector = LocalStateCollector( self._config.get('process'), self.cleanup, self.handle_active_state) self._cpn_collector = CPNStateCollector() self._faucet_config_file = os.path.join(os.getenv('FAUCET_CONFIG_DIR'), _FAUCET_CONFIG_DEFAULT) if not self._faucet_config_file or not os.path.exists( self._faucet_config_file): raise Exception( f"Faucet config file does not exist: {self._faucet_config_file}" ) prom_port = os.getenv('PROMETHEUS_PORT') if not prom_port: raise Exception("PROMETHEUS_PORT is not set") prom_url = f"http://{_PROMETHEUS_HOST}:{prom_port}" self._varz_collector = VarzStateCollector(prom_url) LOGGER.info('Attaching event channel...') self._faucet_events = forch.faucet_event_client.FaucetEventClient( self._config.get('event_client', {})) self._local_collector.initialize() self._cpn_collector.initialize() LOGGER.info('Using peer controller %s', self._get_peer_controller_url()) self._initialized = True def initialized(self): """If forch is initialized or not""" return self._initialized def _restore_states(self): # Make sure the event socket is connected so there's no loss of information. assert self._faucet_events.event_socket_connected, 'restore states without connection' metrics = self._varz_collector.get_metrics() self._event_horizon = self._faucet_collector.restore_states_from_metrics( metrics) LOGGER.info('Setting event horizon to event #%d', self._event_horizon) current_time = time.time() faucet_config = self._get_faucet_config() self._faucet_collector.process_dataplane_config_change( current_time, faucet_config.get('dps', {})) def main_loop(self): """Main event processing loop""" LOGGER.info('Entering main event loop...') try: while self._handle_faucet_events(): while not self._faucet_events.event_socket_connected: LOGGER.info('Attempting faucet event sock connection...') time.sleep(1) try: self._faucet_events.connect() self._restore_states() self._faucet_collector.set_state_restored(True) except Exception as e: LOGGER.error( "Cannot restore states or connect to faucet: %s", e) self._faucet_collector.set_state_restored(False, e) except KeyboardInterrupt: LOGGER.info('Keyboard interrupt. Exiting.') self._faucet_events.disconnect() except Exception as e: LOGGER.error("Exception: %s", e) raise # TODO: This should likely be moved into the faucet_state_collector. # pylint: disable=too-many-locals def _handle_faucet_events(self): while self._faucet_events: event = self._faucet_events.next_event(blocking=True) if not event: return True try: self._handle_faucet_event(event) except Exception as e: LOGGER.warning('While processing event %s', event) raise e return False def _handle_faucet_event(self, event): # TODO: Move this down into some other class so 'event_id' isn't exposed in forchestrator. if int(event.get('event_id')) < self._event_horizon: LOGGER.debug('Outdated faucet event #%d', event.get('event_id')) # TODO: Actually flush event (no-op) when varz sufficient. timestamp = event.get("time") LOGGER.debug("Event: %r", event) (name, dpid, port, active) = self._faucet_events.as_port_state(event) if dpid and port: LOGGER.debug('Port state %s %s %s', name, port, active) self._faucet_collector.process_port_state(timestamp, name, port, active) (name, dpid, port, target_mac, src_ip) = self._faucet_events.as_port_learn(event) if dpid and port: LOGGER.debug('Port learn %s %s %s', name, port, target_mac) self._faucet_collector.process_port_learn(timestamp, name, port, target_mac, src_ip) (name, dpid, restart_type, dps_config) = self._faucet_events.as_config_change(event) if dpid is not None: LOGGER.debug('DP restart %s %s', name, restart_type) self._faucet_collector.process_dp_config_change( timestamp, name, restart_type, dpid) if dps_config: LOGGER.debug('Config change. New config: %s', dps_config) self._faucet_collector.process_dataplane_config_change( timestamp, dps_config) (stack_root, graph, dps) = self._faucet_events.as_stack_topo_change(event) if stack_root is not None: LOGGER.debug('stack dataplane_state change root:%s', stack_root) self._faucet_collector.process_stack_topo_change( timestamp, stack_root, graph, dps) (name, port, state) = self._faucet_events.as_stack_state(event) if name is not None: LOGGER.debug('stack stack_state change: %s:%d, %d', name, port, state) self._faucet_collector.process_stack_state(timestamp, name, port, state) (name, port, state) = self._faucet_events.as_lag_state(event) if name and port: LOGGER.debug('LAG state %s %s %s', name, port, state) self._faucet_collector.process_lag_state(timestamp, name, port, state) (name, connected) = self._faucet_events.as_dp_change(event) if name: LOGGER.debug('DP %s connected %r', name, connected) self._faucet_collector.process_dp_change(timestamp, name, None, connected) def _get_controller_info(self, target): controllers = self._config.get('site', {}).get('controllers', {}) if target not in controllers: return (f'missing_target_{target}', _DEFAULT_PORT) controller = controllers[target] controller = controller if controller else {} port = controller.get('port', _DEFAULT_PORT) host = controller.get('fqdn', target) return (host, port) def get_local_port(self): """Get the local port for this instance""" info = self._get_controller_info(self._get_controller_name()) LOGGER.info('Local controller is at %s on %s', info[0], info[1]) return int(info[1]) def _make_controller_url(self, info): return f'http://{info[0]}:{info[1]}' def _get_local_controller_url(self): return self._make_controller_url( self._get_controller_info(self._get_controller_name())) def _get_peer_controller_name(self): name = self._get_controller_name() controllers = self._config.get('site', {}).get('controllers', {}) if name not in controllers: return (f'missing_controller_name_{name}', _DEFAULT_PORT) if len(controllers) != 2: return ('num_controllers_%s' % len(controllers), _DEFAULT_PORT) things = set(controllers.keys()) things.remove(name) return list(things)[0] def _get_peer_controller_info(self): return self._get_controller_info(self._get_peer_controller_name()) def _get_peer_controller_url(self): return self._make_controller_url(self._get_peer_controller_info()) def _get_controller_name(self): return os.getenv('CONTROLLER_NAME') def get_system_state(self, path, params): """Get an overview of the system state""" system_summary = self._get_system_summary(path) overview = { 'peer_controller_url': self._get_peer_controller_url(), 'summary_sources': system_summary, 'site_name': self._config.get('site', {}).get('name', 'unknown'), 'controller_name': self._get_controller_name(), } overview.update(self._distill_summary(system_summary)) return overview def _distill_summary(self, summaries): try: start_time = self._start_time summary_values = summaries.values() change_counts = list( map(lambda subsystem: subsystem.get('change_count') or 0, summary_values)) last_changes = list( map( lambda subsystem: subsystem.get('last_change') or start_time, summary_values)) last_updates = list( map( lambda subsystem: subsystem.get('last_update') or start_time, summary_values)) summary, detail = self._get_combined_summary(summaries) system_summary = { 'system_state': summary, 'system_state_detail': detail, 'system_state_change_count': sum(change_counts), 'system_state_last_change': max(last_changes), 'system_state_last_update': max(last_updates) } except Exception as e: system_summary = { 'system_state': 'error', 'system_state_detail': str(e) } LOGGER.exception('Calculating state summary') return system_summary def _get_combined_summary(self, summary): controller_state, controller_state_detail = self._get_controller_state( ) if not controller_state == constants.STATE_ACTIVE: return controller_state, controller_state_detail has_error = False has_warning = False details = [] for subsystem_name in summary: subsystem = summary[subsystem_name] state = subsystem.get('state', constants.STATE_BROKEN) if state in (constants.STATE_DOWN, constants.STATE_BROKEN): has_error = True details.append(subsystem_name) elif state != constants.STATE_HEALTHY: has_warning = True details.append(subsystem_name) if details: detail = 'broken subsystems: ' + ', '.join(details) else: detail = 'n/a' if not self._faucet_events.event_socket_connected: has_error = True detail += '. Faucet disconnected.' if has_error: return constants.STATE_BROKEN, detail if has_warning: return constants.STATE_DAMAGED, detail return constants.STATE_HEALTHY, detail def _get_system_summary(self, path): states = { 'cpn_state': proto_dict(self._cpn_collector.get_cpn_summary()), 'process_state': self._local_collector.get_process_summary(), 'dataplane_state': self._faucet_collector.get_dataplane_summary(), 'switch_state': self._faucet_collector.get_switch_summary(), 'list_hosts': self._faucet_collector.get_host_summary() } url_base = self._extract_url_base(path) for state in states: summary = states[state] summary['url'] = f'{url_base}/?{state}' return states def _extract_url_base(self, path): slash = path.find('/') host = path[:slash] return f'http://{host}' def _augment_state_reply(self, reply, path): url = self._extract_url_base(path) if isinstance(reply, Message): reply.system_state_url = url else: reply['system_state_url'] = url def _get_controller_state(self): with self._active_state_lock: if not self._is_active: detail = 'This controller is inactive. Please view peer controller.' return constants.STATE_INACTIVE, detail cpn_state = self._cpn_collector.get_cpn_state() peer_controller = self._get_peer_controller_name() cpn_nodes = proto_dict(cpn_state).get('cpn_nodes') peer_controller_state = cpn_nodes.get(peer_controller, {}).get('state') if not peer_controller_state: LOGGER.error('Cannot get peer controller state: %s', peer_controller) if cpn_state.cpn_state == constants.STATE_INITIALIZING: detail = 'Initializing' return constants.STATE_INITIALIZING, detail if not peer_controller_state == constants.STATE_HEALTHY: detail = 'Lost reachability to peer controller.' return constants.STATE_SPLIT, detail return constants.STATE_ACTIVE, '' def _get_faucet_config(self): try: with open(self._faucet_config_file) as config_file: return yaml.safe_load(config_file) except Exception as e: LOGGER.error("Cannot read faucet config: %s", e) raise e def cleanup(self): """Clean up relevant internal data in all collectors""" self._faucet_collector.cleanup() def handle_active_state(self, is_master): """Handler for local state collector to handle vrrp state""" with self._active_state_lock: self._is_active = is_master self._faucet_collector.set_active(is_master) def get_switch_state(self, path, params): """Get the state of the switches""" switch = params.get('switch') port = params.get('port') host = self._extract_url_base(path) reply = self._faucet_collector.get_switch_state(switch, port, host) self._augment_state_reply(reply, path) return reply def get_dataplane_state(self, path, params): """Get the dataplane state overview""" reply = self._faucet_collector.get_dataplane_state() self._augment_state_reply(reply, path) return reply def get_host_path(self, path, params): """Get active host path""" eth_src = params.get('eth_src') eth_dst = params.get('eth_dst') to_egress = params.get('to_egress') == 'true' reply = self._faucet_collector.get_host_path(eth_src, eth_dst, to_egress) self._augment_state_reply(reply, path) return reply def get_list_hosts(self, path, params): """List learned access devices""" eth_src = params.get('eth_src') host = self._extract_url_base(path) reply = self._faucet_collector.get_list_hosts(host, eth_src) self._augment_state_reply(reply, path) return reply def get_cpn_state(self, path, params): """Get CPN state""" reply = self._cpn_collector.get_cpn_state() self._augment_state_reply(reply, path) return reply def get_process_state(self, path, params): """Get certain processes state on the controller machine""" reply = self._local_collector.get_process_state() self._augment_state_reply(reply, path) return reply def get_faucet_config(self, path, params): """Get faucet config from facuet config file""" try: reply = self._get_faucet_config() self._augment_state_reply(reply, path) return reply except Exception as e: return f"Cannot read faucet config: {e}"