def consul_util(mocker): consul = ConsulUtil() exc = RuntimeError('Not allowed') mock = mocker.patch.object mock(consul.kv, 'kv_get', side_effect=exc) mock(consul.kv, 'kv_put', side_effect=exc) mock(consul.kv, 'kv_put_in_transaction', side_effect=exc) mock(consul.kv, 'kv_delete_in_transaction', side_effect=exc) mock(consul.catalog, 'get_services', side_effect=exc) mock(consul.catalog, 'get_service_names', side_effect=exc) mock(consul, 'get_local_nodename', return_value='localhost') mock(consul, 'get_hax_hostname', return_value='localhost') mock(consul, 'get_hax_ip_address', return_value='192.168.0.28') return consul
def main(): # Note: no logging must happen before this call. # Otherwise the log configuration will not apply. _setup_logging() # [KN] The elements in the queue will appear if # 1. A callback is invoked from ha_link (this will happen in a motr # thread which must be free ASAP) # 2. A new HA notification has come form Consul via HTTP # [KN] The messages are consumed by Python thread created by # _run_thread(ConsumerThread(..)) function. # # [KN] Note: The server is launched in the main thread. q = Queue(maxsize=8) util: ConsulUtil = ConsulUtil() cfg = _get_motr_fids(util) LOG.info('Welcome to HaX') LOG.info(f'Setting up ha_link interface with the options as follows: ' f'hax fid = {cfg.hax_fid}, hax endpoint = {cfg.hax_ep}, ' f'HA fid = {cfg.ha_fid}, RM fid = {cfg.rm_fid}') ffi = HaxFFI() herald = DeliveryHerald() motr = Motr(queue=q, rm_fid=cfg.rm_fid, ffi=ffi, herald=herald) # Note that consumer thread must be started before we invoke motr.start(..) # Reason: hax process will send entrypoint request and somebody needs # to reply it. consumer = _run_thread(ConsumerThread(q, motr)) try: motr.start(cfg.hax_ep, process=cfg.hax_fid, ha_service=cfg.ha_fid, rm_service=cfg.rm_fid) LOG.info('Motr API has been started') service_monitor = _run_thread(ServiceMonitor(q)) stats_updater = _run_thread(FsStatsUpdater(motr, interval_sec=30)) # [KN] This is a blocking call. It will work until the program is # terminated by signal run_server(q, herald, threads_to_wait=[consumer, stats_updater, service_monitor]) except Exception: LOG.exception('Exiting due to an exception') finally: motr.close()
def _generate_sub_services(self, note: HaNoteStruct, cns: ConsulUtil, notify_devices=True) -> List[HaNoteStruct]: new_state = note.no_state fid = Fid.from_struct(note.no_id) service_list = cns.get_services_by_parent_process(fid) LOG.debug('Process fid=%s encloses %s services as follows: %s', fid, len(service_list), service_list) service_notes = [ HaNoteStruct(no_id=x.fid.to_c(), no_state=new_state) for x in service_list ] if notify_devices: service_notes += self._generate_sub_disks(note, service_list, cns) return service_notes
def run_server( queue: Queue, herald: DeliveryHerald, consul_util: ConsulUtil, threads_to_wait: List[StoppableThread] = [], port=8008, ): node_address = consul_util.get_hax_ip_address() # We can't use broad 0.0.0.0 IP address to make it possible to run # multiple hax instances at the same machine (i.e. in failover situation). # Instead, every hax will use a private IP only. web_address = node_address # Note that bq-delivered mechanism must use a unique node name rather than # broad '0.0.0.0' that doesn't identify the node from outside. inbox_filter = InboxFilter( OffsetStorage(node_address, key_prefix='bq-delivered')) conf_obj = ConfObjUtil(consul_util) app = web.Application(middlewares=[encode_exception]) app.add_routes([ web.get('/', hello_reply), web.post('/', process_ha_states(queue, consul_util)), web.post( '/watcher/bq', process_bq_update(inbox_filter, BQProcessor(queue, herald, conf_obj))), web.post('/api/v1/sns/{operation}', process_sns_operation(queue)), web.get('/api/v1/sns/repair-status', get_sns_status(queue, SnsRepairStatus)), web.get('/api/v1/sns/rebalance-status', get_sns_status(queue, SnsRebalanceStatus)), ]) LOG.info(f'Starting HTTP server at {web_address}:{port} ...') try: web.run_app(app, host=web_address, port=port) LOG.debug('Server stopped normally') finally: LOG.debug('Stopping the threads') for thread in threads_to_wait: thread.stop() for thread in threads_to_wait: thread.join() LOG.info('The http server has stopped')
def prepare(args): url = args.config[0] utils = Utils(ConfStoreProvider(url)) stop_event = Event() conf_dir = get_config_dir(url) log_dir = get_log_dir(url) _create_consul_namespace(conf_dir) consul_starter = _start_consul(utils, stop_event, conf_dir, log_dir, url) utils.save_config_path(url) utils.save_log_path() utils.save_node_facts() utils.save_drives_info() try: util: ConsulUtil = ConsulUtil() sess = util.get_leader_session_no_wait() util.destroy_session(sess) except Exception: logging.debug('No leader is elected yet') stop_consul_blocking(consul_starter)
def broadcast_ha_states(self, ha_states: List[HAState]) -> List[MessageId]: LOG.debug('Broadcasting HA states %s over ha_link', ha_states) cns = ConsulUtil() def ha_obj_state(st): return HaNoteStruct.M0_NC_ONLINE if st.status == ServiceHealth.OK \ else HaNoteStruct.M0_NC_FAILED notes = [] for st in ha_states: note = HaNoteStruct(st.fid.to_c(), ha_obj_state(st)) notes.append(note) notes += self._generate_sub_services(note, cns) message_ids: List[MessageId] = self._ffi.ha_broadcast( self._ha_ctx, make_array(HaNoteStruct, notes), len(notes)) LOG.debug( 'Broadcast HA state complete with the following message_ids = %s', message_ids) return message_ids
def kv_cleanup(): util: ConsulUtil = ConsulUtil() if is_cluster_running(): logging.info('Cluster is running, shutting down') shutdown_cluster() keys: List[KeyDelete] = [ KeyDelete(name='epoch', recurse=False), KeyDelete(name='eq-epoch', recurse=False), KeyDelete(name='last_fidk', recurse=False), KeyDelete(name='leader', recurse=False), KeyDelete(name='m0conf/', recurse=True), KeyDelete(name='processes/', recurse=True), KeyDelete(name='stats/', recurse=True) ] logging.info('Deleting Hare KV entries (%s)', keys) if not util.kv.kv_delete_in_transaction(keys): raise RuntimeError('Error during key delete in transaction')
def init(args): try: url = args.config[0] if not is_mkfs_required(url): return conf = ConfStoreProvider(url) utils = Utils(conf) cns_utils = ConsulUtil() stop_event = Event() config_dir = get_config_dir(url) log_dir = get_log_dir(url) # Starting consul and hax consul_starter = _start_consul(utils, stop_event, config_dir, log_dir, url) hax_starter = _start_hax(utils, stop_event, config_dir, log_dir) hostname = utils.get_local_hostname() # Cleanup old mkfs state cleanup_mkfs_state(utils, cns_utils) start_mkfs_parallel(hostname, config_dir) # Update mkfs state set_mkfs_done_for(hostname, cns_utils) data_nodes = conf.get_hostnames_for_service( Const.SERVICE_MOTR_IO.value) # Wait for other nodes to complete. # This will block. while not is_mkfs_done_on_all_nodes(utils, cns_utils, data_nodes): sleep(5) # Stopping hax and consul stop_hax_blocking(hax_starter) stop_consul_blocking(consul_starter) except Exception as error: if hax_starter: stop_hax_blocking(hax_starter) if consul_starter: stop_consul_blocking(consul_starter) raise RuntimeError(f'Error while initializing cluster :key={error}')
def _start_consul(utils: Utils, stop_event: Event, hare_local_dir: str, hare_log_dir: str, url: str): log_dir = hare_log_dir data_dir = f'{hare_local_dir}/consul/data' config_dir = f'{hare_local_dir}/consul/config' provider = ConfStoreProvider(url) node_id = uuid.uuid4() consul_endpoints = provider.get('cortx>external>consul>endpoints') cns_utils: ConsulUtil = ConsulUtil() hostname = utils.get_local_hostname() # remove tcp:// peers = [] for endpoint in consul_endpoints: key = endpoint.split('/') # Considering tcp endpoints only. Ignoring all other endpoints. if key[0] != 'tcp:': continue peer = ('/'.join(key[2:])) peers.append(peer) bind_addr = socket.gethostbyname(hostname) consul_nodename = hostname + ':' + str(node_id)[:8] consul_starter = ConsulStarter(utils=utils, cns_utils=cns_utils, stop_event=stop_event, log_dir=log_dir, data_dir=data_dir, config_dir=config_dir, node_id=str(node_id), node_name=consul_nodename, peers=peers, bind_addr=bind_addr) consul_starter.start() save_consul_node_name(cns_utils, consul_nodename, hostname) return consul_starter
def test_process_failure(self): consul_util = ConsulUtil() consul_cache = InvocationCache() ffi = Mock(spec=['init_motr_api']) motr = Motr(ffi, None, None, consul_util) # Setup for the test: notification of a process failure # - failure here is an ios service and a disk # - dummy Consul reports all processes on the node are failed # - expect the node, enclosure, controller, drive, # process, and service to all be marked as failed # # Static names and fids for the setup are given here. node_name = 'testnode' hax_fid = Fid(0x7200000000000001, 0x6) site_fid = Fid(0x5300000000000001, 0x1) rack_fid = Fid(0x6100000000000001, 0x2) node_fid = Fid(0x6e00000000000001, 0x3) encl_fid = Fid(0x6500000000000001, 0x4) ctrl_fid = Fid(0x6300000000000001, 0x5) process_fid = Fid(0x7200000000000001, 0x15) service_fid = Fid(0x7300000000000001, 0xe) service_fid_typed = FidWithType(fid=service_fid, service_type='ios') drive_fid = Fid(0x6b00000000000001, 0x11) ctrl_path = 'm0conf/sites/{}/racks/{}/encls/{}/ctrls/{}'.format( site_fid, rack_fid, encl_fid, ctrl_fid) ctrl_state = '{"state": "M0_NC_FAILED"}' # Set mock return values for the necessary Consul calls motr._is_mkfs = Mock(return_value=False) consul_util.get_hax_fid = Mock(return_value=hax_fid) consul_util.is_proc_client = Mock(return_value=False) consul_util.get_services_by_parent_process = Mock( return_value=[service_fid_typed]) consul_util.get_disks_by_parent_process = Mock( return_value=[drive_fid]) consul_util.get_process_node = Mock(return_value=node_name) consul_util.get_node_name_by_fid = Mock(return_value=node_name) consul_util.get_node_fid = Mock(return_value=node_fid) consul_util.get_node_encl_fid = Mock(return_value=encl_fid) consul_util.get_node_ctrl_fids = Mock(return_value=[ctrl_fid]) # These failure indications are here to trigger specific code paths for # node failure. Additional tests can cover different scenarios (e.g. # drive failure but node still up), which will set differernt results # for these calls. consul_util.all_io_services_failed = Mock(return_value=True) consul_util.get_sdev_state = Mock( return_value=HaNoteStruct.M0_NC_FAILED) consul_util.get_ctrl_state = Mock( return_value=m0HaObjState.M0_NC_FAILED) consul_util.get_ctrl_state_updates = Mock( return_value=[PutKV(key=ctrl_path, value=ctrl_state)]) # We'll use these mocks to check that expected updates are happening. consul_util.update_drive_state = Mock() consul_util.set_process_state = Mock() consul_util.set_node_state = Mock() consul_util.set_encl_state = Mock() motr._ha_broadcast = Mock() motr._write_updates = Mock() # Send the mock event. motr.broadcast_ha_states( [HAState(fid=process_fid, status=ObjHealth.FAILED)], notify_devices=True, broadcast_hax_only=False, kv_cache=consul_cache) # ConsulUtil is responsible for the actual KV updates, just check # here that the appropriate util function is called for each # component. consul_util.update_drive_state.assert_called_with([drive_fid], ObjHealth.OFFLINE, device_event=False) consul_util.set_process_state.assert_called_with( process_fid, ObjHealth.FAILED) consul_util.set_node_state.assert_called_with(node_fid, ObjHealth.FAILED) consul_util.set_encl_state.assert_called_with(encl_fid, ObjHealth.FAILED, kv_cache=consul_cache) # This KV update is batched, so the check looks different. motr._write_updates.assert_any_call( [PutKV(key=ctrl_path, value=ctrl_state)], consul_cache) # Check hax broadcast. We should see states updated to FAILED. broadcast_list = motr._ha_broadcast.call_args[0][0] self.assertTrue(_has_failed_note(broadcast_list, node_fid)) self.assertTrue(_has_failed_note(broadcast_list, encl_fid)) self.assertTrue(_has_failed_note(broadcast_list, ctrl_fid)) self.assertTrue(_has_failed_note(broadcast_list, process_fid)) self.assertTrue(_has_failed_note(broadcast_list, service_fid)) self.assertTrue(_has_failed_note(broadcast_list, drive_fid))
class ConsumerThread(StoppableThread): """ The only Motr-aware thread in whole HaX. This thread pulls messages from the multithreaded Queue and considers the messages as commands. Every such a command describes what should be sent to Motr land. The thread exits gracefully when it receives message of type Die (i.e. it is a 'poison pill'). """ def __init__(self, q: Queue, motr: Motr, herald: DeliveryHerald): super().__init__(target=self._do_work, name='qconsumer', args=(q, motr)) self.is_stopped = False self.consul = ConsulUtil() self.eq_publisher = EQPublisher() self.herald = herald def stop(self) -> None: self.is_stopped = True @repeat_if_fails(wait_seconds=1) def _update_process_status(self, event: ConfHaProcess) -> None: # If a consul-related exception appears, it will # be processed by repeat_if_fails. # # This thread will become blocked until that # intermittent error gets resolved. self.consul.update_process_status(event) def update_process_failure(self, ha_states: List[HAState]) -> None: for state in ha_states: if state.status == ServiceHealth.FAILED: m0status = m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED pevent = ConfHaProcess(chp_event=m0status, chp_type=3, chp_pid=0, fid=state.fid) self._update_process_status(pevent) def _do_work(self, q: Queue, motr: Motr): ffi = motr._ffi LOG.info('Handler thread has started') ffi.adopt_motr_thread() def pull_msg(): try: return q.get(block=False) except Empty: return None try: while True: try: LOG.debug('Waiting for the next message') item = pull_msg() while item is None: time.sleep(0.2) if self.is_stopped: raise StopIteration() item = pull_msg() LOG.debug('Got %s message from queue', item) if isinstance(item, FirstEntrypointRequest): LOG.debug('first entrypoint request, broadcast FAILED') ids: List[MessageId] = motr.broadcast_ha_states([ HAState(fid=item.process_fid, status=ServiceHealth.FAILED) ]) LOG.debug('waiting for broadcast of %s for ep: %s', ids, item.remote_rpc_endpoint) self.herald.wait_for_all(HaLinkMessagePromise(ids)) motr.send_entrypoint_request_reply( EntrypointRequest( reply_context=item.reply_context, req_id=item.req_id, remote_rpc_endpoint=item.remote_rpc_endpoint, process_fid=item.process_fid, git_rev=item.git_rev, pid=item.pid, is_first_request=item.is_first_request)) elif isinstance(item, EntrypointRequest): # While replying any Exception is catched. In such a # case, the motr process will receive EAGAIN and # hence will need to make new attempt by itself motr.send_entrypoint_request_reply(item) elif isinstance(item, ProcessEvent): self._update_process_status(item.evt) elif isinstance(item, HaNvecGetEvent): fn = motr.ha_nvec_get_reply # If a consul-related exception appears, it will # be processed by repeat_if_fails. # # This thread will become blocked until that # intermittent error gets resolved. decorated = (repeat_if_fails(wait_seconds=5))(fn) decorated(item) elif isinstance(item, BroadcastHAStates): LOG.info('HA states: %s', item.states) result: List[MessageId] = motr.broadcast_ha_states( item.states) self.update_process_failure(item.states) if item.reply_to: item.reply_to.put(result) elif isinstance(item, StobIoqError): LOG.info('Stob IOQ: %s', item.fid) payload = dump_json(item) LOG.debug('Stob IOQ JSON: %s', payload) offset = self.eq_publisher.publish('stob-ioq', payload) LOG.debug('Written to epoch: %s', offset) elif isinstance(item, SnsRepairStatus): LOG.info('Requesting SNS repair status') status = motr.get_repair_status(item.fid) LOG.info('SNS repair status is received: %s', status) item.reply_to.put(status) elif isinstance(item, SnsRebalanceStatus): LOG.info('Requesting SNS rebalance status') status = motr.get_rebalance_status(item.fid) LOG.info('SNS rebalance status is received: %s', status) item.reply_to.put(status) elif isinstance(item, SnsRebalanceStart): LOG.info('Requesting SNS rebalance start') motr.start_rebalance(item.fid) elif isinstance(item, SnsRebalanceStop): LOG.info('Requesting SNS rebalance stop') motr.stop_rebalance(item.fid) elif isinstance(item, SnsRebalancePause): LOG.info('Requesting SNS rebalance pause') motr.pause_rebalance(item.fid) elif isinstance(item, SnsRebalanceResume): LOG.info('Requesting SNS rebalance resume') motr.resume_rebalance(item.fid) elif isinstance(item, SnsRepairStart): LOG.info('Requesting SNS repair start') motr.start_repair(item.fid) elif isinstance(item, SnsRepairStop): LOG.info('Requesting SNS repair stop') motr.stop_repair(item.fid) elif isinstance(item, SnsRepairPause): LOG.info('Requesting SNS repair pause') motr.pause_repair(item.fid) elif isinstance(item, SnsRepairResume): LOG.info('Requesting SNS repair resume') motr.resume_repair(item.fid) else: LOG.warning('Unsupported event type received: %s', item) except StopIteration: raise except Exception: # no op, swallow the exception LOG.exception('**ERROR**') except StopIteration: ffi.shun_motr_thread() finally: LOG.info('Handler thread has exited')
def __init__(self, consul_util: Optional[ConsulUtil]): self.consul = consul_util or ConsulUtil()
def _get_motr_fids(util: ConsulUtil) -> HL_Fids: hax_ep: str = util.get_hax_endpoint() hax_fid: Fid = util.get_hax_fid() ha_fid: Fid = util.get_ha_fid() rm_fid: Fid = util.get_rm_fid() return HL_Fids(hax_ep, hax_fid, ha_fid, rm_fid)
def get_hare_motr_s3_processes(utils: ConsulUtil) -> Dict[str, List[Fid]]: nodes = utils.catalog.get_node_names() processes: Dict[str, List[Fid]] = {} for node in nodes: processes[node] = utils.get_node_hare_motr_s3_fids(node) return processes
class FsStatsUpdater(StoppableThread): def __init__(self, motr: Motr, interval_sec=5): super().__init__(target=self._execute, name='fs-stats-updater', args=(motr, )) self.stopped = False self.consul = ConsulUtil() self.interval_sec = interval_sec self.event = Event() def stop(self) -> None: LOG.debug('Stop signal received') self.stopped = True self.event.set() def _sleep(self, interval_sec) -> None: interrupted = self.event.wait(timeout=interval_sec) if interrupted: raise InterruptedException() @log_exception def _execute(self, motr: Motr): try: ffi = motr._ffi LOG.info('filesystem stats updater thread has started') ffi.adopt_motr_thread() self._ensure_motr_all_started() while not self.stopped: started = self._ioservices_running() if not all(started): self._sleep(self.interval_sec) continue result: int = motr.start_rconfc() if result == 0: stats = motr.get_filesystem_stats() motr.stop_rconfc() if not stats: continue LOG.debug('FS stats are as follows: %s', stats) now_time = datetime.datetime.now() data = FsStatsWithTime(stats=stats, timestamp=now_time.timestamp(), date=now_time.isoformat()) try: self.consul.update_fs_stats(data) except HAConsistencyException: LOG.debug('Failed to update Consul KV ' 'due to an intermittent error. The ' 'error is swallowed since new attempts ' 'will be made timely') self._sleep(self.interval_sec) except InterruptedException: # No op. _sleep() has interrupted before the timeout exceeded: # the application is shutting down. # There are no resources that we need to dispose specially. pass except Exception: LOG.exception('Aborting due to an error') finally: LOG.debug('Releasing motr-related resources for this thread') ffi.shun_motr_thread() LOG.debug('filesystem stats updater thread exited') def _ioservices_running(self) -> List[bool]: statuses = self.consul.get_m0d_statuses() LOG.debug('The following statuses received: %s', statuses) started = ['M0_CONF_HA_PROCESS_STARTED' == v[1] for v in statuses] return started def _ensure_motr_all_started(self): while True: started = self._ioservices_running() if all(started): LOG.debug('According to Consul all confds have been started') return self._sleep(5)
def _update_process_status(self, p: WorkPlanner, motr: Motr, event: ConfHaProcess) -> None: LOG.info('Updating process status: %s', event.fid) # If a consul-related exception appears, it will # be processed by repeat_if_fails. # # This thread will become blocked until that # intermittent error gets resolved. motr_to_svc_status = { (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): ( ObjHealth.OK), (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): ( ObjHealth.FAILED), (m0HaProcessType.M0_CONF_HA_PROCESS_M0D, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): ( ObjHealth.OK), (m0HaProcessType.M0_CONF_HA_PROCESS_M0D, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): ( ObjHealth.FAILED), (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): ( ObjHealth.OK), (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): ( ObjHealth.FAILED)} if event.chp_event in (m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): svc_status = motr_to_svc_status[(event.chp_type, event.chp_event)] broadcast_hax_only = False if ((event.chp_type == m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS) or (event.fid == self.consul.get_hax_fid())): # Motr-mkfs processes do not require updates on their peer # mkfs processes. Motr-mkfs is an independent and typically a # one-time operation. So avoid broadcasting a motr-mkfs state # to the peer motr-mkfs processes but hax still needs to be # notified in-order to disconnect the hax-motr halink when # motr-mkfs process stops. broadcast_hax_only = True LOG.debug('chp_type %d broadcast_hax_only %s', event.chp_type, broadcast_hax_only) motr.broadcast_ha_states( [HAState(fid=event.fid, status=svc_status)], broadcast_hax_only=broadcast_hax_only) self.consul.update_process_status(event) # If we are receiving M0_CONF_HA_PROCESS_STARTED for M0D processes # then we will check if all the M0D processes on the local node are # started. If yes then we are going to send node online event to # MessageBus if event.chp_event == m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED: try: util: ConsulUtil = ConsulUtil() producer = get_producer(util) if producer: producer.check_and_send(parent_resource_type=ObjT.NODE, fid=event.fid, resource_status='online') else: LOG.warning('Could not sent an event as producer' ' is not available') except Exception as e: LOG.warning("Send event failed due to '%s'", e)
def main(): # Note: no logging must happen before this call. # Otherwise the log configuration will not apply. setup_logging() set_locale() inject.configure(di_configuration) state = inject.instance(HaxGlobalState) # [KN] The elements in the work planner will appear if # 1. A callback is invoked from ha_link (this will happen in a motr # thread which must be free ASAP) # 2. A new HA notification has come form Consul via HTTP # [KN] The messages are consumed by Python threads created by # _run_qconsumer_thread function. # # [KN] Note: The server is launched in the main thread. planner = WorkPlanner() def handle_signal(sig, frame): state.set_stopping() planner.shutdown() # This is necessary to allow hax to exit early if Consul is not available # (otherwise _get_motr_fids() may be retrying forever even if the hax # process needs to shutdown). signal.signal(signal.SIGINT, handle_signal) util: ConsulUtil = ConsulUtil() # Avoid removing session on hax start as this will happen # on every node, thus leader election will keep re-triggering # until the final hax node starts, this will delay further # bootstrapping operations. _remove_stale_session(util) cfg: HL_Fids = _get_motr_fids(util) hax_http_port = util.get_hax_http_port() util.init_motr_processes_status() LOG.info('Welcome to HaX') LOG.info(f'Setting up ha_link interface with the options as follows: ' f'hax fid = {cfg.hax_fid}, hax endpoint = {cfg.hax_ep}, ' f'HA fid = {cfg.ha_fid}') ffi = HaxFFI() herald = DeliveryHerald() motr = Motr(planner=planner, ffi=ffi, herald=herald, consul_util=util) # Note that consumer thread must be started before we invoke motr.start(..) # Reason: hax process will send entrypoint request and somebody needs # to reply it. # TODO make the number of threads configurable consumer_threads = [ _run_qconsumer_thread(planner, motr, herald, util, i) for i in range(32) ] try: # [KN] We use just the first profile for Spiel API for now. motr.start(cfg.hax_ep, process=cfg.hax_fid, ha_service=cfg.ha_fid, profile=cfg.profiles[0]) LOG.info('Motr API has been started') rconfc_starter = _run_rconfc_starter_thread(motr, consul_util=util) stats_updater = _run_stats_updater_thread(motr, consul_util=util) bc_updater = _run_bc_updater_thread(motr, consul_util=util) event_poller = _run_thread(create_ha_thread(planner, util)) # [KN] This is a blocking call. It will work until the program is # terminated by signal server = ServerRunner(planner, herald, consul_util=util, hax_state=state) server.run(threads_to_wait=[ *consumer_threads, stats_updater, bc_updater, rconfc_starter, event_poller ], port=hax_http_port) except Exception: LOG.exception('Exiting due to an exception') finally: motr.fini()
class ServiceMonitor(StoppableThread): """ The service monitoring thread. This thread polls the service health status from Consul via Health API and broadcasts the states to Motr land. """ def __init__(self, queue: Queue, interval_sec=1): """ Constructor. queue - the multithreaded blocking queue to send BroadcastHAStates. messages (assuming that the queue is being read out by ConsumerThread). interval_sec - float value, represents the delay between the polling iterations. """ super().__init__(target=self._execute, name='service-monitor') self.stopped = False self.consul = ConsulUtil() self.interval_sec = interval_sec self.event = Event() self.q = queue def stop(self) -> None: """Stop the thread.""" LOG.debug('Stop signal received') self.stopped = True self.event.set() def _sleep(self, interval_sec) -> bool: interrupted = self.event.wait(timeout=interval_sec) return interrupted def _get_services(self) -> List[str]: services = self.consul.catalog_service_names() excluded = {'consul'} return [s for s in services if s not in excluded] def _broadcast(self, state_list: List[HAState]) -> None: if not state_list: return LOG.debug('Changes in statuses: %s', state_list) self.q.put(BroadcastHAStates(states=state_list, reply_to=None)) def _execute(self): service_names: List[str] = self._get_services() LOG.debug('The following services will be monitored %s', service_names) known_statuses: Dict[str, ServiceHealth] = { service: ServiceHealth.UNKNOWN for service in service_names } try: while not self.stopped: try: delta: List[HAState] = [] for name in service_names: health: HAState = self.consul.get_local_service_health( name) if (health.status != known_statuses[name]): delta.append(health) known_statuses[name] = health.status LOG.debug('%s is now %s', name, health.status) self._broadcast(delta) except HAConsistencyException: # No action - we'll just try again at next iteration pass self._sleep(self.interval_sec) except Exception: LOG.exception('Aborting due to an error') finally: LOG.debug('Thread exited')