def main(): # Note: no logging must happen before this call. # Otherwise the log configuration will not apply. setup_logging() # [KN] The elements in the queue will appear if # 1. A callback is invoked from ha_link (this will happen in a motr # thread which must be free ASAP) # 2. A new HA notification has come form Consul via HTTP # [KN] The messages are consumed by Python thread created by # _run_qconsumer_thread function. # # [KN] Note: The server is launched in the main thread. planner = WorkPlanner() util: ConsulUtil = ConsulUtil() _remove_stale_session(util) cfg: HL_Fids = _get_motr_fids(util) LOG.info('Welcome to HaX') LOG.info(f'Setting up ha_link interface with the options as follows: ' f'hax fid = {cfg.hax_fid}, hax endpoint = {cfg.hax_ep}, ' f'HA fid = {cfg.ha_fid}') ffi = HaxFFI() herald = DeliveryHerald() motr = Motr(planner=planner, ffi=ffi, herald=herald, consul_util=util) # Note that consumer thread must be started before we invoke motr.start(..) # Reason: hax process will send entrypoint request and somebody needs # to reply it. # TODO make the number of threads configurable consumer_threads = [ _run_qconsumer_thread(planner, motr, herald, util, i) for i in range(4) ] try: # [KN] We use just the first profile for Spiel API for now. motr.start(cfg.hax_ep, process=cfg.hax_fid, ha_service=cfg.ha_fid, profile=cfg.profiles[0]) LOG.info('Motr API has been started') rconfc_starter = _run_rconfc_starter_thread(motr, consul_util=util) stats_updater = _run_stats_updater_thread(motr, consul_util=util) event_poller = _run_thread(create_ha_thread(planner, util)) # [KN] This is a blocking call. It will work until the program is # terminated by signal server = ServerRunner(planner, herald, consul_util=util) server.run(threads_to_wait=[ *consumer_threads, stats_updater, rconfc_starter, event_poller ]) except Exception: LOG.exception('Exiting due to an exception') finally: motr.fini()
def main(): # Note: no logging must happen before this call. # Otherwise the log configuration will not apply. _setup_logging() # [KN] The elements in the queue will appear if # 1. A callback is invoked from ha_link (this will happen in a motr # thread which must be free ASAP) # 2. A new HA notification has come form Consul via HTTP # [KN] The messages are consumed by Python thread created by # _run_qconsumer_thread function. # # [KN] Note: The server is launched in the main thread. q: Queue = Queue(maxsize=8) util: ConsulUtil = ConsulUtil() cfg = _get_motr_fids(util) LOG.info('Welcome to HaX') LOG.info(f'Setting up ha_link interface with the options as follows: ' f'hax fid = {cfg.hax_fid}, hax endpoint = {cfg.hax_ep}, ' f'HA fid = {cfg.ha_fid}, RM fid = {cfg.rm_fid}') ffi = HaxFFI() herald = DeliveryHerald() motr = Motr(queue=q, rm_fid=cfg.rm_fid, ffi=ffi, herald=herald, consul_util=util) # Note that consumer thread must be started before we invoke motr.start(..) # Reason: hax process will send entrypoint request and somebody needs # to reply it. consumer = _run_qconsumer_thread(q, motr, herald) try: motr.start(cfg.hax_ep, process=cfg.hax_fid, ha_service=cfg.ha_fid, rm_service=cfg.rm_fid) LOG.info('Motr API has been started') stats_updater = _run_stats_updater_thread(motr, consul_util=util) # [KN] This is a blocking call. It will work until the program is # terminated by signal run_server(q, herald, consul_util=util, threads_to_wait=[consumer, stats_updater]) except Exception: LOG.exception('Exiting due to an exception') finally: motr.close()
def main(): # Note: no logging must happen before this call. # Otherwise the log configuration will not apply. setup_logging() set_locale() inject.configure(di_configuration) state = inject.instance(HaxGlobalState) # [KN] The elements in the work planner will appear if # 1. A callback is invoked from ha_link (this will happen in a motr # thread which must be free ASAP) # 2. A new HA notification has come form Consul via HTTP # [KN] The messages are consumed by Python threads created by # _run_qconsumer_thread function. # # [KN] Note: The server is launched in the main thread. planner = WorkPlanner() def handle_signal(sig, frame): state.set_stopping() planner.shutdown() # This is necessary to allow hax to exit early if Consul is not available # (otherwise _get_motr_fids() may be retrying forever even if the hax # process needs to shutdown). signal.signal(signal.SIGINT, handle_signal) util: ConsulUtil = ConsulUtil() # Avoid removing session on hax start as this will happen # on every node, thus leader election will keep re-triggering # until the final hax node starts, this will delay further # bootstrapping operations. _remove_stale_session(util) cfg: HL_Fids = _get_motr_fids(util) hax_http_port = util.get_hax_http_port() util.init_motr_processes_status() LOG.info('Welcome to HaX') LOG.info(f'Setting up ha_link interface with the options as follows: ' f'hax fid = {cfg.hax_fid}, hax endpoint = {cfg.hax_ep}, ' f'HA fid = {cfg.ha_fid}') ffi = HaxFFI() herald = DeliveryHerald() motr = Motr(planner=planner, ffi=ffi, herald=herald, consul_util=util) # Note that consumer thread must be started before we invoke motr.start(..) # Reason: hax process will send entrypoint request and somebody needs # to reply it. # TODO make the number of threads configurable consumer_threads = [ _run_qconsumer_thread(planner, motr, herald, util, i) for i in range(32) ] try: # [KN] We use just the first profile for Spiel API for now. motr.start(cfg.hax_ep, process=cfg.hax_fid, ha_service=cfg.ha_fid, profile=cfg.profiles[0]) LOG.info('Motr API has been started') rconfc_starter = _run_rconfc_starter_thread(motr, consul_util=util) stats_updater = _run_stats_updater_thread(motr, consul_util=util) bc_updater = _run_bc_updater_thread(motr, consul_util=util) event_poller = _run_thread(create_ha_thread(planner, util)) # [KN] This is a blocking call. It will work until the program is # terminated by signal server = ServerRunner(planner, herald, consul_util=util, hax_state=state) server.run(threads_to_wait=[ *consumer_threads, stats_updater, bc_updater, rconfc_starter, event_poller ], port=hax_http_port) except Exception: LOG.exception('Exiting due to an exception') finally: motr.fini()
def motr(mocker, ffi, planner, herald, consul_util) -> Motr: motr = Motr(ffi, planner, herald, consul_util) return motr
def test_process_failure(self): consul_util = ConsulUtil() consul_cache = InvocationCache() ffi = Mock(spec=['init_motr_api']) motr = Motr(ffi, None, None, consul_util) # Setup for the test: notification of a process failure # - failure here is an ios service and a disk # - dummy Consul reports all processes on the node are failed # - expect the node, enclosure, controller, drive, # process, and service to all be marked as failed # # Static names and fids for the setup are given here. node_name = 'testnode' hax_fid = Fid(0x7200000000000001, 0x6) site_fid = Fid(0x5300000000000001, 0x1) rack_fid = Fid(0x6100000000000001, 0x2) node_fid = Fid(0x6e00000000000001, 0x3) encl_fid = Fid(0x6500000000000001, 0x4) ctrl_fid = Fid(0x6300000000000001, 0x5) process_fid = Fid(0x7200000000000001, 0x15) service_fid = Fid(0x7300000000000001, 0xe) service_fid_typed = FidWithType(fid=service_fid, service_type='ios') drive_fid = Fid(0x6b00000000000001, 0x11) ctrl_path = 'm0conf/sites/{}/racks/{}/encls/{}/ctrls/{}'.format( site_fid, rack_fid, encl_fid, ctrl_fid) ctrl_state = '{"state": "M0_NC_FAILED"}' # Set mock return values for the necessary Consul calls motr._is_mkfs = Mock(return_value=False) consul_util.get_hax_fid = Mock(return_value=hax_fid) consul_util.is_proc_client = Mock(return_value=False) consul_util.get_services_by_parent_process = Mock( return_value=[service_fid_typed]) consul_util.get_disks_by_parent_process = Mock( return_value=[drive_fid]) consul_util.get_process_node = Mock(return_value=node_name) consul_util.get_node_name_by_fid = Mock(return_value=node_name) consul_util.get_node_fid = Mock(return_value=node_fid) consul_util.get_node_encl_fid = Mock(return_value=encl_fid) consul_util.get_node_ctrl_fids = Mock(return_value=[ctrl_fid]) # These failure indications are here to trigger specific code paths for # node failure. Additional tests can cover different scenarios (e.g. # drive failure but node still up), which will set differernt results # for these calls. consul_util.all_io_services_failed = Mock(return_value=True) consul_util.get_sdev_state = Mock( return_value=HaNoteStruct.M0_NC_FAILED) consul_util.get_ctrl_state = Mock( return_value=m0HaObjState.M0_NC_FAILED) consul_util.get_ctrl_state_updates = Mock( return_value=[PutKV(key=ctrl_path, value=ctrl_state)]) # We'll use these mocks to check that expected updates are happening. consul_util.update_drive_state = Mock() consul_util.set_process_state = Mock() consul_util.set_node_state = Mock() consul_util.set_encl_state = Mock() motr._ha_broadcast = Mock() motr._write_updates = Mock() # Send the mock event. motr.broadcast_ha_states( [HAState(fid=process_fid, status=ObjHealth.FAILED)], notify_devices=True, broadcast_hax_only=False, kv_cache=consul_cache) # ConsulUtil is responsible for the actual KV updates, just check # here that the appropriate util function is called for each # component. consul_util.update_drive_state.assert_called_with([drive_fid], ObjHealth.OFFLINE, device_event=False) consul_util.set_process_state.assert_called_with( process_fid, ObjHealth.FAILED) consul_util.set_node_state.assert_called_with(node_fid, ObjHealth.FAILED) consul_util.set_encl_state.assert_called_with(encl_fid, ObjHealth.FAILED, kv_cache=consul_cache) # This KV update is batched, so the check looks different. motr._write_updates.assert_any_call( [PutKV(key=ctrl_path, value=ctrl_state)], consul_cache) # Check hax broadcast. We should see states updated to FAILED. broadcast_list = motr._ha_broadcast.call_args[0][0] self.assertTrue(_has_failed_note(broadcast_list, node_fid)) self.assertTrue(_has_failed_note(broadcast_list, encl_fid)) self.assertTrue(_has_failed_note(broadcast_list, ctrl_fid)) self.assertTrue(_has_failed_note(broadcast_list, process_fid)) self.assertTrue(_has_failed_note(broadcast_list, service_fid)) self.assertTrue(_has_failed_note(broadcast_list, drive_fid))