def test_nonmkfs_process_stop_causes_drive_offline(mocker, motr, consul_util): mocker.patch.object(consul_util.kv, 'kv_get', side_effect=create_stub_get('M0_CONF_HA_PROCESS_M0D')) mocker.patch.object(consul_util.kv, 'kv_put', return_value=0) mocker.patch.object(consul_util, 'update_drive_state') mocker.patch.object(consul_util, 'get_node_fid', return_value=Fid(0x6e00000000000001, 0x3)) mocker.patch.object(consul_util, 'get_node_encl_fid', return_value=Fid(0x6500000000000001, 0x4)) motr.broadcast_ha_states([ HAState(fid=Fid(0x7200000000000001, 0x15), status=ServiceHealth.FAILED) ]) assert consul_util.update_drive_state.called, \ 'The drive state should be updated in Consul KV' traces = motr._ffi.traces assert AssertionPlan( tr_and(tr_method('ha_broadcast'), contains_drive())).exists(traces), \ 'DRIVE must be broadcast when non-MKFS process is stopped'
def prepare_svc(self, svc_id: str, name: str): ep = self.provider.get_service_ep(svc_id) if not ep: raise RuntimeError('Cannot get service endpoint.') addr = self.get_service_addr(ep) port = self.get_service_port(ep) checks: Dict[str, Any] = {} checks['args'] = ['/opt/seagate/cortx/hare/libexec/check-service'] checks['interval'] = '1s' checks['status'] = 'warning' # get svc checks args as per svc name if name == 'hax': checks['args'].append('--hax') elif name in ('confd', 'ios'): fid = Fid(ObjT.PROCESS.value, int(svc_id)) checks['args'].extend(['--fid', str(fid)]) elif name == 's3service': fid = Fid(ObjT.PROCESS.value, int(svc_id)) s3svc = 's3server@' + str(fid) checks['args'].extend(['--svc', s3svc]) return Service(id=svc_id, name=name, address=addr, port=port, checks=[checks])
async def test_bq_stob_message_deserialized(hax_client, planner, herald, consul_util, mocker): def fake_get(key, allow_null): # ret = {'bq-delivered/192.168.0.28': ''} ret = {'bq-delivered/localhost': ''} return ret[key] mocker.patch.object(herald, 'wait_for_any') # # InboxFilter will try to read epoch - let's mock KV operations stob = StobId(Fid(12, 13), Fid(14, 15)) msg = StobIoqError(fid=Fid(5, 6), conf_sdev=Fid(0x103, 0x204), stob_id=stob, fd=42, opcode=4, rc=2, offset=0xBF, size=100, bshift=4) # Here we make sure that rea StobIoqError can be used as the payload # for STOB_IOQ_ERROR bq message. stob_payload = dump_json(msg) parsed_stob = simplejson.loads(stob_payload) mocker.patch.object(consul_util.kv, 'kv_put') mocker.patch.object(consul_util.kv, 'kv_get', fake_get) event_payload = {'message_type': 'STOB_IOQ_ERROR', 'payload': parsed_stob} event_str = simplejson.dumps(event_payload) b64: bytes = b64encode(event_str.encode()) b64_str = b64.decode() payload = [{ 'Key': 'bq/12', 'CreateIndex': 1793, 'ModifyIndex': 1793, 'LockIndex': 0, 'Flags': 0, 'Value': b64_str, 'Session': '' }] # Test execution resp = await hax_client.post('/watcher/bq', json=payload) # Validate now if resp.status != 200: resp_json = await resp.json() logging.getLogger('hax').debug('Response: %s', resp_json) assert resp.status == 200 planner.add_command.assert_called_once_with( ContainsStates( [HAState(fid=Fid(0x103, 0x204), status=ObjHealth.FAILED)]))
async def test_service_health_broadcast(hax_client, planner, status: str, health: ServiceHealth): service_health = [{ 'Node': { 'Node': 'localhost', 'Address': '10.1.10.12', }, 'Service': { 'ID': '12', 'Service': 'ios', 'Tags': [], 'Port': 8000, }, 'Checks': [ { 'Node': '12', 'CheckID': 'service:ios', 'Name': "Service 'ios' check", 'Status': status, 'Notes': '', 'Output': '', 'ServiceID': '12', 'ServiceName': 'ios', }, ], }] resp = await hax_client.post('/', json=service_health) assert resp.status == 200 assert planner.add_command.called planner.add_command.assert_called_once_with( BroadcastHAStates( states=[HAState(fid=Fid(0x7200000000000001, 12), status=health)], reply_to=None))
def node_to_drive_fid(self, node_name: str, drive: str): sdev_fid: Fid = Fid(0, 0) # We extract the sdev fid as follows, # e.g. node_name=ssc-vm-c-0553.colo.seagate.com # drive=/dev/vdf # 1. m0conf/nodes/ssc-vm-c-0553.colo.seagate.com/processes/41/ # services/ios:43 # 2. Create ioservice motr fid # 3. fetch consul kv for ios fid, # m0conf/nodes/0x6e00000000000001:0x20/processes/ # 0x7200000000000001:0x29/services/0x7300000000000001:0x2b/ # sdevs/0x6400000000000001:0x2c: # {"path": "/dev/vdf", "state": "M0_NC_UNKNOWN"} # 4. find drive name in the json value and extract sdev fid from the # key 0x6400000000000001:0x2c # 5. Create sdev fid from sdev fid key. process_items = self.kv.kv_get(f'm0conf/nodes/{node_name}/processes', recurse=True) for x in process_items: if '/ios' in x['Key']: fidk_ios = x['Value'] ios_fid = create_service_fid(int(fidk_ios)) sdev_items = self.kv.kv_get('m0conf/nodes', recurse=True) for x in sdev_items: if f'/{ios_fid}/' in x['Key']: if json.loads(x['Value'])['path'] == drive: # Using constant index 8 for the sdev fid. # Fix this by changing the Consul schema to have # mapping of drive path to sdev direct mapping. sdev_fid_item = x['Key'].split('/')[8] sdev_fidk = Fid.parse(sdev_fid_item).key sdev_fid = create_sdev_fid(sdev_fidk) break return self.sdev_to_drive_fid(sdev_fid)
def entrypoint(): return EntrypointRequest(reply_context='test', req_id=Uint128(1, 2), remote_rpc_endpoint='endpoint', process_fid=Fid(1, 2), git_rev='HEAD', pid=123, is_first_request=False)
def get_svc_fids(self, svc_name: str) -> List[str]: IDs = self.get_all_svc_ids() id_map = { 'hax': IDs['HAX_ID'], 'confd': IDs['CONFD_IDs'], 'ios': IDs['IOS_IDs'], 's3': IDs['S3_IDs'] } return [str(Fid(ObjT.PROCESS.value, int(x))) for x in id_map[svc_name]]
def generate_confd(self, svc_id: str, hax_ep: str, motr_conf_dir: str): fid = Fid(ObjT.PROCESS.value, int(svc_id)) ep = self.provider.get_service_ep(svc_id) filename = f'm0d-{fid}' contents = (f"MOTR_M0D_EP='{ep}'\n" f"MOTR_HA_EP='{hax_ep}'\n" f"MOTR_PROCESS_FID='{fid}'\n" f"MOTR_CONF_XC='{motr_conf_dir}/confd.xc'\n") self._write_file(motr_conf_dir + self.sysconf_dir + filename, contents)
def generate_ios(self, svc_id: str, hax_ep: str, motr_conf_dir: str): fid = Fid(ObjT.PROCESS.value, int(svc_id)) ep = self.provider.get_service_ep(svc_id) meta_data = self.provider.get_ios_meta_data(svc_id) filename = f'm0d-{fid}' contents = (f"MOTR_M0D_EP='{ep}'\n" f"MOTR_HA_EP='{hax_ep}'\n" f"MOTR_PROCESS_FID='{fid}'\n") if meta_data: contents += f'MOTR_BE_SEG_PATH={meta_data}\n' self._write_file(motr_conf_dir + self.sysconf_dir + filename, contents)
def generate_s3(self, svc_id: str, hax_ep: str, s3_port: int, s3_conf_dir: str): profile_fid = self.provider.get_profile_fid() fid = Fid(ObjT.PROCESS.value, int(svc_id)) ep = self.provider.get_service_ep(svc_id) filename = f's3server-{fid}' contents = (f"MOTR_PROFILE_FID={profile_fid}\n" f"MOTR_S3SERVER_EP='{ep}'\n" f"MOTR_HA_EP='{hax_ep}'\n" f"MOTR_PROCESS_FID='{fid}'\n" f"MOTR_S3SERVER_PORT={s3_port}\n") self._write_file(s3_conf_dir + self.sysconf_dir + filename, contents)
def send_entrypoint_request_reply(self, message: EntrypointRequest): reply_context = message.reply_context req_id = message.req_id remote_rpc_endpoint = message.remote_rpc_endpoint process_fid = message.process_fid LOG.debug('Processing entrypoint request from remote endpoint' " '{}', process fid {}".format(remote_rpc_endpoint, str(process_fid))) sess = principal_rm = confds = None try: util = self.consul_util sess = util.get_leader_session_no_wait() principal_rm = util.get_session_node(sess) confds = util.get_confd_list() rm_fid = util.get_rm_fid() except Exception: LOG.exception('Failed to get the data from Consul.' ' Replying with EAGAIN error code.') self._ffi.entrypoint_reply(reply_context, req_id.to_c(), EAGAIN, 0, make_array(FidStruct, []), make_array(c.c_char_p, []), 0, Fid(0, 0).to_c(), None) LOG.debug('Reply sent') return rc_quorum = int(len(confds) / 2 + 1) rm_eps = None for svc in confds: if svc.node == principal_rm: rm_eps = svc.address break if not rm_eps: raise RuntimeError('No RM node found in Consul') confd_fids = [x.fid.to_c() for x in confds] confd_eps = [make_c_str(x.address) for x in confds] LOG.debug('Passing the entrypoint reply to hax.c layer') self._ffi.entrypoint_reply(reply_context, req_id.to_c(), 0, len(confds), make_array(FidStruct, confd_fids), make_array(c.c_char_p, confd_eps), rc_quorum, rm_fid.to_c(), make_c_str(rm_eps)) LOG.debug('Entrypoint request has been replied to')
async def test_bq_stob_message_type_recognized(hax_client, planner, herald, consul_util, mocker): def fake_get(key): ret = {'bq-delivered/192.168.0.28': ''} return ret[key] mocker.patch.object(herald, 'wait_for_any') # # InboxFilter will try to read epoch - let's mock KV operations mocker.patch.object(consul_util.kv, 'kv_put') mocker.patch.object(consul_util.kv, 'kv_get', fake_get) event_payload = { 'message_type': 'STOB_IOQ_ERROR', 'payload': { 'fid': '0x1:0x2', 'conf_sdev': '0x1:0x4' } } event_str = simplejson.dumps(event_payload) b64: bytes = b64encode(event_str.encode()) b64_str = b64.decode() payload = [{ 'Key': 'bq/12', 'CreateIndex': 1793, 'ModifyIndex': 1793, 'LockIndex': 0, 'Flags': 0, 'Value': b64_str, 'Session': '' }] # Test execution resp = await hax_client.post('/watcher/bq', json=payload) # Validate now if resp.status != 200: resp_json = await resp.json() logging.getLogger('hax').debug('Response: %s', resp_json) assert resp.status == 200 planner.add_command.assert_called_once_with( ContainsStates( [HAState(fid=Fid(0x1, 0x4), status=ServiceHealth.FAILED)]))
def drive_to_sdev_fid(self, drive_fid: Fid) -> Fid: # We extract the sdev fid as follows, # e.g. drive_fid=0x6400000000000001:0x2d # 1. m0conf/sites/0x5300000000000001:0x1/racks/0x6100000000000001:0x2/ # encls/0x6500000000000001:0x21/ctrls/0x6300000000000001:0x22/ # drives/0x6b00000000000001:0x2d:{"sdev": "0x6400000000000001:0x2c", # "state": "M0_NC_UNKNOWN"} # 2. Fetch Consul kv for sdev fid # 3. Extract sdev fid key from the sdev fid. # 4. Create sdev fid from fid key. sdev_fid: Fid = Fid(0, 0) sdev_items = self.kv.kv_get('m0conf/sites', recurse=True) regex = re.compile(f'^m0conf\\/.*\\/drives/{drive_fid}$') for x in sdev_items: match_result = re.match(regex, x['Key']) if not match_result: continue sdev_fid_item = json.loads(x['Value'])['sdev'] sdev_fidk = Fid.parse(sdev_fid_item).key sdev_fid = create_sdev_fid(sdev_fidk) break return sdev_fid
def sdev_to_drive_fid(self, sdev_fid: Fid): # We extract the drive fid as follows, # e.g. sdev_fid=0x6400000000000001:0x2c # 1. m0conf/sites/0x5300000000000001:0x1/racks/0x6100000000000001:0x2/ # encls/0x6500000000000001:0x21/ctrls/0x6300000000000001:0x22/ # drives/0x6b00000000000001:0x2d:{"sdev": "0x6400000000000001:0x2c", # "state": "M0_NC_UNKNOWN"} # 2. Fetch Consul kv for drive fid # 3. Extract drive fid key from the drive fid. # 4. Create drive fid from fid key. drive_fid: Fid = Fid(0, 0) drive_items = self.kv.kv_get('m0conf/sites', recurse=True) for x in drive_items: if '/drives/' in x['Key']: if json.loads(x['Value'])['sdev'] == f'{sdev_fid}': # Using constant index 10 for the drive fid. # Fix this by changing the Consul schema to have # mapping of sdev fid to drive fid direct mapping. drive_fid_item = x['Key'].split('/')[10] drive_fidk = Fid.parse(drive_fid_item).key drive_fid = create_drive_fid(drive_fidk) break return drive_fid
def mk_fid(obj_t: ObjT, key: int) -> Fid: return Fid(obj_t.value, key)
def send_entrypoint_request_reply(self, message: EntrypointRequest): reply_context = message.reply_context req_id = message.req_id remote_rpc_endpoint = message.remote_rpc_endpoint process_fid = message.process_fid e_rc = EAGAIN LOG.debug('Processing entrypoint request from remote endpoint' " '{}', process fid {}".format(remote_rpc_endpoint, str(process_fid))) sess = principal_rm = confds = None try: util = self.consul_util # When stopping, there's a possibility that hax may receive # an entrypoint request from motr land. In order to unblock # motr land, reply with entrypoint request with no confds # and RM endpoints as the processes might have already # stopped. rc_quorum = 0 rm_fid = Fid(0, 0) if self.is_stopping: confds = [] else: sess = util.get_leader_session() principal_rm = util.get_session_node(sess) confds = util.get_confd_list() # Hax may receive entrypoint requests multiple times during its # lifetime. Hax starts motr rconfc to invoke spiel commands. Motr # rconfc establishes connection with principal RM, in case of # principal RM failure, rconfc invalidates its confc and again # requests entrypoint in a hope that there will be another confd # and principal RM elected so that rconfc can resume its # functionality. During shutdown, when each motr process stops, # including confds, hax broadcasts M0_NC_FAILED event for every # STOPPED or FAILED motr process. Motr rconfc on receiving the # failed events for confds, goes re-requests entrypoint information # and this goes on in a loop. In order to break this loop, the # the entrypoint reply must only report alive confds and rm # endpoints. While doing this we need to handle the bootstrapping # case, so we wait until bootstrapping is done that is all the # motr services are up, we check the confd status and exclude # corresponding confd from the entrypoint reply. # EOS-25726: It seems that the confds were reported as started # and they failed later. This could be due to a Motr issue # EOS-25695. # In such a case, when processes start out of order, a wrong # quorum value is reported that leads to further issues in Motr # process startup. Thus commenting this for now. Need to verify # if this affects hax shutdown. # active_confds = [] # if self.spiel_ready: # for confd in confds: # if not util.is_confd_failed(confd.fid): # active_confds.append(confd) # confds = active_confds if confds: rm_fid = util.get_rm_fid() rc_quorum = int(len(confds) / 2 + 1) rm_eps = None for svc in confds: if svc.node == principal_rm: rm_eps = svc.address break if confds and (not self.is_stopping) and (not rm_eps): if util.m0ds_stopping(): e_rc = 0 raise RuntimeError('No RM node found in Consul') except Exception: LOG.exception('Failed to get the data from Consul.' ' Replying with EAGAIN error code, with a 1' ' second delay.') # If replied EAGAIN, motr immediately sends a subsequent entrypoint # request and it is observed that several entrypoint requests are # received by hare in a second. This floods Hare, as an # intermediate solution, Hare dropped the requests in case of an # error preparing the same. But, motr does not send any subsequent # entrypoint requests as expected after a timeout. As per the # discussion, it is agreed upon to have a temporary fix in Hare. # https://jts.seagate.com/browse/EOS-27068 motr ticket is created # to track the same. sleep(1) self._ffi.entrypoint_reply(reply_context, req_id.to_c(), e_rc, 0, make_array(FidStruct, []), make_array(c.c_char_p, []), 0, Fid(0, 0).to_c(), None) LOG.debug('Reply sent') return confd_fids = [x.fid.to_c() for x in confds] confd_eps = [make_c_str(x.address) for x in confds] LOG.debug('Passing the entrypoint reply to hax.c layer') self._ffi.entrypoint_reply(reply_context, req_id.to_c(), 0, len(confds), make_array(FidStruct, confd_fids), make_array(c.c_char_p, confd_eps), rc_quorum, rm_fid.to_c(), make_c_str(rm_eps)) LOG.debug('Entrypoint request has been replied to')
def process_event(): return ProcessEvent( ConfHaProcess(chp_event=0, chp_type=0, chp_pid=0, fid=Fid(0, 0)))
def test_process_failure(self): consul_util = ConsulUtil() consul_cache = InvocationCache() ffi = Mock(spec=['init_motr_api']) motr = Motr(ffi, None, None, consul_util) # Setup for the test: notification of a process failure # - failure here is an ios service and a disk # - dummy Consul reports all processes on the node are failed # - expect the node, enclosure, controller, drive, # process, and service to all be marked as failed # # Static names and fids for the setup are given here. node_name = 'testnode' hax_fid = Fid(0x7200000000000001, 0x6) site_fid = Fid(0x5300000000000001, 0x1) rack_fid = Fid(0x6100000000000001, 0x2) node_fid = Fid(0x6e00000000000001, 0x3) encl_fid = Fid(0x6500000000000001, 0x4) ctrl_fid = Fid(0x6300000000000001, 0x5) process_fid = Fid(0x7200000000000001, 0x15) service_fid = Fid(0x7300000000000001, 0xe) service_fid_typed = FidWithType(fid=service_fid, service_type='ios') drive_fid = Fid(0x6b00000000000001, 0x11) ctrl_path = 'm0conf/sites/{}/racks/{}/encls/{}/ctrls/{}'.format( site_fid, rack_fid, encl_fid, ctrl_fid) ctrl_state = '{"state": "M0_NC_FAILED"}' # Set mock return values for the necessary Consul calls motr._is_mkfs = Mock(return_value=False) consul_util.get_hax_fid = Mock(return_value=hax_fid) consul_util.is_proc_client = Mock(return_value=False) consul_util.get_services_by_parent_process = Mock( return_value=[service_fid_typed]) consul_util.get_disks_by_parent_process = Mock( return_value=[drive_fid]) consul_util.get_process_node = Mock(return_value=node_name) consul_util.get_node_name_by_fid = Mock(return_value=node_name) consul_util.get_node_fid = Mock(return_value=node_fid) consul_util.get_node_encl_fid = Mock(return_value=encl_fid) consul_util.get_node_ctrl_fids = Mock(return_value=[ctrl_fid]) # These failure indications are here to trigger specific code paths for # node failure. Additional tests can cover different scenarios (e.g. # drive failure but node still up), which will set differernt results # for these calls. consul_util.all_io_services_failed = Mock(return_value=True) consul_util.get_sdev_state = Mock( return_value=HaNoteStruct.M0_NC_FAILED) consul_util.get_ctrl_state = Mock( return_value=m0HaObjState.M0_NC_FAILED) consul_util.get_ctrl_state_updates = Mock( return_value=[PutKV(key=ctrl_path, value=ctrl_state)]) # We'll use these mocks to check that expected updates are happening. consul_util.update_drive_state = Mock() consul_util.set_process_state = Mock() consul_util.set_node_state = Mock() consul_util.set_encl_state = Mock() motr._ha_broadcast = Mock() motr._write_updates = Mock() # Send the mock event. motr.broadcast_ha_states( [HAState(fid=process_fid, status=ObjHealth.FAILED)], notify_devices=True, broadcast_hax_only=False, kv_cache=consul_cache) # ConsulUtil is responsible for the actual KV updates, just check # here that the appropriate util function is called for each # component. consul_util.update_drive_state.assert_called_with([drive_fid], ObjHealth.OFFLINE, device_event=False) consul_util.set_process_state.assert_called_with( process_fid, ObjHealth.FAILED) consul_util.set_node_state.assert_called_with(node_fid, ObjHealth.FAILED) consul_util.set_encl_state.assert_called_with(encl_fid, ObjHealth.FAILED, kv_cache=consul_cache) # This KV update is batched, so the check looks different. motr._write_updates.assert_any_call( [PutKV(key=ctrl_path, value=ctrl_state)], consul_cache) # Check hax broadcast. We should see states updated to FAILED. broadcast_list = motr._ha_broadcast.call_args[0][0] self.assertTrue(_has_failed_note(broadcast_list, node_fid)) self.assertTrue(_has_failed_note(broadcast_list, encl_fid)) self.assertTrue(_has_failed_note(broadcast_list, ctrl_fid)) self.assertTrue(_has_failed_note(broadcast_list, process_fid)) self.assertTrue(_has_failed_note(broadcast_list, service_fid)) self.assertTrue(_has_failed_note(broadcast_list, drive_fid))
def test_first_entrypoint_request_broadcasts_fail_first( mocker, planner, motr, consumer, consul_util): def new_kv(key: str, val: str): return { 'Key': key, 'CreateIndex': 1793, 'ModifyIndex': 1793, 'LockIndex': 0, 'Flags': 0, 'Value': val, 'Session': '' } def my_get(key: str, recurse: bool = False): if key == 'm0conf/nodes' and recurse: return [ new_kv(k, v) for k, v in [( 'm0conf/nodes/cmu/processes/6/services/ha', '15'), ( 'm0conf/nodes/cmu/processes/6/services/rm', '16' ), ('m0conf/nodes/localhost/processes/7/services/rms', '17')] ] elif key == 'm0conf/nodes/localhost/processes/7/services/rms': return new_kv('m0conf/nodes/localhost/processes/7/services/rms', '17') raise RuntimeError(f'Unexpected call: key={key}, recurse={recurse}') def my_services(name): if name == 'confd': return [{ 'Node': 'localhost', 'Service': 'confd', 'ServiceID': '7', 'Address': '192.168.0.28', 'ServiceAddress': '192.168.0.28', 'ServicePort': '12345' }] if name == 'hax': return [{ 'Node': 'localhost', 'Service': 'hax', 'ServiceID': '45', 'Address': '192.168.0.28', 'ServiceAddress': '192.168.0.28', 'ServicePort': '667' }] raise RuntimeError(f'Unexpected call: name={name}') mocker.patch.object(consul_util.kv, 'kv_get', side_effect=my_get) mocker.patch.object(consul_util, 'get_leader_session_no_wait', return_value='localhost') mocker.patch.object(consul_util, 'get_session_node', return_value='localhost') mocker.patch.object(consul_util.catalog, 'get_services', side_effect=my_services) msg = FirstEntrypointRequest(reply_context='stub', req_id=Uint128(0, 1), remote_rpc_endpoint='ep', process_fid=Fid(1, 6), git_rev='deadbeef', pid=123, is_first_request=True) run_in_consumer(mocker, msg, planner, consumer, motr) traces = motr._ffi.traces assert AssertionPlan( tr_and(tr_method('ha_broadcast'), ha_note_failed())).run(traces), 'M0_NC_FAILED not broadcast' assert AssertionPlan( tr_and(tr_method('ha_broadcast'), ha_note_failed())).and_then( tr_method('entrypoint_reply')).run(traces), \ 'entrypoint_reply should go after M0_NC_FAILED ' \ 'is broadcast'
def send_entrypoint_request_reply(self, message: EntrypointRequest): reply_context = message.reply_context req_id = message.req_id remote_rpc_endpoint = message.remote_rpc_endpoint process_fid = message.process_fid e_rc = EAGAIN LOG.debug('Processing entrypoint request from remote endpoint' " '{}', process fid {}".format(remote_rpc_endpoint, str(process_fid))) sess = principal_rm = confds = None try: util = self.consul_util # When stopping, there's a possibility that hax may receive # an entrypoint request from motr land. In order to unblock # motr land, reply with entrypoint request with no confds # and RM endpoints as the processes might have already # stopped. rc_quorum = 0 rm_fid = Fid(0, 0) if self.is_stopping: confds = [] else: sess = util.get_leader_session_no_wait() principal_rm = util.get_session_node(sess) confds = util.get_confd_list() # Hax may receive entrypoint requests multiple times during its # lifetime. Hax starts motr rconfc to invoke spiel commands. Motr # rconfc establishes connection with principal RM, in case of # principal RM failure, rconfc invalidates its confc and again # requests entrypoint in a hope that there will be another confd # and principal RM elected so that rconfc can resume its # functionality. During shutdown, when each motr process stops, # including confds, hax broadcasts M0_NC_FAILED event for every # STOPPED or FAILED motr process. Motr rconfc on receiving the # failed events for confds, goes re-requests entrypoint information # and this goes on in a loop. In order to break this loop, the # the entrypoint reply must only report alive confds and rm # endpoints. While doing this we need to handle the bootstrapping # case, so we wait until bootstrapping is done that is all the # motr services are up, we check the confd status and exclude # corresponding confd from the entrypoint reply. active_confds = [] if self.spiel_ready: for confd in confds: if not util.is_confd_failed(confd.fid): active_confds.append(confd) confds = active_confds if confds: rm_fid = util.get_rm_fid() rc_quorum = int(len(confds) / 2 + 1) rm_eps = None for svc in confds: if svc.node == principal_rm: rm_eps = svc.address break if confds and (not self.is_stopping) and (not rm_eps): if util.m0ds_stopping(): e_rc = 0 raise RuntimeError('No RM node found in Consul') except Exception: LOG.exception('Failed to get the data from Consul.' ' Replying with EAGAIN error code.') self._ffi.entrypoint_reply(reply_context, req_id.to_c(), e_rc, 0, make_array(FidStruct, []), make_array(c.c_char_p, []), 0, Fid(0, 0).to_c(), None) LOG.debug('Reply sent') return confd_fids = [x.fid.to_c() for x in confds] confd_eps = [make_c_str(x.address) for x in confds] LOG.debug('Passing the entrypoint reply to hax.c layer') self._ffi.entrypoint_reply(reply_context, req_id.to_c(), 0, len(confds), make_array(FidStruct, confd_fids), make_array(c.c_char_p, confd_eps), rc_quorum, rm_fid.to_c(), make_c_str(rm_eps)) LOG.debug('Entrypoint request has been replied to')
def test_broadcast_io_service_failure(mocker, planner, motr, consumer, consul_util): def new_kv(key: str, val: str): return { 'Key': key, 'CreateIndex': 1793, 'ModifyIndex': 1793, 'LockIndex': 0, 'Flags': 0, 'Value': val, 'Session': '' } def my_get(key: str, recurse: bool = False, **kwds): if key == 'm0conf/nodes' and recurse: return [ new_kv(k, v) for k, v in [('m0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0x15', json.dumps({ "name": "m0_server", "state": "offline" })), ('m0conf/nodes/cmu/processes/6/services/rm', '16'), ('m0conf/nodes/localhost/processes/7/services/rms', '17'), ('m0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0x15/services' '/0x7300000000000001:0x17', json.dumps({ "name": "ios", "state": "failed" })), ('m0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0xa/services/0x7300000000000001' ':0xc', json.dumps({ "name": "ios", "state": "failed" }))] ] elif key == 'm0conf/sites' and recurse: return [ new_kv(k, v) for k, v in [('m0conf/sites/0x5300000000000001:0x1/racks' '/0x6100000000000001:0x2/encls/0x6500000000000001:0x4' '/ctrls/0x6300000000000001:0x5', json.dumps({"state": "M0_NC_UNKNOWN"})), ('m0conf/sites/0x5300000000000001:0x1/racks' '/0x6100000000000001:0x2/encls/0x6500000000000001:0x4' '/ctrls/0x6300000000000001:0x6', json.dumps({"state": "M0_NC_UNKNOWN"}))] ] elif (key == 'm0conf/nodes/0x6e00000000000001:0x3' '/processes' and recurse): return [ new_kv(k, v) for k, v in [('m0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0x15', json.dumps({ "name": "m0_server", "state": "failed" })), ('m0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0x15/services/0x7300000000000001' ':0x17', json.dumps({ "name": "ios", "state": "failed" })), ('m0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0xa/services/0x7300000000000001:0xc', json.dumps({ "name": "ios", "state": "failed" }))] ] elif (key == 'm0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0x15' and recurse): return [ new_kv(k, v) for k, v in [('m0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0x15', json.dumps({ "name": "m0_server", "state": "failed" })), ('m0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0x15/services/0x7300000000000001' ':0x17', json.dumps({ "name": "ios", "state": "failed" }))] ] elif (key == 'm0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0x15'): return new_kv( 'm0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0x15', json.dumps({ "name": "m0_server", "state": "failed" })) elif (key == 'm0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0xa'): return new_kv( 'm0conf/nodes/0x6e00000000000001:0x3/processes' '/0x7200000000000001:0xa', json.dumps({ "name": "m0_server", "state": "online" })) elif key == 'm0conf/nodes/localhost/processes/7/services/rms': return new_kv('m0conf/nodes/localhost/processes/7/services/rms', '17') elif key == 'localhost/processes/0x7200000000000001:0x15': return new_kv( 'localhost/processes/0x7200000000000001', json.dumps({ 'type': 'M0_CONF_HA_PROCESS_OTHER', 'state': 'Unknown' })) elif key == 'm0conf/nodes/0x6e00000000000001:0x3': return new_kv( 'm0conf/nodes/0x6e00000000000001:0x3', json.dumps({ "name": "localhost", "state": "M0_NC_UNKNOWN" })) raise RuntimeError(f'Unexpected call: key={key}, recurse={recurse}') mocker.patch.object(consul_util.kv, 'kv_get', side_effect=my_get) # TODO: Handle 'kv_put' by updating kv returned by 'kv_get' mocker.patch.object(consul_util.kv, 'kv_put', return_value=0) mocker.patch.object(consul_util, 'get_node_fid', return_value=Fid(0x6e00000000000001, 0x3)) mocker.patch.object(consul_util, 'get_node_encl_fid', return_value=Fid(0x6500000000000001, 0x4)) motr.broadcast_ha_states([ HAState(fid=Fid(0x7200000000000001, 0x15), status=ServiceHealth.FAILED) ]) traces = motr._ffi.traces assert AssertionPlan(tr_and( tr_method('ha_broadcast'), io_service_failed())).exists(traces), 'IOservice failure not broadcast' assert AssertionPlan(tr_and(tr_method('ha_broadcast'), node_fid_failed())).not_exists(traces), \ 'Node failure should not be broadcast'