예제 #1
0
def test_nonmkfs_process_stop_causes_drive_offline(mocker, motr, consul_util):
    mocker.patch.object(consul_util.kv,
                        'kv_get',
                        side_effect=create_stub_get('M0_CONF_HA_PROCESS_M0D'))
    mocker.patch.object(consul_util.kv, 'kv_put', return_value=0)
    mocker.patch.object(consul_util, 'update_drive_state')
    mocker.patch.object(consul_util,
                        'get_node_fid',
                        return_value=Fid(0x6e00000000000001, 0x3))
    mocker.patch.object(consul_util,
                        'get_node_encl_fid',
                        return_value=Fid(0x6500000000000001, 0x4))

    motr.broadcast_ha_states([
        HAState(fid=Fid(0x7200000000000001, 0x15), status=ServiceHealth.FAILED)
    ])

    assert consul_util.update_drive_state.called, \
        'The drive state should be updated in Consul KV'

    traces = motr._ffi.traces
    assert AssertionPlan(
        tr_and(tr_method('ha_broadcast'),
               contains_drive())).exists(traces), \
        'DRIVE must be broadcast when non-MKFS process is stopped'
예제 #2
0
    def prepare_svc(self, svc_id: str, name: str):
        ep = self.provider.get_service_ep(svc_id)
        if not ep:
            raise RuntimeError('Cannot get service endpoint.')
        addr = self.get_service_addr(ep)
        port = self.get_service_port(ep)

        checks: Dict[str, Any] = {}
        checks['args'] = ['/opt/seagate/cortx/hare/libexec/check-service']
        checks['interval'] = '1s'
        checks['status'] = 'warning'
        # get svc checks args as per svc name
        if name == 'hax':
            checks['args'].append('--hax')
        elif name in ('confd', 'ios'):
            fid = Fid(ObjT.PROCESS.value, int(svc_id))
            checks['args'].extend(['--fid', str(fid)])
        elif name == 's3service':
            fid = Fid(ObjT.PROCESS.value, int(svc_id))
            s3svc = 's3server@' + str(fid)
            checks['args'].extend(['--svc', s3svc])
        return Service(id=svc_id,
                       name=name,
                       address=addr,
                       port=port,
                       checks=[checks])
예제 #3
0
async def test_bq_stob_message_deserialized(hax_client, planner, herald,
                                            consul_util, mocker):
    def fake_get(key, allow_null):
        # ret = {'bq-delivered/192.168.0.28': ''}
        ret = {'bq-delivered/localhost': ''}
        return ret[key]

    mocker.patch.object(herald, 'wait_for_any')
    #
    # InboxFilter will try to read epoch - let's mock KV operations
    stob = StobId(Fid(12, 13), Fid(14, 15))
    msg = StobIoqError(fid=Fid(5, 6),
                       conf_sdev=Fid(0x103, 0x204),
                       stob_id=stob,
                       fd=42,
                       opcode=4,
                       rc=2,
                       offset=0xBF,
                       size=100,
                       bshift=4)

    # Here we make sure that rea StobIoqError can be used as the payload
    # for STOB_IOQ_ERROR bq message.
    stob_payload = dump_json(msg)
    parsed_stob = simplejson.loads(stob_payload)

    mocker.patch.object(consul_util.kv, 'kv_put')
    mocker.patch.object(consul_util.kv, 'kv_get', fake_get)
    event_payload = {'message_type': 'STOB_IOQ_ERROR', 'payload': parsed_stob}
    event_str = simplejson.dumps(event_payload)
    b64: bytes = b64encode(event_str.encode())
    b64_str = b64.decode()

    payload = [{
        'Key': 'bq/12',
        'CreateIndex': 1793,
        'ModifyIndex': 1793,
        'LockIndex': 0,
        'Flags': 0,
        'Value': b64_str,
        'Session': ''
    }]
    # Test execution
    resp = await hax_client.post('/watcher/bq', json=payload)
    # Validate now
    if resp.status != 200:
        resp_json = await resp.json()
        logging.getLogger('hax').debug('Response: %s', resp_json)
    assert resp.status == 200
    planner.add_command.assert_called_once_with(
        ContainsStates(
            [HAState(fid=Fid(0x103, 0x204), status=ObjHealth.FAILED)]))
예제 #4
0
async def test_service_health_broadcast(hax_client, planner, status: str,
                                        health: ServiceHealth):
    service_health = [{
        'Node': {
            'Node': 'localhost',
            'Address': '10.1.10.12',
        },
        'Service': {
            'ID': '12',
            'Service': 'ios',
            'Tags': [],
            'Port': 8000,
        },
        'Checks': [
            {
                'Node': '12',
                'CheckID': 'service:ios',
                'Name': "Service 'ios' check",
                'Status': status,
                'Notes': '',
                'Output': '',
                'ServiceID': '12',
                'ServiceName': 'ios',
            },
        ],
    }]
    resp = await hax_client.post('/', json=service_health)
    assert resp.status == 200
    assert planner.add_command.called
    planner.add_command.assert_called_once_with(
        BroadcastHAStates(
            states=[HAState(fid=Fid(0x7200000000000001, 12), status=health)],
            reply_to=None))
예제 #5
0
 def node_to_drive_fid(self, node_name: str, drive: str):
     sdev_fid: Fid = Fid(0, 0)
     # We extract the sdev fid as follows,
     # e.g. node_name=ssc-vm-c-0553.colo.seagate.com
     #      drive=/dev/vdf
     # 1. m0conf/nodes/ssc-vm-c-0553.colo.seagate.com/processes/41/
     #     services/ios:43
     # 2. Create ioservice motr fid
     # 3. fetch consul kv for ios fid,
     #    m0conf/nodes/0x6e00000000000001:0x20/processes/
     #    0x7200000000000001:0x29/services/0x7300000000000001:0x2b/
     #    sdevs/0x6400000000000001:0x2c:
     #    {"path": "/dev/vdf", "state": "M0_NC_UNKNOWN"}
     # 4. find drive name in the json value and extract sdev fid from the
     #    key 0x6400000000000001:0x2c
     # 5. Create sdev fid from sdev fid key.
     process_items = self.kv.kv_get(f'm0conf/nodes/{node_name}/processes',
                                    recurse=True)
     for x in process_items:
         if '/ios' in x['Key']:
             fidk_ios = x['Value']
     ios_fid = create_service_fid(int(fidk_ios))
     sdev_items = self.kv.kv_get('m0conf/nodes', recurse=True)
     for x in sdev_items:
         if f'/{ios_fid}/' in x['Key']:
             if json.loads(x['Value'])['path'] == drive:
                 # Using constant index 8 for the sdev fid.
                 # Fix this by changing the Consul schema to have
                 # mapping of drive path to sdev direct mapping.
                 sdev_fid_item = x['Key'].split('/')[8]
                 sdev_fidk = Fid.parse(sdev_fid_item).key
                 sdev_fid = create_sdev_fid(sdev_fidk)
                 break
     return self.sdev_to_drive_fid(sdev_fid)
예제 #6
0
def entrypoint():
    return EntrypointRequest(reply_context='test',
                             req_id=Uint128(1, 2),
                             remote_rpc_endpoint='endpoint',
                             process_fid=Fid(1, 2),
                             git_rev='HEAD',
                             pid=123,
                             is_first_request=False)
예제 #7
0
 def get_svc_fids(self, svc_name: str) -> List[str]:
     IDs = self.get_all_svc_ids()
     id_map = {
         'hax': IDs['HAX_ID'],
         'confd': IDs['CONFD_IDs'],
         'ios': IDs['IOS_IDs'],
         's3': IDs['S3_IDs']
     }
     return [str(Fid(ObjT.PROCESS.value, int(x))) for x in id_map[svc_name]]
예제 #8
0
 def generate_confd(self, svc_id: str, hax_ep: str, motr_conf_dir: str):
     fid = Fid(ObjT.PROCESS.value, int(svc_id))
     ep = self.provider.get_service_ep(svc_id)
     filename = f'm0d-{fid}'
     contents = (f"MOTR_M0D_EP='{ep}'\n"
                 f"MOTR_HA_EP='{hax_ep}'\n"
                 f"MOTR_PROCESS_FID='{fid}'\n"
                 f"MOTR_CONF_XC='{motr_conf_dir}/confd.xc'\n")
     self._write_file(motr_conf_dir + self.sysconf_dir + filename, contents)
예제 #9
0
 def generate_ios(self, svc_id: str, hax_ep: str, motr_conf_dir: str):
     fid = Fid(ObjT.PROCESS.value, int(svc_id))
     ep = self.provider.get_service_ep(svc_id)
     meta_data = self.provider.get_ios_meta_data(svc_id)
     filename = f'm0d-{fid}'
     contents = (f"MOTR_M0D_EP='{ep}'\n"
                 f"MOTR_HA_EP='{hax_ep}'\n"
                 f"MOTR_PROCESS_FID='{fid}'\n")
     if meta_data:
         contents += f'MOTR_BE_SEG_PATH={meta_data}\n'
     self._write_file(motr_conf_dir + self.sysconf_dir + filename, contents)
예제 #10
0
 def generate_s3(self, svc_id: str, hax_ep: str, s3_port: int,
                 s3_conf_dir: str):
     profile_fid = self.provider.get_profile_fid()
     fid = Fid(ObjT.PROCESS.value, int(svc_id))
     ep = self.provider.get_service_ep(svc_id)
     filename = f's3server-{fid}'
     contents = (f"MOTR_PROFILE_FID={profile_fid}\n"
                 f"MOTR_S3SERVER_EP='{ep}'\n"
                 f"MOTR_HA_EP='{hax_ep}'\n"
                 f"MOTR_PROCESS_FID='{fid}'\n"
                 f"MOTR_S3SERVER_PORT={s3_port}\n")
     self._write_file(s3_conf_dir + self.sysconf_dir + filename, contents)
예제 #11
0
    def send_entrypoint_request_reply(self, message: EntrypointRequest):
        reply_context = message.reply_context
        req_id = message.req_id
        remote_rpc_endpoint = message.remote_rpc_endpoint
        process_fid = message.process_fid

        LOG.debug('Processing entrypoint request from remote endpoint'
                  " '{}', process fid {}".format(remote_rpc_endpoint,
                                                 str(process_fid)))
        sess = principal_rm = confds = None
        try:
            util = self.consul_util
            sess = util.get_leader_session_no_wait()
            principal_rm = util.get_session_node(sess)
            confds = util.get_confd_list()
            rm_fid = util.get_rm_fid()
        except Exception:
            LOG.exception('Failed to get the data from Consul.'
                          ' Replying with EAGAIN error code.')
            self._ffi.entrypoint_reply(reply_context, req_id.to_c(), EAGAIN, 0,
                                       make_array(FidStruct, []),
                                       make_array(c.c_char_p, []), 0,
                                       Fid(0, 0).to_c(), None)
            LOG.debug('Reply sent')
            return

        rc_quorum = int(len(confds) / 2 + 1)

        rm_eps = None
        for svc in confds:
            if svc.node == principal_rm:
                rm_eps = svc.address
                break
        if not rm_eps:
            raise RuntimeError('No RM node found in Consul')

        confd_fids = [x.fid.to_c() for x in confds]
        confd_eps = [make_c_str(x.address) for x in confds]

        LOG.debug('Passing the entrypoint reply to hax.c layer')
        self._ffi.entrypoint_reply(reply_context, req_id.to_c(), 0,
                                   len(confds),
                                   make_array(FidStruct, confd_fids),
                                   make_array(c.c_char_p,
                                              confd_eps), rc_quorum,
                                   rm_fid.to_c(), make_c_str(rm_eps))
        LOG.debug('Entrypoint request has been replied to')
예제 #12
0
async def test_bq_stob_message_type_recognized(hax_client, planner, herald,
                                               consul_util, mocker):
    def fake_get(key):
        ret = {'bq-delivered/192.168.0.28': ''}
        return ret[key]

    mocker.patch.object(herald, 'wait_for_any')
    #
    # InboxFilter will try to read epoch - let's mock KV operations
    mocker.patch.object(consul_util.kv, 'kv_put')
    mocker.patch.object(consul_util.kv, 'kv_get', fake_get)
    event_payload = {
        'message_type': 'STOB_IOQ_ERROR',
        'payload': {
            'fid': '0x1:0x2',
            'conf_sdev': '0x1:0x4'
        }
    }
    event_str = simplejson.dumps(event_payload)
    b64: bytes = b64encode(event_str.encode())
    b64_str = b64.decode()

    payload = [{
        'Key': 'bq/12',
        'CreateIndex': 1793,
        'ModifyIndex': 1793,
        'LockIndex': 0,
        'Flags': 0,
        'Value': b64_str,
        'Session': ''
    }]
    # Test execution
    resp = await hax_client.post('/watcher/bq', json=payload)
    # Validate now
    if resp.status != 200:
        resp_json = await resp.json()
        logging.getLogger('hax').debug('Response: %s', resp_json)
    assert resp.status == 200
    planner.add_command.assert_called_once_with(
        ContainsStates(
            [HAState(fid=Fid(0x1, 0x4), status=ServiceHealth.FAILED)]))
예제 #13
0
 def drive_to_sdev_fid(self, drive_fid: Fid) -> Fid:
     # We extract the sdev fid as follows,
     # e.g. drive_fid=0x6400000000000001:0x2d
     # 1. m0conf/sites/0x5300000000000001:0x1/racks/0x6100000000000001:0x2/
     #    encls/0x6500000000000001:0x21/ctrls/0x6300000000000001:0x22/
     #    drives/0x6b00000000000001:0x2d:{"sdev": "0x6400000000000001:0x2c",
     #    "state": "M0_NC_UNKNOWN"}
     # 2. Fetch Consul kv for sdev fid
     # 3. Extract sdev fid key from the sdev fid.
     # 4. Create sdev fid from fid key.
     sdev_fid: Fid = Fid(0, 0)
     sdev_items = self.kv.kv_get('m0conf/sites', recurse=True)
     regex = re.compile(f'^m0conf\\/.*\\/drives/{drive_fid}$')
     for x in sdev_items:
         match_result = re.match(regex, x['Key'])
         if not match_result:
             continue
         sdev_fid_item = json.loads(x['Value'])['sdev']
         sdev_fidk = Fid.parse(sdev_fid_item).key
         sdev_fid = create_sdev_fid(sdev_fidk)
         break
     return sdev_fid
예제 #14
0
 def sdev_to_drive_fid(self, sdev_fid: Fid):
     # We extract the drive fid as follows,
     # e.g. sdev_fid=0x6400000000000001:0x2c
     # 1. m0conf/sites/0x5300000000000001:0x1/racks/0x6100000000000001:0x2/
     #    encls/0x6500000000000001:0x21/ctrls/0x6300000000000001:0x22/
     #    drives/0x6b00000000000001:0x2d:{"sdev": "0x6400000000000001:0x2c",
     #    "state": "M0_NC_UNKNOWN"}
     # 2. Fetch Consul kv for drive fid
     # 3. Extract drive fid key from the drive fid.
     # 4. Create drive fid from fid key.
     drive_fid: Fid = Fid(0, 0)
     drive_items = self.kv.kv_get('m0conf/sites', recurse=True)
     for x in drive_items:
         if '/drives/' in x['Key']:
             if json.loads(x['Value'])['sdev'] == f'{sdev_fid}':
                 # Using constant index 10 for the drive fid.
                 # Fix this by changing the Consul schema to have
                 # mapping of sdev fid to drive fid direct mapping.
                 drive_fid_item = x['Key'].split('/')[10]
                 drive_fidk = Fid.parse(drive_fid_item).key
                 drive_fid = create_drive_fid(drive_fidk)
                 break
     return drive_fid
예제 #15
0
def mk_fid(obj_t: ObjT, key: int) -> Fid:
    return Fid(obj_t.value, key)
예제 #16
0
    def send_entrypoint_request_reply(self, message: EntrypointRequest):
        reply_context = message.reply_context
        req_id = message.req_id
        remote_rpc_endpoint = message.remote_rpc_endpoint
        process_fid = message.process_fid
        e_rc = EAGAIN

        LOG.debug('Processing entrypoint request from remote endpoint'
                  " '{}', process fid {}".format(remote_rpc_endpoint,
                                                 str(process_fid)))
        sess = principal_rm = confds = None
        try:
            util = self.consul_util
            # When stopping, there's a possibility that hax may receive
            # an entrypoint request from motr land. In order to unblock
            # motr land, reply with entrypoint request with no confds
            # and RM endpoints as the processes might have already
            # stopped.
            rc_quorum = 0
            rm_fid = Fid(0, 0)
            if self.is_stopping:
                confds = []
            else:
                sess = util.get_leader_session()
                principal_rm = util.get_session_node(sess)
                confds = util.get_confd_list()

            # Hax may receive entrypoint requests multiple times during its
            # lifetime. Hax starts motr rconfc to invoke spiel commands. Motr
            # rconfc establishes connection with principal RM, in case of
            # principal RM failure, rconfc invalidates its confc and again
            # requests entrypoint in a hope that there will be another confd
            # and principal RM elected so that rconfc can resume its
            # functionality. During shutdown, when each motr process stops,
            # including confds, hax broadcasts M0_NC_FAILED event for every
            # STOPPED or FAILED motr process. Motr rconfc on receiving the
            # failed events for confds, goes re-requests entrypoint information
            # and this goes on in a loop. In order to break this loop, the
            # the entrypoint reply must only report alive confds and rm
            # endpoints. While doing this we need to handle the bootstrapping
            # case, so we wait until bootstrapping is done that is all the
            # motr services are up, we check the confd status and exclude
            # corresponding confd from the entrypoint reply.

            # EOS-25726: It seems that the confds were reported as started
            # and they failed later. This could be due to a Motr issue
            # EOS-25695.
            # In such a case, when processes start out of order, a wrong
            # quorum value is reported that leads to further issues in Motr
            # process startup. Thus commenting this for now. Need to verify
            # if this affects hax shutdown.
            # active_confds = []
            # if self.spiel_ready:
            #     for confd in confds:
            #         if not util.is_confd_failed(confd.fid):
            #             active_confds.append(confd)
            #     confds = active_confds

            if confds:
                rm_fid = util.get_rm_fid()
                rc_quorum = int(len(confds) / 2 + 1)
            rm_eps = None
            for svc in confds:
                if svc.node == principal_rm:
                    rm_eps = svc.address
                    break
            if confds and (not self.is_stopping) and (not rm_eps):
                if util.m0ds_stopping():
                    e_rc = 0
                raise RuntimeError('No RM node found in Consul')
        except Exception:
            LOG.exception('Failed to get the data from Consul.'
                          ' Replying with EAGAIN error code, with a 1'
                          ' second delay.')
            # If replied EAGAIN, motr immediately sends a subsequent entrypoint
            # request and it is observed that several entrypoint requests are
            # received by hare in a second. This floods Hare, as an
            # intermediate solution, Hare dropped the requests in case of an
            # error preparing the same. But, motr does not send any subsequent
            # entrypoint requests as expected after a timeout. As per the
            # discussion, it is agreed upon to have a temporary fix in Hare.
            # https://jts.seagate.com/browse/EOS-27068 motr ticket is created
            # to track the same.
            sleep(1)
            self._ffi.entrypoint_reply(reply_context, req_id.to_c(), e_rc, 0,
                                       make_array(FidStruct, []),
                                       make_array(c.c_char_p, []), 0,
                                       Fid(0, 0).to_c(), None)
            LOG.debug('Reply sent')
            return

        confd_fids = [x.fid.to_c() for x in confds]
        confd_eps = [make_c_str(x.address) for x in confds]

        LOG.debug('Passing the entrypoint reply to hax.c layer')
        self._ffi.entrypoint_reply(reply_context, req_id.to_c(), 0,
                                   len(confds),
                                   make_array(FidStruct, confd_fids),
                                   make_array(c.c_char_p,
                                              confd_eps), rc_quorum,
                                   rm_fid.to_c(), make_c_str(rm_eps))
        LOG.debug('Entrypoint request has been replied to')
예제 #17
0
def process_event():
    return ProcessEvent(
               ConfHaProcess(chp_event=0,
                             chp_type=0,
                             chp_pid=0,
                             fid=Fid(0, 0)))
예제 #18
0
    def test_process_failure(self):
        consul_util = ConsulUtil()
        consul_cache = InvocationCache()
        ffi = Mock(spec=['init_motr_api'])
        motr = Motr(ffi, None, None, consul_util)

        # Setup for the test: notification of a process failure
        # - failure here is an ios service and a disk
        # - dummy Consul reports all processes on the node are failed
        # - expect the node, enclosure, controller, drive,
        #   process, and service to all be marked as failed
        #
        # Static names and fids for the setup are given here.
        node_name = 'testnode'

        hax_fid = Fid(0x7200000000000001, 0x6)
        site_fid = Fid(0x5300000000000001, 0x1)
        rack_fid = Fid(0x6100000000000001, 0x2)
        node_fid = Fid(0x6e00000000000001, 0x3)
        encl_fid = Fid(0x6500000000000001, 0x4)
        ctrl_fid = Fid(0x6300000000000001, 0x5)
        process_fid = Fid(0x7200000000000001, 0x15)
        service_fid = Fid(0x7300000000000001, 0xe)
        service_fid_typed = FidWithType(fid=service_fid, service_type='ios')
        drive_fid = Fid(0x6b00000000000001, 0x11)
        ctrl_path = 'm0conf/sites/{}/racks/{}/encls/{}/ctrls/{}'.format(
            site_fid, rack_fid, encl_fid, ctrl_fid)
        ctrl_state = '{"state": "M0_NC_FAILED"}'

        # Set mock return values for the necessary Consul calls
        motr._is_mkfs = Mock(return_value=False)
        consul_util.get_hax_fid = Mock(return_value=hax_fid)
        consul_util.is_proc_client = Mock(return_value=False)
        consul_util.get_services_by_parent_process = Mock(
            return_value=[service_fid_typed])
        consul_util.get_disks_by_parent_process = Mock(
            return_value=[drive_fid])
        consul_util.get_process_node = Mock(return_value=node_name)
        consul_util.get_node_name_by_fid = Mock(return_value=node_name)
        consul_util.get_node_fid = Mock(return_value=node_fid)
        consul_util.get_node_encl_fid = Mock(return_value=encl_fid)
        consul_util.get_node_ctrl_fids = Mock(return_value=[ctrl_fid])

        # These failure indications are here to trigger specific code paths for
        # node failure. Additional tests can cover different scenarios (e.g.
        # drive failure but node still up), which will set differernt results
        # for these calls.
        consul_util.all_io_services_failed = Mock(return_value=True)
        consul_util.get_sdev_state = Mock(
            return_value=HaNoteStruct.M0_NC_FAILED)
        consul_util.get_ctrl_state = Mock(
            return_value=m0HaObjState.M0_NC_FAILED)
        consul_util.get_ctrl_state_updates = Mock(
            return_value=[PutKV(key=ctrl_path, value=ctrl_state)])

        # We'll use these mocks to check that expected updates are happening.
        consul_util.update_drive_state = Mock()
        consul_util.set_process_state = Mock()
        consul_util.set_node_state = Mock()
        consul_util.set_encl_state = Mock()
        motr._ha_broadcast = Mock()
        motr._write_updates = Mock()

        # Send the mock event.
        motr.broadcast_ha_states(
            [HAState(fid=process_fid, status=ObjHealth.FAILED)],
            notify_devices=True,
            broadcast_hax_only=False,
            kv_cache=consul_cache)

        # ConsulUtil is responsible for the actual KV updates, just check
        # here that the appropriate util function is called for each
        # component.
        consul_util.update_drive_state.assert_called_with([drive_fid],
                                                          ObjHealth.OFFLINE,
                                                          device_event=False)
        consul_util.set_process_state.assert_called_with(
            process_fid, ObjHealth.FAILED)
        consul_util.set_node_state.assert_called_with(node_fid,
                                                      ObjHealth.FAILED)
        consul_util.set_encl_state.assert_called_with(encl_fid,
                                                      ObjHealth.FAILED,
                                                      kv_cache=consul_cache)
        # This KV update is batched, so the check looks different.
        motr._write_updates.assert_any_call(
            [PutKV(key=ctrl_path, value=ctrl_state)], consul_cache)

        # Check hax broadcast. We should see states updated to FAILED.
        broadcast_list = motr._ha_broadcast.call_args[0][0]
        self.assertTrue(_has_failed_note(broadcast_list, node_fid))
        self.assertTrue(_has_failed_note(broadcast_list, encl_fid))
        self.assertTrue(_has_failed_note(broadcast_list, ctrl_fid))
        self.assertTrue(_has_failed_note(broadcast_list, process_fid))
        self.assertTrue(_has_failed_note(broadcast_list, service_fid))
        self.assertTrue(_has_failed_note(broadcast_list, drive_fid))
예제 #19
0
def test_first_entrypoint_request_broadcasts_fail_first(
        mocker, planner, motr, consumer, consul_util):
    def new_kv(key: str, val: str):
        return {
            'Key': key,
            'CreateIndex': 1793,
            'ModifyIndex': 1793,
            'LockIndex': 0,
            'Flags': 0,
            'Value': val,
            'Session': ''
        }

    def my_get(key: str, recurse: bool = False):
        if key == 'm0conf/nodes' and recurse:
            return [
                new_kv(k, v) for k, v in [(
                    'm0conf/nodes/cmu/processes/6/services/ha',
                    '15'), (
                        'm0conf/nodes/cmu/processes/6/services/rm', '16'
                    ), ('m0conf/nodes/localhost/processes/7/services/rms',
                        '17')]
            ]
        elif key == 'm0conf/nodes/localhost/processes/7/services/rms':
            return new_kv('m0conf/nodes/localhost/processes/7/services/rms',
                          '17')
        raise RuntimeError(f'Unexpected call: key={key}, recurse={recurse}')

    def my_services(name):
        if name == 'confd':
            return [{
                'Node': 'localhost',
                'Service': 'confd',
                'ServiceID': '7',
                'Address': '192.168.0.28',
                'ServiceAddress': '192.168.0.28',
                'ServicePort': '12345'
            }]
        if name == 'hax':
            return [{
                'Node': 'localhost',
                'Service': 'hax',
                'ServiceID': '45',
                'Address': '192.168.0.28',
                'ServiceAddress': '192.168.0.28',
                'ServicePort': '667'
            }]
        raise RuntimeError(f'Unexpected call: name={name}')

    mocker.patch.object(consul_util.kv, 'kv_get', side_effect=my_get)
    mocker.patch.object(consul_util,
                        'get_leader_session_no_wait',
                        return_value='localhost')
    mocker.patch.object(consul_util,
                        'get_session_node',
                        return_value='localhost')

    mocker.patch.object(consul_util.catalog,
                        'get_services',
                        side_effect=my_services)

    msg = FirstEntrypointRequest(reply_context='stub',
                                 req_id=Uint128(0, 1),
                                 remote_rpc_endpoint='ep',
                                 process_fid=Fid(1, 6),
                                 git_rev='deadbeef',
                                 pid=123,
                                 is_first_request=True)
    run_in_consumer(mocker, msg, planner, consumer, motr)
    traces = motr._ffi.traces
    assert AssertionPlan(
        tr_and(tr_method('ha_broadcast'),
               ha_note_failed())).run(traces), 'M0_NC_FAILED not broadcast'
    assert AssertionPlan(
        tr_and(tr_method('ha_broadcast'),
               ha_note_failed())).and_then(
        tr_method('entrypoint_reply')).run(traces), \
        'entrypoint_reply should go after M0_NC_FAILED ' \
        'is broadcast'
예제 #20
0
    def send_entrypoint_request_reply(self, message: EntrypointRequest):
        reply_context = message.reply_context
        req_id = message.req_id
        remote_rpc_endpoint = message.remote_rpc_endpoint
        process_fid = message.process_fid
        e_rc = EAGAIN

        LOG.debug('Processing entrypoint request from remote endpoint'
                  " '{}', process fid {}".format(remote_rpc_endpoint,
                                                 str(process_fid)))
        sess = principal_rm = confds = None
        try:
            util = self.consul_util
            # When stopping, there's a possibility that hax may receive
            # an entrypoint request from motr land. In order to unblock
            # motr land, reply with entrypoint request with no confds
            # and RM endpoints as the processes might have already
            # stopped.
            rc_quorum = 0
            rm_fid = Fid(0, 0)
            if self.is_stopping:
                confds = []
            else:
                sess = util.get_leader_session_no_wait()
                principal_rm = util.get_session_node(sess)
                confds = util.get_confd_list()

            # Hax may receive entrypoint requests multiple times during its
            # lifetime. Hax starts motr rconfc to invoke spiel commands. Motr
            # rconfc establishes connection with principal RM, in case of
            # principal RM failure, rconfc invalidates its confc and again
            # requests entrypoint in a hope that there will be another confd
            # and principal RM elected so that rconfc can resume its
            # functionality. During shutdown, when each motr process stops,
            # including confds, hax broadcasts M0_NC_FAILED event for every
            # STOPPED or FAILED motr process. Motr rconfc on receiving the
            # failed events for confds, goes re-requests entrypoint information
            # and this goes on in a loop. In order to break this loop, the
            # the entrypoint reply must only report alive confds and rm
            # endpoints. While doing this we need to handle the bootstrapping
            # case, so we wait until bootstrapping is done that is all the
            # motr services are up, we check the confd status and exclude
            # corresponding confd from the entrypoint reply.
            active_confds = []
            if self.spiel_ready:
                for confd in confds:
                    if not util.is_confd_failed(confd.fid):
                        active_confds.append(confd)
                confds = active_confds

            if confds:
                rm_fid = util.get_rm_fid()
                rc_quorum = int(len(confds) / 2 + 1)
            rm_eps = None
            for svc in confds:
                if svc.node == principal_rm:
                    rm_eps = svc.address
                    break
            if confds and (not self.is_stopping) and (not rm_eps):
                if util.m0ds_stopping():
                    e_rc = 0
                raise RuntimeError('No RM node found in Consul')
        except Exception:
            LOG.exception('Failed to get the data from Consul.'
                          ' Replying with EAGAIN error code.')
            self._ffi.entrypoint_reply(reply_context, req_id.to_c(), e_rc, 0,
                                       make_array(FidStruct, []),
                                       make_array(c.c_char_p, []), 0,
                                       Fid(0, 0).to_c(), None)
            LOG.debug('Reply sent')
            return

        confd_fids = [x.fid.to_c() for x in confds]
        confd_eps = [make_c_str(x.address) for x in confds]

        LOG.debug('Passing the entrypoint reply to hax.c layer')
        self._ffi.entrypoint_reply(reply_context, req_id.to_c(), 0,
                                   len(confds),
                                   make_array(FidStruct, confd_fids),
                                   make_array(c.c_char_p,
                                              confd_eps), rc_quorum,
                                   rm_fid.to_c(), make_c_str(rm_eps))
        LOG.debug('Entrypoint request has been replied to')
예제 #21
0
def test_broadcast_io_service_failure(mocker, planner, motr, consumer,
                                      consul_util):
    def new_kv(key: str, val: str):
        return {
            'Key': key,
            'CreateIndex': 1793,
            'ModifyIndex': 1793,
            'LockIndex': 0,
            'Flags': 0,
            'Value': val,
            'Session': ''
        }

    def my_get(key: str, recurse: bool = False, **kwds):
        if key == 'm0conf/nodes' and recurse:
            return [
                new_kv(k, v) for k, v in
                [('m0conf/nodes/0x6e00000000000001:0x3/processes'
                  '/0x7200000000000001:0x15',
                  json.dumps({
                      "name": "m0_server",
                      "state": "offline"
                  })), ('m0conf/nodes/cmu/processes/6/services/rm', '16'),
                 ('m0conf/nodes/localhost/processes/7/services/rms', '17'),
                 ('m0conf/nodes/0x6e00000000000001:0x3/processes'
                  '/0x7200000000000001:0x15/services'
                  '/0x7300000000000001:0x17',
                  json.dumps({
                      "name": "ios",
                      "state": "failed"
                  })),
                 ('m0conf/nodes/0x6e00000000000001:0x3/processes'
                  '/0x7200000000000001:0xa/services/0x7300000000000001'
                  ':0xc', json.dumps({
                      "name": "ios",
                      "state": "failed"
                  }))]
            ]
        elif key == 'm0conf/sites' and recurse:
            return [
                new_kv(k, v) for k, v in
                [('m0conf/sites/0x5300000000000001:0x1/racks'
                  '/0x6100000000000001:0x2/encls/0x6500000000000001:0x4'
                  '/ctrls/0x6300000000000001:0x5',
                  json.dumps({"state": "M0_NC_UNKNOWN"})),
                 ('m0conf/sites/0x5300000000000001:0x1/racks'
                  '/0x6100000000000001:0x2/encls/0x6500000000000001:0x4'
                  '/ctrls/0x6300000000000001:0x6',
                  json.dumps({"state": "M0_NC_UNKNOWN"}))]
            ]
        elif (key == 'm0conf/nodes/0x6e00000000000001:0x3'
              '/processes' and recurse):
            return [
                new_kv(k, v) for k, v in
                [('m0conf/nodes/0x6e00000000000001:0x3/processes'
                  '/0x7200000000000001:0x15',
                  json.dumps({
                      "name": "m0_server",
                      "state": "failed"
                  })),
                 ('m0conf/nodes/0x6e00000000000001:0x3/processes'
                  '/0x7200000000000001:0x15/services/0x7300000000000001'
                  ':0x17', json.dumps({
                      "name": "ios",
                      "state": "failed"
                  })),
                 ('m0conf/nodes/0x6e00000000000001:0x3/processes'
                  '/0x7200000000000001:0xa/services/0x7300000000000001:0xc',
                  json.dumps({
                      "name": "ios",
                      "state": "failed"
                  }))]
            ]
        elif (key == 'm0conf/nodes/0x6e00000000000001:0x3/processes'
              '/0x7200000000000001:0x15' and recurse):
            return [
                new_kv(k, v) for k, v in
                [('m0conf/nodes/0x6e00000000000001:0x3/processes'
                  '/0x7200000000000001:0x15',
                  json.dumps({
                      "name": "m0_server",
                      "state": "failed"
                  })),
                 ('m0conf/nodes/0x6e00000000000001:0x3/processes'
                  '/0x7200000000000001:0x15/services/0x7300000000000001'
                  ':0x17', json.dumps({
                      "name": "ios",
                      "state": "failed"
                  }))]
            ]
        elif (key == 'm0conf/nodes/0x6e00000000000001:0x3/processes'
              '/0x7200000000000001:0x15'):
            return new_kv(
                'm0conf/nodes/0x6e00000000000001:0x3/processes'
                '/0x7200000000000001:0x15',
                json.dumps({
                    "name": "m0_server",
                    "state": "failed"
                }))
        elif (key == 'm0conf/nodes/0x6e00000000000001:0x3/processes'
              '/0x7200000000000001:0xa'):
            return new_kv(
                'm0conf/nodes/0x6e00000000000001:0x3/processes'
                '/0x7200000000000001:0xa',
                json.dumps({
                    "name": "m0_server",
                    "state": "online"
                }))
        elif key == 'm0conf/nodes/localhost/processes/7/services/rms':
            return new_kv('m0conf/nodes/localhost/processes/7/services/rms',
                          '17')
        elif key == 'localhost/processes/0x7200000000000001:0x15':
            return new_kv(
                'localhost/processes/0x7200000000000001',
                json.dumps({
                    'type': 'M0_CONF_HA_PROCESS_OTHER',
                    'state': 'Unknown'
                }))
        elif key == 'm0conf/nodes/0x6e00000000000001:0x3':
            return new_kv(
                'm0conf/nodes/0x6e00000000000001:0x3',
                json.dumps({
                    "name": "localhost",
                    "state": "M0_NC_UNKNOWN"
                }))
        raise RuntimeError(f'Unexpected call: key={key}, recurse={recurse}')

    mocker.patch.object(consul_util.kv, 'kv_get', side_effect=my_get)
    # TODO: Handle 'kv_put' by updating kv returned by 'kv_get'
    mocker.patch.object(consul_util.kv, 'kv_put', return_value=0)
    mocker.patch.object(consul_util,
                        'get_node_fid',
                        return_value=Fid(0x6e00000000000001, 0x3))
    mocker.patch.object(consul_util,
                        'get_node_encl_fid',
                        return_value=Fid(0x6500000000000001, 0x4))

    motr.broadcast_ha_states([
        HAState(fid=Fid(0x7200000000000001, 0x15), status=ServiceHealth.FAILED)
    ])

    traces = motr._ffi.traces
    assert AssertionPlan(tr_and(
        tr_method('ha_broadcast'),
        io_service_failed())).exists(traces), 'IOservice failure not broadcast'
    assert AssertionPlan(tr_and(tr_method('ha_broadcast'),
                                node_fid_failed())).not_exists(traces), \
        'Node failure should not be broadcast'