Пример #1
0
    def prepare_svc(self, svc_id: str, name: str):
        ep = self.provider.get_service_ep(svc_id)
        if not ep:
            raise RuntimeError('Cannot get service endpoint.')
        addr = self.get_service_addr(ep)
        port = self.get_service_port(ep)

        checks: Dict[str, Any] = {}
        checks['args'] = ['/opt/seagate/cortx/hare/libexec/check-service']
        checks['interval'] = '1s'
        checks['status'] = 'warning'
        # get svc checks args as per svc name
        if name == 'hax':
            checks['args'].append('--hax')
        elif name in ('confd', 'ios'):
            fid = Fid(ObjT.PROCESS.value, int(svc_id))
            checks['args'].extend(['--fid', str(fid)])
        elif name == 's3service':
            fid = Fid(ObjT.PROCESS.value, int(svc_id))
            s3svc = 's3server@' + str(fid)
            checks['args'].extend(['--svc', s3svc])
        return Service(id=svc_id,
                       name=name,
                       address=addr,
                       port=port,
                       checks=[checks])
Пример #2
0
def test_nonmkfs_process_stop_causes_drive_offline(mocker, motr, consul_util):
    mocker.patch.object(consul_util.kv,
                        'kv_get',
                        side_effect=create_stub_get('M0_CONF_HA_PROCESS_M0D'))
    mocker.patch.object(consul_util.kv, 'kv_put', return_value=0)
    mocker.patch.object(consul_util, 'update_drive_state')
    mocker.patch.object(consul_util,
                        'get_node_fid',
                        return_value=Fid(0x6e00000000000001, 0x3))
    mocker.patch.object(consul_util,
                        'get_node_encl_fid',
                        return_value=Fid(0x6500000000000001, 0x4))

    motr.broadcast_ha_states([
        HAState(fid=Fid(0x7200000000000001, 0x15), status=ServiceHealth.FAILED)
    ])

    assert consul_util.update_drive_state.called, \
        'The drive state should be updated in Consul KV'

    traces = motr._ffi.traces
    assert AssertionPlan(
        tr_and(tr_method('ha_broadcast'),
               contains_drive())).exists(traces), \
        'DRIVE must be broadcast when non-MKFS process is stopped'
Пример #3
0
 def node_to_drive_fid(self, node_name: str, drive: str):
     sdev_fid: Fid = Fid(0, 0)
     # We extract the sdev fid as follows,
     # e.g. node_name=ssc-vm-c-0553.colo.seagate.com
     #      drive=/dev/vdf
     # 1. m0conf/nodes/ssc-vm-c-0553.colo.seagate.com/processes/41/
     #     services/ios:43
     # 2. Create ioservice motr fid
     # 3. fetch consul kv for ios fid,
     #    m0conf/nodes/0x6e00000000000001:0x20/processes/
     #    0x7200000000000001:0x29/services/0x7300000000000001:0x2b/
     #    sdevs/0x6400000000000001:0x2c:
     #    {"path": "/dev/vdf", "state": "M0_NC_UNKNOWN"}
     # 4. find drive name in the json value and extract sdev fid from the
     #    key 0x6400000000000001:0x2c
     # 5. Create sdev fid from sdev fid key.
     process_items = self.kv.kv_get(f'm0conf/nodes/{node_name}/processes',
                                    recurse=True)
     for x in process_items:
         if '/ios' in x['Key']:
             fidk_ios = x['Value']
     ios_fid = create_service_fid(int(fidk_ios))
     sdev_items = self.kv.kv_get('m0conf/nodes', recurse=True)
     for x in sdev_items:
         if f'/{ios_fid}/' in x['Key']:
             if json.loads(x['Value'])['path'] == drive:
                 # Using constant index 8 for the sdev fid.
                 # Fix this by changing the Consul schema to have
                 # mapping of drive path to sdev direct mapping.
                 sdev_fid_item = x['Key'].split('/')[8]
                 sdev_fidk = Fid.parse(sdev_fid_item).key
                 sdev_fid = create_sdev_fid(sdev_fidk)
                 break
     return self.sdev_to_drive_fid(sdev_fid)
Пример #4
0
async def test_bq_stob_message_deserialized(hax_client, planner, herald,
                                            consul_util, mocker):
    def fake_get(key, allow_null):
        # ret = {'bq-delivered/192.168.0.28': ''}
        ret = {'bq-delivered/localhost': ''}
        return ret[key]

    mocker.patch.object(herald, 'wait_for_any')
    #
    # InboxFilter will try to read epoch - let's mock KV operations
    stob = StobId(Fid(12, 13), Fid(14, 15))
    msg = StobIoqError(fid=Fid(5, 6),
                       conf_sdev=Fid(0x103, 0x204),
                       stob_id=stob,
                       fd=42,
                       opcode=4,
                       rc=2,
                       offset=0xBF,
                       size=100,
                       bshift=4)

    # Here we make sure that rea StobIoqError can be used as the payload
    # for STOB_IOQ_ERROR bq message.
    stob_payload = dump_json(msg)
    parsed_stob = simplejson.loads(stob_payload)

    mocker.patch.object(consul_util.kv, 'kv_put')
    mocker.patch.object(consul_util.kv, 'kv_get', fake_get)
    event_payload = {'message_type': 'STOB_IOQ_ERROR', 'payload': parsed_stob}
    event_str = simplejson.dumps(event_payload)
    b64: bytes = b64encode(event_str.encode())
    b64_str = b64.decode()

    payload = [{
        'Key': 'bq/12',
        'CreateIndex': 1793,
        'ModifyIndex': 1793,
        'LockIndex': 0,
        'Flags': 0,
        'Value': b64_str,
        'Session': ''
    }]
    # Test execution
    resp = await hax_client.post('/watcher/bq', json=payload)
    # Validate now
    if resp.status != 200:
        resp_json = await resp.json()
        logging.getLogger('hax').debug('Response: %s', resp_json)
    assert resp.status == 200
    planner.add_command.assert_called_once_with(
        ContainsStates(
            [HAState(fid=Fid(0x103, 0x204), status=ObjHealth.FAILED)]))
Пример #5
0
 def start(self, rpc_endpoint: str, process: Fid, ha_service: Fid,
           rm_service: Fid):
     LOG.debug('Starting m0_halon_interface')
     self._process_fid = process
     result = self._ffi.start(self._ha_ctx, make_c_str(rpc_endpoint),
                              process.to_c(), ha_service.to_c(),
                              rm_service.to_c())
     if result:
         LOG.error(
             'Cannot start Motr API. m0_halon_interface::start'
             ' returned non-zero code (%s)', result)
         raise RuntimeError('Cannot start m0_halon_interface.'
                            'Please check Motr logs for more details.')
Пример #6
0
    def get_node_encl_fid(self, node: str) -> Optional[Fid]:
        """
        Returns the fid of the enclosure for the given node.

        Parameters:
            node : hostname of the node.
        """
        # Example,
        # {
        #    "key": "m0conf/sites/0x5300000000000001:0x1/
        #            racks/0x6100000000000001:0x2/encls/
        #            0x6500000000000001:0x4",
        #    "value": "{\"node\": \"0x6e00000000000001:0x3\",
        #               \"state\": \"M0_NC_UNKNOWN\"}"
        # },
        node_fid = self.get_node_fid(node)
        if not node_fid:
            return None
        encl_items = self.kv.kv_get('m0conf/sites', recurse=True)
        regex = re.compile('^m0conf\\/.*\\/racks\\/.*\\/encls\\/([^/]+)$')
        for encl in encl_items:
            match_result = re.match(regex, encl['Key'])
            if not match_result:
                continue
            encl_value = json.loads(encl['Value'])
            if 'node' in encl_value and encl_value['node'] == str(node_fid):
                encl_fid: str = match_result.group(1)
                return Fid.parse(encl_fid)
        return None
Пример #7
0
    def get_node_ctrl_fid(self, node: str) -> Optional[Fid]:
        """
        Returns the fid of the controller for the given node.

        Parameters:
            node : hostname of the node.
        """
        # Example,
        # {
        #    "key": "m0conf/sites/0x5300000000000001:0x1/
        #            racks/0x6100000000000001:0x2/encls/
        #            0x6500000000000001:0x4/ctrls/0x6300000000000001:0x5",
        # },
        encl_fid = self.get_node_encl_fid(node)
        if not encl_fid:
            return None
        ctrl_items = self.kv.kv_get('m0conf/sites', recurse=True)
        regex = re.compile(
            f'^m0conf\\/.*\\/racks\\/.*\\/encls\\/{encl_fid}\\/ctrls\\/'
            '([^/]+)$')
        for ctrl in ctrl_items:
            match_result = re.match(regex, ctrl['Key'])
            if not match_result:
                continue
            ctrl_fid: str = match_result.group(1)
            return Fid.parse(ctrl_fid)
        return None
Пример #8
0
 def get_disks_by_parent_process(self,
                                 process_fid: Fid,
                                 svc_fid: Fid) -> List[Fid]:
     node_items = self.kv.kv_get('m0conf/nodes', recurse=True)
     # This is the RegExp to match the keys in Consul KV that describe
     # the Motr processes and services that are enclosed into the Motr
     # process that has the given process_fid.
     #
     # Note: we assume that process_fid uniquely identifies the given
     # process within the whole cluster (that's why we are not interested
     # in the hostnames here).
     #
     # Examples of the key that will match:
     #   m0conf/nodes/0x6e00000000000001:0x3b/processes/
     #       0x7200000000000001:0x44/services/0x7300000000000001:0x46
     regex = re.compile(
         f'^m0conf\\/.*\\/processes\\/{process_fid}\\/services\\/'
         f'{svc_fid}\\/(.+)$')
     disks = []
     for node in node_items:
         match_result = re.match(regex, node['Key'])
         if not match_result:
             continue
         sdev_fid_item = node['Key'].split('/')[8]
         sdev_fidk = Fid.parse(sdev_fid_item).key
         sdev_fid = create_sdev_fid(sdev_fidk)
         disk_fid = self.sdev_to_drive_fid(sdev_fid)
         disks.append(disk_fid)
     return disks
Пример #9
0
    def _generate_sub_disks(self,
                            note: HaNoteStruct,
                            services: List[FidWithType],
                            cns: ConsulUtil,
                            kv_cache=None) -> List[HaNoteStruct]:
        disk_list = []
        new_state = note.no_state
        proc_fid = Fid.from_struct(note.no_id)

        state = (ObjHealth.OK if new_state == HaNoteStruct.M0_NC_ONLINE else
                 ObjHealth.OFFLINE)
        is_mkfs = self._is_mkfs(proc_fid)

        mkfs_down = is_mkfs and state != ObjHealth.OK

        if not mkfs_down:
            for svc in services:
                disk_list += cns.get_disks_by_parent_process(proc_fid, svc.fid)
        if disk_list:
            # XXX: Need to check the current state of the device, transition
            # to ONLINE only in case of an explicit request or iff the prior
            # state of the device is UNKNOWN/OFFLINE.
            if not mkfs_down:
                # We don't mark the devices as failed if the process is MKFS
                # and if its effective status is STOPPED (see EOS-24124).
                cns.update_drive_state(disk_list, state, device_event=False)
        LOG.debug('proc fid=%s encloses %d disks as follows: %s', proc_fid,
                  len(disk_list), disk_list)
        drive_ha_notes: List[HaNoteStruct] = []
        for drive_id in disk_list:
            # Get the drive state from Consul KV.
            dstate = cns.get_sdev_state(ObjT.DRIVE, drive_id.key)
            drive_ha_notes.append(
                HaNoteStruct(no_id=drive_id.to_c(), no_state=dstate))
        return drive_ha_notes
Пример #10
0
 def fn():
     proc_state_to_objhealth = {
         'M0_CONF_HA_PROCESS_STARTING': ObjHealth.OFFLINE,
         'M0_CONF_HA_PROCESS_STARTED': ObjHealth.OK,
         'M0_CONF_HA_PROCESS_STOPPING': ObjHealth.OFFLINE,
         'M0_CONF_HA_PROCESS_STOPPED': ObjHealth.OFFLINE
     }
     # import pudb.remote
     # pudb.remote.set_trace(term_size=(80, 40), port=9998)
     ha_states: List[HAState] = []
     LOG.debug('process status: %s', data)
     for item in data:
         proc_val = base64.b64decode(item['Value'])
         proc_status = json.loads(str(proc_val.decode('utf-8')))
         LOG.debug('process update item key %s item val: %s',
                   item['Key'].split('/')[1], proc_status)
         proc_fid = Fid.parse(item['Key'].split('/')[1])
         proc_state = proc_status['state']
         proc_type = proc_status['type']
         if (proc_type != 'M0_CONF_HA_PROCESS_M0MKFS'
                 and proc_state in ('M0_CONF_HA_PROCESS_STARTED',
                                    'M0_CONF_HA_PROCESS_STOPPED')):
             ha_states.append(
                 HAState(fid=proc_fid,
                         status=proc_state_to_objhealth[proc_state]))
             planner.add_command(
                 BroadcastHAStates(states=ha_states, reply_to=None))
Пример #11
0
 def get_pver_status(self, pver_fid: Fid) -> PverInfo:
     status: PverInfo = self._ffi.pver_status_fetch(self._ha_ctx,
                                                    pver_fid.to_c())
     if not status:
         raise BytecountException('Pool version status unavailable')
     LOG.debug('Pver status for pver %s: %s', pver_fid, status.state)
     return status
Пример #12
0
    def is_node_failed(self, proc_note: HaNoteStruct, kv_cache=None):
        proc_fid = Fid.from_struct(proc_note.no_id)
        assert ObjT.PROCESS.value == proc_fid.container

        node = self.consul_util.get_process_node(proc_fid, kv_cache=kv_cache)

        return self.consul_util.all_io_services_failed(node, kv_cache=kv_cache)
Пример #13
0
async def test_service_health_broadcast(hax_client, planner, status: str,
                                        health: ServiceHealth):
    service_health = [{
        'Node': {
            'Node': 'localhost',
            'Address': '10.1.10.12',
        },
        'Service': {
            'ID': '12',
            'Service': 'ios',
            'Tags': [],
            'Port': 8000,
        },
        'Checks': [
            {
                'Node': '12',
                'CheckID': 'service:ios',
                'Name': "Service 'ios' check",
                'Status': status,
                'Notes': '',
                'Output': '',
                'ServiceID': '12',
                'ServiceName': 'ios',
            },
        ],
    }]
    resp = await hax_client.post('/', json=service_health)
    assert resp.status == 200
    assert planner.add_command.called
    planner.add_command.assert_called_once_with(
        BroadcastHAStates(
            states=[HAState(fid=Fid(0x7200000000000001, 12), status=health)],
            reply_to=None))
Пример #14
0
    def notify_node_status_by_process(
            self, proc_note: HaNoteStruct) -> List[HaNoteStruct]:
        # proc_note.no_state is of int type
        new_state = ServiceHealth.from_ha_note_state(proc_note.no_state)
        proc_fid = Fid.from_struct(proc_note.no_id)
        assert ObjT.PROCESS.value == proc_fid.container
        LOG.debug('Notifying node status for process_fid=%s state=%s',
                  proc_fid, new_state)

        node = self.consul_util.get_process_node(proc_fid)

        if new_state == ServiceHealth.OK:
            # Node can have multiple controllers. Node can be online, with
            # a single controller running online.
            # If we receive process 'OK', only the process state is
            # updated. So, we need to update the corresponding
            # controller state.
            ctrl_fid = self.consul_util.get_ioservice_ctrl_fid(proc_fid)
            if ctrl_fid:
                self.consul_util.set_ctrl_state(ctrl_fid, new_state)

        node_fid = self.consul_util.get_node_fid(node)
        notes = self.add_node_state_by_fid(node_fid, new_state)
        notes += self.add_enclosing_devices_by_node(node_fid,
                                                    new_state,
                                                    node=node)
        return notes
Пример #15
0
    def ha_nvec_set_process(self, event: HaNvecSetEvent) -> None:
        LOG.debug('Processing HaNvecSetEvent (nvec size = %s)',
                  len(event.nvec))
        self.consul_util.get_all_nodes()
        ha_states: List[HAState] = []
        bcast_ss: List[HAState] = []
        for n in event.nvec:
            fid = Fid.from_struct(n.note.no_id)
            obj_health = ObjHealth.from_ha_note_state(n.note.no_state)
            ha_states.append(HAState(fid, obj_health))
            if n.note.no_state in {
                    HaNoteStruct.M0_NC_REPAIRED, HaNoteStruct.M0_NC_ONLINE
            }:
                bcast_ss.append(HAState(fid, obj_health))

            # In case of failed repair, roll back to failed state.
            elif n.note.no_state == HaNoteStruct.M0_NC_REPAIR:
                obj_health = ObjHealth.from_ha_note_state(
                    HaNoteStruct.M0_NC_FAILED)
                bcast_ss.append(HAState(fid, obj_health))

            # In case of failed rebalance, roll back to repaired state.
            elif n.note.no_state == HaNoteStruct.M0_NC_REBALANCE:
                obj_health = ObjHealth.from_ha_note_state(
                    HaNoteStruct.M0_NC_REPAIRED)
                bcast_ss.append(HAState(fid, obj_health))

        LOG.debug('got ha_states %s', ha_states)
        if bcast_ss:
            self.broadcast_ha_states(bcast_ss)
Пример #16
0
 def pause_repair(self, pool_fid: Fid):
     LOG.debug('Pausing repair for pool %s', pool_fid)
     result: int = self._ffi.pause_repair(self._ha_ctx, pool_fid.to_c())
     if result:
         raise RepairRebalanceException(
             'Failed to send SPIEL request "sns_repair_pause", please' +
             ' check Motr logs for more details.')
     LOG.debug('Repairing paused for pool %s', pool_fid)
Пример #17
0
 def get_proc_bytecount(self, proc_fid: Fid) -> ByteCountStats:
     bytecount: ByteCountStats = self._ffi.proc_bytecount_fetch(
         self._ha_ctx, proc_fid.to_c())
     if not bytecount:
         raise BytecountException('Bytecount stats unavailable')
     LOG.debug('Bytecount status for proc fid: %s, stats =%s',
               str(bytecount.proc_fid), bytecount.pvers)
     return bytecount
Пример #18
0
 def resume_rebalance(self, pool_fid: Fid):
     LOG.debug('Resuming rebalance for pool %s', pool_fid)
     result: int = self._ffi.resume_rebalance(self._ha_ctx, pool_fid.to_c())
     if result:
         raise RepairRebalanceException(
             'Failed to send SPIEL request "sns_rebalance_resume",' +
             'please check Motr logs for more details.')
     LOG.debug('Rebalancing resumed for pool %s', pool_fid)
Пример #19
0
 def stop_rebalance(self, pool_fid: Fid):
     logging.debug('Stopping rebalance for pool %s', pool_fid)
     result: int = self._ffi.stop_rebalance(self._ha_ctx, pool_fid.to_c())
     if result:
         raise RepairRebalanceException(
             'Failed to send SPIEL request "sns_rebalance_stop",' +
             'please check Motr logs for more details.')
     logging.debug('Rebalancing stoped for pool %s', pool_fid)
Пример #20
0
 def start_repair(self, pool_fid: Fid):
     logging.debug('Initiating repair for pool %s', pool_fid)
     result: int = self._ffi.start_repair(self._ha_ctx, pool_fid.to_c())
     if result:
         raise RepairRebalanceException(
             'Failed to send SPIEL request "sns_repair_start", please' +
             ' check Motr logs for more details.')
     logging.debug('Repairing started for pool %s', pool_fid)
Пример #21
0
 def get_service_process_fid(self, svc_fid: Fid) -> Fid:
     assert ObjT.SERVICE.value == svc_fid.container
     node_items = self.kv.kv_get('m0conf/nodes', recurse=True)
     keys = self.get_service_keys(node_items, svc_fid.key)
     assert len(keys) == 1
     process_fid: str = keys[0].split('/')[4]
     pfid = Fid.parse(process_fid)
     return pfid
Пример #22
0
def entrypoint():
    return EntrypointRequest(reply_context='test',
                             req_id=Uint128(1, 2),
                             remote_rpc_endpoint='endpoint',
                             process_fid=Fid(1, 2),
                             git_rev='HEAD',
                             pid=123,
                             is_first_request=False)
Пример #23
0
 def get_rebalance_status(self, pool_fid: Fid) -> List[ReprebStatus]:
     LOG.debug('Fetching rebalance status for pool %s', pool_fid)
     status: List[ReprebStatus] = self._ffi.rebalance_status(
         self._ha_ctx, pool_fid.to_c())
     if status is None:
         raise RepairRebalanceException('rebalance status unavailable')
     LOG.debug('rebalance status for pool %s: %s', pool_fid, status)
     return status
Пример #24
0
    def add_node_state_by_fid(self, node_fid: Fid,
                              new_state: ObjHealth) -> List[HaNoteStruct]:

        # Update the node state in consul kv.
        self.consul_util.set_node_state(node_fid, new_state)

        state_int = new_state.to_ha_note_status()
        return [HaNoteStruct(no_id=node_fid.to_c(), no_state=state_int)]
Пример #25
0
 def generate_confd(self, svc_id: str, hax_ep: str, motr_conf_dir: str):
     fid = Fid(ObjT.PROCESS.value, int(svc_id))
     ep = self.provider.get_service_ep(svc_id)
     filename = f'm0d-{fid}'
     contents = (f"MOTR_M0D_EP='{ep}'\n"
                 f"MOTR_HA_EP='{hax_ep}'\n"
                 f"MOTR_PROCESS_FID='{fid}'\n"
                 f"MOTR_CONF_XC='{motr_conf_dir}/confd.xc'\n")
     self._write_file(motr_conf_dir + self.sysconf_dir + filename, contents)
Пример #26
0
 def get_svc_fids(self, svc_name: str) -> List[str]:
     IDs = self.get_all_svc_ids()
     id_map = {
         'hax': IDs['HAX_ID'],
         'confd': IDs['CONFD_IDs'],
         'ios': IDs['IOS_IDs'],
         's3': IDs['S3_IDs']
     }
     return [str(Fid(ObjT.PROCESS.value, int(x))) for x in id_map[svc_name]]
Пример #27
0
 def generate_ios(self, svc_id: str, hax_ep: str, motr_conf_dir: str):
     fid = Fid(ObjT.PROCESS.value, int(svc_id))
     ep = self.provider.get_service_ep(svc_id)
     meta_data = self.provider.get_ios_meta_data(svc_id)
     filename = f'm0d-{fid}'
     contents = (f"MOTR_M0D_EP='{ep}'\n"
                 f"MOTR_HA_EP='{hax_ep}'\n"
                 f"MOTR_PROCESS_FID='{fid}'\n")
     if meta_data:
         contents += f'MOTR_BE_SEG_PATH={meta_data}\n'
     self._write_file(motr_conf_dir + self.sysconf_dir + filename, contents)
Пример #28
0
    def start(self, rpc_endpoint: str, process: Fid, ha_service: Fid,
              profile: Profile):
        LOG.debug('Starting m0_halon_interface')
        self._process_fid = process
        self._profile = profile

        @repeat_if_fails()
        def _get_rm_fid() -> Fid:
            return self.consul_util.get_rm_fid()

        rm_fid = _get_rm_fid()
        result = self._ffi.start(self._ha_ctx, make_c_str(rpc_endpoint),
                                 process.to_c(), ha_service.to_c(),
                                 rm_fid.to_c())
        if result:
            LOG.error(
                'Cannot start Motr API. m0_halon_interface::start'
                ' returned non-zero code (%s)', result)
            raise RuntimeError('Cannot start m0_halon_interface.'
                               'Please check Motr logs for more details.')
Пример #29
0
 def _generate_sub_services(self, note: HaNoteStruct,
                            cns: ConsulUtil) -> List[HaNoteStruct]:
     new_state = note.no_state
     fid = Fid.from_struct(note.no_id)
     service_list = cns.get_services_by_parent_process(fid)
     LOG.debug('Process fid=%s encloses %s services as follows: %s', fid,
               len(service_list), service_list)
     return [
         HaNoteStruct(no_id=x.fid.to_c(), no_state=new_state)
         for x in service_list
     ]
Пример #30
0
    def handle_ioq_stob_error(self, payload: Dict[str, Any]) -> None:
        fid = Fid.parse(payload['conf_sdev'])
        if fid.is_null():
            LOG.debug('Fid is 0:0. Skipping the message.')
            return

        q: Queue = Queue(1)
        self.planner.add_command(
            BroadcastHAStates(states=[HAState(fid, status=ObjHealth.FAILED)],
                              reply_to=q))
        ids: List[MessageId] = q.get()
        self.herald.wait_for_any(HaLinkMessagePromise(ids))