예제 #1
0
 def _update_process_status(self, p: WorkPlanner, motr: Motr,
                            event: ConfHaProcess) -> None:
     # If a consul-related exception appears, it will
     # be processed by repeat_if_fails.
     #
     # This thread will become blocked until that
     # intermittent error gets resolved.
     motr_to_svc_status = {
         (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED):
         (ServiceHealth.OK),
         (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED):
         (ServiceHealth.OFFLINE),
         (m0HaProcessType.M0_CONF_HA_PROCESS_M0D, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED):
         (ServiceHealth.OK),
         (m0HaProcessType.M0_CONF_HA_PROCESS_M0D, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED):
         (ServiceHealth.FAILED),
         (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED):
         (ServiceHealth.OK),
         (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED):
         (ServiceHealth.FAILED)
     }
     self.consul.update_process_status(event)
     if event.chp_event in (m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED,
                            m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED):
         svc_status = motr_to_svc_status[(event.chp_type, event.chp_event)]
         motr.broadcast_ha_states(
             [HAState(fid=event.fid, status=svc_status)])
예제 #2
0
def run_in_consumer(mocker, msg: BaseMessage, planner: WorkPlanner,
                    consumer: ConsumerThread, motr: Motr) -> None:
    mocker.patch.object(planner, 'get_next_command', side_effect=[msg, Die()])
    profile = Profile(fid=create_profile_fid(22),
                      name='the_pool',
                      pool_names=['name1'])
    motr.start('endpoint', create_process_fid(120), create_process_fid(15),
               profile)
    consumer._do_work(planner, motr)
예제 #3
0
    def _execute(self, motr: Motr):
        try:
            LOG.info('byte-count updater thread has started')
            while not self.stopped:
                if not self.consul.am_i_rc():
                    wait_for_event(self.event, self.interval_sec)
                    continue
                if not motr.is_spiel_ready():
                    wait_for_event(self.event, self.interval_sec)
                    continue
                processes: List[Tuple[Fid, ObjHealth]] = \
                    self.consul.get_proc_fids_with_status(['ios'])
                if not processes:
                    continue
                try:
                    for ios, status in processes:
                        if status == ObjHealth.OK:
                            byte_count: ByteCountStats = \
                                motr.get_proc_bytecount(ios)
                            LOG.debug('Received bytecount: %s', byte_count)
                            if not byte_count:
                                continue
                            self.consul.update_pver_bc(byte_count)

                    pver_items = self._get_pver_with_pver_status(motr)
                    if not pver_items:
                        continue
                    pver_bc = self._calculate_bc_per_pver(pver_items)
                    self.consul.update_bc_for_dg_category(pver_bc, pver_items)
                except HAConsistencyException:
                    LOG.exception('Failed to update Consul KV '
                                  'due to an intermittent error. The '
                                  'error is swallowed since new attempts '
                                  'will be made timely')
                except BytecountException as e:
                    LOG.exception(
                        'Failed due to %s. Aborting this iteration.'
                        ' Waiting for next attempt.', e.message)
                wait_for_event(self.event, self.interval_sec)
        except InterruptedException:
            # No op. _sleep() has interrupted before the timeout exceeded:
            # the application is shutting down.
            # There are no resources that we need to dispose specially.
            pass
        except Exception:
            LOG.exception('Aborting due to an error')
        finally:
            LOG.exception('byte-count updater thread exited')
예제 #4
0
    def _execute(self, motr: Motr):
        try:
            ffi = motr._ffi
            LOG.info('filesystem stats updater thread has started')
            ffi.adopt_motr_thread()
            self._ensure_motr_all_started()
            while not self.stopped:
                if not self._am_i_rc():
                    self._sleep(self.interval_sec)
                    continue

                started = self._ioservices_running()
                if not all(started):
                    self._sleep(self.interval_sec)
                    continue
                result: int = motr.start_rconfc()
                if result == 0:
                    stats = motr.get_filesystem_stats()
                    motr.stop_rconfc()
                    if not stats:
                        continue
                    LOG.debug('FS stats are as follows: %s', stats)
                    now_time = datetime.datetime.now()
                    data = FsStatsWithTime(stats=stats,
                                           timestamp=now_time.timestamp(),
                                           date=now_time.isoformat())
                    try:
                        self.consul.update_fs_stats(data)
                    except HAConsistencyException:
                        LOG.debug('Failed to update Consul KV '
                                  'due to an intermittent error. The '
                                  'error is swallowed since new attempts '
                                  'will be made timely')
                self._sleep(self.interval_sec)
        except InterruptedException:
            # No op. _sleep() has interrupted before the timeout exceeded:
            # the application is shutting down.
            # There are no resources that we need to dispose specially.
            pass
        except Exception:
            LOG.exception('Aborting due to an error')
        finally:
            LOG.debug('Releasing motr-related resources for this thread')
            ffi.shun_motr_thread()
            LOG.debug('filesystem stats updater thread exited')
예제 #5
0
 def _execute(self, motr: Motr):
     try:
         LOG.debug('rconfc starter thread has started')
         self.consul.ensure_motr_all_started(self.event)
         while (not self.stopped) and (not motr.spiel_ready):
             started = self.consul.ensure_ioservices_running()
             if not all(started):
                 wait_for_event(self.event, 5)
                 continue
             result: int = motr.start_rconfc()
             if result == 0:
                 motr.spiel_ready = True
     except InterruptedException:
         # No op. sleep() has interrupted before the timeout exceeded:
         # the application is shutting down.
         # There are no resources that we need to dispose specially.
         pass
     except Exception:
         LOG.exception('Aborting due to an error')
     finally:
         LOG.debug('rconfc starter thread exited')
예제 #6
0
def main():
    # Note: no logging must happen before this call.
    # Otherwise the log configuration will not apply.
    setup_logging()

    # [KN] The elements in the queue will appear if
    # 1. A callback is invoked from ha_link (this will happen in a motr
    #    thread which must be free ASAP)
    # 2. A new HA notification has come form Consul via HTTP
    # [KN] The messages are consumed by Python thread created by
    # _run_qconsumer_thread function.
    #
    # [KN] Note: The server is launched in the main thread.
    planner = WorkPlanner()

    util: ConsulUtil = ConsulUtil()
    _remove_stale_session(util)
    cfg: HL_Fids = _get_motr_fids(util)

    LOG.info('Welcome to HaX')
    LOG.info(f'Setting up ha_link interface with the options as follows: '
             f'hax fid = {cfg.hax_fid}, hax endpoint = {cfg.hax_ep}, '
             f'HA fid = {cfg.ha_fid}')

    ffi = HaxFFI()
    herald = DeliveryHerald()
    motr = Motr(planner=planner, ffi=ffi, herald=herald, consul_util=util)

    # Note that consumer thread must be started before we invoke motr.start(..)
    # Reason: hax process will send entrypoint request and somebody needs
    # to reply it.

    # TODO make the number of threads configurable
    consumer_threads = [
        _run_qconsumer_thread(planner, motr, herald, util, i) for i in range(4)
    ]

    try:
        # [KN] We use just the first profile for Spiel API for now.
        motr.start(cfg.hax_ep,
                   process=cfg.hax_fid,
                   ha_service=cfg.ha_fid,
                   profile=cfg.profiles[0])
        LOG.info('Motr API has been started')
        rconfc_starter = _run_rconfc_starter_thread(motr, consul_util=util)

        stats_updater = _run_stats_updater_thread(motr, consul_util=util)
        event_poller = _run_thread(create_ha_thread(planner, util))
        # [KN] This is a blocking call. It will work until the program is
        # terminated by signal

        server = ServerRunner(planner, herald, consul_util=util)
        server.run(threads_to_wait=[
            *consumer_threads, stats_updater, rconfc_starter, event_poller
        ])
    except Exception:
        LOG.exception('Exiting due to an exception')
    finally:
        motr.fini()
예제 #7
0
 def _execute(self, motr: Motr):
     try:
         LOG.info('filesystem stats updater thread has started')
         while not self.stopped:
             if not self.consul.am_i_rc():
                 wait_for_event(self.event, self.interval_sec)
                 continue
             if (not motr.is_spiel_ready() or (
                     not all(self.consul.ensure_ioservices_running()))):
                 wait_for_event(self.event, self.interval_sec)
                 continue
             stats = motr.get_filesystem_stats()
             if not stats:
                 continue
             LOG.debug('FS stats are as follows: %s', stats)
             now_time = datetime.datetime.now()
             data = FsStatsWithTime(stats=stats,
                                    timestamp=now_time.timestamp(),
                                    date=now_time.isoformat())
             try:
                 self.consul.update_fs_stats(data)
             except HAConsistencyException:
                 LOG.debug('Failed to update Consul KV '
                           'due to an intermittent error. The '
                           'error is swallowed since new attempts '
                           'will be made timely')
             wait_for_event(self.event, self.interval_sec)
     except InterruptedException:
         # No op. _sleep() has interrupted before the timeout exceeded:
         # the application is shutting down.
         # There are no resources that we need to dispose specially.
         pass
     except Exception:
         LOG.exception('Aborting due to an error')
     finally:
         LOG.debug('filesystem stats updater thread exited')
예제 #8
0
def main():
    # Note: no logging must happen before this call.
    # Otherwise the log configuration will not apply.
    _setup_logging()

    # [KN] The elements in the queue will appear if
    # 1. A callback is invoked from ha_link (this will happen in a motr
    #    thread which must be free ASAP)
    # 2. A new HA notification has come form Consul via HTTP
    # [KN] The messages are consumed by Python thread created by
    # _run_qconsumer_thread function.
    #
    # [KN] Note: The server is launched in the main thread.
    q: Queue = Queue(maxsize=8)

    util: ConsulUtil = ConsulUtil()
    cfg = _get_motr_fids(util)

    LOG.info('Welcome to HaX')
    LOG.info(f'Setting up ha_link interface with the options as follows: '
             f'hax fid = {cfg.hax_fid}, hax endpoint = {cfg.hax_ep}, '
             f'HA fid = {cfg.ha_fid}, RM fid = {cfg.rm_fid}')

    ffi = HaxFFI()
    herald = DeliveryHerald()
    motr = Motr(queue=q,
                rm_fid=cfg.rm_fid,
                ffi=ffi,
                herald=herald,
                consul_util=util)

    # Note that consumer thread must be started before we invoke motr.start(..)
    # Reason: hax process will send entrypoint request and somebody needs
    # to reply it.
    consumer = _run_qconsumer_thread(q, motr, herald)

    try:
        motr.start(cfg.hax_ep,
                   process=cfg.hax_fid,
                   ha_service=cfg.ha_fid,
                   rm_service=cfg.rm_fid)
        LOG.info('Motr API has been started')
        stats_updater = _run_stats_updater_thread(motr, consul_util=util)
        # [KN] This is a blocking call. It will work until the program is
        # terminated by signal
        run_server(q,
                   herald,
                   consul_util=util,
                   threads_to_wait=[consumer, stats_updater])
    except Exception:
        LOG.exception('Exiting due to an exception')
    finally:
        motr.close()
예제 #9
0
    def _get_pver_with_pver_status(
            self, motr: Motr) -> Optional[Dict[str, PverInfo]]:
        '''
        Storing a map of pver_fid and its state.
        Ex of pver state:
        PverInfo(fid=0x7600000000000001:0x3e, state=0,
        data_units=1, parity_units=0, pool_width=10, unit_size=0)

        Pver data is stored in consul kv in format
        key = ioservices/0x7200000000000001:0x20/pvers/
              0x7600000000000001:0x6/users/1
        value = {"bc": 4096, "object_cnt": 1}
        '''
        iosservice_items = self.consul.kv.kv_get('ioservices/', recurse=True)
        pver_items = {}
        if iosservice_items:
            for k in iosservice_items:
                p_ver = k['Key'].split('/')[3]
                if p_ver not in pver_items:
                    pver_info: PverInfo = motr.get_pver_status(
                        Fid.parse(p_ver))
                    pver_items[p_ver] = pver_info
            LOG.debug('Received pool version and status: %s', pver_items)
        return pver_items
예제 #10
0
    def _do_work(self, q: Queue, motr: Motr):
        ffi = motr._ffi
        LOG.info('Handler thread has started')
        ffi.adopt_motr_thread()

        def pull_msg():
            try:
                return q.get(block=False)
            except Empty:
                return None

        try:
            while True:
                try:
                    LOG.debug('Waiting for the next message')

                    item = pull_msg()
                    while item is None:
                        time.sleep(0.2)
                        if self.is_stopped:
                            raise StopIteration()
                        item = pull_msg()

                    LOG.debug('Got %s message from queue', item)
                    if isinstance(item, FirstEntrypointRequest):
                        LOG.debug('first entrypoint request, broadcast FAILED')
                        ids: List[MessageId] = motr.broadcast_ha_states([
                            HAState(fid=item.process_fid,
                                    status=ServiceHealth.FAILED)
                        ])
                        LOG.debug('waiting for broadcast of %s for ep: %s',
                                  ids, item.remote_rpc_endpoint)
                        self.herald.wait_for_all(HaLinkMessagePromise(ids))
                        motr.send_entrypoint_request_reply(
                            EntrypointRequest(
                                reply_context=item.reply_context,
                                req_id=item.req_id,
                                remote_rpc_endpoint=item.remote_rpc_endpoint,
                                process_fid=item.process_fid,
                                git_rev=item.git_rev,
                                pid=item.pid,
                                is_first_request=item.is_first_request))
                    elif isinstance(item, EntrypointRequest):
                        # While replying any Exception is catched. In such a
                        # case, the motr process will receive EAGAIN and
                        # hence will need to make new attempt by itself
                        motr.send_entrypoint_request_reply(item)
                    elif isinstance(item, ProcessEvent):
                        self._update_process_status(q, item.evt)
                    elif isinstance(item, HaNvecGetEvent):
                        fn = motr.ha_nvec_get_reply
                        # If a consul-related exception appears, it will
                        # be processed by repeat_if_fails.
                        #
                        # This thread will become blocked until that
                        # intermittent error gets resolved.
                        decorated = (repeat_if_fails(wait_seconds=5))(fn)
                        decorated(item)
                    elif isinstance(item, BroadcastHAStates):
                        LOG.info('HA states: %s', item.states)
                        ha_states = self.update_process_failure(q, item.states)
                        result: List[MessageId] = motr.broadcast_ha_states(
                            ha_states)
                        if item.reply_to:
                            item.reply_to.put(result)
                    elif isinstance(item, StobIoqError):
                        LOG.info('Stob IOQ: %s', item.fid)
                        payload = dump_json(item)
                        LOG.debug('Stob IOQ JSON: %s', payload)
                        offset = self.eq_publisher.publish('stob-ioq', payload)
                        LOG.debug('Written to epoch: %s', offset)
                    elif isinstance(item, SnsRepairStatus):
                        LOG.info('Requesting SNS repair status')
                        status = motr.get_repair_status(item.fid)
                        LOG.info('SNS repair status is received: %s', status)
                        item.reply_to.put(status)
                    elif isinstance(item, SnsRebalanceStatus):
                        LOG.info('Requesting SNS rebalance status')
                        status = motr.get_rebalance_status(item.fid)
                        LOG.info('SNS rebalance status is received: %s',
                                 status)
                        item.reply_to.put(status)
                    elif isinstance(item, SnsRebalanceStart):
                        LOG.info('Requesting SNS rebalance start')
                        motr.start_rebalance(item.fid)
                    elif isinstance(item, SnsRebalanceStop):
                        LOG.info('Requesting SNS rebalance stop')
                        motr.stop_rebalance(item.fid)
                    elif isinstance(item, SnsRebalancePause):
                        LOG.info('Requesting SNS rebalance pause')
                        motr.pause_rebalance(item.fid)
                    elif isinstance(item, SnsRebalanceResume):
                        LOG.info('Requesting SNS rebalance resume')
                        motr.resume_rebalance(item.fid)
                    elif isinstance(item, SnsRepairStart):
                        LOG.info('Requesting SNS repair start')
                        motr.start_repair(item.fid)
                    elif isinstance(item, SnsRepairStop):
                        LOG.info('Requesting SNS repair stop')
                        motr.stop_repair(item.fid)
                    elif isinstance(item, SnsRepairPause):
                        LOG.info('Requesting SNS repair pause')
                        motr.pause_repair(item.fid)
                    elif isinstance(item, SnsRepairResume):
                        LOG.info('Requesting SNS repair resume')
                        motr.resume_repair(item.fid)

                    else:
                        LOG.warning('Unsupported event type received: %s',
                                    item)
                except StopIteration:
                    raise
                except Exception:
                    # no op, swallow the exception
                    LOG.exception('**ERROR**')
        except StopIteration:
            ffi.shun_motr_thread()
        finally:
            LOG.info('Handler thread has exited')
예제 #11
0
def motr(mocker, ffi, planner, herald, consul_util) -> Motr:
    motr = Motr(ffi, planner, herald, consul_util)
    return motr
예제 #12
0
    def _update_process_status(self, p: WorkPlanner, motr: Motr,
                               event: ConfHaProcess) -> None:
        LOG.info('Updating process status: %s', event.fid)
        # If a consul-related exception appears, it will
        # be processed by repeat_if_fails.
        #
        # This thread will become blocked until that
        # intermittent error gets resolved.
        motr_to_svc_status = {
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): (
                    ObjHealth.OK),
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): (
                    ObjHealth.FAILED),
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0D,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): (
                    ObjHealth.OK),
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0D,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): (
                    ObjHealth.FAILED),
            (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): (
                    ObjHealth.OK),
            (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): (
                    ObjHealth.FAILED)}
        if event.chp_event in (m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED,
                               m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED):
            svc_status = motr_to_svc_status[(event.chp_type, event.chp_event)]
            broadcast_hax_only = False
            if ((event.chp_type ==
                 m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS) or
               (event.fid == self.consul.get_hax_fid())):
                # Motr-mkfs processes do not require updates on their peer
                # mkfs processes. Motr-mkfs is an independent and typically a
                # one-time operation. So avoid broadcasting a motr-mkfs state
                # to the peer motr-mkfs processes but hax still needs to be
                # notified in-order to disconnect the hax-motr halink when
                # motr-mkfs process stops.
                broadcast_hax_only = True

            LOG.debug('chp_type %d broadcast_hax_only %s', event.chp_type,
                      broadcast_hax_only)
            motr.broadcast_ha_states(
                [HAState(fid=event.fid, status=svc_status)],
                broadcast_hax_only=broadcast_hax_only)
        self.consul.update_process_status(event)

        # If we are receiving M0_CONF_HA_PROCESS_STARTED for M0D processes
        # then we will check if all the M0D processes on the local node are
        # started. If yes then we are going to send node online event to
        # MessageBus
        if event.chp_event == m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED:
            try:
                util: ConsulUtil = ConsulUtil()
                producer = get_producer(util)
                if producer:
                    producer.check_and_send(parent_resource_type=ObjT.NODE,
                                            fid=event.fid,
                                            resource_status='online')
                else:
                    LOG.warning('Could not sent an event as producer'
                                ' is not available')
            except Exception as e:
                LOG.warning("Send event failed due to '%s'", e)
예제 #13
0
    def _do_work(self, planner: WorkPlanner, motr: Motr):
        LOG.info('Handler thread has started')

        try:
            while True:
                try:
                    LOG.debug('Waiting for the next message')

                    item = planner.get_next_command()

                    LOG.debug('Got %s message from planner', item)
                    if isinstance(item, FirstEntrypointRequest):
                        motr.send_entrypoint_request_reply(
                            EntrypointRequest(
                                reply_context=item.reply_context,
                                req_id=item.req_id,
                                remote_rpc_endpoint=item.remote_rpc_endpoint,
                                process_fid=item.process_fid,
                                git_rev=item.git_rev,
                                pid=item.pid,
                                is_first_request=item.is_first_request))
                    elif isinstance(item, EntrypointRequest):
                        # While replying any Exception is catched. In such a
                        # case, the motr process will receive EAGAIN and
                        # hence will need to make new attempt by itself
                        motr.send_entrypoint_request_reply(item)
                    elif isinstance(item, ProcessEvent):
                        self._update_process_status(planner, motr, item.evt)
                    elif isinstance(item, HaNvecGetEvent):
                        fn = motr.ha_nvec_get_reply
                        # If a consul-related exception appears, it will
                        # be processed by repeat_if_fails.
                        #
                        # This thread will become blocked until that
                        # intermittent error gets resolved.
                        decorated = (repeat_if_fails(wait_seconds=5))(fn)
                        decorated(item)
                    elif isinstance(item, HaNvecSetEvent):
                        fn = motr.ha_nvec_set_process
                        # If a consul-related exception appears, it will
                        # be processed by repeat_if_fails.
                        #
                        # This thread will become blocked until that
                        # intermittent error gets resolved.
                        decorated = (repeat_if_fails(wait_seconds=5))(fn)
                        decorated(item)
                    elif isinstance(item, BroadcastHAStates):
                        LOG.info('HA states: %s', item.states)
                        ha_states = self.update_process_failure(
                            planner, item.states)
                        result: List[MessageId] = motr.broadcast_ha_states(
                            ha_states)
                        if item.reply_to:
                            item.reply_to.put(result)
                    elif isinstance(item, StobIoqError):
                        LOG.info('Stob IOQ: %s', item.fid)
                        payload = dump_json(item)
                        LOG.debug('Stob IOQ JSON: %s', payload)
                        offset = self.eq_publisher.publish('stob-ioq', payload)
                        LOG.debug('Written to epoch: %s', offset)
                    elif isinstance(item, SnsRepairStatus):
                        LOG.info('Requesting SNS repair status')
                        status = motr.get_repair_status(item.fid)
                        LOG.info('SNS repair status is received: %s', status)
                        item.reply_to.put(status)
                    elif isinstance(item, SnsRebalanceStatus):
                        LOG.info('Requesting SNS rebalance status')
                        status = motr.get_rebalance_status(item.fid)
                        LOG.info('SNS rebalance status is received: %s',
                                 status)
                        item.reply_to.put(status)
                    elif isinstance(item, SnsRebalanceStart):
                        LOG.info('Requesting SNS rebalance start')
                        motr.start_rebalance(item.fid)
                    elif isinstance(item, SnsRebalanceStop):
                        LOG.info('Requesting SNS rebalance stop')
                        motr.stop_rebalance(item.fid)
                    elif isinstance(item, SnsRebalancePause):
                        LOG.info('Requesting SNS rebalance pause')
                        motr.pause_rebalance(item.fid)
                    elif isinstance(item, SnsRebalanceResume):
                        LOG.info('Requesting SNS rebalance resume')
                        motr.resume_rebalance(item.fid)
                    elif isinstance(item, SnsRepairStart):
                        LOG.info('Requesting SNS repair start')
                        motr.start_repair(item.fid)
                    elif isinstance(item, SnsRepairStop):
                        LOG.info('Requesting SNS repair stop')
                        motr.stop_repair(item.fid)
                    elif isinstance(item, SnsRepairPause):
                        LOG.info('Requesting SNS repair pause')
                        motr.pause_repair(item.fid)
                    elif isinstance(item, SnsRepairResume):
                        LOG.info('Requesting SNS repair resume')
                        motr.resume_repair(item.fid)
                    elif isinstance(item, Die):
                        raise StopIteration()
                    else:
                        LOG.warning('Unsupported event type received: %s',
                                    item)
                except StopIteration:
                    raise
                except Exception:
                    # no op, swallow the exception
                    LOG.exception('**ERROR**')
                finally:
                    planner.notify_finished(item)
        except StopIteration:
            LOG.info('Consumer Stopped')
            if self.idx == 0:
                motr.stop()
        finally:
            LOG.info('Handler thread has exited')
예제 #14
0
def main():
    # Note: no logging must happen before this call.
    # Otherwise the log configuration will not apply.
    setup_logging()
    set_locale()
    inject.configure(di_configuration)

    state = inject.instance(HaxGlobalState)

    # [KN] The elements in the work planner will appear if
    # 1. A callback is invoked from ha_link (this will happen in a motr
    #    thread which must be free ASAP)
    # 2. A new HA notification has come form Consul via HTTP
    # [KN] The messages are consumed by Python threads created by
    # _run_qconsumer_thread function.
    #
    # [KN] Note: The server is launched in the main thread.
    planner = WorkPlanner()

    def handle_signal(sig, frame):
        state.set_stopping()
        planner.shutdown()

    # This is necessary to allow hax to exit early if Consul is not available
    # (otherwise _get_motr_fids() may be retrying forever even if the hax
    # process needs to shutdown).
    signal.signal(signal.SIGINT, handle_signal)

    util: ConsulUtil = ConsulUtil()
    # Avoid removing session on hax start as this will happen
    # on every node, thus leader election will keep re-triggering
    # until the final hax node starts, this will delay further
    # bootstrapping operations.
    _remove_stale_session(util)
    cfg: HL_Fids = _get_motr_fids(util)
    hax_http_port = util.get_hax_http_port()
    util.init_motr_processes_status()

    LOG.info('Welcome to HaX')
    LOG.info(f'Setting up ha_link interface with the options as follows: '
             f'hax fid = {cfg.hax_fid}, hax endpoint = {cfg.hax_ep}, '
             f'HA fid = {cfg.ha_fid}')

    ffi = HaxFFI()
    herald = DeliveryHerald()
    motr = Motr(planner=planner, ffi=ffi, herald=herald, consul_util=util)

    # Note that consumer thread must be started before we invoke motr.start(..)
    # Reason: hax process will send entrypoint request and somebody needs
    # to reply it.

    # TODO make the number of threads configurable
    consumer_threads = [
        _run_qconsumer_thread(planner, motr, herald, util, i)
        for i in range(32)
    ]

    try:
        # [KN] We use just the first profile for Spiel API for now.
        motr.start(cfg.hax_ep,
                   process=cfg.hax_fid,
                   ha_service=cfg.ha_fid,
                   profile=cfg.profiles[0])
        LOG.info('Motr API has been started')
        rconfc_starter = _run_rconfc_starter_thread(motr, consul_util=util)

        stats_updater = _run_stats_updater_thread(motr, consul_util=util)
        bc_updater = _run_bc_updater_thread(motr, consul_util=util)
        event_poller = _run_thread(create_ha_thread(planner, util))
        # [KN] This is a blocking call. It will work until the program is
        # terminated by signal

        server = ServerRunner(planner,
                              herald,
                              consul_util=util,
                              hax_state=state)
        server.run(threads_to_wait=[
            *consumer_threads, stats_updater, bc_updater, rconfc_starter,
            event_poller
        ],
                   port=hax_http_port)
    except Exception:
        LOG.exception('Exiting due to an exception')
    finally:
        motr.fini()
예제 #15
0
    def test_process_failure(self):
        consul_util = ConsulUtil()
        consul_cache = InvocationCache()
        ffi = Mock(spec=['init_motr_api'])
        motr = Motr(ffi, None, None, consul_util)

        # Setup for the test: notification of a process failure
        # - failure here is an ios service and a disk
        # - dummy Consul reports all processes on the node are failed
        # - expect the node, enclosure, controller, drive,
        #   process, and service to all be marked as failed
        #
        # Static names and fids for the setup are given here.
        node_name = 'testnode'

        hax_fid = Fid(0x7200000000000001, 0x6)
        site_fid = Fid(0x5300000000000001, 0x1)
        rack_fid = Fid(0x6100000000000001, 0x2)
        node_fid = Fid(0x6e00000000000001, 0x3)
        encl_fid = Fid(0x6500000000000001, 0x4)
        ctrl_fid = Fid(0x6300000000000001, 0x5)
        process_fid = Fid(0x7200000000000001, 0x15)
        service_fid = Fid(0x7300000000000001, 0xe)
        service_fid_typed = FidWithType(fid=service_fid, service_type='ios')
        drive_fid = Fid(0x6b00000000000001, 0x11)
        ctrl_path = 'm0conf/sites/{}/racks/{}/encls/{}/ctrls/{}'.format(
            site_fid, rack_fid, encl_fid, ctrl_fid)
        ctrl_state = '{"state": "M0_NC_FAILED"}'

        # Set mock return values for the necessary Consul calls
        motr._is_mkfs = Mock(return_value=False)
        consul_util.get_hax_fid = Mock(return_value=hax_fid)
        consul_util.is_proc_client = Mock(return_value=False)
        consul_util.get_services_by_parent_process = Mock(
            return_value=[service_fid_typed])
        consul_util.get_disks_by_parent_process = Mock(
            return_value=[drive_fid])
        consul_util.get_process_node = Mock(return_value=node_name)
        consul_util.get_node_name_by_fid = Mock(return_value=node_name)
        consul_util.get_node_fid = Mock(return_value=node_fid)
        consul_util.get_node_encl_fid = Mock(return_value=encl_fid)
        consul_util.get_node_ctrl_fids = Mock(return_value=[ctrl_fid])

        # These failure indications are here to trigger specific code paths for
        # node failure. Additional tests can cover different scenarios (e.g.
        # drive failure but node still up), which will set differernt results
        # for these calls.
        consul_util.all_io_services_failed = Mock(return_value=True)
        consul_util.get_sdev_state = Mock(
            return_value=HaNoteStruct.M0_NC_FAILED)
        consul_util.get_ctrl_state = Mock(
            return_value=m0HaObjState.M0_NC_FAILED)
        consul_util.get_ctrl_state_updates = Mock(
            return_value=[PutKV(key=ctrl_path, value=ctrl_state)])

        # We'll use these mocks to check that expected updates are happening.
        consul_util.update_drive_state = Mock()
        consul_util.set_process_state = Mock()
        consul_util.set_node_state = Mock()
        consul_util.set_encl_state = Mock()
        motr._ha_broadcast = Mock()
        motr._write_updates = Mock()

        # Send the mock event.
        motr.broadcast_ha_states(
            [HAState(fid=process_fid, status=ObjHealth.FAILED)],
            notify_devices=True,
            broadcast_hax_only=False,
            kv_cache=consul_cache)

        # ConsulUtil is responsible for the actual KV updates, just check
        # here that the appropriate util function is called for each
        # component.
        consul_util.update_drive_state.assert_called_with([drive_fid],
                                                          ObjHealth.OFFLINE,
                                                          device_event=False)
        consul_util.set_process_state.assert_called_with(
            process_fid, ObjHealth.FAILED)
        consul_util.set_node_state.assert_called_with(node_fid,
                                                      ObjHealth.FAILED)
        consul_util.set_encl_state.assert_called_with(encl_fid,
                                                      ObjHealth.FAILED,
                                                      kv_cache=consul_cache)
        # This KV update is batched, so the check looks different.
        motr._write_updates.assert_any_call(
            [PutKV(key=ctrl_path, value=ctrl_state)], consul_cache)

        # Check hax broadcast. We should see states updated to FAILED.
        broadcast_list = motr._ha_broadcast.call_args[0][0]
        self.assertTrue(_has_failed_note(broadcast_list, node_fid))
        self.assertTrue(_has_failed_note(broadcast_list, encl_fid))
        self.assertTrue(_has_failed_note(broadcast_list, ctrl_fid))
        self.assertTrue(_has_failed_note(broadcast_list, process_fid))
        self.assertTrue(_has_failed_note(broadcast_list, service_fid))
        self.assertTrue(_has_failed_note(broadcast_list, drive_fid))