Exemplo n.º 1
0
def main():
    # Note: no logging must happen before this call.
    # Otherwise the log configuration will not apply.
    setup_logging()

    # [KN] The elements in the queue will appear if
    # 1. A callback is invoked from ha_link (this will happen in a motr
    #    thread which must be free ASAP)
    # 2. A new HA notification has come form Consul via HTTP
    # [KN] The messages are consumed by Python thread created by
    # _run_qconsumer_thread function.
    #
    # [KN] Note: The server is launched in the main thread.
    planner = WorkPlanner()

    util: ConsulUtil = ConsulUtil()
    _remove_stale_session(util)
    cfg: HL_Fids = _get_motr_fids(util)

    LOG.info('Welcome to HaX')
    LOG.info(f'Setting up ha_link interface with the options as follows: '
             f'hax fid = {cfg.hax_fid}, hax endpoint = {cfg.hax_ep}, '
             f'HA fid = {cfg.ha_fid}')

    ffi = HaxFFI()
    herald = DeliveryHerald()
    motr = Motr(planner=planner, ffi=ffi, herald=herald, consul_util=util)

    # Note that consumer thread must be started before we invoke motr.start(..)
    # Reason: hax process will send entrypoint request and somebody needs
    # to reply it.

    # TODO make the number of threads configurable
    consumer_threads = [
        _run_qconsumer_thread(planner, motr, herald, util, i) for i in range(4)
    ]

    try:
        # [KN] We use just the first profile for Spiel API for now.
        motr.start(cfg.hax_ep,
                   process=cfg.hax_fid,
                   ha_service=cfg.ha_fid,
                   profile=cfg.profiles[0])
        LOG.info('Motr API has been started')
        rconfc_starter = _run_rconfc_starter_thread(motr, consul_util=util)

        stats_updater = _run_stats_updater_thread(motr, consul_util=util)
        event_poller = _run_thread(create_ha_thread(planner, util))
        # [KN] This is a blocking call. It will work until the program is
        # terminated by signal

        server = ServerRunner(planner, herald, consul_util=util)
        server.run(threads_to_wait=[
            *consumer_threads, stats_updater, rconfc_starter, event_poller
        ])
    except Exception:
        LOG.exception('Exiting due to an exception')
    finally:
        motr.fini()
Exemplo n.º 2
0
def main():
    # Note: no logging must happen before this call.
    # Otherwise the log configuration will not apply.
    _setup_logging()

    # [KN] The elements in the queue will appear if
    # 1. A callback is invoked from ha_link (this will happen in a motr
    #    thread which must be free ASAP)
    # 2. A new HA notification has come form Consul via HTTP
    # [KN] The messages are consumed by Python thread created by
    # _run_qconsumer_thread function.
    #
    # [KN] Note: The server is launched in the main thread.
    q: Queue = Queue(maxsize=8)

    util: ConsulUtil = ConsulUtil()
    cfg = _get_motr_fids(util)

    LOG.info('Welcome to HaX')
    LOG.info(f'Setting up ha_link interface with the options as follows: '
             f'hax fid = {cfg.hax_fid}, hax endpoint = {cfg.hax_ep}, '
             f'HA fid = {cfg.ha_fid}, RM fid = {cfg.rm_fid}')

    ffi = HaxFFI()
    herald = DeliveryHerald()
    motr = Motr(queue=q,
                rm_fid=cfg.rm_fid,
                ffi=ffi,
                herald=herald,
                consul_util=util)

    # Note that consumer thread must be started before we invoke motr.start(..)
    # Reason: hax process will send entrypoint request and somebody needs
    # to reply it.
    consumer = _run_qconsumer_thread(q, motr, herald)

    try:
        motr.start(cfg.hax_ep,
                   process=cfg.hax_fid,
                   ha_service=cfg.ha_fid,
                   rm_service=cfg.rm_fid)
        LOG.info('Motr API has been started')
        stats_updater = _run_stats_updater_thread(motr, consul_util=util)
        # [KN] This is a blocking call. It will work until the program is
        # terminated by signal
        run_server(q,
                   herald,
                   consul_util=util,
                   threads_to_wait=[consumer, stats_updater])
    except Exception:
        LOG.exception('Exiting due to an exception')
    finally:
        motr.close()
Exemplo n.º 3
0
def main():
    # Note: no logging must happen before this call.
    # Otherwise the log configuration will not apply.
    setup_logging()
    set_locale()
    inject.configure(di_configuration)

    state = inject.instance(HaxGlobalState)

    # [KN] The elements in the work planner will appear if
    # 1. A callback is invoked from ha_link (this will happen in a motr
    #    thread which must be free ASAP)
    # 2. A new HA notification has come form Consul via HTTP
    # [KN] The messages are consumed by Python threads created by
    # _run_qconsumer_thread function.
    #
    # [KN] Note: The server is launched in the main thread.
    planner = WorkPlanner()

    def handle_signal(sig, frame):
        state.set_stopping()
        planner.shutdown()

    # This is necessary to allow hax to exit early if Consul is not available
    # (otherwise _get_motr_fids() may be retrying forever even if the hax
    # process needs to shutdown).
    signal.signal(signal.SIGINT, handle_signal)

    util: ConsulUtil = ConsulUtil()
    # Avoid removing session on hax start as this will happen
    # on every node, thus leader election will keep re-triggering
    # until the final hax node starts, this will delay further
    # bootstrapping operations.
    _remove_stale_session(util)
    cfg: HL_Fids = _get_motr_fids(util)
    hax_http_port = util.get_hax_http_port()
    util.init_motr_processes_status()

    LOG.info('Welcome to HaX')
    LOG.info(f'Setting up ha_link interface with the options as follows: '
             f'hax fid = {cfg.hax_fid}, hax endpoint = {cfg.hax_ep}, '
             f'HA fid = {cfg.ha_fid}')

    ffi = HaxFFI()
    herald = DeliveryHerald()
    motr = Motr(planner=planner, ffi=ffi, herald=herald, consul_util=util)

    # Note that consumer thread must be started before we invoke motr.start(..)
    # Reason: hax process will send entrypoint request and somebody needs
    # to reply it.

    # TODO make the number of threads configurable
    consumer_threads = [
        _run_qconsumer_thread(planner, motr, herald, util, i)
        for i in range(32)
    ]

    try:
        # [KN] We use just the first profile for Spiel API for now.
        motr.start(cfg.hax_ep,
                   process=cfg.hax_fid,
                   ha_service=cfg.ha_fid,
                   profile=cfg.profiles[0])
        LOG.info('Motr API has been started')
        rconfc_starter = _run_rconfc_starter_thread(motr, consul_util=util)

        stats_updater = _run_stats_updater_thread(motr, consul_util=util)
        bc_updater = _run_bc_updater_thread(motr, consul_util=util)
        event_poller = _run_thread(create_ha_thread(planner, util))
        # [KN] This is a blocking call. It will work until the program is
        # terminated by signal

        server = ServerRunner(planner,
                              herald,
                              consul_util=util,
                              hax_state=state)
        server.run(threads_to_wait=[
            *consumer_threads, stats_updater, bc_updater, rconfc_starter,
            event_poller
        ],
                   port=hax_http_port)
    except Exception:
        LOG.exception('Exiting due to an exception')
    finally:
        motr.fini()
Exemplo n.º 4
0
def motr(mocker, ffi, planner, herald, consul_util) -> Motr:
    motr = Motr(ffi, planner, herald, consul_util)
    return motr
Exemplo n.º 5
0
    def test_process_failure(self):
        consul_util = ConsulUtil()
        consul_cache = InvocationCache()
        ffi = Mock(spec=['init_motr_api'])
        motr = Motr(ffi, None, None, consul_util)

        # Setup for the test: notification of a process failure
        # - failure here is an ios service and a disk
        # - dummy Consul reports all processes on the node are failed
        # - expect the node, enclosure, controller, drive,
        #   process, and service to all be marked as failed
        #
        # Static names and fids for the setup are given here.
        node_name = 'testnode'

        hax_fid = Fid(0x7200000000000001, 0x6)
        site_fid = Fid(0x5300000000000001, 0x1)
        rack_fid = Fid(0x6100000000000001, 0x2)
        node_fid = Fid(0x6e00000000000001, 0x3)
        encl_fid = Fid(0x6500000000000001, 0x4)
        ctrl_fid = Fid(0x6300000000000001, 0x5)
        process_fid = Fid(0x7200000000000001, 0x15)
        service_fid = Fid(0x7300000000000001, 0xe)
        service_fid_typed = FidWithType(fid=service_fid, service_type='ios')
        drive_fid = Fid(0x6b00000000000001, 0x11)
        ctrl_path = 'm0conf/sites/{}/racks/{}/encls/{}/ctrls/{}'.format(
            site_fid, rack_fid, encl_fid, ctrl_fid)
        ctrl_state = '{"state": "M0_NC_FAILED"}'

        # Set mock return values for the necessary Consul calls
        motr._is_mkfs = Mock(return_value=False)
        consul_util.get_hax_fid = Mock(return_value=hax_fid)
        consul_util.is_proc_client = Mock(return_value=False)
        consul_util.get_services_by_parent_process = Mock(
            return_value=[service_fid_typed])
        consul_util.get_disks_by_parent_process = Mock(
            return_value=[drive_fid])
        consul_util.get_process_node = Mock(return_value=node_name)
        consul_util.get_node_name_by_fid = Mock(return_value=node_name)
        consul_util.get_node_fid = Mock(return_value=node_fid)
        consul_util.get_node_encl_fid = Mock(return_value=encl_fid)
        consul_util.get_node_ctrl_fids = Mock(return_value=[ctrl_fid])

        # These failure indications are here to trigger specific code paths for
        # node failure. Additional tests can cover different scenarios (e.g.
        # drive failure but node still up), which will set differernt results
        # for these calls.
        consul_util.all_io_services_failed = Mock(return_value=True)
        consul_util.get_sdev_state = Mock(
            return_value=HaNoteStruct.M0_NC_FAILED)
        consul_util.get_ctrl_state = Mock(
            return_value=m0HaObjState.M0_NC_FAILED)
        consul_util.get_ctrl_state_updates = Mock(
            return_value=[PutKV(key=ctrl_path, value=ctrl_state)])

        # We'll use these mocks to check that expected updates are happening.
        consul_util.update_drive_state = Mock()
        consul_util.set_process_state = Mock()
        consul_util.set_node_state = Mock()
        consul_util.set_encl_state = Mock()
        motr._ha_broadcast = Mock()
        motr._write_updates = Mock()

        # Send the mock event.
        motr.broadcast_ha_states(
            [HAState(fid=process_fid, status=ObjHealth.FAILED)],
            notify_devices=True,
            broadcast_hax_only=False,
            kv_cache=consul_cache)

        # ConsulUtil is responsible for the actual KV updates, just check
        # here that the appropriate util function is called for each
        # component.
        consul_util.update_drive_state.assert_called_with([drive_fid],
                                                          ObjHealth.OFFLINE,
                                                          device_event=False)
        consul_util.set_process_state.assert_called_with(
            process_fid, ObjHealth.FAILED)
        consul_util.set_node_state.assert_called_with(node_fid,
                                                      ObjHealth.FAILED)
        consul_util.set_encl_state.assert_called_with(encl_fid,
                                                      ObjHealth.FAILED,
                                                      kv_cache=consul_cache)
        # This KV update is batched, so the check looks different.
        motr._write_updates.assert_any_call(
            [PutKV(key=ctrl_path, value=ctrl_state)], consul_cache)

        # Check hax broadcast. We should see states updated to FAILED.
        broadcast_list = motr._ha_broadcast.call_args[0][0]
        self.assertTrue(_has_failed_note(broadcast_list, node_fid))
        self.assertTrue(_has_failed_note(broadcast_list, encl_fid))
        self.assertTrue(_has_failed_note(broadcast_list, ctrl_fid))
        self.assertTrue(_has_failed_note(broadcast_list, process_fid))
        self.assertTrue(_has_failed_note(broadcast_list, service_fid))
        self.assertTrue(_has_failed_note(broadcast_list, drive_fid))