async def test_service_health_broadcast(hax_client, planner, status: str, health: ServiceHealth): service_health = [{ 'Node': { 'Node': 'localhost', 'Address': '10.1.10.12', }, 'Service': { 'ID': '12', 'Service': 'ios', 'Tags': [], 'Port': 8000, }, 'Checks': [ { 'Node': '12', 'CheckID': 'service:ios', 'Name': "Service 'ios' check", 'Status': status, 'Notes': '', 'Output': '', 'ServiceID': '12', 'ServiceName': 'ios', }, ], }] resp = await hax_client.post('/', json=service_health) assert resp.status == 200 assert planner.add_command.called planner.add_command.assert_called_once_with( BroadcastHAStates( states=[HAState(fid=Fid(0x7200000000000001, 12), status=health)], reply_to=None))
def fn(): # import pudb.remote # pudb.remote.set_trace(term_size=(80, 40), port=9998) LOG.debug('Service health from Consul: %s', data) planner.add_command( BroadcastHAStates(states=to_ha_states(data, consul_util), reply_to=None))
def fn(): proc_state_to_objhealth = { 'M0_CONF_HA_PROCESS_STARTING': ObjHealth.OFFLINE, 'M0_CONF_HA_PROCESS_STARTED': ObjHealth.OK, 'M0_CONF_HA_PROCESS_STOPPING': ObjHealth.OFFLINE, 'M0_CONF_HA_PROCESS_STOPPED': ObjHealth.OFFLINE } # import pudb.remote # pudb.remote.set_trace(term_size=(80, 40), port=9998) ha_states: List[HAState] = [] LOG.debug('process status: %s', data) for item in data: proc_val = base64.b64decode(item['Value']) proc_status = json.loads(str(proc_val.decode('utf-8'))) LOG.debug('process update item key %s item val: %s', item['Key'].split('/')[1], proc_status) proc_fid = Fid.parse(item['Key'].split('/')[1]) proc_state = proc_status['state'] proc_type = proc_status['type'] if (proc_type != 'M0_CONF_HA_PROCESS_M0MKFS' and proc_state in ('M0_CONF_HA_PROCESS_STARTED', 'M0_CONF_HA_PROCESS_STOPPED')): ha_states.append( HAState(fid=proc_fid, status=proc_state_to_objhealth[proc_state])) planner.add_command( BroadcastHAStates(states=ha_states, reply_to=None))
async def _process(request): data = await request.json() loop = asyncio.get_event_loop() # Note that queue.put is potentially a blocking call await loop.run_in_executor( None, lambda: queue.put( BroadcastHAStates(states=to_ha_states(data), reply_to=None))) return web.Response()
def _process_event_cb(self, fid, chp_event, chp_type, chp_pid): logging.info('fid=%s, chp_event=%s', fid, chp_event) self.queue.put( ProcessEvent( ConfHaProcess(chp_event=chp_event, chp_type=chp_type, chp_pid=chp_pid, fid=fid))) if chp_event == 3: self.queue.put( BroadcastHAStates(states=[HAState(fid=fid, status='offline')], reply_to=None))
def handle_ioq_stob_error(self, payload: Dict[str, Any]) -> None: fid = Fid.parse(payload['conf_sdev']) if fid.is_null(): logging.debug('Fid is 0:0. Skipping the message.') return q: Queue = Queue(1) self.queue.put( BroadcastHAStates(states=[HAState(fid, status='offline')], reply_to=q)) ids: List[MessageId] = q.get() self.herald.wait_for_any(HaLinkMessagePromise(ids))
def handle_ioq_stob_error(self, payload: Dict[str, Any]) -> None: fid = Fid.parse(payload['conf_sdev']) if fid.is_null(): LOG.debug('Fid is 0:0. Skipping the message.') return q: Queue = Queue(1) self.planner.add_command( BroadcastHAStates(states=[HAState(fid, status=ObjHealth.FAILED)], reply_to=q)) ids: List[MessageId] = q.get() self.herald.wait_for_any(HaLinkMessagePromise(ids))
def _process_event_cb(self, fid, chp_event, chp_type, chp_pid): LOG.info('fid=%s, chp_event=%s', fid, chp_event) self.queue.put( ProcessEvent( ConfHaProcess(chp_event=chp_event, chp_type=chp_type, chp_pid=chp_pid, fid=fid))) if chp_event == 3: self.queue.put( BroadcastHAStates( states=[HAState(fid=fid, status=ServiceHealth.FAILED)], reply_to=None))
def handle(self, msg: Event) -> None: node_fid = self.cns.get_node_fid(msg.node_id) if not node_fid: LOG.warn('Unknown [node_id=%s] provided. HA event is ignored', msg.node_id) return get_health = self._get_status_by_text self.planner.add_command( BroadcastHAStates(states=[ HAState(fid=node_fid, status=get_health(msg.event_type)) ], reply_to=None))
def handle_device_state_set(self, payload: Dict[str, Any]) -> None: # To add check for multiple object entries in a payload. # for objinfo in payload: hastate: Optional[HAState] = self.to_ha_state(payload) if not hastate: LOG.debug('No ha states to broadcast.') return q: Queue = Queue(1) LOG.debug('HA broadcast, node: %s device: %s state: %s', payload['node'], payload['device'], payload['state']) self.queue.put(BroadcastHAStates(states=[hastate], reply_to=q)) ids: List[MessageId] = q.get() self.herald.wait_for_any(HaLinkMessagePromise(ids))
def _update_process_status(self, q: Queue, event: ConfHaProcess) -> None: # If a consul-related exception appears, it will # be processed by repeat_if_fails. # # This thread will become blocked until that # intermittent error gets resolved. self.consul.update_process_status(event) svc_status = m0HaProcessEvent.event_to_svchealth(event.chp_event) if event.chp_type == m0HaProcessType.M0_CONF_HA_PROCESS_M0D: # Broadcast the received motr process status to other motr # processes in the cluster. q.put( BroadcastHAStates( states=[HAState(fid=event.fid, status=svc_status)], reply_to=None))
def _process_event_cb(self, fid, chp_event, chp_type, chp_pid): LOG.info('fid=%s, chp_event=%s', fid, chp_event) self.queue.put( ProcessEvent( ConfHaProcess(chp_event=chp_event, chp_type=chp_type, chp_pid=chp_pid, fid=fid))) if chp_type == m0HaProcessType.M0_CONF_HA_PROCESS_M0D: if chp_event == m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED: self.queue.put( BroadcastHAStates( states=[HAState(fid=fid, status=ServiceHealth.OK)], reply_to=None))
def handle(self, msg: Event) -> None: node_name = self.cns.get_node_name_by_machineid(msg.resource_id, allow_null=True) if not node_name: LOG.warn('Unknown [resource_id=%s] provided. HA event is ignored', msg.resource_id) return node_fid = self.cns.get_node_fid(node_name) if not node_fid: LOG.warn('Unknown [node_name=%s] provided. HA event is ignored', node_name) return get_health = self._get_status_by_text self.planner.add_command( BroadcastHAStates(states=[ HAState(fid=node_fid, status=get_health(msg.event_type)) ], reply_to=None))
def update_process_failure(self, q: Queue, ha_states: List[HAState]) -> List[HAState]: new_ha_states: List[HAState] = [] for state in ha_states: # We are only concerned with process statuses. if state.fid.container == ObjT.PROCESS.value: current_status = self.consul.get_process_current_status( state.status, state.fid) if current_status == ServiceHealth.FAILED: self.consul.service_health_to_m0dstatus_update( state.fid, current_status) elif current_status == ServiceHealth.UNKNOWN: # We got service status as UNKNOWN, that means hax was # notified about process failure but hax couldn't # confirm if the process is in failed state or have # failed and restarted. So, we will not loose the # event and try again to confirm the real time # process status by enqueing a broadcast event # specific to this process. # It is expected that the process status gets # eventually confirmed as either failed or passing (OK). # This situation typically arises due to delay # in receiving failure notification during which the # corresponding process might be restarting or have # already restarted. Thus it is important to confirm # the real time status of the process before # broadcasting failure. current_status = ServiceHealth.OK q.put( BroadcastHAStates(states=[ HAState(fid=state.fid, status=ServiceHealth.FAILED) ], reply_to=None)) new_ha_states.append( HAState(fid=state.fid, status=current_status)) else: new_ha_states.append(state) return new_ha_states
def _broadcast(self, state_list: List[HAState]) -> None: if not state_list: return LOG.debug('Changes in statuses: %s', state_list) self.q.put(BroadcastHAStates(states=state_list, reply_to=None))
def broadcast(): return BroadcastHAStates(states=[], reply_to=None)
def update_process_failure(self, planner: WorkPlanner, ha_states: List[HAState]) -> List[HAState]: new_ha_states: List[HAState] = [] for state in ha_states: # We are only concerned with process statuses. if state.fid.container == ObjT.PROCESS.value: current_status = self.consul.get_process_current_status( state.status, state.fid) if current_status == ServiceHealth.OK: if (self.consul.get_process_local_status( state.fid) == 'M0_CONF_HA_PROCESS_STARTED'): continue if current_status in (ServiceHealth.FAILED, ServiceHealth.STOPPED): if (self.consul.get_process_local_status( state.fid) == 'M0_CONF_HA_PROCESS_STOPPED'): # Consul may report failure of a process multiple # times, so we don't want to send duplicate failure # notifications, it may cause delay in cleanup # activities. continue if current_status == ServiceHealth.UNKNOWN: # We got service status as UNKNOWN, that means hax was # notified about process failure but hax couldn't # confirm if the process is in failed state or have # failed and restarted. So, we will not loose the # event and try again to confirm the real time # process status by enqueing a broadcast event # specific to this process. # It is expected that the process status gets # eventually confirmed as either failed or passing (OK). # This situation typically arises due to delay # in receiving failure notification during which the # corresponding process might be restarting or have # already restarted. Thus it is important to confirm # the real time status of the process before # broadcasting failure. current_status = ServiceHealth.UNKNOWN planner.add_command( BroadcastHAStates(states=[ HAState(fid=state.fid, status=ServiceHealth.FAILED) ], reply_to=None)) if current_status not in (ServiceHealth.UNKNOWN, ServiceHealth.OFFLINE): # We also need to account and report the failure of remote # Motr processes to this node's hax and motr processes. # When Consul reports a remote process failure, hax # confirms its current status from Consul KV and updates # the list of failed services and also adds it to the # broadcast list. if current_status != ServiceHealth.OK: event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED else: event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED self.consul.update_process_status( ConfHaProcess( chp_event=event, chp_type=int( m0HaProcessType.M0_CONF_HA_PROCESS_M0D), chp_pid=0, fid=state.fid)) new_ha_states.append( HAState(fid=state.fid, status=current_status)) else: new_ha_states.append(state) return new_ha_states