def _generate_sub_disks(self, note: HaNoteStruct, services: List[FidWithType], cns: ConsulUtil, kv_cache=None) -> List[HaNoteStruct]: disk_list = [] new_state = note.no_state proc_fid = Fid.from_struct(note.no_id) state = (ObjHealth.OK if new_state == HaNoteStruct.M0_NC_ONLINE else ObjHealth.OFFLINE) is_mkfs = self._is_mkfs(proc_fid) mkfs_down = is_mkfs and state != ObjHealth.OK if not mkfs_down: for svc in services: disk_list += cns.get_disks_by_parent_process(proc_fid, svc.fid) if disk_list: # XXX: Need to check the current state of the device, transition # to ONLINE only in case of an explicit request or iff the prior # state of the device is UNKNOWN/OFFLINE. if not mkfs_down: # We don't mark the devices as failed if the process is MKFS # and if its effective status is STOPPED (see EOS-24124). cns.update_drive_state(disk_list, state, device_event=False) LOG.debug('proc fid=%s encloses %d disks as follows: %s', proc_fid, len(disk_list), disk_list) drive_ha_notes: List[HaNoteStruct] = [] for drive_id in disk_list: # Get the drive state from Consul KV. dstate = cns.get_sdev_state(ObjT.DRIVE, drive_id.key) drive_ha_notes.append( HaNoteStruct(no_id=drive_id.to_c(), no_state=dstate)) return drive_ha_notes
def is_node_failed(self, proc_note: HaNoteStruct, kv_cache=None): proc_fid = Fid.from_struct(proc_note.no_id) assert ObjT.PROCESS.value == proc_fid.container node = self.consul_util.get_process_node(proc_fid, kv_cache=kv_cache) return self.consul_util.all_io_services_failed(node, kv_cache=kv_cache)
def ha_nvec_set_process(self, event: HaNvecSetEvent) -> None: LOG.debug('Processing HaNvecSetEvent (nvec size = %s)', len(event.nvec)) self.consul_util.get_all_nodes() ha_states: List[HAState] = [] bcast_ss: List[HAState] = [] for n in event.nvec: fid = Fid.from_struct(n.note.no_id) obj_health = ObjHealth.from_ha_note_state(n.note.no_state) ha_states.append(HAState(fid, obj_health)) if n.note.no_state in { HaNoteStruct.M0_NC_REPAIRED, HaNoteStruct.M0_NC_ONLINE }: bcast_ss.append(HAState(fid, obj_health)) # In case of failed repair, roll back to failed state. elif n.note.no_state == HaNoteStruct.M0_NC_REPAIR: obj_health = ObjHealth.from_ha_note_state( HaNoteStruct.M0_NC_FAILED) bcast_ss.append(HAState(fid, obj_health)) # In case of failed rebalance, roll back to repaired state. elif n.note.no_state == HaNoteStruct.M0_NC_REBALANCE: obj_health = ObjHealth.from_ha_note_state( HaNoteStruct.M0_NC_REPAIRED) bcast_ss.append(HAState(fid, obj_health)) LOG.debug('got ha_states %s', ha_states) if bcast_ss: self.broadcast_ha_states(bcast_ss)
def notify_node_status_by_process( self, proc_note: HaNoteStruct) -> List[HaNoteStruct]: # proc_note.no_state is of int type new_state = ServiceHealth.from_ha_note_state(proc_note.no_state) proc_fid = Fid.from_struct(proc_note.no_id) assert ObjT.PROCESS.value == proc_fid.container LOG.debug('Notifying node status for process_fid=%s state=%s', proc_fid, new_state) node = self.consul_util.get_process_node(proc_fid) if new_state == ServiceHealth.OK: # Node can have multiple controllers. Node can be online, with # a single controller running online. # If we receive process 'OK', only the process state is # updated. So, we need to update the corresponding # controller state. ctrl_fid = self.consul_util.get_ioservice_ctrl_fid(proc_fid) if ctrl_fid: self.consul_util.set_ctrl_state(ctrl_fid, new_state) node_fid = self.consul_util.get_node_fid(node) notes = self.add_node_state_by_fid(node_fid, new_state) notes += self.add_enclosing_devices_by_node(node_fid, new_state, node=node) return notes
def _generate_sub_services(self, note: HaNoteStruct, cns: ConsulUtil) -> List[HaNoteStruct]: new_state = note.no_state fid = Fid.from_struct(note.no_id) service_list = cns.get_services_by_parent_process(fid) LOG.debug('Process fid=%s encloses %s services as follows: %s', fid, len(service_list), service_list) return [ HaNoteStruct(no_id=x.fid.to_c(), no_state=new_state) for x in service_list ]
def _generate_sub_disks(self, note: HaNoteStruct, services: List, cns: ConsulUtil) -> List[HaNoteStruct]: disk_list = [] new_state = note.no_state proc_fid = Fid.from_struct(note.no_id) for svc in services: disk_list += cns.get_disks_by_parent_process(proc_fid, svc.fid) LOG.debug('proc fid=%s encloses %d disks as follows: %s', proc_fid, len(disk_list), disk_list) return [ HaNoteStruct(no_id=x.to_c(), no_state=new_state) for x in disk_list ]
def ha_nvec_get_reply(self, event: HaNvecGetEvent, kv_cache=None) -> None: LOG.debug('Preparing the reply for HaNvecGetEvent (nvec size = %s)', len(event.nvec)) self.consul_util.get_all_nodes() notes: List[HaNoteStruct] = [] for n in event.nvec: fid = Fid.from_struct(n.note.no_id) n.note.no_state = self.consul_util.get_conf_obj_status( FidTypeToObjT[fid.container], fid.key, kv_cache=kv_cache) notes.append(n.note) LOG.debug('Replying ha nvec of length ' + str(len(event.nvec))) self._ffi.ha_nvec_reply(event.hax_msg, make_array(HaNoteStruct, notes), len(notes))
def get_ctrl_status(self, proc_note: HaNoteStruct) -> Optional[HaNoteStruct]: new_state = proc_note.no_state proc_fid = Fid.from_struct(proc_note.no_id) assert ObjT.PROCESS.value == proc_fid.container LOG.debug('Notifying ctrl status for process_fid=%s state=%s', proc_fid, new_state) ctrl_fid = self.consul_util.get_ioservice_ctrl_fid(proc_fid) if ctrl_fid: # Update controller state in consul kv. self.consul_util.set_ctrl_state( ctrl_fid, ServiceHealth.from_ha_note_state(new_state)) return HaNoteStruct(no_id=ctrl_fid.to_c(), no_state=new_state) return None
def _generate_sub_services(self, note: HaNoteStruct, cns: ConsulUtil, notify_devices=True, kv_cache=None) -> List[HaNoteStruct]: new_state = note.no_state fid = Fid.from_struct(note.no_id) service_list = cns.get_services_by_parent_process(fid, kv_cache=kv_cache) LOG.debug('Process fid=%s encloses %s services as follows: %s', fid, len(service_list), service_list) service_notes = [ HaNoteStruct(no_id=x.fid.to_c(), no_state=new_state) for x in service_list ] if notify_devices: service_notes += self._generate_sub_disks(note, service_list, cns) return service_notes
def get_ctrl_status( self, proc_note: HaNoteStruct, kv_cache=None) -> Optional[Tuple[HaNoteStruct, List[PutKV]]]: new_state = proc_note.no_state proc_fid = Fid.from_struct(proc_note.no_id) assert ObjT.PROCESS.value == proc_fid.container LOG.debug('Notifying ctrl status for process_fid=%s state=%s', proc_fid, new_state) ctrl_fid = self.consul_util.get_ioservice_ctrl_fid(proc_fid, kv_cache=kv_cache) if ctrl_fid: # Update controller state in consul kv. updates = self.consul_util.get_ctrl_state_updates( ctrl_fid, ObjHealth.from_ha_note_state(new_state), kv_cache=kv_cache) return (HaNoteStruct(no_id=ctrl_fid.to_c(), no_state=new_state), updates) return None
def _generate_sub_disks(self, note: HaNoteStruct, services: List, cns: ConsulUtil) -> List[HaNoteStruct]: disk_list = [] new_state = note.no_state proc_fid = Fid.from_struct(note.no_id) for svc in services: disk_list += cns.get_disks_by_parent_process(proc_fid, svc.fid) if disk_list: state = (ServiceHealth.OK if new_state == HaNoteStruct.M0_NC_ONLINE else ServiceHealth.OFFLINE) # XXX: Need to check the current state of the device, transition # to ONLINE only in case of an explicit request or iff the prior # state of the device is UNKNOWN/OFFLINE. cns.update_drive_state(disk_list, state, device_event=False) LOG.debug('proc fid=%s encloses %d disks as follows: %s', proc_fid, len(disk_list), disk_list) drive_ha_notes: List[HaNoteStruct] = [] for drive_id in disk_list: # Get the drive state from Consul KV. dstate = cns.get_sdev_state(ObjT.DRIVE, drive_id.key) drive_ha_notes.append( HaNoteStruct(no_id=drive_id.to_c(), no_state=dstate)) return drive_ha_notes
def notify_node_status_by_process(self, proc_note: HaNoteStruct, kv_cache=None) -> List[HaNoteStruct]: # proc_note.no_state is of int type new_state = ObjHealth.from_ha_note_state(proc_note.no_state) proc_fid = Fid.from_struct(proc_note.no_id) assert ObjT.PROCESS.value == proc_fid.container LOG.debug('Notifying node status for process_fid=%s state=%s', proc_fid, new_state) node = self.consul_util.get_process_node(proc_fid, kv_cache=kv_cache) updates: List[PutKV] = [] if new_state == ObjHealth.OK: # Node can have multiple controllers. Node can be online, with # a single controller running online. # If we receive process 'OK', only the process state is # updated. So, we need to update the corresponding # controller state. ctrl_fid = self.consul_util.get_ioservice_ctrl_fid( proc_fid, kv_cache=kv_cache) if ctrl_fid: updates = self.consul_util.get_ctrl_state_updates( ctrl_fid, new_state, kv_cache=kv_cache) node_fid = self.consul_util.get_node_fid(node, kv_cache=kv_cache) # FIXME make these two functions to return List[PutKV] so that the # write operations can be delayed to reuse the cache as long as # possible notes = self.add_node_state_by_fid(node_fid, new_state) notes += self.add_enclosing_devices_by_node(node_fid, new_state, node=node, kv_cache=kv_cache) self._write_updates(updates, kv_cache) return notes
def notify_node_status(self, proc_note: HaNoteStruct) -> List[HaNoteStruct]: new_state = proc_note.no_state proc_fid = Fid.from_struct(proc_note.no_id) assert ObjT.PROCESS.value == proc_fid.container LOG.debug('Notifying node status for process_fid=%s state=%s', proc_fid, new_state) node = self.consul_util.get_process_node(proc_fid) node_fid = self.consul_util.get_node_fid(node) encl_fid = self.consul_util.get_node_encl_fid(node) ctrl_fid = self.consul_util.get_node_ctrl_fid(node) LOG.debug('node_fid: %s encl_fid: %s ctrl_fid: %s with state: %s', node_fid, encl_fid, ctrl_fid, new_state) notes = [] if node_fid and encl_fid and ctrl_fid: notes = [ HaNoteStruct(no_id=x.to_c(), no_state=new_state) for x in [node_fid, encl_fid, ctrl_fid] ] return notes
def is_node_failed(self, proc_note: HaNoteStruct, kv_cache=None): proc_fid = Fid.from_struct(proc_note.no_id)