def enter(service): logger.warning("{} service is disabled, it will not be " "monitored".format(service.name)) Service.non_active.discard(service.name) Service.monitoring_disabled.discard(service.name) if service.properties_changed_signal: service.properties_changed_signal.remove()
def _get_bmc_info(self): """ nwCableConnection will be default UNKNOWN, Until solution to find bmc eth port cable connection status is found. """ try: bmcdata = {'ifId': 'ebmc0', 'ipV4Prev': "", 'ipV4': "", 'nwStatus': "DOWN", 'nwCableConnStatus': 'UNKNOWN'} ipdata = sp.Popen("sudo ipmitool lan print", shell=True, stdout=sp.PIPE, stderr=sp.PIPE).communicate()[0].decode().strip() bmcip = re.findall("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", ipdata)[0] if bmcip: pingbmchost = "ping -c1 -W1 -q "+bmcip child = sp.Popen(pingbmchost.split(), stdout=sp.PIPE) streamdata = child.communicate()[0] #child must be communicated before fetching return code. retcode = child.returncode if self.prev_bmcip is not None and self.prev_bmcip != bmcip: bmcdata['ipV4Prev'] = self.prev_bmcip bmcdata['ipV4'] = bmcip self.prev_bmcip = bmcip else: self.prev_bmcip = bmcdata['ipV4Prev'] = bmcdata['ipV4'] = bmcip if retcode == 0: bmcdata['nwStatus'] = "UP" else: logger.warning("BMC Host:{0} is not reachable".format(bmcip)) except Exception as e: logger.error("Exception occurs while fetching bmc_info:{}".format(e)) return bmcdata
def check_and_send_alert(self): """Checks whether conditions are met and sends alert if required Alerts will be sent if - 1. All 4 phys of a sas port go up -> down : fault alert 2. All 4 phys of a sas port come down -> up : fault_resolved alert Sensor data stored in persistent storage is a dict of { sas_port_number : alert_type } """ # Update sas ports status self.update_sas_ports_status() # Check the version of stored alert version = None try: # Try to get the version # Exception will be raised if stored alert is None or no Version is available version = self.sas_phy_stored_alert['version'] except Exception: logger.warning( f"Found no data or old data format for SASPortSensor, \ updating data format to version {self.CURRENT_DATA_VERSION}" ) # Versioning is not implemented or there is no data, write new data # Initialize dummy fault_resolved for all sas ports and conn self.sas_phy_stored_alert = {} self.sas_phy_stored_alert['version'] = self.CURRENT_DATA_VERSION self.sas_phy_stored_alert['conn'] = 'fault_resolved' for i in range(0, self.NUM_SAS_PORTS): self.sas_phy_stored_alert[i] = 'fault_resolved' # Save data to store store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA) if version == self.CURRENT_DATA_VERSION: self.handle_current_version_data()
def get_sensor_list_by_type(self, fru_type): """Returns the sensor list based on FRU type using ipmitool utility ipmitool sdr type '<FRU>'. Example of output form 'sdr type 'Fan'' command: Sys Fan 2B | 33h | ok | 29.4 | 5332 RPM ( sensor_id | sensor_num | status | entity_id | <FRU Specific attribute> ) Params : self, fru_type Output Format : List of Tuple Output Example : [(HDD 1 Status, F1, ok, 4.2, Drive Present),] """ sensor_list_out, retcode = self._run_ipmitool_subcommand(f"sdr type '{fru_type.title()}'") if retcode != 0: if isinstance(sensor_list_out, tuple): sensor_list_out = [val for val in sensor_list_out if val] msg = "ipmitool sdr type command failed: {0}".format(b''.join(sensor_list_out)) logger.warning(msg) return sensor_list = b''.join(sensor_list_out).decode("utf-8").split("\n") out = [] for sensor in sensor_list: if sensor == "": break # Example of output form 'sdr type' command: # Sys Fan 2B | 33h | ok | 29.4 | 5332 RPM # PS1 1a Fan Fail | A0h | ok | 29.13 | # HDD 1 Status | F1h | ok | 4.2 | Drive Present fields_list = [ f.strip() for f in sensor.split("|")] sensor_id, sensor_num, status, entity_id, reading = fields_list sensor_num = sensor_num.strip("h").lower() out.append((sensor_id, sensor_num, status, entity_id, reading)) return out
def _raid_health_monitor(self): try: devices = self._get_devices() if len(devices) == 0: return logger.debug("Fetched devices:{}".format(devices)) for device in devices: # Update the state as 'check' for RAID device file result = self._update_raid_device_file(device) if result == "failed": self._retry_execution(self._update_raid_device_file, device) logger.info("RAID device state is changed to 'check'") # Check RAID device array state is 'idle' or not result = self._check_raid_state(device) if result == "failed": logger.warning( "'Idle' state not found for RAID device:{}".format( device)) # Retry to check RAID state self._retry_execution(self._check_raid_state, device) logger.info( "'idle' state is found in Raid device:{}.".format(device)) # Check Mismatch count in RAID device files. result = self._check_mismatch_count(device) if result == "failed": # Persist RAID device fault state and send alert fault_status_file = self.DEFAULT_RAID_DATA_PATH + device + "_" + RaidDataConfig.RAID_MISMATCH_FAULT_STATUS.value if os.path.exists(fault_status_file): with open(fault_status_file, 'r') as fs: data = fs.read().rstrip() if self.FAULT_RESOLVED in data: self.alert_type = self.FAULT self._alert_msg = RaidAlertMsgs.MISMATCH_MSG.value self._send_json_msg(self.alert_type, device, self._alert_msg) self._update_fault_state_file( device, self.FAULT, fault_status_file) else: self.alert_type = self.FAULT self._alert_msg = RaidAlertMsgs.MISMATCH_MSG.value self._send_json_msg(self.alert_type, device, self._alert_msg) self._update_fault_state_file(device, self.FAULT, fault_status_file) # Retry to check mismatch_cnt self._retry_execution(self._check_mismatch_count, device) logger.debug( "No mismatch count is found in Raid device:{}".format( device)) except Exception as ae: logger.error("Failed in monitoring RAID health. ERROR:{}".format( str(ae)))
def _nw_cable_alert_exists(self, interfaces): """Checks cable connection status with physical link(carrier) state and avoids duplicate alert reporting by comparing with its previous state. Fault detection is identified by physical link state Down. Fault resolved is identified by physical link state changed from Down to Up. """ identified_cables = {} for interface in interfaces: interface_name = interface.get("ifId") phy_link_status = interface.get("nwCableConnStatus") # fault detected (Down, Up to Down) if phy_link_status == 'DOWN': if self.prev_cable_cnxns.get( interface_name) != phy_link_status: if self.prev_cable_cnxns.get(interface_name): logger.warning( f"Cable connection fault is detected with '{interface_name}'" ) identified_cables[interface_name] = self.FAULT self.prev_cable_cnxns[interface_name] = phy_link_status # fault resolved (Down to Up) elif phy_link_status == 'UP': if self.prev_cable_cnxns.get( interface_name) != phy_link_status: if self.prev_cable_cnxns.get(interface_name): logger.info( f"Cable connection fault is resolved with '{interface_name}'" ) identified_cables[interface_name] = self.FAULT_RESOLVED if self.interface_fault_state and interface_name in self.interface_fault_state: # After the cable fault is resolved, unset the flag for interface # So that, it can be tracked further for any failure self.INTERFACE_FAULT_DETECTED = False self.interface_fault_state[ interface_name] = self.INTERFACE_FAULT_DETECTED # Also clear the global nw interface dictionary self.prev_nw_status[ interface_name] = phy_link_status self.prev_cable_cnxns[interface_name] = phy_link_status else: logger.debug( f"Cable connection state is unknown with '{interface_name}'" ) return identified_cables
def _get_nwalert(self, interfaces): """ Get network interfaces with fault/OK state for each interface. Parameters: interfaces(list) : List of availabel network interfaces Returns: Dictionary of network interfaces having key as interface name and value as fault state. Return type: dict """ nw_alerts = {} try: for interface in interfaces: interface_name = interface.get("ifId") nw_status = interface.get("nwStatus") logger.debug("{0}:{1}".format(interface_name, nw_status)) # fault detected (Down/UNKNOWN, Up/UNKNOWN to Down, Up/Down to UNKNOWN) if nw_status == 'DOWN' or nw_status == 'UNKNOWN': if self.prev_nw_status.get(interface_name) != nw_status: if self.prev_nw_status.get( interface_name) and self.prev_nw_status.get( interface_name) == 'UP': logger.warning( f"Network connection fault is detected for interface:'{interface_name}'" ) nw_alerts[interface_name] = self.FAULT self.prev_nw_status[interface_name] = nw_status # fault resolved (Down to Up) elif nw_status == 'UP': if self.prev_nw_status.get(interface_name) != nw_status: if self.prev_nw_status.get(interface_name): logger.info( f"Network connection fault is resolved for interface:'{interface_name}'" ) nw_alerts[interface_name] = self.FAULT_RESOLVED self.prev_nw_status[interface_name] = nw_status else: logger.warning( f"Network connection state is:'{nw_status}', for interface:'{interface_name}'" ) except Exception as e: logger.error( f"Exception occurs while checking for network alert condition:'{e}'" ) logger.debug("nw_alerts existed for:{}".format(nw_alerts)) return nw_alerts
def get_sensor_props(self, sensor_id): """Returns individual sensor instance properties based on sensor id using ipmitool utility ipmitool sensor get "Sys Fan 1A" Returns FRU instance specific information Params : self, sensor_id Output Format : Tuple inside dictionary of common and specific data Output Example : ({common dict data},{specific dict data}) """ props_list_out, retcode = self._run_ipmitool_subcommand("sensor get '{0}'".format(sensor_id)) if retcode != 0: if isinstance(props_list_out, tuple): props_list_out = [val for val in props_list_out if val] msg = "ipmitool sensor get command failed: {0}".format(b''.join(props_list_out)) logger.warning(msg) err_response = {sensor_id: {"ERROR": msg}} return (False, err_response) props_list = b''.join(props_list_out).decode("utf-8").split("\n") props_list = props_list[1:] # The first line is 'Locating sensor record...' specific = {} curr_key = None for prop in props_list: if prop == '': continue if ':' in prop: curr_key, val = [f.strip() for f in prop.split(":")] specific[curr_key] = val else: specific[curr_key] += "\n" + prop common = {} common_props = { 'Sensor ID', 'Entity ID', } # Whatever keys from common_props are present, # move them to the 'common' dict for c in (set(specific.keys()) & common_props): common[c] = specific[c] del specific[c] return (common, specific)
def check_notactive_services(self): """ Monitor non-active Services. Raise FAULT Alert if any of the not-active services has exceeded the threshould time for inactivity. """ not_active_services_copy = self.not_active_services.copy() for service, [start_time, prev_state, prev_substate]\ in not_active_services_copy.items(): if self.current_time() - start_time > self.max_wait_time: state = self.service_status[service]["state"] substate = self.service_status[service]["substate"] pid = self.service_status[service]["pid"] self.not_active_services.pop(service) self.failed_services.append(service) self.raise_alert(service, prev_state, state, prev_substate, substate, pid, pid, 1) logger.warning(f"{service} in {state}:{substate} for "\ f"more than {self.max_wait_time} seconds.")
def is_env_vm(self): """ Returns true if VM env, false otherwise """ is_vm = True CMD = "facter is_virtual" try: subout = subprocess.Popen(CMD, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) result = subout.stdout.readlines() if result == [] or result == "": logger.warning( "Not able to read whether env is vm or not, assuming VM env." ) else: if 'false' in result[0].decode(): is_vm = False except Exception as e: logger.warning( "Error while reading whether env is vm or not, assuming VM env : {e}" ) return is_vm
def action_per_transition(self, service, prev_state, state, prev_substate, substate, prev_pid, pid): """Take action according to the state change of the service.""" # alert_info_index : index pointing to alert_info table from # ServiceMonitor:raise_alerts() representing alert # description, type, impact etc. to be sent. alert_info_index = -1 logger.debug(f"ServiceMonitor:action_per_transition for {service} : " + \ f"({prev_state}:{prev_substate}) -> ({state}:{substate})") if prev_state in ["active", "reloading"]: if state == "active": # reloading -> active self.not_active_services.pop(service) if service in self.failed_services: self.failed_services.remove(service) alert_info_index = 2 elif state != "failed": # active -> deactivating/inactive/reloading/activating # or # reloading -> deactivating/inactive/activating self.not_active_services[service] = \ [self.current_time(), prev_state, prev_substate] elif state == "failed": # active/reloading -> failed if service not in self.failed_services: self.failed_services.append(service) alert_info_index = 0 elif prev_state == "deactivating": if state in ["inactive", "activating"]: # deactivating -> inactive/activating if service not in self.not_active_services: self.not_active_services[service] = \ [self.current_time(), prev_state, prev_substate] elif state == "failed": # deactivating -> failed if service not in self.failed_services: self.failed_services.append(service) alert_info_index = 0 elif state == "active": # deactivating -> active if service in self.not_active_services: self.not_active_services.pop(service) if service in self.failed_services: self.failed_services.remove(service) alert_info_index = 2 else: alert_info_index = 3 elif prev_state in ["inactive", "failed"]: if state == "activating": # inactive/failed -> activating if service not in self.not_active_services: self.not_active_services[service] = \ [self.current_time(), prev_state, prev_substate] elif state == "active": # inactive/failed -> active if service in self.failed_services: self.failed_services.remove(service) alert_info_index = 2 if service in self.not_active_services: self.not_active_services.pop(service) elif state == "failed": # inactive -> failed if service not in self.failed_services: self.failed_services.append(service) alert_info_index = 0 else: alert_info_index = 3 elif prev_state == "activating": if service in self.not_active_services: self.not_active_services.pop(service) if state in ["inactive", "deactivating"]: # activating -> inactive/deactivating self.failed_services.append(service) alert_info_index = 0 elif state == "active": # activating -> active if service in self.failed_services: self.failed_services.remove(service) alert_info_index = 2 else: # its a restart. pass elif state == "failed": # activating -> failed if service not in self.failed_services: self.failed_services.append(service) alert_info_index = 0 else: alert_info_index = 3 if alert_info_index == 3: logger.warning(f"{service} service state transition from "\ f"{prev_state} to {state} is not handled.") if alert_info_index != -1: self.raise_alert(service, prev_state, state, prev_substate, substate, prev_pid, pid, alert_info_index)
def _generate_disk_space_alert(self): """Create & transmit a disk_space_alert message as defined by the sensor response json schema""" # Notify the node sensor to update its data required for the disk_space_data message successful = self._node_sensor.read_data("disk_space_alert", self._get_debug(), self._units) if not successful: logger.error( "NodeDataMsgHandler, _generate_disk_space_alert was NOT successful." ) return # Changing disk_usage_threshold type according to what value type entered in config file self._disk_usage_threshold = str(self._disk_usage_threshold) try: if self._disk_usage_threshold.isdigit(): self._disk_usage_threshold = int(self._disk_usage_threshold) else: self._disk_usage_threshold = float(self._disk_usage_threshold) except ValueError: logger.warning( "Disk Space Alert, Invalid disk_usage_threshold value are entered in config." ) # Assigning default value to _disk_usage_threshold self._disk_usage_threshold = self.DEFAULT_DISK_USAGE_THRESHOLD if self._node_sensor.disk_used_percentage >= self._disk_usage_threshold: if not self.disk_fault: self.disk_fault = True # Create the disk space data message and hand it over to the egress processor to transmit fault_event = "Disk usage increased to %s, beyond configured threshold of %s" \ %(self._node_sensor.disk_used_percentage, self._disk_usage_threshold) logger.warning(fault_event) diskSpaceAlertMsg = DiskSpaceAlertMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.total_space, self._node_sensor.free_space, self._node_sensor.disk_used_percentage, self._units, self.site_id, self.rack_id, self.node_id, self.cluster_id, self.FAULT, fault_event) # Add in uuid if it was present in the json request if self._uuid is not None: diskSpaceAlertMsg.set_uuid(self._uuid) jsonMsg = diskSpaceAlertMsg.getJson() self.disk_sensor_data = jsonMsg self.os_sensor_type["disk_space"] = self.disk_sensor_data # Transmit it out over rabbitMQ channel self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) if (self._node_sensor.disk_used_percentage <= self._disk_usage_threshold) and (self.disk_fault == True): # Create the disk space data message and hand it over to the egress processor to transmit fault_resolved_event = "Disk usage decreased to %s, lesser than configured threshold of %s" \ %(self._node_sensor.disk_used_percentage, self._disk_usage_threshold) logger.warning(fault_resolved_event) diskSpaceAlertMsg = DiskSpaceAlertMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.total_space, self._node_sensor.free_space, self._node_sensor.disk_used_percentage, self._units, self.site_id, self.rack_id, self.node_id, self.cluster_id, self.FAULT_RESOLVED, fault_resolved_event) # Add in uuid if it was present in the json request if self._uuid is not None: diskSpaceAlertMsg.set_uuid(self._uuid) jsonMsg = diskSpaceAlertMsg.getJson() self.disk_sensor_data = jsonMsg self.os_sensor_type["disk_space"] = self.disk_sensor_data # Transmit it out over rabbitMQ channel self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) self.disk_fault = False
def _generate_cpu_data(self): """Create & transmit a cpu_data message as defined by the sensor response json schema""" # Notify the node sensor to update its data required for the cpu_data message successful = self._node_sensor.read_data("cpu_data", self._get_debug()) if not successful: logger.error( "NodeDataMsgHandler, _generate_cpu_data was NOT successful.") self._cpu_usage_threshold = str(self._cpu_usage_threshold) try: if self._cpu_usage_threshold.isdigit(): self._cpu_usage_threshold = int(self._cpu_usage_threshold) else: self._cpu_usage_threshold = float(self._cpu_usage_threshold) except ValueError: logger.warning( "CPU Usage Alert, Invalid host_memory_usage_threshold value are entered in config." ) # Assigning default value to _cpu_usage_threshold self._cpu_usage_threshold = self.DEFAULT_CPU_USAGE_THRESHOLD if self._node_sensor.cpu_usage >= self._cpu_usage_threshold: if not self.cpu_fault: self.cpu_fault = True # Create the cpu usage data message and hand it over to the egress processor to transmit fault_event = "CPU usage increased to %s, beyond configured threshold of %s" \ %(self._node_sensor.cpu_usage, self._cpu_usage_threshold) logger.warning(fault_event) # Create the local mount data message and hand it over to the egress processor to transmit cpuDataMsg = CPUdataMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.csps, self._node_sensor.idle_time, self._node_sensor.interrupt_time, self._node_sensor.iowait_time, self._node_sensor.nice_time, self._node_sensor.softirq_time, self._node_sensor.steal_time, self._node_sensor.system_time, self._node_sensor.user_time, self._node_sensor.cpu_core_data, self._node_sensor.cpu_usage, self.site_id, self.rack_id, self.node_id, self.cluster_id, self.FAULT, fault_event) # Add in uuid if it was present in the json request if self._uuid is not None: cpuDataMsg.set_uuid(self._uuid) jsonMsg = cpuDataMsg.getJson() self.cpu_sensor_data = jsonMsg self.os_sensor_type["cpu_usage"] = self.cpu_sensor_data # Transmit it out over rabbitMQ channel self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) if (self._node_sensor.cpu_usage <= self._cpu_usage_threshold) and (self.cpu_fault == True): # Create the cpu usage data message and hand it over to the egress processor to transmit fault_resolved_event = "CPU usage decreased to %s, lesser than configured threshold of %s" \ %(self._node_sensor.cpu_usage, self._cpu_usage_threshold) logger.warning(fault_resolved_event) # Create the local mount data message and hand it over to the egress processor to transmit cpuDataMsg = CPUdataMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.csps, self._node_sensor.idle_time, self._node_sensor.interrupt_time, self._node_sensor.iowait_time, self._node_sensor.nice_time, self._node_sensor.softirq_time, self._node_sensor.steal_time, self._node_sensor.system_time, self._node_sensor.user_time, self._node_sensor.cpu_core_data, self._node_sensor.cpu_usage, self.site_id, self.rack_id, self.node_id, self.cluster_id, self.FAULT_RESOLVED, fault_resolved_event) # Add in uuid if it was present in the json request if self._uuid is not None: cpuDataMsg.set_uuid(self._uuid) jsonMsg = cpuDataMsg.getJson() self.cpu_sensor_data = jsonMsg self.os_sensor_type["cpu_usage"] = self.cpu_sensor_data # Transmit it out over rabbitMQ channel self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) self.cpu_fault = False
def _generate_host_update(self): """Create & transmit a host update message as defined by the sensor response json schema""" # Notify the node sensor to update its data required for the host_update message successful = self._node_sensor.read_data("host_update", self._get_debug(), self._units) if not successful: logger.error( "NodeDataMsgHandler, _generate_host_update was NOT successful." ) self._host_memory_usage_threshold = str( self._host_memory_usage_threshold) try: if self._host_memory_usage_threshold.isdigit(): self._host_memory_usage_threshold = int( self._host_memory_usage_threshold) else: self._host_memory_usage_threshold = float( self._host_memory_usage_threshold) except ValueError: logger.warning( "Host Memory Alert, Invalid host_memory_usage_threshold value are entered in config." ) # Assigning default value to _disk_usage_threshold self._host_memory_usage_threshold = self.DEFAULT_HOST_MEMORY_USAGE_THRESHOLD if self._node_sensor.total_memory[ "percent"] >= self._host_memory_usage_threshold: # Create the disk space data message and hand it over to the egress processor to transmit if not self.host_fault: self.host_fault = True # Create the disk space data message and hand it over to the egress processor to transmit fault_event = "Host memory usage increased to %s, beyond configured threshold of %s" \ %(self._node_sensor.total_memory["percent"], self._host_memory_usage_threshold) logger.warning(fault_event) logged_in_users = [] # Create the host update message and hand it over to the egress processor to transmit hostUpdateMsg = HostUpdateMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.boot_time, self._node_sensor.up_time, self._node_sensor.uname, self._units, self.site_id, self.rack_id, self.node_id, self.cluster_id, self._node_sensor.total_memory, self._node_sensor.logged_in_users, self._node_sensor.process_count, self._node_sensor.running_process_count, self.FAULT, fault_event) # Add in uuid if it was present in the json request if self._uuid is not None: hostUpdateMsg.set_uuid(self._uuid) jsonMsg = hostUpdateMsg.getJson() # Transmit it out over rabbitMQ channel self.host_sensor_data = jsonMsg self.os_sensor_type["memory_usage"] = self.host_sensor_data self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) if (self._node_sensor.total_memory["percent"] < self._host_memory_usage_threshold) and (self.host_fault == True): fault_resolved_event = "Host memory usage decreased to %s, lesser than configured threshold of %s" \ %(self._node_sensor.total_memory["percent"], self._host_memory_usage_threshold) logger.warning(fault_resolved_event) logged_in_users = [] # Create the host update message and hand it over to the egress processor to transmit hostUpdateMsg = HostUpdateMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.boot_time, self._node_sensor.up_time, self._node_sensor.uname, self._units, self.site_id, self.rack_id, self.node_id, self.cluster_id, self._node_sensor.total_memory, self._node_sensor.logged_in_users, self._node_sensor.process_count, self._node_sensor.running_process_count, self.FAULT_RESOLVED, fault_resolved_event) # Add in uuid if it was present in the json request if self._uuid is not None: hostUpdateMsg.set_uuid(self._uuid) jsonMsg = hostUpdateMsg.getJson() # Transmit it out over rabbitMQ channel self.host_sensor_data = jsonMsg self.os_sensor_type["memory_usage"] = self.host_sensor_data self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) self.host_fault = False