def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(CPUFaultSensor, self).initialize(conf_reader) super(CPUFaultSensor, self).initialize_msgQ(msgQlist) # get the cpu fault implementor from configuration cpu_fault_utility = Conf.get(SSPL_CONF, f"{self.name().upper()}>{self.PROBE}", 'sysfs') # Creating the instance of ToolFactory class self.tool_factory = ToolFactory() try: # Get the instance of the utility using ToolFactory self._utility_instance = self._utility_instance or \ self.tool_factory.get_instance(cpu_fault_utility) except Exception as err: raise Exception( "Error while initializing. " f"Unable to get the instance of {cpu_fault_utility} Utility, {err}" ) self._node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY, 'SN01') cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME) self.CPU_FAULT_SENSOR_DATA = os.path.join( cache_dir_path, f'CPU_FAULT_SENSOR_DATA_{self._node_id}') return True
def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(CPUFaultSensor, self).initialize(conf_reader) super(CPUFaultSensor, self).initialize_msgQ(msgQlist) self._site_id = Conf.get(GLOBAL_CONF, SITE_ID_KEY,'DC01') self._rack_id = Conf.get(GLOBAL_CONF, RACK_ID_KEY,'RC01') self._node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY,'SN01') self._cluster_id = Conf.get(GLOBAL_CONF, CLUSTER_ID_KEY,'CC01') # get the cpu fault implementor from configuration cpu_fault_utility = Conf.get(SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}", 'sysfs') # Creating the instance of ToolFactory class self.tool_factory = ToolFactory() try: # Get the instance of the utility using ToolFactory self._utility_instance = self._utility_instance or \ self.tool_factory.get_instance(cpu_fault_utility) except Exception as e: logger.error(f"Error while initializing, shutting down CPUFaultSensor : {e}") self.shutdown() cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME) self.CPU_FAULT_SENSOR_DATA = os.path.join(cache_dir_path, f'CPU_FAULT_SENSOR_DATA_{self._node_id}') return True
def __init__(self): """Initialize server.""" super().__init__() self.log = CustomLog(const.HEALTH_SVC_NAME) self.validate_server_type_support() self.sysfs = ToolFactory().get_instance('sysfs') self.sysfs.initialize() self.sysfs_base_path = self.sysfs.get_sysfs_base_path() self.cpu_path = self.sysfs_base_path + const.CPU_PATH hw_resources = { 'cpu': self.get_cpu_info, 'platform_sensors': self.get_platform_sensors_info, 'memory': self.get_mem_info, 'fans': self.get_fans_info, 'nw_ports': self.get_nw_ports_info, 'sas_hba': self.get_sas_hba_info, 'sas_ports': self.get_sas_ports_info, 'disks': self.get_disks_info, 'psus': self.get_psu_info } sw_resources = { 'cortx_sw_services': self.get_cortx_service_info, 'external_sw_services': self.get_external_service_info, 'raid': self.get_raid_info } self.server_resources = {"hw": hw_resources, "sw": sw_resources} self._ipmi = IpmiFactory().get_implementor("ipmitool") self.platform_sensor_list = ['Temperature', 'Voltage', 'Current']
def __init__(self): """Initialize server.""" super().__init__() self.log = CustomLog(const.HEALTH_SVC_NAME) server_type = Conf.get(GLOBAL_CONF, NODE_TYPE_KEY) Platform.validate_server_type_support(self.log, ResourceMapError, server_type) self.sysfs = ToolFactory().get_instance('sysfs') self.sysfs.initialize() self.sysfs_base_path = self.sysfs.get_sysfs_base_path() self.cpu_path = self.sysfs_base_path + const.CPU_PATH hw_resources = { 'cpu': self.get_cpu_info, 'platform_sensor': self.get_platform_sensors_info, 'memory': self.get_mem_info, 'fan': self.get_fans_info, 'nw_port': self.get_nw_ports_info, 'sas_hba': self.get_sas_hba_info, 'sas_port': self.get_sas_ports_info, 'disk': self.get_disks_info, 'psu': self.get_psu_info } sw_resources = { 'cortx_sw_services': self.get_cortx_service_info, 'external_sw_services': self.get_external_service_info, 'raid': self.get_raid_info } self.server_resources = {"hw": hw_resources, "sw": sw_resources} self._ipmi = IpmiFactory().get_implementor("ipmitool") self.platform_sensor_list = ['Temperature', 'Voltage', 'Current'] self.service = Service() self.resource_indexing_map = ServerResourceMap.resource_indexing_map\ ["health"]
def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(MemFaultSensor, self).initialize(conf_reader) super(MemFaultSensor, self).initialize_msgQ(msgQlist) self._site_id = self._conf_reader._get_value_with_default( self.SYSTEM_INFORMATION_KEY, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get( self.SITE_ID_KEY), '001') self._cluster_id = self._conf_reader._get_value_with_default( self.SYSTEM_INFORMATION_KEY, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get( self.CLUSTER_ID_KEY), '001') self._rack_id = self._conf_reader._get_value_with_default( self.SYSTEM_INFORMATION_KEY, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get( self.RACK_ID_KEY), '001') self._node_id = self._conf_reader._get_value_with_default( self.SYSTEM_INFORMATION_KEY, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get( self.NODE_ID_KEY), '001') # get the mem fault implementor from configuration mem_fault_utility = self._conf_reader._get_value_with_default( self.name().capitalize(), self.PROBE, "procfs") self.polling_interval = int( self._conf_reader._get_value_with_default( self.SENSOR_NAME.upper(), self.POLLING_INTERVAL_KEY, self.DEFAULT_POLLING_INTERVAL)) # Creating the instance of ToolFactory class self.tool_factory = ToolFactory() try: # Get the instance of the utility using ToolFactory self._utility_instance = self._utility_instance or \ self.tool_factory.get_instance(mem_fault_utility) # self._utility_instance.initialize() except KeyError as key_error: logger.error( "Unable to get the instance of {} \ Utility. Hence shutting down the sensor {}"\ .format(mem_fault_utility, MemFaultSensor.SENSOR_NAME)) self.shutdown() cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME) self.MEM_FAULT_SENSOR_DATA = os.path.join( cache_dir_path, f'MEM_FAULT_SENSOR_DATA_{self._node_id}') return True
def __init__(self): super(NodeData, self).__init__() self.os_utils = OSUtils() self._epoch_time = str(int(time.time())) # Total number of CPUs self.cpus = psutil.cpu_count() self.host_id = self.os_utils.get_fqdn() # Calculate the load averages on separate blocking threads self.load_1min_average = [] self.load_5min_average = [] self.load_15min_average = [] self.prev_bmcip = None load_1min_avg = threading.Thread(target=self._load_1min_avg).start() load_5min_avg = threading.Thread(target=self._load_5min_avg).start() load_15min_avg = threading.Thread(target=self._load_15min_avg).start() self.conf_reader = ConfigReader() nw_fault_utility = Conf.get( SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}", "sysfs") self._utility_instance = None try: # Creating the instance of ToolFactory class self.tool_factory = ToolFactory() # Get the instance of the utility using ToolFactory self._utility_instance = self._utility_instance or \ self.tool_factory.get_instance(nw_fault_utility) if self._utility_instance: # Initialize the path as /sys/class/net/ self.nw_interface_path = self._utility_instance.get_sys_dir_path( 'net') except KeyError as key_error: logger.error( f'NodeData, Unable to get the instance of {nw_fault_utility} Utility' ) except Exception as err: logger.error( f'NodeData, Problem occured while getting the instance of {nw_fault_utility}' )
class MemFaultSensor(SensorThread, InternalMsgQ): """Memory fault Sensor which runs on its own thread once every power cycle and is responsible for identifying total RAM memory on the node and any errors in it using available tool/utility""" SENSOR_NAME = "MemFaultSensor" PRIORITY = 1 RESOURCE_TYPE = "node:os:memory" # section in the configuration store SYSTEM_INFORMATION_KEY = "SYSTEM_INFORMATION" SITE_ID_KEY = "site_id" CLUSTER_ID_KEY = "cluster_id" NODE_ID_KEY = "node_id" RACK_ID_KEY = "rack_id" POLLING_INTERVAL_KEY = "polling_interval" CACHE_DIR_NAME = "server" RESOURCE_ID = "0" DEFAULT_POLLING_INTERVAL = '0' PROBE = "probe" # Dependency list DEPENDENCIES = { "plugins": ["NodeDataMsgHandler", "LoggingMsgHandler"], "rpms": [] } @staticmethod def name(): """@return: name of the module.""" return MemFaultSensor.SENSOR_NAME def __init__(self, utility_instance=None): """init method""" super(MemFaultSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY) # Initialize the utility instance self._utility_instance = utility_instance self.total_mem = None self.mem_path_file = None self.prev_mem = None self.fault_alert_state = "Neutral State" # Flag to indicate suspension of module self._suspended = False def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(MemFaultSensor, self).initialize(conf_reader) super(MemFaultSensor, self).initialize_msgQ(msgQlist) self._site_id = self._conf_reader._get_value_with_default( self.SYSTEM_INFORMATION_KEY, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get( self.SITE_ID_KEY), '001') self._cluster_id = self._conf_reader._get_value_with_default( self.SYSTEM_INFORMATION_KEY, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get( self.CLUSTER_ID_KEY), '001') self._rack_id = self._conf_reader._get_value_with_default( self.SYSTEM_INFORMATION_KEY, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get( self.RACK_ID_KEY), '001') self._node_id = self._conf_reader._get_value_with_default( self.SYSTEM_INFORMATION_KEY, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get( self.NODE_ID_KEY), '001') # get the mem fault implementor from configuration mem_fault_utility = self._conf_reader._get_value_with_default( self.name().capitalize(), self.PROBE, "procfs") self.polling_interval = int( self._conf_reader._get_value_with_default( self.SENSOR_NAME.upper(), self.POLLING_INTERVAL_KEY, self.DEFAULT_POLLING_INTERVAL)) # Creating the instance of ToolFactory class self.tool_factory = ToolFactory() try: # Get the instance of the utility using ToolFactory self._utility_instance = self._utility_instance or \ self.tool_factory.get_instance(mem_fault_utility) # self._utility_instance.initialize() except KeyError as key_error: logger.error( "Unable to get the instance of {} \ Utility. Hence shutting down the sensor {}"\ .format(mem_fault_utility, MemFaultSensor.SENSOR_NAME)) self.shutdown() cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME) self.MEM_FAULT_SENSOR_DATA = os.path.join( cache_dir_path, f'MEM_FAULT_SENSOR_DATA_{self._node_id}') return True def get_stored_mem_info(self): """ Get the memory info from consul""" if store.exists(self.MEM_FAULT_SENSOR_DATA): consul_data = (store.get(self.MEM_FAULT_SENSOR_DATA)).split(":") self.prev_mem = consul_data[0].strip() self.fault_alert_state = consul_data[1].strip() def put_mem_info(self, total_memory_size): """ Store the current memory in Consul""" store.put(f"{total_memory_size}:{self.fault_alert_state}", self.MEM_FAULT_SENSOR_DATA) def run(self): """Run the sensor on its own thread""" alert_type = "fault" mem_path = self._utility_instance.get_proc_memory('meminfo') if mem_path.is_file(): self.mem_path_file = mem_path.read_text() mem_info_fields = self.mem_path_file.split() if mem_info_fields[0] == 'MemTotal:': self.total_mem = mem_info_fields[1] # Get data from store if available and compare to the current value self.get_stored_mem_info() if self.prev_mem is not None: # Fault and Fault_resolved Both conditions are handled. if int(self.prev_mem) > int(self.total_mem): # update the store with new value, raise an alert of type "fault" if self.fault_alert_state == "Neutral State": self.fault_alert_state = "Fault Generated" self._generate_alert(alert_type) self.put_mem_info(self.prev_mem) elif (int(self.prev_mem) <= int(self.total_mem)) and ( self.fault_alert_state == "Fault Generated"): self.fault_alert_state = "Neutral State" alert_type = "fault_resolved" self._generate_alert(alert_type) self.put_mem_info(self.total_mem) else: self.put_mem_info(self.total_mem) else: logger.error( "MemFaultSensor: invalid file, shutting down the sensor") self.shutdown() return True else: logger.error( "MemFaultSensor: file does not exist, shutting down the sensor" ) self.shutdown() return True # Do not proceed if module is suspended # Memory sensor is going to trigger only during SSPL reboot; at reboot time a sensor # can not be in suspended state. # Commented code is retained if in future we want to make the sensor periodic, # this piece will be needed #if self._suspended is True: # self._scheduler.enter(self.polling_interval, self._priority, self.run, ()) # return # Check for debug mode being activated self._read_my_msgQ_noWait() # self scheduling is commented so that the process runs only once per SSPL reboot # Enable with correct polling_interval if in future memory sensor needs to run periodically #self._scheduler.enter(self.polling_interval, self._priority, self.run, ()) def _create_json_message(self, alert_type): """Creates a defined json message structure which can flow inside SSPL modules""" internal_json_msg = None severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) host_name = socket.gethostname() specific_info = {} specific_info_list = [] if alert_type == "fault": specific_info["event"] = \ "Total available main memory value decreased from {} kB to {} kB"\ .format(self.prev_mem, self.total_mem) elif alert_type == "fault_resolved": specific_info["event"] = \ "Total main memory value available {} kB"\ .format(self.total_mem) # populate all the data from /proc/meminfo split_strs = [ s.split(maxsplit=1) for s in self.mem_path_file.splitlines() ] dictionary_str = dict(split_strs) specific_info["meminfo"] = dictionary_str specific_info_list.append(specific_info) alert_specific_info = specific_info_list info = { "site_id": self._site_id, "cluster_id": self._cluster_id, "rack_id": self._rack_id, "node_id": self._node_id, "resource_type": self.RESOURCE_TYPE, "resource_id": self.RESOURCE_ID, "event_time": epoch_time } internal_json_msg = json.dumps({ "sensor_request_type": { "node_data": { "status": "update", "host_id": host_name, "alert_type": alert_type, "severity": severity, "alert_id": alert_id, "info": info, "specific_info": alert_specific_info } } }) return internal_json_msg def _get_alert_id(self, epoch_time): """Returns alert id which is a combination of epoch_time and salt value """ salt = str(uuid.uuid4().hex) alert_id = epoch_time + salt return alert_id def _generate_alert(self, alert_type): """Queues the message to NodeData Message Handler""" json_msg = self._create_json_message(alert_type) if json_msg: self._write_internal_msgQ(NodeDataMsgHandler.name(), json_msg) def suspend(self): """Suspends the module thread. It should be non-blocking""" super(MemFaultSensor, self).suspend() self._suspended = True def resume(self): """Resumes the module thread. It should be non-blocking""" super(MemFaultSensor, self).resume() self._suspended = False def shutdown(self): """Clean up scheduler queue and gracefully shutdown thread""" super(MemFaultSensor, self).shutdown()
class CPUFaultSensor(SensorThread, InternalMsgQ): """CPU Fault Sensor which runs on its own thread on each boot up and is responsible for sensing changes in online CPUs using available tool/utility""" SENSOR_NAME = "CPUFaultSensor" PRIORITY = 1 RESOURCE_TYPE = "node:os:cpu:core" # Section in the configuration store SYSTEM_INFORMATION_KEY = "SYSTEM_INFORMATION" SITE_ID_KEY = "site_id" CLUSTER_ID_KEY = "cluster_id" NODE_ID_KEY = "node_id" RACK_ID_KEY = "rack_id" CACHE_DIR_NAME = "server" RESOURCE_ID = "CPU-" PROBE = "probe" # Dependency list DEPENDENCIES = { "plugins": ["NodeDataMsgHandler", "LoggingMsgHandler"], "rpms": [] } @staticmethod def name(): """@return: name of the module.""" return CPUFaultSensor.SENSOR_NAME def __init__(self, utility_instance=None): """init method""" super(CPUFaultSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY) # Initialize the utility instance self._utility_instance = utility_instance # CPU info self.stored_cpu_info = None self.prev_cpu_info = None self.current_cpu_info = None def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(CPUFaultSensor, self).initialize(conf_reader) super(CPUFaultSensor, self).initialize_msgQ(msgQlist) self._site_id = self._conf_reader._get_value_with_default( self.SYSTEM_INFORMATION_KEY, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get( self.SITE_ID_KEY), '001') self._cluster_id = self._conf_reader._get_value_with_default( self.SYSTEM_INFORMATION_KEY, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get( self.CLUSTER_ID_KEY), '001') self._rack_id = self._conf_reader._get_value_with_default( self.SYSTEM_INFORMATION_KEY, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get( self.RACK_ID_KEY), '001') self._node_id = self._conf_reader._get_value_with_default( self.SYSTEM_INFORMATION_KEY, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get( self.NODE_ID_KEY), '001') # get the cpu fault implementor from configuration cpu_fault_utility = self._conf_reader._get_value_with_default( self.name().capitalize(), self.PROBE, 'sysfs') # Creating the instance of ToolFactory class self.tool_factory = ToolFactory() try: # Get the instance of the utility using ToolFactory self._utility_instance = self._utility_instance or \ self.tool_factory.get_instance(cpu_fault_utility) except Exception as e: logger.error( f"Error while initializing, shutting down CPUFaultSensor : {e}" ) self.shutdown() cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME) self.CPU_FAULT_SENSOR_DATA = os.path.join( cache_dir_path, f'CPU_FAULT_SENSOR_DATA_{self._node_id}') return True def read_stored_cpu_info(self): """Read the most recent stored cpu info""" try: if self.stored_cpu_info is None: self.stored_cpu_info = store.get(self.CPU_FAULT_SENSOR_DATA) if self.stored_cpu_info is not None and self._node_id in self.stored_cpu_info.keys( ): self.prev_cpu_info = self.stored_cpu_info[ self._node_id]['CPU_LIST'] except Exception as e: logger.error( f"Error while reading stored cpu info, shutting down CPUFaultSensor : {e}" ) self.shutdown() def read_current_cpu_info(self): """Read current cpu info""" try: self.current_cpu_info = self._utility_instance.get_cpu_info() except Exception as e: logger.error( f"Error while reading current cpu info, shutting down CPUFaultSensor : {e}" ) self.shutdown() def run(self): """Run the sensor on its own thread""" # Check for debug mode being activated self._read_my_msgQ_noWait() # Read recent stored cpu info self.read_stored_cpu_info() # Store alerts to be sent here self.alerts_for = {} # Specific info field for alerts self.specific_info = [] # Read current cpu info self.read_current_cpu_info() to_update = False # Compare with previous cpu info # If a cpu is present in prev_cpu_info and not present in current_cpu_info : fault alert is generated # If a cpu is present in current_cpu_info and not present in prev_cpu_info : two possibilities # 1) if cpu has an outstanding fault alert : it is a repaired cpu, hence generate fault_resolved # 2) if cpu has no outstanding alert : it is a newly added cpu, do not do anything try: if self.prev_cpu_info: if self.current_cpu_info != self.prev_cpu_info: # Create a set of all relevant cpus cpu_list = set(self.prev_cpu_info + self.current_cpu_info) # Iterate through the set for cpu in cpu_list: if cpu not in self.current_cpu_info and cpu not in self.stored_cpu_info[ self._node_id]['FAULT_LIST']: # This is a failed cpu self.stored_cpu_info[ self._node_id]['FAULT_LIST'].append(cpu) self.alerts_for[cpu] = "fault" elif cpu not in self.prev_cpu_info and cpu in self.stored_cpu_info[ self._node_id]['FAULT_LIST']: # This is a repaired cpu self.alerts_for[cpu] = "fault_resolved" # Update stored cpu info for next run self.stored_cpu_info[ self._node_id]['CPU_LIST'] = self.current_cpu_info to_update = True else: # Previous cpu info not available, need to store current info if not self.stored_cpu_info: # No info is available self.stored_cpu_info = {} # Add info for the current node self.stored_cpu_info[self._node_id] = {} self.stored_cpu_info[ self._node_id]['CPU_LIST'] = self.current_cpu_info self.stored_cpu_info[self._node_id]['FAULT_LIST'] = [] # Update stored cpu info to_update = True except Exception as e: logger.error( f"Error while processing cpu info, shutting down CPUFaultSensor : {e}" ) self.shutdown() # Send alerts for cpu, alert_type in self.alerts_for.items(): if self._generate_alert( cpu, alert_type) == True and alert_type == "fault_resolved": # Delete from the FAULT_LIST self.stored_cpu_info[self._node_id]['FAULT_LIST'].remove(cpu) # Update stored cpu info if to_update: store.put(self.stored_cpu_info, self.CPU_FAULT_SENSOR_DATA) def fill_specific_info(self): """Fills the specific info to be sent via alert""" if not self.specific_info: # Create a set of all relevant cpus cpu_list = set(self.prev_cpu_info + self.current_cpu_info) # Iterate through the set for cpu in cpu_list: item = {} item['resource_id'] = self.RESOURCE_ID + str(cpu) # Keep default state online item['state'] = "online" if cpu in self.alerts_for.keys(): if self.alerts_for[cpu] == "fault": item['state'] = "offline" self.specific_info.append(item) def _create_json_message(self, cpu, alert_type): """Creates a defined json message structure which can flow inside SSPL modules""" internal_json_msg = None severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) host_name = socket.gethostname() # Populate specific info self.fill_specific_info() alert_specific_info = self.specific_info info = { "site_id": self._site_id, "cluster_id": self._cluster_id, "rack_id": self._rack_id, "node_id": self._node_id, "resource_type": self.RESOURCE_TYPE, "resource_id": self.RESOURCE_ID + str(cpu), "event_time": epoch_time } internal_json_msg = json.dumps({ "sensor_request_type": { "node_data": { "status": "update", "host_id": host_name, "alert_type": alert_type, "severity": severity, "alert_id": alert_id, "info": info, "specific_info": alert_specific_info } } }) return internal_json_msg def _get_alert_id(self, epoch_time): """Returns alert id which is a combination of epoch_time and salt value """ salt = str(uuid.uuid4().hex) alert_id = epoch_time + salt return alert_id def _generate_alert(self, cpu, alert_type): """Queues the message to NodeData Message Handler""" try: json_msg = self._create_json_message(cpu, alert_type) if json_msg: # RAAL stands for - RAise ALert logger.info(f"RAAL: {json_msg}") self._write_internal_msgQ(NodeDataMsgHandler.name(), json_msg) return True except Exception as e: logger.error(f"Exception while sending alert : {e}") return False def shutdown(self): """Clean up scheduler queue and gracefully shutdown thread""" super(CPUFaultSensor, self).shutdown()
class NodeData(Debug): """Obtains data about the node and makes it available""" SENSOR_NAME = "NodeData" # conf attribute initialization PROBE = 'probe' @staticmethod def name(): """@return: name of the module.""" return NodeData.SENSOR_NAME @staticmethod def impact(): """Returns impact of the module.""" return ("Server CPU, network, disk space, process and local mount " "data can not be monitored.") def __init__(self): super(NodeData, self).__init__() self.os_utils = OSUtils() self._epoch_time = str(int(time.time())) # Total number of CPUs self.cpus = psutil.cpu_count() self.host_id = self.os_utils.get_fqdn() # Calculate the load averages on separate blocking threads self.load_1min_average = [] self.load_5min_average = [] self.load_15min_average = [] self.prev_bmcip = None load_1min_avg = threading.Thread(target=self._load_1min_avg).start() load_5min_avg = threading.Thread(target=self._load_5min_avg).start() load_15min_avg = threading.Thread(target=self._load_15min_avg).start() self.conf_reader = ConfigReader() nw_fault_utility = Conf.get( SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}", "sysfs") self._utility_instance = None try: # Creating the instance of ToolFactory class self.tool_factory = ToolFactory() # Get the instance of the utility using ToolFactory self._utility_instance = self._utility_instance or \ self.tool_factory.get_instance(nw_fault_utility) if self._utility_instance: # Initialize the path as /sys/class/net/ self.nw_interface_path = self._utility_instance.get_sys_dir_path( 'net') except KeyError as key_error: logger.error( f'NodeData, Unable to get the instance of {nw_fault_utility} Utility' ) except Exception as err: logger.error( f'NodeData, Problem occured while getting the instance of {nw_fault_utility}' ) def read_data(self, subset, debug, units="MB"): """Updates data based on a subset""" self._set_debug(debug) self._log_debug("read_data, subset: %s, units: %s" % (subset, units)) try: # Determine the units factor value self.units_factor = 1 if units == "GB": self.units_factor = 1000000000 elif units == "MB": self.units_factor = 1000000 elif units == "KB": self.units_factor = 1000 self.host_id = self.os_utils.get_fqdn() # get_fqdn() function checks the socket.gethostname() to get the host name if it not available # then it try to find host name from socket.gethostbyaddr(socket.gethostname())[0] and return the # meaningful host name. self.local_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S %Z') # Branch off and gather data based upon value sent into subset if subset == "host_update": self._get_host_update_data() elif subset == "local_mount_data": self._get_local_mount_data() elif subset == "cpu_data": self._get_cpu_data() elif subset == "if_data": self._get_if_data() elif subset == "disk_space_alert": self._get_disk_space_alert_data() except Exception as e: raise Exception(f"Failed to read data, {e}") return True def _get_host_update_data(self): """Retrieves node information for the host_update json message""" logged_in_users = [] uname_keys = ("sysname", "nodename", "version", "release", "machine") self.up_time = int(psutil.boot_time()) self.boot_time = self._epoch_time self.uname = dict(zip(uname_keys, os.uname())) self.total_memory = dict(psutil.virtual_memory()._asdict()) self.process_count = len(psutil.pids()) for users in psutil.users(): logged_in_users.append(dict(users._asdict())) self.logged_in_users = logged_in_users # Calculate the current number of running processes at this moment total_running_proc = 0 for proc in psutil.process_iter(): try: pinfo = proc.as_dict(attrs=['status']) if pinfo['status'] not in (psutil.STATUS_ZOMBIE, psutil.STATUS_DEAD, psutil.STATUS_STOPPED, psutil.STATUS_IDLE, psutil.STATUS_SLEEPING): total_running_proc += 1 except psutil.NoSuchProcess: logger.warn( f"(psutil) Process '{proc.name()}' exited unexpectedly.") self.running_process_count = total_running_proc def _get_local_mount_data(self): """Retrieves node information for the local_mount_data json message""" self.total_space = int(psutil.disk_usage("/")[0]) // int( self.units_factor) self.free_space = int(psutil.disk_usage("/")[2]) // int( self.units_factor) self.total_swap = int(psutil.swap_memory()[0]) // int( self.units_factor) self.free_swap = int(psutil.swap_memory()[2]) // int(self.units_factor) self.free_inodes = int(100 - math.ceil((float(os.statvfs("/").f_files - os.statvfs("/").f_ffree) \ / os.statvfs("/").f_files) * 100)) def _get_cpu_data(self): """Retrieves node information for the cpu_data json message""" cpu_core_usage_dict = dict() cpu_data = psutil.cpu_times_percent() self._log_debug( "_get_cpu_data, cpu_data: %s %s %s %s %s %s %s %s %s %s" % cpu_data) self.csps = 0 # What the hell is csps - cycles per second? self.user_time = int(cpu_data[0]) self.nice_time = int(cpu_data[1]) self.system_time = int(cpu_data[2]) self.idle_time = int(cpu_data[3]) self.iowait_time = int(cpu_data[4]) self.interrupt_time = int(cpu_data[5]) self.softirq_time = int(cpu_data[6]) self.steal_time = int(cpu_data[7]) self.cpu_usage = psutil.cpu_percent(interval=1, percpu=False) # Array to hold data about each CPU core self.cpu_core_data = [] index = 0 while index < self.cpus: self._log_debug( "_get_cpu_data, index: %s, 1 min: %s, 5 min: %s, 15 min: %s" % (index, self.load_1min_average[index], self.load_5min_average[index], self.load_15min_average[index])) cpu_core_data = { "coreId": index, "load1MinAvg": int(self.load_1min_average[index]), "load5MinAvg": int(self.load_5min_average[index]), "load15MinAvg": int(self.load_15min_average[index]), "ips": 0 } self.cpu_core_data.append(cpu_core_data) index += 1 def _get_if_data(self): """Retrieves node information for the if_data json message""" net_data = psutil.net_io_counters(pernic=True) # Array to hold data about each network interface self.if_data = [] bmc_data = self._get_bmc_info() for interface, if_data in net_data.items(): self._log_debug("_get_if_data, interface: %s %s" % (interface, net_data)) nw_status = self._fetch_nw_status() nw_cable_conn_status = self.fetch_nw_cable_conn_status(interface) if_data = { "ifId": interface, "networkErrors": (net_data[interface].errin + net_data[interface].errout), "droppedPacketsIn": net_data[interface].dropin, "packetsIn": net_data[interface].packets_recv, "trafficIn": net_data[interface].bytes_recv, "droppedPacketsOut": net_data[interface].dropout, "packetsOut": net_data[interface].packets_sent, "trafficOut": net_data[interface].bytes_sent, "nwStatus": nw_status[interface][0], "ipV4": nw_status[interface][1], "nwCableConnStatus": nw_cable_conn_status } self.if_data.append(if_data) self.if_data.append(bmc_data) def _fetch_nw_status(self): nw_dict = {} nws = os.popen("ip --br a | awk '{print $1, $2, $3}'").read().split( '\n')[:-1] for nw in nws: if nw.split(' ')[2]: ip = nw.split(' ')[2].split("/")[0] else: ip = "" nw_dict[nw.split(' ')[0]] = [nw.split(' ')[1], ip] logger.debug("network info going is : {}".format(nw_dict)) return nw_dict def fetch_nw_cable_conn_status(self, interface): carrier_status = None try: carrier_status = Network().get_link_state(interface) except NetworkError as err: # NetworkError i.e. all OSError exceptions indicate that # the carrier file is not available to access which # constitute the UNKOWN status for network cable. logger.debug(err) carrier_status = "UNKNOWN" except Exception as e: # All other exceptions are unexpected and are logged as errors. logger.excpetion( "Problem occured while reading from nw carrier file:" f" {self.nw_interface_path}/{interface}/carrier. Error: {e}") return carrier_status def _get_bmc_info(self): """ nwCableConnection will be default UNKNOWN, Until solution to find bmc eth port cable connection status is found. """ try: bmcdata = { 'ifId': 'ebmc0', 'ipV4Prev': "", 'ipV4': "", 'nwStatus': "DOWN", 'nwCableConnStatus': 'UNKNOWN' } ipdata = sp.Popen( "sudo ipmitool lan print", shell=True, stdout=sp.PIPE, stderr=sp.PIPE).communicate()[0].decode().strip() bmcip = re.findall("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", ipdata) if bmcip: bmcip = bmcip[0] pingbmchost = "ping -c1 -W1 -q " + bmcip child = sp.Popen(pingbmchost.split(), stdout=sp.PIPE) streamdata = child.communicate( )[0] #child must be communicated before fetching return code. retcode = child.returncode if self.prev_bmcip is not None and self.prev_bmcip != bmcip: bmcdata['ipV4Prev'] = self.prev_bmcip bmcdata['ipV4'] = bmcip self.prev_bmcip = bmcip else: self.prev_bmcip = bmcdata['ipV4Prev'] = bmcdata[ 'ipV4'] = bmcip if retcode == 0: bmcdata['nwStatus'] = "UP" else: logger.warn("BMC Host:{0} is not reachable".format(bmcip)) except Exception as e: logger.error( "Exception occurs while fetching bmc_info:{}".format(e)) return bmcdata def _get_disk_space_alert_data(self): """Retrieves node information for the disk_space_alert_data json message""" self.total_space = int(psutil.disk_usage("/")[0]) // int( self.units_factor) self.free_space = int(psutil.disk_usage("/")[2]) // int( self.units_factor) self.disk_used_percentage = psutil.disk_usage("/")[3] def _load_1min_avg(self): """Loop forever calculating the one minute average load""" # Initialize list to -1 indicating the time interval has not occurred yet index = 0 while index < self.cpus: self.load_1min_average.append(-1) index += 1 while True: # API call blocks for one minute and then returns the value self.load_1min_average = psutil.cpu_percent(interval=1, percpu=True) def _load_5min_avg(self): """Loop forever calculating the five minute average load""" # Initialize list to -1 indicating the time interval has not occurred yet index = 0 while index < self.cpus: self.load_5min_average.append(-1) index += 1 while True: # API call blocks for five minutes and then returns the value self.load_5min_average = psutil.cpu_percent(interval=5, percpu=True) def _load_15min_avg(self): """Loop forever calculating the fifteen minute average load""" # Initialize list to -1 indicating the time interval has not occurred yet index = 0 while index < self.cpus: self.load_15min_average.append(-1) index += 1 while True: # API call blocks for fifteen minutes and then returns the value self.load_15min_average = psutil.cpu_percent(interval=15, percpu=True)
class SASPortSensor(SensorThread, InternalMsgQ): """SAS Port Sensor which runs on its own thread periodically and is responsible for sensing changes is SAS ports/cable using available tool/utility""" SENSOR_NAME = "SASPortSensor" PRIORITY = 1 RESOURCE_TYPE = "node:interface:sas" # section in the configuration store SYSTEM_INFORMATION = "SYSTEM_INFORMATION" SITE_ID = "site_id" CLUSTER_ID = "cluster_id" NODE_ID = "node_id" RACK_ID = "rack_id" POLLING_INTERVAL = "polling_interval" CACHE_DIR_NAME = "server" RESOURCE_ID = "SASHBA-0" DEFAULT_POLLING_INTERVAL = '30' PROBE = "probe" # Dependency list DEPENDENCIES = { "plugins": ["NodeDataMsgHandler", "LoggingMsgHandler"], "rpms": [] } # Number of SAS Ports NUM_SAS_PORTS = 4 # Number of Phys in a Port NUM_PHYS_PER_PORT = 4 # Current Data Version CURRENT_DATA_VERSION = 1 @staticmethod def name(): """@return: name of the module.""" return SASPortSensor.SENSOR_NAME def __init__(self, utility_instance=None): """init method""" super(SASPortSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY) # Initialize the utility instance self._utility_instance = utility_instance self.phy_dir_to_linkrate_mapping = None # Flag to indicate suspension of module self._suspended = False self._count = 0 self.phy_link_count = 0 self.sas_ports_status = {} self.port_phy_list_dict = {} self.sas_phy_stored_alert = None def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(SASPortSensor, self).initialize(conf_reader) super(SASPortSensor, self).initialize_msgQ(msgQlist) self._site_id = Conf.get(GLOBAL_CONF, SITE_ID_KEY, 'DC01') self._rack_id = Conf.get(GLOBAL_CONF, RACK_ID_KEY, 'RC01') self._node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY, 'SN01') self._cluster_id = Conf.get(GLOBAL_CONF, CLUSTER_ID_KEY, 'CC01') # Get the sas port implementor from configuration sas_port_utility = Conf.get( SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}", "sysfs") self.polling_interval = int( Conf.get(SSPL_CONF, f"{self.SENSOR_NAME.upper()}>{self.POLLING_INTERVAL}", self.DEFAULT_POLLING_INTERVAL)) # Creating the instance of ToolFactory class self.tool_factory = ToolFactory() cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME) self.SAS_PORT_SENSOR_DATA = os.path.join( cache_dir_path, f'SAS_PORT_SENSOR_DATA_{self._node_id}') alert_type = None try: # Get the instance of the utility using ToolFactory self._utility_instance = self._utility_instance or \ self.tool_factory.get_instance(sas_port_utility) self._utility_instance.initialize() phy_status = None link_value_phy_status_collection = () # Call to sas phy dirctory which will return a dictionary # which has phy_name to negotiated link rate mapping # Ex: {"phy-0:0": "<12.0, Unknown>"} self.phy_dir_to_linkrate_mapping = \ self._utility_instance.get_phy_negotiated_link_rate() # Iterate over populated dictionary and restructure it # Ex: if phy-0:0 is 12.0/6.0/3.0, considered as UP. # {"phy-0:0": ("link_rate", <Up/Down>)} for phy, value in self.phy_dir_to_linkrate_mapping.items(): if 'Gbit'.lower() in value.strip().lower(): phy_status = 'up' # Increment global phy_link count for UP status self.phy_link_count += 1 else: phy_status = 'fault' link_value_phy_status_collection = (value, phy_status) self.phy_dir_to_linkrate_mapping[ phy] = link_value_phy_status_collection # Get the stored previous alert info self.sas_phy_stored_alert = store.get(self.SAS_PORT_SENSOR_DATA) self.check_and_send_alert() except KeyError as key_error: logger.error("Unable to get the instance of {} \ Utility. Hence shutting down the sensor".format( sas_port_utility)) self.shutdown() except Exception as e: if e == errno.ENOENT: logger.error("Problem occured while reading from sas_phy \ directory. directory path doesn't directory. Hence \ shuting down the sensor") elif e == errno.EACCES: logger.error( "Problem occured while reading from sas_phy directory. \ Not enough permission to read from the directory. \ Hence shuting down the sensor") else: logger.error( "Problem occured while reading from sas_phy directory. \ {0}. Hence shuting down the sensor".format(e)) self.shutdown() return True def update_sas_ports_status(self): """ Reads current phy status and updates port connectivity status Assumption : phys will be present in multiples of 4 """ phy_list = [*self.phy_dir_to_linkrate_mapping] phy_list = sort_phy_list(phy_list) # Now we have a sorted list of phys # Phys 0-3 for the 0th sas port, and so on in groups of 4 phys # List containing status of all phys hba = [] for phy in phy_list: if self.phy_dir_to_linkrate_mapping[phy][1] == 'up': hba.append(1) else: hba.append(0) for i in range(0, self.NUM_SAS_PORTS): # Save phy names forming this port for future use self.port_phy_list_dict[i] = phy_list[ self.NUM_PHYS_PER_PORT * i : \ self.NUM_PHYS_PER_PORT * i + self.NUM_PHYS_PER_PORT ] # Check port status s = set(hba[self.NUM_PHYS_PER_PORT * i:self.NUM_PHYS_PER_PORT * i + self.NUM_PHYS_PER_PORT]) if len(s) == 1 and 0 in s: port_status = 'down' elif len(s) == 1 and 1 in s: port_status = 'up' else: port_status = 'degraded' # Store the data self.sas_ports_status[i] = port_status def check_and_send_conn_alert(self): """ Sends conn fault alert if all phys go down Sends conn fault_resolved alert if at least 1 sas port (4 phys) comes up """ # Case 1 : all fault for fault alert cur_all_fault = True # Case 2 : all fault_resolved for fault_resolved alert cur_all_fault_resolved = True # Previous conn alert that was sent prev_conn_alert = self.sas_phy_stored_alert['conn'] # Current for port, value in self.sas_phy_stored_alert.items(): if port in ['version', 'conn']: # This is key for conn alert, skip continue # Case 1 : All faults in current status if value != 'fault': cur_all_fault = False # Case 2 : All fault_resolved in current status elif value != 'fault_resolved': cur_all_fault_resolved = False if prev_conn_alert == 'fault_resolved' and cur_all_fault: # Send conn fault alert alert_type = 'fault' self._generate_alert(alert_type, -1) self.sas_phy_stored_alert['conn'] = alert_type elif prev_conn_alert == 'fault' and cur_all_fault_resolved: # Send conn fault_resolved alert alert_type = 'fault_resolved' self._generate_alert(alert_type, -1) self.sas_phy_stored_alert['conn'] = alert_type def handle_current_version_data(self): """Contains logic to check and send alert if data has version == 1.""" # Compare current status of each port with previous alert_type for port, value in self.sas_phy_stored_alert.items(): if port in ['version', 'conn']: # Skip continue if value == 'fault_resolved' and \ self.sas_ports_status[port] == 'down': alert_type = 'fault' self._generate_alert(alert_type, port) self.sas_phy_stored_alert[port] = alert_type elif value == 'fault' and \ self.sas_ports_status[port] == 'up': alert_type = 'fault_resolved' self._generate_alert(alert_type, port) self.sas_phy_stored_alert[port] = alert_type # See if conn failure/conn resolved alert needs to be sent self.check_and_send_conn_alert() # Save data to store store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA) def check_and_send_alert(self): """Checks whether conditions are met and sends alert if required Alerts will be sent if - 1. All 4 phys of a sas port go up -> down : fault alert 2. All 4 phys of a sas port come down -> up : fault_resolved alert Sensor data stored in persistent storage is a dict of { sas_port_number : alert_type } """ # Update sas ports status self.update_sas_ports_status() # Check the version of stored alert version = None try: # Try to get the version # Exception will be raised if stored alert is None or no Version is available version = self.sas_phy_stored_alert['version'] except Exception: logger.warning( f"Found no data or old data format for SASPortSensor, \ updating data format to version {self.CURRENT_DATA_VERSION}" ) # Versioning is not implemented or there is no data, write new data # Initialize dummy fault_resolved for all sas ports and conn self.sas_phy_stored_alert = {} self.sas_phy_stored_alert['version'] = self.CURRENT_DATA_VERSION self.sas_phy_stored_alert['conn'] = 'fault_resolved' for i in range(0, self.NUM_SAS_PORTS): self.sas_phy_stored_alert[i] = 'fault_resolved' # Save data to store store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA) if version == self.CURRENT_DATA_VERSION: self.handle_current_version_data() def run(self): """Run the sensor on its own thread""" alert_type = None status = None new_phy_up = 0 new_phy_down = 0 # Do not proceed if module is suspended if self._suspended == True: self._scheduler.enter(self.polling_interval, self._priority, self.run, ()) return # Check for debug mode being activated self._read_my_msgQ_noWait() try: phy_link_rate_dict = \ self._utility_instance.get_phy_negotiated_link_rate() if phy_link_rate_dict: for key, value in phy_link_rate_dict.items(): link_rate = value.strip() prev_linkrate_value = \ self.phy_dir_to_linkrate_mapping[key][0].strip() prev_alert_type = \ self.phy_dir_to_linkrate_mapping[key][1].strip() status = prev_alert_type # Compare local dict wrt global dictionary for change in the # negotiated link rate if link_rate.lower() != prev_linkrate_value.lower(): # If current link rate has no value like 12/6/3 Gbit # and previously it was up, then it's a fault condition if 'Gbit'.lower() not in link_rate.lower( ) and prev_alert_type.lower() == 'up': # Increment count for new phy down which were up previously new_phy_down += 1 # Make respective phy_status as fault status = 'fault' # Check if 12/6/3 Gbit is there in the current link rate and # the previous alert_type is fault. If so, means phy is Up again elif 'Gbit'.lower() in link_rate.lower( ) and prev_alert_type.lower() == 'fault': # Mark respective phy_status as Up status = 'up' # Increment count for new phy up new_phy_up += 1 # Finally update the global dict with current link rate # and respctive phy status self.phy_dir_to_linkrate_mapping[key] = (link_rate, status) # Get current phy status i.e number of Up phys new_phy_link_count = self.phy_link_count + new_phy_up - new_phy_down # Get the last sent alert info self.sas_phy_stored_alert = store.get( self.SAS_PORT_SENSOR_DATA) self.check_and_send_alert() # Update current active phy count for next iteration self.phy_link_count = new_phy_link_count except Exception as ae: logger.exception(ae) # Fire every 30 seconds to see if there's a change in the phy status self._scheduler.enter(self.polling_interval, self._priority, self.run, ()) def _create_json_message(self, alert_type, port): """Creates a defined json message structure which can flow inside SSPL modules""" internal_json_msg = None severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) host_name = socket.gethostname() specific_info = {} specific_info_list = [] description = "N/A" # specific_info will contain all 16 phys for conn level alert # Only 4 phys for port level alert for key, val in self.phy_dir_to_linkrate_mapping.items(): if port != -1: # This is a port level alert, skip phys that are not relevant if key not in self.port_phy_list_dict[port]: # Skip adding this phy continue # Key will be phy-0:0. So, aplit it using ':' # So, structure will be SASHBA-0:phy-0 phy_number = key.split(":")[1] specific_info[ "resource_id"] = self.RESOURCE_ID + ':' + "phy-" + phy_number specific_info[ "negotiated_link_rate"] = self.phy_dir_to_linkrate_mapping[ key][0].strip() specific_info_list.append(specific_info) specific_info = {} alert_specific_info = specific_info_list if port == -1: # This is a SAS HBA level connection alert if alert_type == 'fault': description = "SAS connection error detected in SAS HBA %s." % self.RESOURCE_ID elif alert_type == 'fault_resolved': description = "SAS connection re-established in SAS HBA %s." % self.RESOURCE_ID info = { "site_id": self._site_id, "cluster_id": self._cluster_id, "rack_id": self._rack_id, "node_id": self._node_id, "resource_type": self.RESOURCE_TYPE, # node:interface:sas "resource_id": self.RESOURCE_ID, # SASHBA-0 "event_time": epoch_time, "description": description } else: # This is a port level alert if alert_type == 'fault': description = ( "No connectivity detected on the SAS port %s, possible" "causes could be missing SAS cable, bad cable connection," "faulty cable or SAS port failure." % port) elif alert_type == 'fault_resolved': description = "Connection established on SAS port." info = { "site_id": self._site_id, "cluster_id": self._cluster_id, "rack_id": self._rack_id, "node_id": self._node_id, "resource_type": self.RESOURCE_TYPE + ':port', # node:interface:sas:port "resource_id": self.RESOURCE_ID + f'-port-{port}', # SASHBA-0-port-0 "event_time": epoch_time, "description": description } internal_json_msg = json.dumps({ "sensor_request_type": { "node_data": { "status": "update", "host_id": host_name, "alert_type": alert_type, "severity": severity, "alert_id": alert_id, "info": info, "specific_info": alert_specific_info } } }) return internal_json_msg def _get_alert_id(self, epoch_time): """Returns alert id which is a combination of epoch_time and salt value """ salt = str(uuid.uuid4().hex) alert_id = epoch_time + salt return alert_id def _generate_alert(self, alert_type, port): """Queues the message to NodeData Message Handler""" json_msg = self._create_json_message(alert_type, port) if json_msg: self._write_internal_msgQ(NodeDataMsgHandler.name(), json_msg) def suspend(self): """Suspends the module thread. It should be non-blocking""" super(SASPortSensor, self).suspend() self._suspended = True def resume(self): """Resumes the module thread. It should be non-blocking""" super(SASPortSensor, self).resume() self._suspended = False def shutdown(self): """Clean up scheduler queue and gracefully shutdown thread""" super(SASPortSensor, self).shutdown()
def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(SASPortSensor, self).initialize(conf_reader) super(SASPortSensor, self).initialize_msgQ(msgQlist) self._site_id = Conf.get(GLOBAL_CONF, SITE_ID_KEY, 'DC01') self._rack_id = Conf.get(GLOBAL_CONF, RACK_ID_KEY, 'RC01') self._node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY, 'SN01') self._cluster_id = Conf.get(GLOBAL_CONF, CLUSTER_ID_KEY, 'CC01') # Get the sas port implementor from configuration sas_port_utility = Conf.get( SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}", "sysfs") self.polling_interval = int( Conf.get(SSPL_CONF, f"{self.SENSOR_NAME.upper()}>{self.POLLING_INTERVAL}", self.DEFAULT_POLLING_INTERVAL)) # Creating the instance of ToolFactory class self.tool_factory = ToolFactory() cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME) self.SAS_PORT_SENSOR_DATA = os.path.join( cache_dir_path, f'SAS_PORT_SENSOR_DATA_{self._node_id}') alert_type = None try: # Get the instance of the utility using ToolFactory self._utility_instance = self._utility_instance or \ self.tool_factory.get_instance(sas_port_utility) self._utility_instance.initialize() phy_status = None link_value_phy_status_collection = () # Call to sas phy dirctory which will return a dictionary # which has phy_name to negotiated link rate mapping # Ex: {"phy-0:0": "<12.0, Unknown>"} self.phy_dir_to_linkrate_mapping = \ self._utility_instance.get_phy_negotiated_link_rate() # Iterate over populated dictionary and restructure it # Ex: if phy-0:0 is 12.0/6.0/3.0, considered as UP. # {"phy-0:0": ("link_rate", <Up/Down>)} for phy, value in self.phy_dir_to_linkrate_mapping.items(): if 'Gbit'.lower() in value.strip().lower(): phy_status = 'up' # Increment global phy_link count for UP status self.phy_link_count += 1 else: phy_status = 'fault' link_value_phy_status_collection = (value, phy_status) self.phy_dir_to_linkrate_mapping[ phy] = link_value_phy_status_collection # Get the stored previous alert info self.sas_phy_stored_alert = store.get(self.SAS_PORT_SENSOR_DATA) self.check_and_send_alert() except KeyError as key_error: logger.error("Unable to get the instance of {} \ Utility. Hence shutting down the sensor".format( sas_port_utility)) self.shutdown() except Exception as e: if e == errno.ENOENT: logger.error("Problem occured while reading from sas_phy \ directory. directory path doesn't directory. Hence \ shuting down the sensor") elif e == errno.EACCES: logger.error( "Problem occured while reading from sas_phy directory. \ Not enough permission to read from the directory. \ Hence shuting down the sensor") else: logger.error( "Problem occured while reading from sas_phy directory. \ {0}. Hence shuting down the sensor".format(e)) self.shutdown() return True
class SASPortSensor(SensorThread, InternalMsgQ): """SAS Port Sensor which runs on its own thread periodically and is responsible for sensing changes is SAS ports/cable using available tool/utility""" SENSOR_NAME = "SASPortSensor" PRIORITY = 1 RESOURCE_TYPE = "node:interface:sas" # section in the configuration store SYSTEM_INFORMATION = "SYSTEM_INFORMATION" SITE_ID = "site_id" CLUSTER_ID = "cluster_id" NODE_ID = "node_id" RACK_ID = "rack_id" POLLING_INTERVAL = "polling_interval" CACHE_DIR_NAME = "server" RESOURCE_ID = "SASHBA-0" DEFAULT_POLLING_INTERVAL = '30' PROBE = "probe" # Dependency list DEPENDENCIES = { "plugins": ["NodeDataMsgHandler", "LoggingMsgHandler"], "rpms": [] } MIN_PHY_COUNT = 4 @staticmethod def name(): """@return: name of the module.""" return SASPortSensor.SENSOR_NAME def __init__(self, utility_instance=None): """init method""" super(SASPortSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY) # Initialize the utility instance self._utility_instance = utility_instance self.phy_dir_to_linkrate_mapping = None # Flag to indicate suspension of module self._suspended = False self._count = 0 self.phy_link_count = 0 def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(SASPortSensor, self).initialize(conf_reader) super(SASPortSensor, self).initialize_msgQ(msgQlist) self._site_id = self._conf_reader._get_value_with_default( self.SYSTEM_INFORMATION, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.SITE_ID), '001') self._cluster_id = self._conf_reader._get_value_with_default( self.SYSTEM_INFORMATION, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.CLUSTER_ID), '001') self._rack_id = self._conf_reader._get_value_with_default( self.SYSTEM_INFORMATION, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.RACK_ID), '001') self._node_id = self._conf_reader._get_value_with_default( self.SYSTEM_INFORMATION, COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.NODE_ID), '001') # Get the sas port implementor from configuration sas_port_utility = self._conf_reader._get_value_with_default( self.name().capitalize(), self.PROBE, "sysfs") self.polling_interval = int( self._conf_reader._get_value_with_default( self.SENSOR_NAME.upper(), self.POLLING_INTERVAL, self.DEFAULT_POLLING_INTERVAL)) # Creating the instance of ToolFactory class self.tool_factory = ToolFactory() cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME) self.SAS_PORT_SENSOR_DATA = os.path.join( cache_dir_path, f'SAS_PORT_SENSOR_DATA_{self._node_id}') alert_type = None try: # Get the instance of the utility using ToolFactory self._utility_instance = self._utility_instance or \ self.tool_factory.get_instance(sas_port_utility) self._utility_instance.initialize() phy_status = None link_value_phy_status_collection = () # Call to sas phy dirctory which will return a dictionary # which has phy_name to negotiated link rate mapping # Ex: {"phy-0:0": "<12.0, Unknown>"} self.phy_dir_to_linkrate_mapping = \ self._utility_instance.get_phy_negotiated_link_rate() # Iterate over populated dictionary and restructure it # Ex: if phy-0:0 is 12.0/6.0/3.0, considered as UP. # {"phy-0:0": ("link_rate", <Up/Down>)} for phy, value in self.phy_dir_to_linkrate_mapping.items(): if 'Gbit'.lower() in value.strip().lower(): phy_status = 'Up' # Increment global phy_link count for UP status self.phy_link_count += 1 else: phy_status = 'fault' link_value_phy_status_collection = (value, phy_status) self.phy_dir_to_linkrate_mapping[ phy] = link_value_phy_status_collection # Get the stored previous alert info self.sas_phy_stored_alert = store.get(self.SAS_PORT_SENSOR_DATA) self.check_and_send_alert(self.phy_link_count) except KeyError as key_error: logger.error("Unable to get the instance of {} \ Utility. Hence shutting down the sensor".format( sas_port_utility)) self.shutdown() except Exception as e: if e == errno.ENOENT: logger.error("Problem occured while reading from sas_phy \ directory. directory path doesn't directory. Hence \ shuting down the sensor") elif e == errno.EACCES: logger.error( "Problem occured while reading from sas_phy directory. \ Not enough permission to read from the directory. \ Hence shuting down the sensor") else: logger.error( "Problem occured while reading from sas_phy directory. \ {0}. Hence shuting down the sensor".format(e)) self.shutdown() return True def check_and_send_alert(self, new_phy_link_count): """Checks whether conditions are met and sends alert if required Alerts will be sent if - 1. All phys are down -> fault alert 2. 4 phys are up -> fault_resolved alert 3. Next group of 4 phys comes up -> informational alert Sensor data stored in Consul is a tuple (alert_type, phy_link_count) """ if self.sas_phy_stored_alert == None: # No info is stored for this node in Consul # Initialize alert_type to dummy fault_resolved self.sas_phy_stored_alert = ('fault_resolved', new_phy_link_count) # Save data to Consul store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA) elif self.sas_phy_stored_alert[0] == 'fault': # Previous alert sent for this node was fault, check if fault is resolved if new_phy_link_count >= self.MIN_PHY_COUNT: alert_type = 'fault_resolved' # Send alert self._generate_alert(alert_type) # Save data to Consul self.sas_phy_stored_alert = (alert_type, new_phy_link_count) store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA) elif self.sas_phy_stored_alert[0] in ['fault_resolved', 'insertion']: # Check if we need to send informational alert if new_phy_link_count > self.sas_phy_stored_alert[ 1] and new_phy_link_count % self.MIN_PHY_COUNT == 0: alert_type = 'insertion' # Send alert self._generate_alert(alert_type) # Save data to Consul self.sas_phy_stored_alert = (alert_type, new_phy_link_count) store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA) # Check to see if we need to send fault alert if new_phy_link_count == 0: alert_type = 'fault' # Send alert self._generate_alert(alert_type) # Save data to Consul self.sas_phy_stored_alert = (alert_type, new_phy_link_count) store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA) def run(self): """Run the sensor on its own thread""" alert_type = None status = None new_phy_up = 0 new_phy_down = 0 # Do not proceed if module is suspended if self._suspended == True: self._scheduler.enter(self.polling_interval, self._priority, self.run, ()) return # Check for debug mode being activated self._read_my_msgQ_noWait() try: phy_link_rate_dict = \ self._utility_instance.get_phy_negotiated_link_rate() if phy_link_rate_dict: for key, value in phy_link_rate_dict.items(): link_rate = value.strip() prev_linkrate_value = \ self.phy_dir_to_linkrate_mapping[key][0].strip() prev_alert_type = \ self.phy_dir_to_linkrate_mapping[key][1].strip() status = prev_alert_type # Compare local dict wrt global dictionary for change in the # negitiated link rate if link_rate.lower() != prev_linkrate_value.lower(): # If current link rate has no value like 12/6/3 Gbit # and previously it was up, then it's a fault condition if 'Gbit'.lower() not in link_rate.lower( ) and prev_alert_type.lower() == 'up': # Increment count for new phy down which were up previously new_phy_down += 1 # Make respective phy_status as fault status = 'fault' # Check if 12/6/3 Gbit is there in the current link rate and # the previous alert_type is fault. If so, means phy is Up again elif 'Gbit'.lower() in link_rate.lower( ) and prev_alert_type.lower() == 'fault': # Mark respective phy_status as Up status = 'Up' # Increment count for new phy up new_phy_up += 1 # Finally update the global dict with current link rate # and respctive phy status self.phy_dir_to_linkrate_mapping[key] = (link_rate, status) # Get current phy status i.e number of Up phys new_phy_link_count = self.phy_link_count + new_phy_up - new_phy_down # Get the last sent alert info # It is a tuple of (alert_type, phy_link_count) self.sas_phy_stored_alert = store.get( self.SAS_PORT_SENSOR_DATA) self.check_and_send_alert(new_phy_link_count) # Update current active phy count for next iteration self.phy_link_count = new_phy_link_count except Exception as ae: logger.exception(ae) # Fire every 30 seconds to see if there's a change in the phy status self._scheduler.enter(self.polling_interval, self._priority, self.run, ()) def _create_json_message(self, alert_type): """Creates a defined json message structure which can flow inside SSPL modules""" internal_json_msg = None severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) host_name = socket.gethostname() specific_info = {} specific_info_list = [] for key, val in self.phy_dir_to_linkrate_mapping.items(): # key will be phy-0:0. So, aplit it using ':' # So, structure will be SASHBA-0:phy-0 phy_number = key.split(":")[1] specific_info[ "resource_id"] = self.RESOURCE_ID + ':' + "phy-" + phy_number specific_info[ "negotiated_link_rate"] = self.phy_dir_to_linkrate_mapping[ key][0].strip() specific_info_list.append(specific_info) specific_info = {} alert_specific_info = specific_info_list info = { "site_id": self._site_id, "cluster_id": self._cluster_id, "rack_id": self._rack_id, "node_id": self._node_id, "resource_type": self.RESOURCE_TYPE, "resource_id": self.RESOURCE_ID, "event_time": epoch_time } internal_json_msg = json.dumps({ "sensor_request_type": { "node_data": { "status": "update", "host_id": host_name, "alert_type": alert_type, "severity": severity, "alert_id": alert_id, "info": info, "specific_info": alert_specific_info } } }) return internal_json_msg def _get_alert_id(self, epoch_time): """Returns alert id which is a combination of epoch_time and salt value """ salt = str(uuid.uuid4().hex) alert_id = epoch_time + salt return alert_id def _generate_alert(self, alert_type): """Queues the message to NodeData Message Handler""" json_msg = self._create_json_message(alert_type) if json_msg: self._write_internal_msgQ(NodeDataMsgHandler.name(), json_msg) def suspend(self): """Suspends the module thread. It should be non-blocking""" super(SASPortSensor, self).suspend() self._suspended = True def resume(self): """Resumes the module thread. It should be non-blocking""" super(SASPortSensor, self).resume() self._suspended = False def shutdown(self): """Clean up scheduler queue and gracefully shutdown thread""" super(SASPortSensor, self).shutdown()
class ServerMap(ResourceMap): """ServerMap class provides resource map and related information like health, manifest, etc,. """ name = "server" def __init__(self): """Initialize server.""" super().__init__() self.log = CustomLog(const.HEALTH_SVC_NAME) self.validate_server_type_support() self.sysfs = ToolFactory().get_instance('sysfs') self.sysfs.initialize() self.sysfs_base_path = self.sysfs.get_sysfs_base_path() self.cpu_path = self.sysfs_base_path + const.CPU_PATH hw_resources = { 'cpu': self.get_cpu_info, 'platform_sensors': self.get_platform_sensors_info, 'memory': self.get_mem_info, 'fans': self.get_fans_info, 'nw_ports': self.get_nw_ports_info, 'sas_hba': self.get_sas_hba_info, 'sas_ports': self.get_sas_ports_info, 'disks': self.get_disks_info, 'psus': self.get_psu_info } sw_resources = { 'cortx_sw_services': self.get_cortx_service_info, 'external_sw_services': self.get_external_service_info, 'raid': self.get_raid_info } self.server_resources = {"hw": hw_resources, "sw": sw_resources} self._ipmi = IpmiFactory().get_implementor("ipmitool") self.platform_sensor_list = ['Temperature', 'Voltage', 'Current'] def validate_server_type_support(self): """Check for supported server type.""" server_type = Conf.get(GLOBAL_CONF, NODE_TYPE_KEY) logger.debug(self.log.svc_log(f"Server Type:{server_type}")) if not server_type: msg = "ConfigError: server type is unknown." logger.error(self.log.svc_log(msg)) raise ResourceMapError(errno.EINVAL, msg) if server_type.lower( ) not in const.RESOURCE_MAP["server_type_supported"]: msg = f"Health provider is not supported for server type '{server_type}'" logger.error(self.log.svc_log(msg)) raise ResourceMapError(errno.EINVAL, msg) def get_health_info(self, rpath): """ Fetch health information for given rpath. rpath: Resource path to fetch its health Examples: node>compute[0] node>compute[0]>hw node>compute[0]>hw>disks """ logger.info(self.log.svc_log(f"Get Health data for rpath:{rpath}")) info = {} resource_found = False nodes = rpath.strip().split(">") leaf_node, _ = self.get_node_details(nodes[-1]) # Fetch health information for all sub nodes if leaf_node == "compute": info = self.get_server_health_info() resource_found = True elif leaf_node in self.server_resources: for resource, method in self.server_resources[leaf_node].items(): try: info.update({resource: method()}) resource_found = True except Exception as err: logger.error( self.log.svc_log(f"{err.__class__.__name__}: {err}")) info = None else: for node in nodes: resource, _ = self.get_node_details(node) for res_type in self.server_resources: method = self.server_resources[res_type].get(resource) if not method: logger.error( self.log.svc_log( f"No mapping function found for {res_type}")) continue try: info = method() resource_found = True except Exception as err: logger.error( self.log.svc_log( f"{err.__class__.__name__}: {err}")) info = None if resource_found: break if not resource_found: msg = f"Invalid rpath or health provider doesn't have support for'{rpath}'." logger.error(self.log.svc_log(f"{msg}")) raise ResourceMapError(errno.EINVAL, msg) return info @staticmethod def _is_any_resource_unhealthy(fru, data): """Check for any unhealthy resource at child level.""" for child in data[fru]: if isinstance(child, dict): if child.get("health") and \ child["health"]["status"].lower() != "ok": return True return False def get_server_health_info(self): """Returns overall server information""" unhealthy_resource_found = False server_details = Platform().get_server_details() # Currently only one instance of server is considered server = [] info = {} info["make"] = server_details["Board Mfg"] info["model"] = server_details["Product Name"] try: build_instance = BuildInfo() info["product_family"] = build_instance.get_attribute("NAME") info["version"] = build_instance.get_attribute("VERSION") info["build"] = build_instance.get_attribute("BUILD") except Exception as err: logger.error( self.log.svc_log(f"Unable to get build info due to {err}")) info["resource_usage"] = {} info["resource_usage"]["cpu_usage"] = self.get_cpu_overall_usage() info["resource_usage"]["disk_usage"] = self.get_disk_overall_usage() info["resource_usage"]["memory_usage"] = self.get_memory_overall_usage( ) for res_type in self.server_resources: info.update({res_type: {}}) for fru, method in self.server_resources[res_type].items(): try: info[res_type].update({fru: method()}) unhealthy_resource_found = self._is_any_resource_unhealthy( fru, info[res_type]) except Exception as err: logger.error( self.log.svc_log(f"{err.__class__.__name__}:{err}")) info[res_type].update({fru: None}) info["uid"] = socket.getfqdn() info["last_updated"] = int(time.time()) info["health"] = {} info["health"][ "status"] = "OK" if not unhealthy_resource_found else "Degraded" health_desc = 'good' if info["health"]["status"] == 'OK' else 'bad' info["health"]["description"] = f"Server is in {health_desc} health." info["health"]["recommendation"] = const.DEFAULT_RECOMMENDATION \ if info["health"]["status"] != "OK" else "NA" info["health"]["specifics"] = [] server.append(info) return server @staticmethod def get_cpu_usage(index=2, percpu=False): """Get CPU usage list.""" i = 0 cpu_usage = None while i < index: cpu_usage = psutil.cpu_percent(interval=None, percpu=percpu) time.sleep(1) i = i + 1 return cpu_usage def get_cpu_list(self, mode): """Returns the CPU list as per specified mode.""" cpu_info_path = Path(self.cpu_path + mode) # Read the text from /cpu/online file cpu_info = cpu_info_path.read_text() # Drop the \n character from the end of string cpu_info = cpu_info.rstrip('\n') # Convert the string to list of indexes cpu_list = self.sysfs.convert_cpu_info_list(cpu_info) return cpu_list def get_cpu_info(self, add_overall_usage=False): """Update and return CPU information in specific format.""" per_cpu_data = [] cpu_present = self.get_cpu_list("present") cpu_online = self.get_cpu_list("online") cpu_usage = self.get_cpu_usage(percpu=True) cpu_usage_dict = dict(zip(cpu_online, cpu_usage)) overall_cpu_usage = list(psutil.getloadavg()) cpu_count = len(cpu_present) overall_usage = { "current": self.get_cpu_usage(percpu=False), "1_min_avg": overall_cpu_usage[0], "5_min_avg": overall_cpu_usage[1], "15_min_avg": overall_cpu_usage[2] } for cpu_id in range(0, cpu_count): uid = f"CPU-{cpu_id}" cpu_dict = self.get_health_template(uid, is_fru=False) online_status = "Online" if cpu_id in cpu_online else "Offline" health_status = "OK" if online_status == "Online" else "NA" usage = "NA" if health_status == "NA" \ else cpu_usage_dict[cpu_id] specifics = [{"cpu_usage": usage, "state": online_status}] self.set_health_data(cpu_dict, status=health_status, specifics=specifics) per_cpu_data.append(cpu_dict) cpu_data = [{ "overall_usage": overall_usage, "cpu_count": cpu_count, "last_updated": int(time.time()), "cpus": per_cpu_data }] if not add_overall_usage: cpu_data = per_cpu_data logger.debug(self.log.svc_log(f"CPU Health Data:{cpu_data}")) return cpu_data def get_cpu_overall_usage(self): """Returns CPU overall usage.""" overall_usage = None cpu_data = self.get_cpu_info(add_overall_usage=True) if cpu_data[0].get("overall_usage"): overall_usage = cpu_data[0].get("overall_usage") else: logger.error(self.log.svc_log("Failed to get overall cpu usage")) return overall_usage def get_disk_info(self, add_overall_usage=False): """Update and return Disk information in specific format.""" per_disk_data = [] overall_usage = None disk_data = [{ "overall_usage": overall_usage, "last_updated": int(time.time()), "disks": per_disk_data }] if not add_overall_usage: disk_data = per_disk_data logger.debug(self.log.svc_log(f"Disk Health Data:{disk_data}")) return disk_data def format_ipmi_platform_sensor_reading(self, reading): """builds json resposne from ipmi tool response. reading arg sample: ('CPU1 Temp', '01', 'ok', '3.1', '36 degrees C') """ uid = '_'.join(reading[0].split()) sensor_id = reading[0] sensor_props = self._ipmi.get_sensor_props(sensor_id) lower_critical = sensor_props[1].get('Lower Critical', 'NA') upper_critical = sensor_props[1].get('Upper Critical', 'NA') lower_non_recoverable = sensor_props[1].get('Lower Non-Recoverable', 'NA') upper_non_recoverable = sensor_props[1].get('Upper Non-Recoverable', 'NA') status = 'OK' if reading[2] == 'ok' else 'NA' health_desc = 'good' if status == 'OK' else 'bad' description = f"{uid} sensor is in {health_desc} health." recommendation = const.DEFAULT_RECOMMENDATION if status != 'OK' else 'NA' specifics = [{ "Sensor Reading": f"{reading[-1]}", "lower_critical_threshold": lower_critical, "upper_critical_threshold": upper_critical, "lower_non_recoverable": lower_non_recoverable, "upper_non_recoverable": upper_non_recoverable, }] resp = self.get_health_template(uid, is_fru=False) self.set_health_data(resp, status, description, recommendation, specifics) return resp def get_platform_sensors_info(self): """Get the sensor information based on sensor_type and instance.""" response = {sensor: [] for sensor in self.platform_sensor_list} for sensor in self.platform_sensor_list: sensor_reading = self._ipmi.get_sensor_list_by_type(sensor) if not sensor_reading: logger.debug( self.log.svc_log(f"No sensor data received for :{sensor}")) continue for reading in sensor_reading: response[sensor].append( self.format_ipmi_platform_sensor_reading(reading)) logger.debug( self.log.svc_log(f"Platform Sensor Health Data:{response}")) return response def get_mem_info(self): """Collect & return system memory info in specific format.""" default_mem_usage_threshold = int( Conf.get(SSPL_CONF, "NODEDATAMSGHANDLER>host_memory_usage_threshold", 80)) data = [] status = "OK" description = "Host memory is in good health." self.mem_info = dict(psutil.virtual_memory()._asdict()) curr_mem_usage_threshold = int(self.mem_info['percent']) if curr_mem_usage_threshold > int(default_mem_usage_threshold): status = "Overloaded" description = ( f"Current host memory usage is {curr_mem_usage_threshold}," f"beyond configured threshold of {default_mem_usage_threshold}." ) memory_dict = self.prepare_mem_json(status, description) data.append(memory_dict) logger.debug(self.log.svc_log(f"Memory Health Data:{data}")) return data def prepare_mem_json(self, status, description): """Update and return memory information dict.""" total_memory = {} for key, value in self.mem_info.items(): if key == 'percent': total_memory['percent'] = str(self.mem_info['percent']) + '%' else: total_memory[key] = str(self.mem_info[key] >> 20) + 'MB' uid = "main_memory" specifics = [{ "total": total_memory['total'], "available": total_memory['available'], "percent": total_memory['percent'], "used": total_memory['used'], "free": total_memory['free'], "active": total_memory['active'], "inactive": total_memory['inactive'], "buffers": total_memory['buffers'], "cached": total_memory['cached'], "shared": total_memory['shared'], "slab": total_memory['slab'] }] memory_dict = self.get_health_template(uid, is_fru=False) self.set_health_data(memory_dict, status=status, description=description, specifics=specifics) return memory_dict def get_memory_overall_usage(self): """Returns Memory overall usage.""" overall_usage = None mem_info = self.get_mem_info() if mem_info[0].get("health"): overall_usage = mem_info[0]["health"]["specifics"] else: logger.error( self.log.svc_log("Failed to get memory overall usage")) return overall_usage def get_fans_info(self): """Get the Fan sensor information using ipmitool.""" data = [] sensor_reading = self._ipmi.get_sensor_list_by_type('Fan') if sensor_reading is None: msg = f"Failed to get Fan sensor reading using ipmitool" logger.error(self.log.svc_log(msg)) return for fan_reading in sensor_reading: sensor_id = fan_reading[0] fan_dict = self.get_health_template(sensor_id, is_fru=True) sensor_props = self._ipmi.get_sensor_props(sensor_id) status = 'OK' if fan_reading[2] == 'ok' else 'NA' lower_critical = sensor_props[1].get('Lower Critical', 'NA') upper_critical = sensor_props[1].get('Upper Critical', 'NA') specifics = [{ "Sensor Reading": f"{fan_reading[-1]}", "lower_critical_threshold": lower_critical, "upper_critical_threshold": upper_critical }] self.set_health_data(fan_dict, status=status, specifics=specifics) data.append(fan_dict) logger.debug(self.log.svc_log(f"Fan Health Data:{fan_dict}")) return data def get_sas_hba_info(self): """Return SAS-HBA current health.""" sas_hba_data = [] sas_instance = SAS() try: hosts = sas_instance.get_host_list() # ['host1'] except SASError as err: hosts = [] logger.error(self.log.svc_log(err)) except Exception as err: hosts = [] logger.exception(self.log.svc_log(err)) for host in hosts: host_id = const.SAS_RESOURCE_ID + host.replace('host', '') host_data = self.get_health_template(host_id, False) try: ports = sas_instance.get_port_list(host) # ports = ['port-1:0', 'port-1:1', 'port-1:2', 'port-1:3'] except SASError as err: ports = [] logger.error(self.log.svc_log(err)) except Exception as err: ports = [] logger.exception(self.log.svc_log(err)) health = "OK" specifics = {'num_ports': len(ports), 'ports': []} for port in ports: try: port_data = sas_instance.get_port_data(port) except SASError as err: port_data = [] logger.error(self.log.svc_log(err)) except Exception as err: port_data = [] logger.exception(self.log.svc_log(err)) specifics['ports'].append(port_data) if not port_data or port_data['state'] != 'running': health = "NA" self.set_health_data(host_data, health, specifics=[specifics]) sas_hba_data.append(host_data) return sas_hba_data def get_sas_ports_info(self): """Return SAS Ports current health.""" sas_ports_data = [] sas_instance = SAS() try: ports = sas_instance.get_port_list() # eg: ['port-1:0', 'port-1:1', 'port-1:2', 'port-1:3'] except SASError as err: ports = [] logger.error(self.log.svc_log(err)) except Exception as err: ports = [] logger.exception(self.log.svc_log(err)) for port in ports: port_id = 'sas_' + port port_data = self.get_health_template(port_id, False) try: phys = sas_instance.get_phy_list_for_port(port) # eg: [ 'phy-1:0', 'phy-1:1', 'phy-1:2', 'phy-1:3'] except SASError as err: phys = [] logger.error(self.log.svc_log(err)) except Exception as err: phys = [] logger.exception(self.log.svc_log(err)) specifics = {'num_phys': len(phys), 'phys': []} health = "OK" for phy in phys: try: phy_data = sas_instance.get_phy_data(phy) except SASError as err: phy_data = {} logger.error(self.log.svc_log(err)) except Exception: phy_data = {} logger.exception(self.log.svc_log(err)) specifics['phys'].append(phy_data) if not phy_data or phy_data['state'] != 'enabled' or \ 'Gbit' not in phy_data['negotiated_linkrate']: health = "NA" self.set_health_data(port_data, health, specifics=[specifics]) sas_ports_data.append(port_data) return sas_ports_data def get_nw_ports_info(self): """Return the Network ports information.""" network_cable_data = [] io_counters = psutil.net_io_counters(pernic=True) nw_instance = Network() for interface, addrs in psutil.net_if_addrs().items(): nic_info = self.get_health_template(interface, False) specifics = {} for addr in addrs: if addr.family == socket.AF_INET: specifics["ipV4"] = addr.address if interface in io_counters: io_info = io_counters[interface] specifics = { "networkErrors": io_info.errin + io_info.errout, "droppedPacketsIn": io_info.dropin, "droppedPacketsOut": io_info.dropout, "packetsIn": io_info.packets_recv, "packetsOut": io_info.packets_sent, "trafficIn": io_info.bytes_recv, "trafficOut": io_info.bytes_sent } # Get the interface health status. nw_status, nw_cable_conn_status = \ self.get_nw_status(nw_instance, interface) specifics["nwStatus"] = nw_status specifics["nwCableConnStatus"] = nw_cable_conn_status # Map and set the interface health status and description. map_status = { "CONNECTED": "OK", "DISCONNECTED": "Disabled/Failed", "UNKNOWN": "NA" } health_status = map_status[nw_cable_conn_status] desc = "Network Interface '%s' is %sin good health." % ( interface, '' if health_status == "OK" else 'not ') self.set_health_data(nic_info, health_status, description=desc, specifics=[specifics]) network_cable_data.append(nic_info) return network_cable_data def get_nw_status(self, nw_interface, interface): """Read & Return the latest network status from sysfs files.""" try: nw_status = nw_interface.get_operational_state(interface) except NetworkError as err: nw_status = "UNKNOWN" logger.error(self.log.svc_log(err)) except Exception as err: nw_status = "UNKNOWN" logger.exception(self.log.svc_log(err)) try: nw_cable_conn_status = nw_interface.get_link_state(interface) except NetworkError as err: nw_cable_conn_status = "UNKNOWN" logger.exception(self.log.svc_log(err)) except Exception as err: nw_cable_conn_status = "UNKNOWN" logger.exception(self.log.svc_log(err)) return nw_status, nw_cable_conn_status def get_cortx_service_info(self): """Get cortx service info in required format.""" service_info = [] cortx_services = Service().get_cortx_service_list() for service in cortx_services: response = self.get_systemd_service_info(service) if response is not None: service_info.append(response) return service_info def get_external_service_info(self): """Get external service info in required format.""" service_info = [] external_services = Service().get_external_service_list() for service in external_services: response = self.get_systemd_service_info(service) if response is not None: service_info.append(response) return service_info def get_systemd_service_info(self, service_name): """Get info of specified service using dbus API.""" try: unit = Service()._bus.get_object( const.SYSTEMD_BUS, Service()._manager.LoadUnit(service_name)) properties_iface = Interface(unit, dbus_interface=PROPERTIES_IFACE) except DBusException as err: logger.error( self.log.svc_log( f"Unable to initialize {service_name} due to {err}")) return None path_array = properties_iface.Get(const.SERVICE_IFACE, 'ExecStart') try: command_line_path = str(path_array[0][0]) except IndexError as err: logger.error( self.log.svc_log( f"Unable to find {service_name} path due to {err}")) command_line_path = "NA" is_installed = True if command_line_path != "NA" or 'invalid' in properties_iface.Get( const.UNIT_IFACE, 'UnitFileState') else False uid = str(properties_iface.Get(const.UNIT_IFACE, 'Id')) if not is_installed: health_status = "NA" health_description = f"Software enabling {uid} is not installed" recommendation = "NA" specifics = [{ "service_name": uid, "description": "NA", "installed": str(is_installed).lower(), "pid": "NA", "state": "NA", "substate": "NA", "status": "NA", "license": "NA", "version": "NA", "command_line_path": "NA" }] else: service_license = "NA" version = "NA" service_description = str( properties_iface.Get(const.UNIT_IFACE, 'Description')) state = str(properties_iface.Get(const.UNIT_IFACE, 'ActiveState')) substate = str(properties_iface.Get(const.UNIT_IFACE, 'SubState')) service_status = 'enabled' if 'disabled' not in properties_iface.Get( const.UNIT_IFACE, 'UnitFileState') else 'disabled' pid = "NA" if state == "inactive" else str( properties_iface.Get(const.SERVICE_IFACE, 'ExecMainPID')) try: version = Service().get_service_info_from_rpm(uid, "VERSION") except ServiceError as err: logger.error( self.log.svc_log( f"Unable to get service version due to {err}")) try: service_license = Service().get_service_info_from_rpm( uid, "LICENSE") except ServiceError as err: logger.error( self.log.svc_log( f"Unable to get service license due to {err}")) specifics = [{ "service_name": uid, "description": service_description, "installed": str(is_installed).lower(), "pid": pid, "state": state, "substate": substate, "status": service_status, "license": service_license, "version": version, "command_line_path": command_line_path }] if service_status == 'enabled' and state == 'active' \ and substate == 'running': health_status = 'OK' health_description = f"{uid} is in good health" recommendation = "NA" else: health_status = state health_description = f"{uid} is not in good health" recommendation = const.DEFAULT_RECOMMENDATION service_info = self.get_health_template(uid, is_fru=False) self.set_health_data(service_info, health_status, health_description, recommendation, specifics) return service_info def get_raid_info(self): raids_data = [] for raid in RAIDs.get_configured_raids(): raid_data = self.get_health_template(raid.id, False) health, description = raid.get_health() devices = raid.get_devices() specifics = [{ "location": raid.raid, "data_integrity_status": raid.get_data_integrity_status(), "devices": devices }] self.set_health_data(raid_data, health, specifics=specifics, description=description) raids_data.append(raid_data) return raids_data @staticmethod def get_disk_overall_usage(): units_factor_GB = 1000000000 overall_usage = { "totalSpace": f'{int(psutil.disk_usage("/")[0])//int(units_factor_GB)} GB', "usedSpace": f'{int(psutil.disk_usage("/")[1])//int(units_factor_GB)} GB', "freeSpace": f'{int(psutil.disk_usage("/")[2])//int(units_factor_GB)} GB', "diskUsedPercentage": psutil.disk_usage("/")[3], } return overall_usage def get_disks_info(self): """Update and return server drive information in specific format.""" disks = [] for disk in Disk.get_disks(): uid = disk.path if disk.path else disk.id disk_health = self.get_health_template(uid, True) health_data = disk.get_health() health = "OK" if (health_data['SMART_health'] == "PASSED") else "Fault" self.set_health_data(disk_health, health, specifics=[{ "SMART": health_data }]) disks.append(disk_health) logger.debug(self.log.svc_log(f"Disk Health Data:{disks}")) return disks def get_psu_info(self): """Update and return PSU information in specific format.""" psus_health_data = [] for psu in self.get_psus(): data = self.get_health_template(f'{psu["Location"]}', True) health = "OK" if (psu["Status"] == "Present, OK") else "Fault" self.set_health_data(data, health, specifics=psu) psus_health_data.append(data) logger.debug(self.log.svc_log(f"PSU Health Data:{psus_health_data}")) return psus_health_data @staticmethod def get_psus(): response, _, _ = SimpleProcess("dmidecode -t 39").run() matches = re.findall( "System Power Supply|Power Unit Group:.*|" "Location:.*|Name:.*|Serial Number:.*|" "Max Power Capacity:.*|Status: .*|" "Plugged:.*|Hot Replaceable:.*", response.decode()) psus = [] stack = [] while matches: item = matches.pop() while item != "System Power Supply": stack.append(item) item = matches.pop() psu = {} while stack: key, value = stack.pop().strip().split(":") psu[key] = value.strip() psus.append(psu) return psus