def __init__(self): """Initialize server.""" super().__init__() self.log = CustomLog(const.HEALTH_SVC_NAME) self.validate_server_type_support() self.sysfs = ToolFactory().get_instance('sysfs') self.sysfs.initialize() self.sysfs_base_path = self.sysfs.get_sysfs_base_path() self.cpu_path = self.sysfs_base_path + const.CPU_PATH hw_resources = { 'cpu': self.get_cpu_info, 'platform_sensors': self.get_platform_sensors_info, 'memory': self.get_mem_info, 'fans': self.get_fans_info, 'nw_ports': self.get_nw_ports_info, 'sas_hba': self.get_sas_hba_info, 'sas_ports': self.get_sas_ports_info, 'disks': self.get_disks_info, 'psus': self.get_psu_info } sw_resources = { 'cortx_sw_services': self.get_cortx_service_info, 'external_sw_services': self.get_external_service_info, 'raid': self.get_raid_info } self.server_resources = {"hw": hw_resources, "sw": sw_resources} self._ipmi = IpmiFactory().get_implementor("ipmitool") self.platform_sensor_list = ['Temperature', 'Voltage', 'Current']
def __init__(self): """Initialize server.""" super().__init__() self.log = CustomLog(const.HEALTH_SVC_NAME) server_type = Conf.get(GLOBAL_CONF, NODE_TYPE_KEY) Platform.validate_server_type_support(self.log, ResourceMapError, server_type) self.sysfs = ToolFactory().get_instance('sysfs') self.sysfs.initialize() self.sysfs_base_path = self.sysfs.get_sysfs_base_path() self.cpu_path = self.sysfs_base_path + const.CPU_PATH hw_resources = { 'cpu': self.get_cpu_info, 'platform_sensor': self.get_platform_sensors_info, 'memory': self.get_mem_info, 'fan': self.get_fans_info, 'nw_port': self.get_nw_ports_info, 'sas_hba': self.get_sas_hba_info, 'sas_port': self.get_sas_ports_info, 'disk': self.get_disks_info, 'psu': self.get_psu_info } sw_resources = { 'cortx_sw_services': self.get_cortx_service_info, 'external_sw_services': self.get_external_service_info, 'raid': self.get_raid_info } self.server_resources = {"hw": hw_resources, "sw": sw_resources} self._ipmi = IpmiFactory().get_implementor("ipmitool") self.platform_sensor_list = ['Temperature', 'Voltage', 'Current'] self.service = Service() self.resource_indexing_map = ServerResourceMap.resource_indexing_map\ ["health"]
def setUp(self): """Mock the config values and spwan required class objects.""" self.mocked_values = { "BMC_INTERFACE>default": 'system', "/var/cortx/sspl/data/server/ACTIVE_BMC_IF_SN01": 'system', "ip": '10.0.0.1', "user": '******', "secret": ('gAAAAABgi9l0ZR5tSwBoLvDS4m2c6ps5rFzdo1' '-o_mr43C8HYSw5mRRd63je_2251_QU-XlVhgEe_' 'k6lQesrrjFVrKkQ70Yfgg==') } Conf.get = Mock(side_effect=self.mocked_conf) store.get = Mock(side_effect=self.mocked_store) self.tool = IpmiFactory().get_implementor('ipmitool')
def _process_msg(self, jsonMsg): """Parses the incoming message and handles appropriately""" self._log_debug(f"_process_msg, jsonMsg: {jsonMsg}") if isinstance(jsonMsg, dict) is False: jsonMsg = json.loads(jsonMsg) # Parse out the uuid so that it can be sent back in Ack message uuid = None if jsonMsg.get("sspl_ll_msg_header").get("uuid") is not None: uuid = jsonMsg.get("sspl_ll_msg_header").get("uuid") self._log_debug(f"_processMsg, uuid: {uuid}") if jsonMsg.get("actuator_request_type").get("node_controller").get("node_request") is not None: node_request = jsonMsg.get("actuator_request_type").get("node_controller").get("node_request") self._log_debug(f"_processMsg, node_request: {node_request}") # Parse out the component field in the node_request component = node_request[0:4] # Handle generic command line requests if component == 'SSPL': # Query the Zope GlobalSiteManager for an object implementing the MOTR actuator if self._command_line_actuator is None: from actuators.Icommand_line import ICommandLine command_line_actuator_class = self._queryUtility(ICommandLine) # Instantiate CommandLine Actuator only if class is loaded if command_line_actuator_class: self._command_line_actuator = command_line_actuator_class(self._conf_reader) else: logger.warn("CommandLine Actuator not loaded") json_msg = AckResponseMsg(node_request, NodeControllerMsgHandler.UNSUPPORTED_REQUEST, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) return # Perform the request and get the response command_line_response = self._command_line_actuator.perform_request(jsonMsg).strip() self._log_debug(f"_process_msg, command line response: {command_line_response}") json_msg = AckResponseMsg(node_request, command_line_response, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) # Handle LED effects using the HPI actuator elif component == "LED:": # HPI related operations are not supported in VM environment. if self._is_env_vm(): logger.warn("HPI operations are not supported in current environment") return # Query the Zope GlobalSiteManager for an object implementing the IHPI actuator if self._HPI_actuator is None: from actuators.Ihpi import IHPI # Load HPIActuator class HPI_actuator_class = self._queryUtility(IHPI) # Instantiate HPIActuator only if class is loaded if HPI_actuator_class: self._HPI_actuator = HPI_actuator_class(self._conf_reader) else: logger.warn("HPIActuator not loaded") if self._product.lower() in [x.lower() for x in enabled_products]: json_msg = AckResponseMsg(node_request, NodeControllerMsgHandler.UNSUPPORTED_REQUEST, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) return self._log_debug(f"_process_msg, _HPI_actuator name: {self._HPI_actuator.name()}") # Perform the request using HPI and get the response hpi_response = self._HPI_actuator.perform_request(jsonMsg).strip() self._log_debug(f"_process_msg, hpi_response: {hpi_response}") json_msg = AckResponseMsg(node_request, hpi_response, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) # Set the Bezel LED color using the GEM interface elif component == "BEZE": # Query the Zope GlobalSiteManager for an object implementing the IGEM actuator if self._GEM_actuator is None: self._GEM_actuator = self._queryUtility(IGEM)(self._conf_reader) self._log_debug(f"_process_msg, _GEM_actuator name: {self._GEM_actuator.name()}") # Perform the request using GEM and get the response gem_response = self._GEM_actuator.perform_request(jsonMsg).strip() self._log_debug(f"_process_msg, gem_response: {gem_response}") json_msg = AckResponseMsg(node_request, gem_response, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) elif component == "PDU:": # Query the Zope GlobalSiteManager for an object implementing the IPDU actuator if self._PDU_actuator is None: from actuators.Ipdu import IPDU PDU_actuator_class = self._queryUtility(IPDU) # Instantiate RaritanPDU Actuator only if class is loaded if PDU_actuator_class: self._PDU_actuator = PDU_actuator_class(self._conf_reader) else: logger.warn("RaritanPDU Actuator not loaded") json_msg = AckResponseMsg(node_request, NodeControllerMsgHandler.UNSUPPORTED_REQUEST, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) return # Perform the request on the PDU and get the response pdu_response = self._PDU_actuator.perform_request(jsonMsg).strip() self._log_debug(f"_process_msg, pdu_response: {pdu_response}") json_msg = AckResponseMsg(node_request, pdu_response, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) elif component == "RAID": # If the state is INITIALIZED, We can assume that actuator is # ready to perform operation. if actuator_state_manager.is_initialized("RAIDactuator"): self._log_debug(f"_process_msg, _RAID_actuator name: {self._RAID_actuator.name()}") self._execute_raid_request( node_request, self._RAID_actuator, jsonMsg, uuid) # If the state is INITIALIZING, need to send message elif actuator_state_manager.is_initializing("RAIDactuator"): # This state will not be reached. Kept here for consistency. logger.info("RAID actuator is initializing") busy_json_msg = AckResponseMsg( node_request, "BUSY", uuid, error_no=errno.EBUSY).getJson() self._write_internal_msgQ( "RabbitMQegressProcessor", busy_json_msg) elif actuator_state_manager.is_imported("RAIDactuator"): # This case will be for first request only. Subsequent # requests will go to INITIALIZED state case. logger.info("RAID actuator is imported and initializing") from actuators.Iraid import IRAIDactuator actuator_state_manager.set_state( "RAIDactuator", actuator_state_manager.INITIALIZING) # Query the Zope GlobalSiteManager for an object implementing the IRAIDactuator raid_actuator_class = self._queryUtility(IRAIDactuator) if raid_actuator_class: # NOTE: Instantiation part should not time consuming # otherwise NodeControllerMsgHandler will get block # and will not be able serve any subsequent requests. # This applies to instantiation of evey actuator. self._RAID_actuator = raid_actuator_class() logger.info(f"_process_msg, _RAID_actuator name: {self._RAID_actuator.name()}") self._execute_raid_request( node_request, self._RAID_actuator, jsonMsg, uuid) actuator_state_manager.set_state( "RAIDactuator", actuator_state_manager.INITIALIZED) else: logger.warn("RAID actuator is not instantiated") # If there is no entry for actuator in table, We can assume # that it is not loaded for some reason. else: logger.warn("RAID actuator is not loaded or not supported") elif component == "IPMI": # Query the Zope GlobalSiteManager for an object implementing the IPMI actuator if self._IPMI_actuator is None: from actuators.Iipmi import Iipmi IPMI_actuator_class = self._queryUtility(Iipmi) # Instantiate IPMI Actuator only if class is loaded if IPMI_actuator_class: self._IPMI_actuator = IPMI_actuator_class(self._conf_reader) else: logger.warn("IPMI Actuator not loaded") json_msg = AckResponseMsg(node_request, NodeControllerMsgHandler.UNSUPPORTED_REQUEST, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) return # Perform the IPMI request on the node and get the response ipmi_response = self._IPMI_actuator.perform_request(jsonMsg).strip() self._log_debug(f"_process_msg, ipmi_response: {ipmi_response}") json_msg = AckResponseMsg(node_request, ipmi_response, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) elif component == "STOP": # HPI related operations are not supported in VM environment. if self._is_env_vm(): logger.warn("HPI operations are not supported in current environment") return # Query the Zope GlobalSiteManager for an object implementing the IHPI actuator if self._HPI_actuator is None: from actuators.Ihpi import IHPI # Load HPIActuator class HPI_actuator_class = self._queryUtility(IHPI) # Instantiate HPIActuator only if class is loaded if HPI_actuator_class: self._HPI_actuator = HPI_actuator_class(self._conf_reader) else: logger.warn("HPIActuator not loaded") if self._product.lower() in [x.lower() for x in enabled_products]: json_msg = AckResponseMsg(node_request, NodeControllerMsgHandler.UNSUPPORTED_REQUEST, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) return self._log_debug(f"_process_msg, _HPI_actuator name: {self._HPI_actuator.name()}") # Parse out the drive to stop drive_request = node_request[12:].strip() self._log_debug(f"perform_request, drive to stop: {drive_request}") # Append POWER_OFF to notify HPI actuator of desired state jsonMsg["actuator_request_type"]["node_controller"]["node_request"] = \ f"DISK: set {drive_request} POWER_OFF" self._log_debug(f"_process_msg, jsonMsg: {jsonMsg}") # Perform the request using HPI and get the response hpi_response = self._HPI_actuator.perform_request(jsonMsg).strip() self._log_debug(f"_process_msg, hpi_response: {hpi_response}") # Simplify success message as external apps don't care about details if "Success" in hpi_response: hpi_response = "Successful" json_msg = AckResponseMsg(node_request, hpi_response, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) elif component == "STAR": # HPI related operations are not supported in VM environment. if self._is_env_vm(): logger.warn("HPI operations are not supported in current environment") return # Query the Zope GlobalSiteManager for an object implementing the IHPI actuator if self._HPI_actuator is None: from actuators.Ihpi import IHPI # Load HPIActuator class HPI_actuator_class = self._queryUtility(IHPI) # Instantiate HPIActuator only if class is loaded if HPI_actuator_class: self._HPI_actuator = HPI_actuator_class(self._conf_reader) else: logger.warn("HPIActuator not loaded") if self._product.lower() in [x.lower() for x in enabled_products]: json_msg = AckResponseMsg(node_request, NodeControllerMsgHandler.UNSUPPORTED_REQUEST, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) return self._log_debug(f"_process_msg, _HPI_actuator name: {self._HPI_actuator.name()}") # Parse out the drive to start drive_request = node_request[13:].strip() self._log_debug(f"perform_request, drive to start: {drive_request}") # Append POWER_ON to notify HPI actuator of desired state jsonMsg["actuator_request_type"]["node_controller"]["node_request"] = \ f"DISK: set {drive_request} POWER_ON" self._log_debug(f"_process_msg, jsonMsg: {jsonMsg}") # Perform the request using HPI and get the response hpi_response = self._HPI_actuator.perform_request(jsonMsg).strip() self._log_debug(f"_process_msg, hpi_response: {hpi_response}") # Simplify success message as external apps don't care about details if "Success" in hpi_response: hpi_response = "Successful" json_msg = AckResponseMsg(node_request, hpi_response, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) elif component == "RESE": # HPI related operations are not supported in VM environment. if self._is_env_vm(): logger.warn("HPI operations are not supported in current environment") return # Query the Zope GlobalSiteManager for an object implementing the IHPI actuator if self._HPI_actuator is None: from actuators.Ihpi import IHPI # Load HPIActuator class HPI_actuator_class = self._queryUtility(IHPI) # Instantiate HPIActuator only if class is loaded if HPI_actuator_class: self._HPI_actuator = HPI_actuator_class(self._conf_reader) else: logger.warn("HPIActuator not loaded") if self._product.lower() in [x.lower() for x in enabled_products]: json_msg = AckResponseMsg(node_request, NodeControllerMsgHandler.UNSUPPORTED_REQUEST, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) return self._log_debug(f"_process_msg, _HPI_actuator name: {self._HPI_actuator.name()}") # Parse out the drive to power cycle drive_request = node_request[13:].strip() self._log_debug(f"perform_request, drive to power cycle: {drive_request}") # Append POWER_OFF and then POWER_ON to notify HPI actuator of desired state jsonMsg["actuator_request_type"]["node_controller"]["node_request"] = \ f"DISK: set {drive_request} POWER_OFF" self._log_debug(f"_process_msg, jsonMsg: {jsonMsg}") # Perform the request using HPI and get the response hpi_response = self._HPI_actuator.perform_request(jsonMsg).strip() self._log_debug(f"_process_msg, hpi_response: {hpi_response}") # Check for success and power the disk back on if "Success" in hpi_response: # Append POWER_ON to notify HPI actuator of desired state jsonMsg["actuator_request_type"]["node_controller"]["node_request"] = \ f"DISK: set {drive_request} POWER_ON" self._log_debug(f"_process_msg, jsonMsg: {jsonMsg}") # Perform the request using HPI and get the response hpi_response = self._HPI_actuator.perform_request(jsonMsg).strip() self._log_debug(f"_process_msg, hpi_response: {hpi_response}") # Simplify success message as external apps don't care about details if "Success" in hpi_response: hpi_response = "Successful" json_msg = AckResponseMsg(node_request, hpi_response, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) elif component == "HDPA": # If the state is INITIALIZED, We can assume that actuator is # ready to perform operation. if actuator_state_manager.is_initialized("Hdparm"): logger.info(f"_process_msg, Hdparm_actuator name: {self._hdparm_actuator.name()}") # Perform the hdparm request on the node and get the response hdparm_response = self._hdparm_actuator.perform_request(jsonMsg).strip() self._log_debug(f"_process_msg, hdparm_response: {hdparm_response}") json_msg = AckResponseMsg(node_request, hdparm_response, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) # If the state is INITIALIZING, need to send message elif actuator_state_manager.is_initializing("Hdparm"): # This state will not be reached. Kept here for consistency. logger.info("Hdparm actuator is initializing") busy_json_msg = AckResponseMsg( node_request, "BUSY", uuid, error_no=errno.EBUSY).getJson() self._write_internal_msgQ( "RabbitMQegressProcessor", busy_json_msg) elif actuator_state_manager.is_imported("Hdparm"): # This case will be for first request only. Subsequent # requests will go to INITIALIZED state case. logger.info("Hdparm actuator is imported and initializing") # Query the Zope GlobalSiteManager for an object # implementing the hdparm actuator. from actuators.Ihdparm import IHdparm actuator_state_manager.set_state( "Hdparm", actuator_state_manager.INITIALIZING) hdparm_actuator_class = self._queryUtility(IHdparm) if hdparm_actuator_class: # NOTE: Instantiation part should not time consuming # otherwise NodeControllerMsgHandler will get block and will # not be able serve any subsequent requests. This applies # to instantiation of evey actuator. self._hdparm_actuator = hdparm_actuator_class() self._log_debug(f"_process_msg, _hdparm_actuator name: {self._hdparm_actuator.name()}") # Perform the hdparm request on the node and get the response hdparm_response = self._hdparm_actuator.perform_request(jsonMsg).strip() self._log_debug(f"_process_msg, hdparm_response: {hdparm_response}") json_msg = AckResponseMsg(node_request, hdparm_response, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) actuator_state_manager.set_state( "Hdparm", actuator_state_manager.INITIALIZED) else: logger.info("Hdparm actuator is not instantiated") # If there is no entry for actuator in table, We can assume # that it is not loaded for some reason. else: logger.info("Hdparm actuator is not loaded or not supported") elif component == "SMAR": # Parse out the drive request field in json msg node_request = jsonMsg.get("actuator_request_type").get("node_controller").get("node_request") drive_request = node_request[12:].strip() self._log_debug(f"perform_request, drive: {drive_request}") # If the drive field is an asterisk then send all the smart results for all drives available if drive_request == "*": # Send the event to SystemdWatchdog to schedule SMART test internal_json_msg = json.dumps( {"sensor_request_type" : "disk_smart_test", "serial_number" : "*", "node_request" : self.host_id, "uuid" : uuid }) self._write_internal_msgQ("SystemdWatchdog", internal_json_msg) return # Put together a message to get the serial number of the drive using hdparm tool if drive_request.startswith("/"): serial_number, error = self._retrieve_serial_number(drive_request) # Send error response back on ack channel if error != "": json_msg = AckResponseMsg(node_request, error, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) return else: if self._smartctl_actuator is None: from actuators.Ismartctl import ISmartctl smartctl_actuator_class = self._queryUtility(ISmartctl) if smartctl_actuator_class: self._smartctl_actuator = self._queryUtility(ISmartctl)() self._log_debug("_process_msg, _smart_actuator name: %s" % self._smartctl_actuator.name()) else: logger.error(" No module Smartctl is present to load") serial_compare = self._smartctl_actuator._check_serial_number(drive_request) if not serial_compare: json_msg = AckResponseMsg(node_request, "Drive Not Found", uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) return else: serial_number = drive_request # Send the event to SystemdWatchdog to schedule SMART test internal_json_msg = json.dumps( {"sensor_request_type" : "disk_smart_test", "serial_number" : serial_number, "node_request" : node_request, "uuid" : uuid }) self._write_internal_msgQ("SystemdWatchdog", internal_json_msg) elif component == "DRVM": # Requesting the current status from drivemanager # Parse out the drive request field in json msg node_request = jsonMsg.get("actuator_request_type").get("node_controller").get("node_request") drive_request = node_request[15:].strip() self._log_debug(f"perform_request, drive: {drive_request}") # If the drive field is an asterisk then send all the drivemanager results for all drives available if drive_request == "*": # Send a message to the disk message handler to lookup the drivemanager status and send it out internal_json_msg = json.dumps( {"sensor_request_type" : "drvmngr_status", "serial_number" : "*", "node_request" : self.host_id, "uuid" : uuid }) # Send the event to disk message handler to generate json message self._write_internal_msgQ(DiskMsgHandler.name(), internal_json_msg) return # Put together a message to get the serial number of the drive using hdparm tool if drive_request.startswith("/"): serial_number, error = self._retrieve_serial_number(drive_request) # Send error response back on ack channel if error != "": json_msg = AckResponseMsg(node_request, error, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) return else: serial_number = drive_request # Send a message to the disk message handler to lookup the smart status and send it out internal_json_msg = json.dumps( {"sensor_request_type" : "drvmngr_status", "serial_number" : serial_number, "node_request" : node_request, "uuid" : uuid }) # Send the event to disk message handler to generate json message self._write_internal_msgQ(DiskMsgHandler.name(), internal_json_msg) elif component == "HPI_": # Requesting the current status from HPI data # Parse out the drive request field in json msg if self._is_env_vm(): logger.warn("HPI operations are not supported in current environment") return if self.setup == 'cortx': logger.warn("HPIMonitor not loaded") json_msg = AckResponseMsg(node_request, NodeControllerMsgHandler.UNSUPPORTED_REQUEST, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) return node_request = jsonMsg.get("actuator_request_type").get("node_controller").get("node_request") drive_request = node_request[11:].strip() self._log_debug(f"perform_request, drive: {drive_request}") # If the drive field is an asterisk then send all the hpi results for all drives available if drive_request == "*": # Send a message to the disk message handler to lookup the hpi status and send it out internal_json_msg = json.dumps( {"sensor_request_type" : "hpi_status", "serial_number" : "*", "node_request" : self.host_id, "uuid" : uuid }) # Send the event to disk message handler to generate json message self._write_internal_msgQ(DiskMsgHandler.name(), internal_json_msg) return # Put together a message to get the serial number of the drive using hdparm tool if drive_request.startswith("/"): serial_number, error = self._retrieve_serial_number(drive_request) # Send error response back on ack channel if error != "": json_msg = AckResponseMsg(node_request, error, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) return else: serial_number = drive_request # Send a message to the disk message handler to lookup the smart status and send it out internal_json_msg = json.dumps( {"sensor_request_type" : "hpi_status", "serial_number" : serial_number, "node_request" : node_request, "uuid" : uuid }) # Send the event to disk message handler to generate json message self._write_internal_msgQ(DiskMsgHandler.name(), internal_json_msg) elif component == "SIMU": # Requesting to simulate an event # Parse out the simulated request field node_request = jsonMsg.get("actuator_request_type").get("node_controller").get("node_request") sim_request = node_request[9:].strip().split(" ") self._log_debug(f"perform_request, sim_request: {str(sim_request)}") # Put together a message to get the serial number of the drive using hdparm tool if sim_request[1].startswith("/"): serial_number, error = self._retrieve_serial_number(sim_request[1]) # Send error response back on ack channel if error != "": json_msg = AckResponseMsg(node_request, error, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) return else: serial_number = sim_request[1] # SMART simulation requests are sent to SystemdWatchdog if sim_request[0] == "SMART_FAILURE": logger.info(f"NodeControllerMsgHandler, simulating SMART_FAILURE on drive: {serial_number}") internal_json_msg = json.dumps( {"sensor_request_type" : "simulate_failure", "serial_number" : serial_number, "node_request" : sim_request[0], "uuid" : uuid }) # Send the event to SystemdWatchdog to handle it from here self._write_internal_msgQ("SystemdWatchdog", internal_json_msg) else: # Send a message to the disk message handler to handle simulation request internal_json_msg = json.dumps( {"sensor_request_type" : "sim_event", "serial_number" : serial_number, "node_request" : sim_request[0], "uuid" : uuid }) # Send the event to disk message handler to generate json message self._write_internal_msgQ(DiskMsgHandler.name(), internal_json_msg) elif component == "NDHW": # NDHW Stands for Node HW. try: # Load and Instantiate the Actuator for the first request if self._NodeHW_actuator is None: from actuators.impl.generic.node_hw import NodeHWactuator from framework.utils.ipmi_client import IpmiFactory self.ipmi_client_name = self._conf_reader._get_value_with_default( self.NODE_HW_ACTUATOR, self.IPMI_IMPLEMENTOR, "ipmitool") ipmi_factory = IpmiFactory() ipmi_client = \ ipmi_factory.get_implementor(self.ipmi_client_name) # Instantiate NodeHWactuator only if class is loaded if ipmi_client is not None: self._NodeHW_actuator = NodeHWactuator(ipmi_client, self._conf_reader) self._NodeHW_actuator.initialize() else: logger.error(f"IPMI client: '{self.ipmi_client_name}' doesn't exist") return node_request = jsonMsg.get("actuator_request_type") # Perform the NodeHW request on the node and get the response #TODO: Send message to Ack as well as Sensor in their respective channel. node_hw_response = self._NodeHW_actuator.perform_request(node_request) self._log_debug(f"_process_msg, node_hw_response: {node_hw_response}") json_msg = NodeHwAckResponseMsg(node_request, node_hw_response, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg) except ImportError as e: logger.error(f"Modules could not be loaded: {e}") return except Exception as e: logger.error(f"NodeControllerMsgHandler, _process_msg, Exception in request handling: {e}") return else: response = f"NodeControllerMsgHandler, _process_msg, unknown node controller msg: {node_request}" self._log_debug(response) json_msg = AckResponseMsg(node_request, response, uuid).getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg)
class TestIpmiTool(unittest.TestCase): """Test IpmiTool over different platform and interface settings.""" ERR_STR = "\noutput: %s\nerror: %s\nreturn code: %s" def setUp(self): """Mock the config values and spwan required class objects.""" self.mocked_values = { "BMC_INTERFACE>default": 'system', "/var/cortx/sspl/data/server/ACTIVE_BMC_IF_SN01": 'system', "ip": '10.0.0.1', "user": '******', "secret": ('gAAAAABgi9l0ZR5tSwBoLvDS4m2c6ps5rFzdo1' '-o_mr43C8HYSw5mRRd63je_2251_QU-XlVhgEe_' 'k6lQesrrjFVrKkQ70Yfgg==') } Conf.get = Mock(side_effect=self.mocked_conf) store.get = Mock(side_effect=self.mocked_store) self.tool = IpmiFactory().get_implementor('ipmitool') def mocked_conf(self, *args, **kwargs): key = args[1] if key.find('bmc') != -1: key = key.split('>')[-1] return self.mocked_values.get(key, '') def mocked_store(self, *args, **kwargs): key = args[0] return self.mocked_values.get(key, '') def test_ipmi_on_vm_over_kcs(self): out, err, retcode = self.tool._run_ipmitool_subcommand('sel info') err_str = self.ERR_STR % (out, err, retcode) self.assertEqual(retcode, 1, msg=err_str) self.assertTrue(self.tool.VM_ERROR in err, msg=err_str) # TODO: Needs to be implemented # def test_ipmi_over_lan(self): # pass def test_ipmisimtool(self): # Start ipmisimtool sspl_test = os.path.join(BASE_DIR, 'sspl_test') shutil.copy(f"{sspl_test}/ipmi_simulator/ipmisimtool", "/usr/bin") with open(f"{DATA_PATH}/server/activate_ipmisimtool", 'a'): os.utime(f"{DATA_PATH}/server/activate_ipmisimtool") out, err, retcode = \ self.tool._run_ipmitool_subcommand("sdr type 'Fan'") err_str = self.ERR_STR % (out, err, retcode) self.assertEqual(retcode, 0, msg=err_str) self.assertEqual(err, '', msg=err_str) self.assertTrue('Fan' in out, msg=err_str) # Stop ipmisimtool os.remove('/usr/bin/ipmisimtool') os.remove(f"{DATA_PATH}/server/activate_ipmisimtool") def tearDown(self): pass
def initialize(self): """Performs basic Node HW actuator initialization""" self.sensor_id_map = self._executor.get_fru_list_by_type( ['fan', 'power supply', 'drive slot / bay'], sensor_id_map={}) self.ipmi_client = IpmiFactory().get_implementor('ipmitool')
class NodeHWactuator(Actuator, Debug): """Handles request messages for Node server requests """ ACTUATOR_NAME = "NodeHWactuator" SYSTEM_INFORMATION = "SYSTEM_INFORMATION" NODE_REQUEST_MAP = { "disk" : "Drive Slot / Bay", "fan" : "Fan", "psu" : "Power Supply" } @staticmethod def name(): """ @return: name of the module.""" return NodeHWactuator.ACTUATOR_NAME def __init__(self, executor, conf_reader): super(NodeHWactuator, self).__init__() self.host_id = socket.getfqdn() self.sensor_id_map = None self._executor = executor self.fru_specific_info = {} self._resource_id = "" self._sensor_type = "" def initialize(self): """Performs basic Node HW actuator initialization""" self.sensor_id_map = self._executor.get_fru_list_by_type( ['fan', 'power supply', 'drive slot / bay'], sensor_id_map={}) self.ipmi_client = IpmiFactory().get_implementor('ipmitool') def _get_fru_instances(self, fru, fru_instance): """Get the fru information based on fru_type and instance""" response = None try: if self.sensor_id_map: fru_dict = self.sensor_id_map[fru.lower()] for sensor_id in fru_dict.values(): if sensor_id == '': continue sensor_common_info, sensor_specific_info = self._executor.get_sensor_props(sensor_id) self.fru_specific_info[sensor_id] = sensor_specific_info if self.fru_specific_info is not None: resource_info = self._parse_fru_info(fru) if fru_instance == "*": response = self._create_node_fru_json_message(resource_info, fru_instance) else: for resource in resource_info: if resource['resource_id'] == fru_instance: response = self._create_node_fru_json_message(resource, fru_instance) break else: raise Exception("Resource Id Not Found %s" %(fru_instance)) except KeyError as e: logger.error('NodeHWactuator, _get_fru_instances, \ Unable to process the FRU type: %s' % e) return except Exception as e: logger.exception('NodeHWactuator, _get_fru_instances, \ Error occured during request parsing %s' % e) return return response def _parse_fru_info(self, fru): """Parses fan information""" specific_info = None specifics = [] for sensor_id, fru_info in self.fru_specific_info.items(): specific_info = dict() for fru_key,fru_value in fru_info.items(): specific_info[fru_key] = fru_value specific_info["resource_id"] = sensor_id specifics.append(specific_info) if (fru == "Power Supply") or (fru == "Fan") or (fru == "Drive Slot / Bay"): if not specifics: manufacturer = self._executor.get_manufacturer_name() msg = "'%s' sensors not seen in %s node server" % ( fru, manufacturer) specifics.append({"ERROR": msg}) logger.critical(msg) else: for each in specifics: if each.get('States Asserted'): each['States Asserted'] = ' '.join( x.strip() for x in each['States Asserted'].split()) self.fru_specific_info = {} return specifics def perform_request(self, json_msg): """Performs the Node server request @return: The response string from performing the request """ response = "" node_request = json_msg.get("node_controller") node_request_instance = node_request.get("node_request").split(":")[:3] if node_request_instance == ['NDHW', 'node', 'hw']: response = self._process_fru_request(node_request) elif node_request_instance == ['NDHW', 'node', 'sensor']: response = self._process_sensor_request(node_request) return response def _process_fru_request(self, node_request): """Get the fru information based on node_request @return: The response string from performing the request """ response = "" self.fru_node_request = node_request.get("node_request").split(":")[3] fru = self.NODE_REQUEST_MAP.get(self.fru_node_request) fru_instance = node_request.get("resource") if fru_instance.isdigit() and isinstance(int(fru_instance), int): fru_dict = self.sensor_id_map.get(fru.lower()) sensor_id = fru_dict[int(fru_instance)] common, specific = self._executor.get_sensor_props(sensor_id) response = self._create_node_fru_json_message(specific, sensor_id) response['instance_id'] = fru_instance response['info']['resource_id'] = sensor_id # Converting Fru ID From "HDD 0 Status (0xf0)" to "Drive Slot / Bay #0xf0" response['specific_info']['fru_id'] = fru+" #"+common['Sensor ID'].split('(')[1][:-1] else: response = self._get_fru_instances(fru, fru_instance) return response def _create_node_fru_json_message(self, specifics, resource_id): """Creates JSON response to be sent out to Node Controller Message Handler for further validation""" resource_type = "node:hw:{0}".format(self.fru_node_request) fru = self.ipmi_client.is_fru(self.fru_node_request) epoch_time = str(calendar.timegm(time.gmtime())) response = { "alert_type":"GET", "severity":"informational", "host_id": self.host_id, "instance_id": resource_id, "info": { "resource_id": resource_id, "fru": fru, "resource_type": resource_type, "event_time": epoch_time }, "specific_info": specifics } return response def _process_sensor_request(self, node_request): response = dict() # todo : validate on which node request commands are executing. # "node_request": "NDHW:node:sensor:Temperature" # "resource": "* or PS1 Temperature" self._sensor_type = node_request.get('node_request').split(":")[3] self._resource_id = node_request.get('resource') if self._sensor_type.lower() in list(map(lambda sensor_item: sensor_item.value.lower(), SensorTypes)): # fetch generic node info self._build_generic_info(response) # fetch specific info self._build_sensor_info(response, self._sensor_type, self._resource_id) else: logger.error("Error: Unsupported sensor type {}".format(self._sensor_type)) return response def _get_sensor_properties(self, sensor_name): """ Get all the properties of a sensor. Returns a tuple (common, specific) where common is a dict of common sensor properties and their values for this sensor, and specific is a dict of the properties specific to this sensor e.g. ipmitool sensor get 'PS1 Temperature' Locating sensor record... Sensor ID : PS1 Temperature (0x5c) Entity ID : 10.1 Sensor Type (Threshold) : Temperature Sensor Reading : 16 (+/- 0) degrees C Status : ok Lower Non-Recoverable : na Lower Critical : na Lower Non-Critical : na Upper Non-Critical : 55.000 Upper Critical : 60.000 Upper Non-Recoverable : na Positive Hysteresis : 2.000 Negative Hysteresis : 2.000 Assertion Events : Assertions Enabled : unc+ ucr+ Deassertions Enabled : unc+ ucr+ """ try: cmd = f"sensor get '{sensor_name}'" sensor_get_response, err, return_code = \ self._executor._run_ipmitool_subcommand(cmd) if return_code == 0: return self._response_to_dict(sensor_get_response) else: msg = (f"sensor get '{sensor_name}' :" f" command failed with error {err}") logger.warn(msg) return self._errorstr_to_dict(sensor_get_response) except Exception as err: logger.error("Exception occurred in _get_sensor_properties for cmd - sensor get '{0}': {1}".format(sensor_name, err)) def _errorstr_to_dict(self, data): error_resp = {'sensor_error': None} try: for line in data.split("\n"): if "Sensor Reading" in line: error_str = "-".join(line.split(":")[1:]) error_resp['sensor_reading'] = error_str break except Exception as err: logger.error("Exception occurs while parsing sensor error string:{0}".format(err)) return error_resp def _build_generic_info(self, response): """ Build json with generic information :param response: :return: """ response['host_id'] = self.host_id response['instance_id'] = self._resource_id response['alert_type'] = AlertTypes.GET.value response['severity'] = SeverityTypes.INFORMATIONAL.value response['info'] = { "resource_type": "node:sensor:" + self._sensor_type.lower(), "fru": "false", "resource_id": self._resource_id, "event_time": str(calendar.timegm(time.gmtime())), } def _build_sensor_info(self, response, sensor_type, sensor_name): """ Build json with sensor common and specific information :param response: :param sensor_type: :param sensor_name: :return: """ many_sensors = (sensor_name == "*") search_args = None if many_sensors else sensor_name sdr_type_response, err, return_code = \ self._executor._run_ipmitool_subcommand( f"sdr type '{sensor_type}'", grep_args=search_args) if return_code != 0: msg = f"sdr type '{sensor_type}' : command failed with error {err}" logger.error(msg) error_resp = {'sensor_status': err} response['specific_info'] = error_resp else: if many_sensors: # for all sensors specific info response will be list response['specific_info'] = self._response_to_dict(sdr_type_response, split_char='|', dict_keys=['resource_id', 'sensor_number', 'sensor_status', 'entity_id_instance', 'sensor_reading'], many_sensors=True) else: # for specific sensor specific info response will be dict response['specific_info'] = self._response_to_dict(sdr_type_response, split_char='|', dict_keys=['resource_id', 'sensor_number', 'sensor_status', 'entity_id_instance', 'sensor_reading']) response['specific_info'].update(self._get_sensor_properties(sensor_name)) def _response_to_dict(self, data, split_char=':', dict_keys=None, many_sensors=False): """ Take response data and split with given split char, Convert it into readable dict. :param data: String with multiple lines :param split_char: char to split with :param dict_keys: List of keys to be used in properties dict :param many_sensors: Many lines for sensor name = '*' :return: """ many_sensors_data = [] properties = {} try: # from properties list split out key and values. for line in data.split("\n"): if split_char in line: if dict_keys is not None: inner_dict = dict() result = line.split(split_char) # validate result size and dict key size are same if len(result) == len(dict_keys): for i in range(len(result)): inner_dict[dict_keys[i]] = result[i].strip() # This line will break loop by reading single line of result # If user trying to fetch only specific sensor data if not many_sensors: properties = inner_dict break many_sensors_data.append(inner_dict) else: if len(line.split(split_char)) >= 2: properties[line.split(split_char)[0].strip()] = line.split(split_char)[1].strip() except KeyError as e: msg = "Error in parsing response: {}".format(e) logger.error(msg) if many_sensors: return many_sensors_data return properties
def __init__(self): """Initialize instance.""" self._ipmi = IpmiFactory().get_implementor("ipmitool")
class Platform: """provides information about server.""" def __init__(self): """Initialize instance.""" self._ipmi = IpmiFactory().get_implementor("ipmitool") @staticmethod def get_os(): """Returns os name({ID}{VERSION_ID}) from /etc/os-release.""" os_release = "" with open("/etc/os-release") as f: os_release = f.read() if os_release: os_id = re.findall('^ID=(.*)\n', os_release, flags=re.MULTILINE) os_version_id = re.findall('^VERSION_ID=(.*)\n', os_release, flags=re.MULTILINE) if os_id and os_version_id: os_info = [ os_str.strip('"') for os_str in os_id + os_version_id ] return "".join(os_info) return os_release def get_manufacturer_name(self): """Returns node server manufacturer name.""" manufacturer = "" cmd = "bmc info" out, _, retcode = self._ipmi._run_ipmitool_subcommand(cmd) if retcode == 0: search_res = re.search(r"Manufacturer Name[\s]+:[\s]+([\w]+)(.*)", out) if search_res: manufacturer = search_res.groups()[0] return manufacturer def get_server_details(self): """ Returns a dictionary of server information. Grep 'FRU device description on ID 0' information using ipmitool command. """ specifics = { "Board Mfg": "", "Board Product": "", "Board Part Number": "", "Product Name": "", "Product Part Number": "", "Manufacturer": self.get_manufacturer_name(), "OS": self.get_os() } cmd = "fru print" prefix = "FRU Device Description : Builtin FRU Device (ID 0)" search_res = "" out, _, retcode = self._ipmi._run_ipmitool_subcommand(cmd) if retcode == 0: # Get only 'FRU Device Description : Builtin FRU Device (ID 0)' information search_res = re.search( r"((.*%s[\S\n\s]+ID 1\)).*)|(.*[\S\n\s]+)" % prefix, out) if search_res: search_res = search_res.group() for key in specifics.keys(): if key in search_res: device_desc = re.search(r"%s[\s]+:[\s]+([\w-]+)(.*)" % key, out) if device_desc: value = device_desc.groups()[0] specifics.update({key: value}) return specifics
class ServerMap(ResourceMap): """ServerMap class provides resource map and related information like health, manifest, etc,. """ name = "server" def __init__(self): """Initialize server.""" super().__init__() self.log = CustomLog(const.HEALTH_SVC_NAME) self.validate_server_type_support() self.sysfs = ToolFactory().get_instance('sysfs') self.sysfs.initialize() self.sysfs_base_path = self.sysfs.get_sysfs_base_path() self.cpu_path = self.sysfs_base_path + const.CPU_PATH hw_resources = { 'cpu': self.get_cpu_info, 'platform_sensors': self.get_platform_sensors_info, 'memory': self.get_mem_info, 'fans': self.get_fans_info, 'nw_ports': self.get_nw_ports_info, 'sas_hba': self.get_sas_hba_info, 'sas_ports': self.get_sas_ports_info, 'disks': self.get_disks_info, 'psus': self.get_psu_info } sw_resources = { 'cortx_sw_services': self.get_cortx_service_info, 'external_sw_services': self.get_external_service_info, 'raid': self.get_raid_info } self.server_resources = {"hw": hw_resources, "sw": sw_resources} self._ipmi = IpmiFactory().get_implementor("ipmitool") self.platform_sensor_list = ['Temperature', 'Voltage', 'Current'] def validate_server_type_support(self): """Check for supported server type.""" server_type = Conf.get(GLOBAL_CONF, NODE_TYPE_KEY) logger.debug(self.log.svc_log(f"Server Type:{server_type}")) if not server_type: msg = "ConfigError: server type is unknown." logger.error(self.log.svc_log(msg)) raise ResourceMapError(errno.EINVAL, msg) if server_type.lower( ) not in const.RESOURCE_MAP["server_type_supported"]: msg = f"Health provider is not supported for server type '{server_type}'" logger.error(self.log.svc_log(msg)) raise ResourceMapError(errno.EINVAL, msg) def get_health_info(self, rpath): """ Fetch health information for given rpath. rpath: Resource path to fetch its health Examples: node>compute[0] node>compute[0]>hw node>compute[0]>hw>disks """ logger.info(self.log.svc_log(f"Get Health data for rpath:{rpath}")) info = {} resource_found = False nodes = rpath.strip().split(">") leaf_node, _ = self.get_node_details(nodes[-1]) # Fetch health information for all sub nodes if leaf_node == "compute": info = self.get_server_health_info() resource_found = True elif leaf_node in self.server_resources: for resource, method in self.server_resources[leaf_node].items(): try: info.update({resource: method()}) resource_found = True except Exception as err: logger.error( self.log.svc_log(f"{err.__class__.__name__}: {err}")) info = None else: for node in nodes: resource, _ = self.get_node_details(node) for res_type in self.server_resources: method = self.server_resources[res_type].get(resource) if not method: logger.error( self.log.svc_log( f"No mapping function found for {res_type}")) continue try: info = method() resource_found = True except Exception as err: logger.error( self.log.svc_log( f"{err.__class__.__name__}: {err}")) info = None if resource_found: break if not resource_found: msg = f"Invalid rpath or health provider doesn't have support for'{rpath}'." logger.error(self.log.svc_log(f"{msg}")) raise ResourceMapError(errno.EINVAL, msg) return info @staticmethod def _is_any_resource_unhealthy(fru, data): """Check for any unhealthy resource at child level.""" for child in data[fru]: if isinstance(child, dict): if child.get("health") and \ child["health"]["status"].lower() != "ok": return True return False def get_server_health_info(self): """Returns overall server information""" unhealthy_resource_found = False server_details = Platform().get_server_details() # Currently only one instance of server is considered server = [] info = {} info["make"] = server_details["Board Mfg"] info["model"] = server_details["Product Name"] try: build_instance = BuildInfo() info["product_family"] = build_instance.get_attribute("NAME") info["version"] = build_instance.get_attribute("VERSION") info["build"] = build_instance.get_attribute("BUILD") except Exception as err: logger.error( self.log.svc_log(f"Unable to get build info due to {err}")) info["resource_usage"] = {} info["resource_usage"]["cpu_usage"] = self.get_cpu_overall_usage() info["resource_usage"]["disk_usage"] = self.get_disk_overall_usage() info["resource_usage"]["memory_usage"] = self.get_memory_overall_usage( ) for res_type in self.server_resources: info.update({res_type: {}}) for fru, method in self.server_resources[res_type].items(): try: info[res_type].update({fru: method()}) unhealthy_resource_found = self._is_any_resource_unhealthy( fru, info[res_type]) except Exception as err: logger.error( self.log.svc_log(f"{err.__class__.__name__}:{err}")) info[res_type].update({fru: None}) info["uid"] = socket.getfqdn() info["last_updated"] = int(time.time()) info["health"] = {} info["health"][ "status"] = "OK" if not unhealthy_resource_found else "Degraded" health_desc = 'good' if info["health"]["status"] == 'OK' else 'bad' info["health"]["description"] = f"Server is in {health_desc} health." info["health"]["recommendation"] = const.DEFAULT_RECOMMENDATION \ if info["health"]["status"] != "OK" else "NA" info["health"]["specifics"] = [] server.append(info) return server @staticmethod def get_cpu_usage(index=2, percpu=False): """Get CPU usage list.""" i = 0 cpu_usage = None while i < index: cpu_usage = psutil.cpu_percent(interval=None, percpu=percpu) time.sleep(1) i = i + 1 return cpu_usage def get_cpu_list(self, mode): """Returns the CPU list as per specified mode.""" cpu_info_path = Path(self.cpu_path + mode) # Read the text from /cpu/online file cpu_info = cpu_info_path.read_text() # Drop the \n character from the end of string cpu_info = cpu_info.rstrip('\n') # Convert the string to list of indexes cpu_list = self.sysfs.convert_cpu_info_list(cpu_info) return cpu_list def get_cpu_info(self, add_overall_usage=False): """Update and return CPU information in specific format.""" per_cpu_data = [] cpu_present = self.get_cpu_list("present") cpu_online = self.get_cpu_list("online") cpu_usage = self.get_cpu_usage(percpu=True) cpu_usage_dict = dict(zip(cpu_online, cpu_usage)) overall_cpu_usage = list(psutil.getloadavg()) cpu_count = len(cpu_present) overall_usage = { "current": self.get_cpu_usage(percpu=False), "1_min_avg": overall_cpu_usage[0], "5_min_avg": overall_cpu_usage[1], "15_min_avg": overall_cpu_usage[2] } for cpu_id in range(0, cpu_count): uid = f"CPU-{cpu_id}" cpu_dict = self.get_health_template(uid, is_fru=False) online_status = "Online" if cpu_id in cpu_online else "Offline" health_status = "OK" if online_status == "Online" else "NA" usage = "NA" if health_status == "NA" \ else cpu_usage_dict[cpu_id] specifics = [{"cpu_usage": usage, "state": online_status}] self.set_health_data(cpu_dict, status=health_status, specifics=specifics) per_cpu_data.append(cpu_dict) cpu_data = [{ "overall_usage": overall_usage, "cpu_count": cpu_count, "last_updated": int(time.time()), "cpus": per_cpu_data }] if not add_overall_usage: cpu_data = per_cpu_data logger.debug(self.log.svc_log(f"CPU Health Data:{cpu_data}")) return cpu_data def get_cpu_overall_usage(self): """Returns CPU overall usage.""" overall_usage = None cpu_data = self.get_cpu_info(add_overall_usage=True) if cpu_data[0].get("overall_usage"): overall_usage = cpu_data[0].get("overall_usage") else: logger.error(self.log.svc_log("Failed to get overall cpu usage")) return overall_usage def get_disk_info(self, add_overall_usage=False): """Update and return Disk information in specific format.""" per_disk_data = [] overall_usage = None disk_data = [{ "overall_usage": overall_usage, "last_updated": int(time.time()), "disks": per_disk_data }] if not add_overall_usage: disk_data = per_disk_data logger.debug(self.log.svc_log(f"Disk Health Data:{disk_data}")) return disk_data def format_ipmi_platform_sensor_reading(self, reading): """builds json resposne from ipmi tool response. reading arg sample: ('CPU1 Temp', '01', 'ok', '3.1', '36 degrees C') """ uid = '_'.join(reading[0].split()) sensor_id = reading[0] sensor_props = self._ipmi.get_sensor_props(sensor_id) lower_critical = sensor_props[1].get('Lower Critical', 'NA') upper_critical = sensor_props[1].get('Upper Critical', 'NA') lower_non_recoverable = sensor_props[1].get('Lower Non-Recoverable', 'NA') upper_non_recoverable = sensor_props[1].get('Upper Non-Recoverable', 'NA') status = 'OK' if reading[2] == 'ok' else 'NA' health_desc = 'good' if status == 'OK' else 'bad' description = f"{uid} sensor is in {health_desc} health." recommendation = const.DEFAULT_RECOMMENDATION if status != 'OK' else 'NA' specifics = [{ "Sensor Reading": f"{reading[-1]}", "lower_critical_threshold": lower_critical, "upper_critical_threshold": upper_critical, "lower_non_recoverable": lower_non_recoverable, "upper_non_recoverable": upper_non_recoverable, }] resp = self.get_health_template(uid, is_fru=False) self.set_health_data(resp, status, description, recommendation, specifics) return resp def get_platform_sensors_info(self): """Get the sensor information based on sensor_type and instance.""" response = {sensor: [] for sensor in self.platform_sensor_list} for sensor in self.platform_sensor_list: sensor_reading = self._ipmi.get_sensor_list_by_type(sensor) if not sensor_reading: logger.debug( self.log.svc_log(f"No sensor data received for :{sensor}")) continue for reading in sensor_reading: response[sensor].append( self.format_ipmi_platform_sensor_reading(reading)) logger.debug( self.log.svc_log(f"Platform Sensor Health Data:{response}")) return response def get_mem_info(self): """Collect & return system memory info in specific format.""" default_mem_usage_threshold = int( Conf.get(SSPL_CONF, "NODEDATAMSGHANDLER>host_memory_usage_threshold", 80)) data = [] status = "OK" description = "Host memory is in good health." self.mem_info = dict(psutil.virtual_memory()._asdict()) curr_mem_usage_threshold = int(self.mem_info['percent']) if curr_mem_usage_threshold > int(default_mem_usage_threshold): status = "Overloaded" description = ( f"Current host memory usage is {curr_mem_usage_threshold}," f"beyond configured threshold of {default_mem_usage_threshold}." ) memory_dict = self.prepare_mem_json(status, description) data.append(memory_dict) logger.debug(self.log.svc_log(f"Memory Health Data:{data}")) return data def prepare_mem_json(self, status, description): """Update and return memory information dict.""" total_memory = {} for key, value in self.mem_info.items(): if key == 'percent': total_memory['percent'] = str(self.mem_info['percent']) + '%' else: total_memory[key] = str(self.mem_info[key] >> 20) + 'MB' uid = "main_memory" specifics = [{ "total": total_memory['total'], "available": total_memory['available'], "percent": total_memory['percent'], "used": total_memory['used'], "free": total_memory['free'], "active": total_memory['active'], "inactive": total_memory['inactive'], "buffers": total_memory['buffers'], "cached": total_memory['cached'], "shared": total_memory['shared'], "slab": total_memory['slab'] }] memory_dict = self.get_health_template(uid, is_fru=False) self.set_health_data(memory_dict, status=status, description=description, specifics=specifics) return memory_dict def get_memory_overall_usage(self): """Returns Memory overall usage.""" overall_usage = None mem_info = self.get_mem_info() if mem_info[0].get("health"): overall_usage = mem_info[0]["health"]["specifics"] else: logger.error( self.log.svc_log("Failed to get memory overall usage")) return overall_usage def get_fans_info(self): """Get the Fan sensor information using ipmitool.""" data = [] sensor_reading = self._ipmi.get_sensor_list_by_type('Fan') if sensor_reading is None: msg = f"Failed to get Fan sensor reading using ipmitool" logger.error(self.log.svc_log(msg)) return for fan_reading in sensor_reading: sensor_id = fan_reading[0] fan_dict = self.get_health_template(sensor_id, is_fru=True) sensor_props = self._ipmi.get_sensor_props(sensor_id) status = 'OK' if fan_reading[2] == 'ok' else 'NA' lower_critical = sensor_props[1].get('Lower Critical', 'NA') upper_critical = sensor_props[1].get('Upper Critical', 'NA') specifics = [{ "Sensor Reading": f"{fan_reading[-1]}", "lower_critical_threshold": lower_critical, "upper_critical_threshold": upper_critical }] self.set_health_data(fan_dict, status=status, specifics=specifics) data.append(fan_dict) logger.debug(self.log.svc_log(f"Fan Health Data:{fan_dict}")) return data def get_sas_hba_info(self): """Return SAS-HBA current health.""" sas_hba_data = [] sas_instance = SAS() try: hosts = sas_instance.get_host_list() # ['host1'] except SASError as err: hosts = [] logger.error(self.log.svc_log(err)) except Exception as err: hosts = [] logger.exception(self.log.svc_log(err)) for host in hosts: host_id = const.SAS_RESOURCE_ID + host.replace('host', '') host_data = self.get_health_template(host_id, False) try: ports = sas_instance.get_port_list(host) # ports = ['port-1:0', 'port-1:1', 'port-1:2', 'port-1:3'] except SASError as err: ports = [] logger.error(self.log.svc_log(err)) except Exception as err: ports = [] logger.exception(self.log.svc_log(err)) health = "OK" specifics = {'num_ports': len(ports), 'ports': []} for port in ports: try: port_data = sas_instance.get_port_data(port) except SASError as err: port_data = [] logger.error(self.log.svc_log(err)) except Exception as err: port_data = [] logger.exception(self.log.svc_log(err)) specifics['ports'].append(port_data) if not port_data or port_data['state'] != 'running': health = "NA" self.set_health_data(host_data, health, specifics=[specifics]) sas_hba_data.append(host_data) return sas_hba_data def get_sas_ports_info(self): """Return SAS Ports current health.""" sas_ports_data = [] sas_instance = SAS() try: ports = sas_instance.get_port_list() # eg: ['port-1:0', 'port-1:1', 'port-1:2', 'port-1:3'] except SASError as err: ports = [] logger.error(self.log.svc_log(err)) except Exception as err: ports = [] logger.exception(self.log.svc_log(err)) for port in ports: port_id = 'sas_' + port port_data = self.get_health_template(port_id, False) try: phys = sas_instance.get_phy_list_for_port(port) # eg: [ 'phy-1:0', 'phy-1:1', 'phy-1:2', 'phy-1:3'] except SASError as err: phys = [] logger.error(self.log.svc_log(err)) except Exception as err: phys = [] logger.exception(self.log.svc_log(err)) specifics = {'num_phys': len(phys), 'phys': []} health = "OK" for phy in phys: try: phy_data = sas_instance.get_phy_data(phy) except SASError as err: phy_data = {} logger.error(self.log.svc_log(err)) except Exception: phy_data = {} logger.exception(self.log.svc_log(err)) specifics['phys'].append(phy_data) if not phy_data or phy_data['state'] != 'enabled' or \ 'Gbit' not in phy_data['negotiated_linkrate']: health = "NA" self.set_health_data(port_data, health, specifics=[specifics]) sas_ports_data.append(port_data) return sas_ports_data def get_nw_ports_info(self): """Return the Network ports information.""" network_cable_data = [] io_counters = psutil.net_io_counters(pernic=True) nw_instance = Network() for interface, addrs in psutil.net_if_addrs().items(): nic_info = self.get_health_template(interface, False) specifics = {} for addr in addrs: if addr.family == socket.AF_INET: specifics["ipV4"] = addr.address if interface in io_counters: io_info = io_counters[interface] specifics = { "networkErrors": io_info.errin + io_info.errout, "droppedPacketsIn": io_info.dropin, "droppedPacketsOut": io_info.dropout, "packetsIn": io_info.packets_recv, "packetsOut": io_info.packets_sent, "trafficIn": io_info.bytes_recv, "trafficOut": io_info.bytes_sent } # Get the interface health status. nw_status, nw_cable_conn_status = \ self.get_nw_status(nw_instance, interface) specifics["nwStatus"] = nw_status specifics["nwCableConnStatus"] = nw_cable_conn_status # Map and set the interface health status and description. map_status = { "CONNECTED": "OK", "DISCONNECTED": "Disabled/Failed", "UNKNOWN": "NA" } health_status = map_status[nw_cable_conn_status] desc = "Network Interface '%s' is %sin good health." % ( interface, '' if health_status == "OK" else 'not ') self.set_health_data(nic_info, health_status, description=desc, specifics=[specifics]) network_cable_data.append(nic_info) return network_cable_data def get_nw_status(self, nw_interface, interface): """Read & Return the latest network status from sysfs files.""" try: nw_status = nw_interface.get_operational_state(interface) except NetworkError as err: nw_status = "UNKNOWN" logger.error(self.log.svc_log(err)) except Exception as err: nw_status = "UNKNOWN" logger.exception(self.log.svc_log(err)) try: nw_cable_conn_status = nw_interface.get_link_state(interface) except NetworkError as err: nw_cable_conn_status = "UNKNOWN" logger.exception(self.log.svc_log(err)) except Exception as err: nw_cable_conn_status = "UNKNOWN" logger.exception(self.log.svc_log(err)) return nw_status, nw_cable_conn_status def get_cortx_service_info(self): """Get cortx service info in required format.""" service_info = [] cortx_services = Service().get_cortx_service_list() for service in cortx_services: response = self.get_systemd_service_info(service) if response is not None: service_info.append(response) return service_info def get_external_service_info(self): """Get external service info in required format.""" service_info = [] external_services = Service().get_external_service_list() for service in external_services: response = self.get_systemd_service_info(service) if response is not None: service_info.append(response) return service_info def get_systemd_service_info(self, service_name): """Get info of specified service using dbus API.""" try: unit = Service()._bus.get_object( const.SYSTEMD_BUS, Service()._manager.LoadUnit(service_name)) properties_iface = Interface(unit, dbus_interface=PROPERTIES_IFACE) except DBusException as err: logger.error( self.log.svc_log( f"Unable to initialize {service_name} due to {err}")) return None path_array = properties_iface.Get(const.SERVICE_IFACE, 'ExecStart') try: command_line_path = str(path_array[0][0]) except IndexError as err: logger.error( self.log.svc_log( f"Unable to find {service_name} path due to {err}")) command_line_path = "NA" is_installed = True if command_line_path != "NA" or 'invalid' in properties_iface.Get( const.UNIT_IFACE, 'UnitFileState') else False uid = str(properties_iface.Get(const.UNIT_IFACE, 'Id')) if not is_installed: health_status = "NA" health_description = f"Software enabling {uid} is not installed" recommendation = "NA" specifics = [{ "service_name": uid, "description": "NA", "installed": str(is_installed).lower(), "pid": "NA", "state": "NA", "substate": "NA", "status": "NA", "license": "NA", "version": "NA", "command_line_path": "NA" }] else: service_license = "NA" version = "NA" service_description = str( properties_iface.Get(const.UNIT_IFACE, 'Description')) state = str(properties_iface.Get(const.UNIT_IFACE, 'ActiveState')) substate = str(properties_iface.Get(const.UNIT_IFACE, 'SubState')) service_status = 'enabled' if 'disabled' not in properties_iface.Get( const.UNIT_IFACE, 'UnitFileState') else 'disabled' pid = "NA" if state == "inactive" else str( properties_iface.Get(const.SERVICE_IFACE, 'ExecMainPID')) try: version = Service().get_service_info_from_rpm(uid, "VERSION") except ServiceError as err: logger.error( self.log.svc_log( f"Unable to get service version due to {err}")) try: service_license = Service().get_service_info_from_rpm( uid, "LICENSE") except ServiceError as err: logger.error( self.log.svc_log( f"Unable to get service license due to {err}")) specifics = [{ "service_name": uid, "description": service_description, "installed": str(is_installed).lower(), "pid": pid, "state": state, "substate": substate, "status": service_status, "license": service_license, "version": version, "command_line_path": command_line_path }] if service_status == 'enabled' and state == 'active' \ and substate == 'running': health_status = 'OK' health_description = f"{uid} is in good health" recommendation = "NA" else: health_status = state health_description = f"{uid} is not in good health" recommendation = const.DEFAULT_RECOMMENDATION service_info = self.get_health_template(uid, is_fru=False) self.set_health_data(service_info, health_status, health_description, recommendation, specifics) return service_info def get_raid_info(self): raids_data = [] for raid in RAIDs.get_configured_raids(): raid_data = self.get_health_template(raid.id, False) health, description = raid.get_health() devices = raid.get_devices() specifics = [{ "location": raid.raid, "data_integrity_status": raid.get_data_integrity_status(), "devices": devices }] self.set_health_data(raid_data, health, specifics=specifics, description=description) raids_data.append(raid_data) return raids_data @staticmethod def get_disk_overall_usage(): units_factor_GB = 1000000000 overall_usage = { "totalSpace": f'{int(psutil.disk_usage("/")[0])//int(units_factor_GB)} GB', "usedSpace": f'{int(psutil.disk_usage("/")[1])//int(units_factor_GB)} GB', "freeSpace": f'{int(psutil.disk_usage("/")[2])//int(units_factor_GB)} GB', "diskUsedPercentage": psutil.disk_usage("/")[3], } return overall_usage def get_disks_info(self): """Update and return server drive information in specific format.""" disks = [] for disk in Disk.get_disks(): uid = disk.path if disk.path else disk.id disk_health = self.get_health_template(uid, True) health_data = disk.get_health() health = "OK" if (health_data['SMART_health'] == "PASSED") else "Fault" self.set_health_data(disk_health, health, specifics=[{ "SMART": health_data }]) disks.append(disk_health) logger.debug(self.log.svc_log(f"Disk Health Data:{disks}")) return disks def get_psu_info(self): """Update and return PSU information in specific format.""" psus_health_data = [] for psu in self.get_psus(): data = self.get_health_template(f'{psu["Location"]}', True) health = "OK" if (psu["Status"] == "Present, OK") else "Fault" self.set_health_data(data, health, specifics=psu) psus_health_data.append(data) logger.debug(self.log.svc_log(f"PSU Health Data:{psus_health_data}")) return psus_health_data @staticmethod def get_psus(): response, _, _ = SimpleProcess("dmidecode -t 39").run() matches = re.findall( "System Power Supply|Power Unit Group:.*|" "Location:.*|Name:.*|Serial Number:.*|" "Max Power Capacity:.*|Status: .*|" "Plugged:.*|Hot Replaceable:.*", response.decode()) psus = [] stack = [] while matches: item = matches.pop() while item != "System Power Supply": stack.append(item) item = matches.pop() psu = {} while stack: key, value = stack.pop().strip().split(":") psu[key] = value.strip() psus.append(psu) return psus
class Platform: """provides information about server.""" def __init__(self): """Initialize instance.""" self._ipmi = IpmiFactory().get_implementor("ipmitool") @staticmethod def get_os_info(): """Returns OS information from /etc/os-release.""" os_release = "" os_info = {} with open("/etc/os-release") as f: os_release = f.read() if os_release: os_lst = os_release.split("\n") for line in os_lst: data = line.split('=') if len(data) > 1 and data[1].strip() != "": key = data[0].strip().lower().replace(" ", "_") value = data[1].strip().replace("\"", "") os_info.update({key: value}) return os_info def get_bmc_info(self): """Returns node server bmc info.""" bmc_info = {} cmd = "bmc info" out, _, retcode = self._ipmi._run_ipmitool_subcommand(cmd) if retcode == 0: out_lst = out.split("\n") for line in out_lst: data = line.split(':') if len(data) > 1 and data[1].strip() != "": key = data[0].strip().lower().replace(" ", "_") value = data[1].strip() bmc_info.update({key: value}) return bmc_info def get_server_details(self): """ Returns a dictionary of server information. Grep 'FRU device description on ID 0' information using ipmitool command. """ os_info = self.get_os_info() specifics = { "Board Mfg": "", "Board Product": "", "Board Part Number": "", "Product Name": "", "Product Part Number": "", "Manufacturer": self._ipmi.get_manufacturer_name(), "OS": os_info.get('id', '') + os_info.get('version_id', '') } cmd = "fru print" prefix = "FRU Device Description : Builtin FRU Device (ID 0)" search_res = "" out, _, retcode = self._ipmi._run_ipmitool_subcommand(cmd) if retcode == 0: # Get only 'FRU Device Description : Builtin FRU Device (ID 0)' information search_res = re.search( r"((.*%s[\S\n\s]+ID 1\)).*)|(.*[\S\n\s]+)" % prefix, out) search_res = search_res.group() if search_res else "" for key in specifics.keys(): if key in search_res: device_desc = re.search(r"%s[\s]+:[\s]+([\w-]+)(.*)" % key, out) if device_desc: value = device_desc.groups()[0] specifics.update({key: value}) return specifics @staticmethod def validate_server_type_support(log, Error, server_type): """Check for supported server type.""" logger.debug(log.svc_log(f"Server Type:{server_type}")) if not server_type: msg = "ConfigError: server type is unknown." logger.error(log.svc_log(msg)) raise Error(errno.EINVAL, msg) if server_type.lower( ) not in const.RESOURCE_MAP["server_type_supported"]: msg = f"{log.service} provider is not supported for server type '{server_type}'" logger.error(log.svc_log(msg)) raise Error(errno.EINVAL, msg) def get_effective_monitored_services(): """Get platform type based monitored services.""" # Align node type as it is given in sspl.conf SERVICEMONITOR section node_type = Conf.get(GLOBAL_CONF, NODE_TYPE_KEY).lower() vm_types = ["virtual", "vm"] node_type = "vm" if node_type in vm_types else "hw" monitored_services = Conf.get( SSPL_CONF, f'{SERVICEMONITOR}>{MONITORED_SERVICES}', []) excluded_services = Conf.get( SSPL_CONF, f'{SERVICEMONITOR}>{EXCLUDED_SERVICES}>{node_type}', []) effective_monitored_services = list( set(monitored_services) - set(excluded_services)) logger.debug("Monitored services list, %s" % monitored_services) logger.debug("Excluded monitored services list, " \ "%s for environment %s" %(excluded_services, node_type)) logger.debug("Effective monitored services list, " \ "%s" % effective_monitored_services) return effective_monitored_services