示例#1
0
    def get_cpu_info(self, add_overall_usage=False):
        """Update and return CPU information in specific format."""
        per_cpu_data = []
        cpu_present = self.get_cpu_list("present")
        cpu_online = self.get_cpu_list("online")
        cpu_usage = self.get_cpu_usage(percpu=True)
        cpu_usage_dict = dict(zip(cpu_online, cpu_usage))
        overall_cpu_usage = list(psutil.getloadavg())
        cpu_count = len(cpu_present)
        overall_usage = {
            "current": self.get_cpu_usage(percpu=False),
            "1_min_avg": overall_cpu_usage[0],
            "5_min_avg": overall_cpu_usage[1],
            "15_min_avg": overall_cpu_usage[2]
        }

        for cpu_id in range(0, cpu_count):
            uid = f"CPU-{cpu_id}"
            cpu_dict = self.get_health_template(uid, is_fru=False)
            online_status = "Online" if cpu_id in cpu_online else "Offline"
            health_status = "OK" if online_status == "Online" else "NA"
            usage = "NA" if health_status == "NA" \
                else cpu_usage_dict[cpu_id]
            specifics = [{"cpu_usage": usage, "state": online_status}]
            self.set_health_data(cpu_dict,
                                 status=health_status,
                                 specifics=specifics)
            per_cpu_data.append(cpu_dict)

        cpu_data = [{
            "overall_usage": overall_usage,
            "cpu_count": cpu_count,
            "last_updated": int(time.time()),
            "cpus": per_cpu_data
        }]
        if not add_overall_usage:
            cpu_data = per_cpu_data

        logger.debug(self.log.svc_log(f"CPU Health Data:{cpu_data}"))
        return cpu_data
    def check_and_conclude_initialization(self):
        logger.debug("Begin {}.conclude_initializatio()".format(
            self.__class__.__name__))

        # Check that self.lock is held by the caller
        if self.lock.acquire(blocking=False):
            self.lock.release()
            logger.error("SensorThread.check_and_conclude_initialization() called"\
                " without acquiring lock. Returning immediately")
            return

        if self.status != SensorThreadState.WAITING:
            return

        # It is possible that self.event() is not called at all.
        if self.num_failed_dependees:
            self.deps_status = DependencyState.DEPS_FAILED
        elif not self.remaining_dependees:
            self.deps_status = DependencyState.DEPS_SUCCESS

        definitely_failed = \
                self.deps_status == DependencyState.DEPS_FAILED or \
                self.init_status == InitState.INIT_FAILED or \
                self.has_timed_out
        definitely_succeeded = \
                self.deps_status == DependencyState.DEPS_SUCCESS and \
                self.init_status == InitState.INIT_SUCCESS

        if (definitely_failed):
            self.status = SensorThreadState.FAILED
        elif (definitely_succeeded):
            self.status = SensorThreadState.RUNNING
        # else it remains as waiting

        if self.status != SensorThreadState.WAITING:
            for d in self.waiting_dependers:
                d.event(self, self.status == SensorThreadState.RUNNING)

        logger.debug("End {}.conclude_initializatio() with state {}".format(
            self.__class__.__name__, self.status))
示例#3
0
    def connect_to_prop_changed_signal(self, service):
        """
           Bind the service to a signal('PropertiesChanged').

           Fetch the service unit from systemd and its state, substate,
           pid etc. Bind the service to the sigle which will be triggered
           whenever the service changes it's state/substate. Also raise
           an alert if service is in failed/inactive state.
        """
        try:
            unit, _, state, substate, pid = self.get_service_status(
                service=service)

            self.update_status_local_cache(service, state, substate, pid)

            Iunit2 = Interface(
                unit, dbus_interface='org.freedesktop.systemd1.Manager')

            Iunit2.connect_to_signal(
                'PropertiesChanged',
                lambda a, b, c, p=unit: self.on_prop_changed(a, b, c, p),
                dbus_interface=PROPERTIES_IFACE)

            logger.debug(f"{service}({pid}) state is {state}:{substate}")

            if state in ["activating", "reloading", "deactivating"]:
                self.not_active_services[service] = \
                                    [self.current_time(), "N/A", "N/A"]
            elif state != "active":
                self.failed_services.append(service)
                self.raise_alert(service, "N/A", state, "N/A", substate, "N/A",
                                 pid, 0)
                logger.error(
                    f"{service} is not active initially. state = {state}:{substate}"
                )

            return None
        except DBusException as err:
            return err
示例#4
0
 def get_fanmodules_info(self):
     """Returns fan modules health data."""
     response = []
     fanmodules_data = self.get_realstor_encl_data("fan-modules")
     if fanmodules_data:
         for fan_module in fanmodules_data:
             uid = fan_module.get('durable-id', 'NA')
             health = fan_module.get('health')
             fan_module_resp = self.get_health_template(uid, is_fru=True)
             specifics = [
                 self.get_fan_specfics(fan) for fan in fan_module['fan']
             ]
             self.set_health_data(fan_module_resp,
                                  health,
                                  specifics=specifics)
             response.append(fan_module_resp)
         logger.debug(
             self.log.svc_log(f"Fan modules health Data:{response}"))
     else:
         logger.error(
             self.log.svc_log("No response received from fan modules"))
     return response
示例#5
0
    def run(self):
        """Run the sensor on its own thread"""

        # Check for debug mode being activated
        self._read_my_msgQ_noWait()
        try:
            with self._iem_log_file_lock:
                self._iem_logs = open(self._log_file_path)
            self._create_file(self._timestamp_file_path)

            with open(self._timestamp_file_path, "r") as timestamp_file:
                last_processed_log_timestamp = timestamp_file.read().strip()

            # Read and send unprocessed messages
            with self._iem_log_file_lock:
                for iem_log in self._iem_logs:
                    log = iem_log.rstrip()
                    log_timestamp = log[:log.index(" ")]
                    if not last_processed_log_timestamp or log_timestamp > last_processed_log_timestamp:
                        self._process_iem(log)

            # Reset debug mode if persistence is not enabled
            self._disable_debug_if_persist_false()

            # Read new messages
            self._read_iem()

        except IOError as io_error:
            if io_error.errno == errno.ENOENT:
                logger.debug(f"IEMSensor, self.run, {io_error.args} {io_error.filename}")
            elif io_error.errno == errno.EACCES:
                logger.error(f"IEMSensor, self.run, {io_error.args} {io_error.filename}")
            else:
                logger.error(f"IEMSensor, self.run, {io_error.args} {io_error.filename}")
            self._scheduler.enter(10, self._priority, self.run, ())
        except Exception as exception:
            logger.error(f"IEMSensor, self.run, {exception.args}")
            self._scheduler.enter(10, self._priority, self.run, ())
    def _process_msg(self, jsonMsg):
        """Parses the incoming message and handles appropriately"""
        self._log_debug(f"RealStorActuatorMsgHandler, _process_msg, jsonMsg: {jsonMsg}")

        if isinstance(jsonMsg, dict) is False:
            jsonMsg = json.loads(jsonMsg)

        # Parse out the uuid so that it can be sent back in Ack message
        uuid = None
        if jsonMsg.get("sspl_ll_msg_header").get("uuid") is not None:
            uuid = jsonMsg.get("sspl_ll_msg_header").get("uuid")
            self._log_debug(f"_processMsg, uuid: {uuid}")

        logger.debug(f"RealStorActuatorMsgHandler: _process_msg: jsonMsg: {jsonMsg}")
        if jsonMsg.get("actuator_request_type").get("storage_enclosure").get("enclosure_request") is not None:
            enclosure_request = jsonMsg.get("actuator_request_type").get("storage_enclosure").get("enclosure_request")
            self._log_debug(f"_processMsg, enclosure_request: {enclosure_request}")
            logger.debug(f"RealStorActuatorMsgHandler: _process_msg: INSIDE: jsonMsg: {jsonMsg}")

            # Parse out the request field in the enclosure_request
            (request, fru) = enclosure_request.split(":", 1)
            request = request.strip()
            fru = fru.strip()

            if self._real_stor_actuator is None:
                try:
                    from actuators.impl.generic.realstor_encl import RealStorActuator
                    self._real_stor_actuator = RealStorActuator()
                except ImportError as e:
                    logger.warn("RealStor Actuator not loaded")
                    return

            # Perform the request and get the response
            real_stor_response = self._real_stor_actuator.perform_request(jsonMsg)
            self._log_debug(f"_process_msg, RealStor response: {real_stor_response}")

            json_msg = RealStorActuatorMsg(real_stor_response, uuid).getJson()
            self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg)
示例#7
0
    def on_prop_changed(self, interface, changed_properties,
                        invalidated_properties, unit):
        """Handler to process the service state change signal."""
        _, service, state, substate, pid = self.get_service_status(unit=unit)

        prev_state = self.service_status[service]["state"]
        prev_substate = self.service_status[service]["substate"]
        prev_pid = self.service_status[service]["pid"]

        logger.debug(f"Event for {service}, properties changed from "\
                     f"{prev_state}:{prev_substate} to {state}:{substate}")

        if prev_state == state:
            return


        logger.info(f"{service} changed state from " + \
                    f"{prev_state}:{prev_substate} to {state}:{substate}")

        self.update_status_local_cache(service, state, substate, pid)

        self.action_per_transition(service, prev_state, state, prev_substate,
                                   substate, prev_pid, pid)
示例#8
0
 def get_disks_info(self):
     """Update and return server drive information in specific format."""
     disks = []
     sort_key_path = None
     for disk in Disk.get_disks():
         uid = disk.path if disk.path else disk.id
         disk_health = self.get_health_template(uid, True)
         health_data = disk.get_health()
         health = "OK" if (health_data['SMART_health']
                           == "PASSED") else "Fault"
         serial_number = disk.id.split("-")[-1] if disk.id else "NA"
         health_data.update({"serial_number": serial_number})
         self.set_health_data(disk_health,
                              health,
                              specifics=[{
                                  "SMART": health_data
                              }])
         disks.append(disk_health)
     # Sort disk list by serial_number
     sort_key_path = self.resource_indexing_map["hw"]["disk"]
     disks = MonUtils.sort_by_specific_kv(disks, sort_key_path, self.log)
     logger.debug(self.log.svc_log(f"Disk Health Data:{disks}"))
     return disks
    def _get_nwalert(self, interfaces):
        """
        Get network interfaces with fault/OK state for each interface.
        Parameters:
                    interfaces(list) : List of availabel network interfaces

        Returns: Dictionary of network interfaces having key as interface name and value as fault state.

        Return type: dict
        """
        nw_alerts = {}
        try:
            for interface in interfaces:
                interface_name = interface.get("ifId")
                nw_status = interface.get("nwStatus")
                logger.debug("{0}:{1}".format(interface_name, nw_status))
                # fault detected (Down/UNKNOWN, Up/UNKNOWN to Down, Up/Down to UNKNOWN)
                if nw_status == 'DOWN' or nw_status == 'UNKNOWN':
                    if self.prev_nw_status.get(interface_name) != nw_status:
                        if self.prev_nw_status.get(interface_name) and self.prev_nw_status.get(interface_name) == 'UP':
                            logger.warning(f"Network connection fault is detected for interface:'{interface_name}'")
                            nw_alerts[interface_name] = self.FAULT
                        self.prev_nw_status[interface_name] = nw_status
                # fault resolved (Down to Up)
                elif nw_status == 'UP':
                    if self.prev_nw_status.get(interface_name) != nw_status:
                        if self.prev_nw_status.get(interface_name):
                            logger.info(f"Network connection fault is resolved for interface:'{interface_name}'")
                            nw_alerts[interface_name] = self.FAULT_RESOLVED
                        self.prev_nw_status[interface_name] = nw_status
                else:
                    logger.warning(f"Network connection state is:'{nw_status}', for interface:'{interface_name}'")
        except Exception as e:
            logger.error(f"Exception occurs while checking for network alert condition:'{e}'")
        logger.debug("nw_alerts existed for:{}".format(nw_alerts))
        return nw_alerts
示例#10
0
 def get_psu_info(self):
     """Update and return PSUs information in specific format."""
     data = []
     psus = self.get_realstor_encl_data("power-supplies")
     for psu in psus:
         uid = psu.get("durable-id")
         status = psu.get("health", "NA")
         description = psu.get("description")
         recommendation = psu.get("health-recommendation")
         specifics = [{
             "location": psu.get("location", "NA"),
             "dc12v": psu.get("dc12v", "NA"),
             "dc5v": psu.get("dc5v", "NA"),
             "dc33v": psu.get("dc33v", "NA"),
             "dc12i": psu.get("dc12i", "NA"),
             "dc5i": psu.get("dc5i", "NA"),
             "dctemp": psu.get("dctemp", "NA")
         }]
         psu_dict = self.get_health_template(uid, is_fru=True)
         self.set_health_data(psu_dict, status, description, recommendation,
                              specifics)
         data.append(psu_dict)
         logger.debug(self.log.svc_log(f"PSU Health Data:{data}"))
     return data
 def _update_raid_device_file(self, device):
     try:
         status = "failed"
         raid_check = 0
         raid_dir = RaidDataConfig.DIR.value
         sync_action_file = RaidDataConfig.SYNC_ACTION_FILE.value
         while raid_check <= RaidDataConfig.MAX_RETRIES.value:
             CHECK_COMMAND = "echo 'check' |sudo tee " + raid_dir + device + sync_action_file + " > /dev/null"
             logger.debug('Executing CHECK_COMMAND:{}'.format(CHECK_COMMAND))
             response, error = self._run_command(CHECK_COMMAND)
             if error:
                 logger.warn("Failed in executing command:{}."
                             .format(error))
                 raid_check += 1
                 time.sleep(1)
             else:
                 logger.debug("RAID device state is changed to 'check' with response : {}".format(response))
                 status = "success"
                 break
         return status
     except Exception as ae:
         logger.error("Failed to update RAID File. ERROR:{}"
                      .format(str(ae)))
         raise
示例#12
0
 def get_controllers_info(self):
     """Update and return controller information in specific format"""
     data = []
     controllers = self.get_realstor_encl_data("controllers")
     for controller in controllers:
         uid = controller.get("durable-id")
         status = controller.get("health", "NA")
         description = controller.get("description")
         recommendation = controller.get("health-recommendation")
         specifics = [{
             "serial-number": controller.get("serial-number", "NA"),
             "disks": controller.get("disks", "NA"),
             "virtual-disks": controller.get("virtual-disks", "NA"),
             "model": controller.get("model", "NA"),
             "part-number": controller.get("part-number", "NA"),
             "fw": controller.get("sc-fw", "NA"),
             "location": controller.get("position", "NA")
         }]
         controller_dict = self.get_health_template(uid, is_fru=True)
         self.set_health_data(controller_dict, status, description,
                              recommendation, specifics)
         data.append(controller_dict)
         logger.debug(self.log.svc_log(f"Contollers Health Data:{data}"))
     return data
    def _run_command(self, command):
        """Run the command and get the response and error returned"""
        logger.debug(f"_run_command: {command}")
        process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        response, error = process.communicate()
        if response:
            logger.debug(f"_run_command, response: {str(response)}")
        if error:
            logger.debug(f"_run_command: error: {str(error)}")

        return response.decode().rstrip('\n'), error.decode().rstrip('\n')
 def _check_mismatch_count(self, device):
     try:
         status = None
         mismatch_cnt_file = RaidDataConfig.MISMATCH_COUNT_FILE.value
         MISMATCH_COUNT_COMMAND = 'cat ' + self.raid_dir + device +\
                                  mismatch_cnt_file
         logger.debug('Executing MISMATCH_CNT_COMMAND:{}'.format(
             MISMATCH_COUNT_COMMAND))
         response, error = self._run_command(MISMATCH_COUNT_COMMAND)
         if error:
             logger.error("Error in cmd{} in raid health monitor".format(
                 MISMATCH_COUNT_COMMAND))
         if response == RaidDataConfig.MISMATCH_COUNT_RESPONSE.value:
             logger.debug("No mismatch count is found")
             status = "success"
             with open(self.output_file, 'a') as raid_file:
                 raid_file.write(
                     RaidDataConfig.MISMATCH_COUNT_RESPONSE.value)
             fault_status_file = self.DEFAULT_RAID_DATA_PATH + device + "_" + RaidDataConfig.RAID_MISMATCH_FAULT_STATUS.value
             if os.path.exists(fault_status_file):
                 with open(fault_status_file, 'r') as fs:
                     data = fs.read().rstrip()
                 if self.FAULT in data:
                     faulty_device = data.split(":")[0].rstrip()
                     if device == faulty_device:
                         self.alert_type = self.FAULT_RESOLVED
                         self._alert_msg = "RAID disks present in %s RAID array are synchronized." % device
                         self._send_json_msg(self.alert_type, device,
                                             self._alert_msg)
                         self._update_fault_state_file(
                             device, self.FAULT_RESOLVED, fault_status_file)
                         self._scan_frequency = Conf.get(
                             SSPL_CONF,
                             f"{self.RAIDIntegritySensor}>{self.SCAN_FREQUENCY}",
                             self.DEFAULT_SCAN_FREQUENCY)
                         self._scan_frequency = max(self._scan_frequency,
                                                    self.MIN_SCAN_FREQUENCY)
         else:
             status = "failed"
             logger.debug(
                 "Mismatch found in {} file in raid_integrity_data!".format(
                     mismatch_cnt_file))
         return status
     except Exception as ae:
         logger.error(
             "Failed in checking mismatch_cnt in RAID file. ERROR:{}".
             format(str(ae)))
         raise
示例#15
0
 def run(self):
     """Run the sensor on its own thread"""
     logger.debug("Consul accumulated messages processing started")
     if not self._is_my_msgQ_empty():
         # Check for shut down message from sspl_ll_d and set a flag to shutdown
         #  once our message queue is empty
         self._jsonMsg, _ = self._read_my_msgQ()
         if self._jsonMsg.get("message").get(
                 "actuator_response_type") is not None and \
                 self._jsonMsg.get("message").get(
                     "actuator_response_type").get(
                     "thread_controller") is not None and \
                 self._jsonMsg.get("message").get(
                     "actuator_response_type").get("thread_controller").get(
                     "thread_response") == \
                 "SSPL-LL is shutting down":
             logger.info("EgressAccumulatedMsgsProcessor, run, received"
                         "global shutdown message from sspl_ll_d")
             self.shutdown()
     try:
         # TODO : Fix accumulated message processor when message bus changes are available to
         # error out in case of failure (EOS-17626)
         if not self.store_queue.is_empty():
             logger.debug(
                 "Found accumulated messages, trying to send again")
             while not self.store_queue.is_empty():
                 message = self.store_queue.get()
                 if isinstance(message, bytes):
                     message = message.decode()
                 dict_msg = json.loads(message)
                 if "actuator_response_type" in dict_msg["message"]:
                     event_time = dict_msg["message"] \
                         ["actuator_response_type"]["info"]["event_time"]
                     time_diff = int(time.time()) - int(event_time)
                     if time_diff > self.MSG_TIMEOUT:
                         continue
                 if "sensor_response_type" in dict_msg["message"]:
                     logger.info(f"Publishing Accumulated Alert: {message}")
                 self._producer.send([message])
     except MessageBusError as e:
         logger.error("EgressAccumulatedMsgsProcessor, run, %r" % e)
     except Exception as e:
         logger.error(e)
     finally:
         logger.debug("Consul accumulated processing ended")
         self._scheduler.enter(30, self._priority, self.run, ())
示例#16
0
 def run(self):
     """Run the sensor on its own thread"""
     logger.debug("Consul accumulated messages processing started")
     if not self._is_my_msgQ_empty():
         # Check for shut down message from sspl_ll_d and set a flag to shutdown
         #  once our message queue is empty
         self._jsonMsg, _ = self._read_my_msgQ()
         if self._jsonMsg.get("message").get("actuator_response_type") is not None and \
             self._jsonMsg.get("message").get("actuator_response_type").get("thread_controller") is not None and \
             self._jsonMsg.get("message").get("actuator_response_type").get("thread_controller").get("thread_response") == \
                 "SSPL-LL is shutting down":
             logger.info("RabbitMQEgressAccumulatedMsgsProcessor, run, received" \
                             "global shutdown message from sspl_ll_d")
             self.shutdown()
     try:
         if not self.store_queue.is_empty():
             logger.debug(
                 "Found accumulated messages, trying to send again")
             self._connection._establish_connection()
             msg_props = pika.BasicProperties()
             msg_props.content_type = "text/plain"
             while not self.store_queue.is_empty():
                 message = self.store_queue.get()
                 dict_msg = json.loads(message)
                 if "actuator_response_type" in dict_msg["message"]:
                     event_time = dict_msg["message"][
                         "actuator_response_type"]["info"]["event_time"]
                     time_diff = int(time.time()) - int(event_time)
                     if time_diff > self.MSG_TIMEOUT:
                         continue
                 self._connection.publish(exchange=self._exchange_name,
                                          routing_key=self._routing_key,
                                          properties=msg_props,
                                          body=message)
                 if "sensor_response_type" in dict_msg["message"]:
                     logger.info(f"Publishing Accumulated Alert: {message}")
             self._connection.cleanup()
     except connection_exceptions as e:
         logger.error(connection_error_msg.format(e))
     except Exception as e:
         logger.error(e)
     finally:
         logger.debug("Consul accumulated processing ended")
         self._scheduler.enter(30, self._priority, self.run, ())
示例#17
0
    def get_effective_monitored_services():
        """Get platform type based monitored services."""
        # Align node type as it is given in sspl.conf SERVICEMONITOR section
        node_type = Conf.get(GLOBAL_CONF, NODE_TYPE_KEY).lower()
        vm_types = ["virtual", "vm"]
        node_type = "vm" if node_type in vm_types else "hw"

        monitored_services = Conf.get(
            SSPL_CONF, f'{SERVICEMONITOR}>{MONITORED_SERVICES}', [])
        excluded_services = Conf.get(
            SSPL_CONF, f'{SERVICEMONITOR}>{EXCLUDED_SERVICES}>{node_type}', [])
        effective_monitored_services = list(
            set(monitored_services) - set(excluded_services))

        logger.debug("Monitored services list, %s" % monitored_services)
        logger.debug("Excluded monitored services list, " \
            "%s for environment %s" %(excluded_services, node_type))
        logger.debug("Effective monitored services list, " \
            "%s" % effective_monitored_services)

        return effective_monitored_services
    def _raid_health_monitor(self):
        try:
            devices = self._get_devices()
            if len(devices) == 0:
                return
            logger.debug("Fetched devices:{}".format(devices))

            for device in devices:
                # Update the state as 'check' for RAID device file
                result = self._update_raid_device_file(device)
                if result == "failed":
                    self._retry_execution(self._update_raid_device_file,
                                          device)
                logger.info("RAID device state is changed to 'check'")

                # Check RAID device array state is 'idle' or not
                result = self._check_raid_state(device)
                if result == "failed":
                    logger.warn(
                        "'Idle' state not found for RAID device:{}".format(
                            device))
                    # Retry to check RAID state
                    self._retry_execution(self._check_raid_state, device)
                logger.info(
                    "'idle' state is found in Raid device:{}.".format(device))

                # Check Mismatch count in RAID device files.
                result = self._check_mismatch_count(device)
                if result == "failed":
                    # Persist RAID device fault state and send alert
                    fault_status_file = self.DEFAULT_RAID_DATA_PATH + device + "_" + RaidDataConfig.RAID_MISMATCH_FAULT_STATUS.value
                    if os.path.exists(fault_status_file):
                        with open(fault_status_file, 'r') as fs:
                            data = fs.read().rstrip()
                        if self.FAULT_RESOLVED in data:
                            self.alert_type = self.FAULT
                            self._alert_msg = "RAID disks present in %s RAID array"\
                                ", needs synchronization. If fault persists for "\
                                "more than 2 days, Please contact Seagate support."%device
                            self._send_json_msg(self.alert_type, device,
                                                self._alert_msg)
                            self._update_fault_state_file(
                                device, self.FAULT, fault_status_file)
                            self._scan_frequency = self.MIN_SCAN_FREQUENCY
                    else:
                        self.alert_type = self.FAULT
                        self._alert_msg = "RAID disks present in %s RAID array"\
                                ", needs synchronization. If fault persists for "\
                                "more than 2 days, Please contact Seagate support."%device
                        self._send_json_msg(self.alert_type, device,
                                            self._alert_msg)
                        self._update_fault_state_file(device, self.FAULT,
                                                      fault_status_file)
                        self._scan_frequency = self.MIN_SCAN_FREQUENCY

                    # Retry to check mismatch_cnt
                    self._retry_execution(self._check_mismatch_count, device)
                logger.debug(
                    "No mismatch count is found in Raid device:{}".format(
                        device))

        except Exception as ae:
            raise Exception(f"Failed in monitoring RAID health, {ae}")
示例#19
0
    def action_per_transition(self, service, prev_state, state, prev_substate,
                              substate, prev_pid, pid):
        """Take action according to the state change of the service."""
        # alert_info_index : index pointing to alert_info table from
        #               ServiceMonitor:raise_alerts() representing alert
        #               description, type, impact etc. to be sent.
        alert_info_index = -1

        logger.debug(f"ServiceMonitor:action_per_transition for {service} : " + \
            f"({prev_state}:{prev_substate}) -> ({state}:{substate})")

        if prev_state in ["active", "reloading"]:
            if state == "active":
                # reloading -> active
                self.not_active_services.pop(service)
                if service in self.failed_services:
                    self.failed_services.remove(service)
                    alert_info_index = 2
            elif state != "failed":
                # active -> deactivating/inactive/reloading/activating
                # or
                # reloading -> deactivating/inactive/activating
                self.not_active_services[service] = \
                    [self.current_time(), prev_state, prev_substate]
            elif state == "failed":
                # active/reloading -> failed
                if service not in self.failed_services:
                    self.failed_services.append(service)
                    alert_info_index = 0
        elif prev_state == "deactivating":
            if state in ["inactive", "activating"]:
                # deactivating -> inactive/activating
                if service not in self.not_active_services:
                    self.not_active_services[service] = \
                        [self.current_time(), prev_state, prev_substate]
            elif state == "failed":
                # deactivating -> failed
                if service not in self.failed_services:
                    self.failed_services.append(service)
                    alert_info_index = 0
            elif state == "active":
                # deactivating -> active
                if service in self.not_active_services:
                    self.not_active_services.pop(service)
                if service in self.failed_services:
                    self.failed_services.remove(service)
                    alert_info_index = 2
            else:
                alert_info_index = 3
        elif prev_state in ["inactive", "failed"]:
            if state == "activating":
                # inactive/failed -> activating
                if service not in self.not_active_services:
                    self.not_active_services[service] = \
                        [self.current_time(), prev_state, prev_substate]
            elif state == "active":
                # inactive/failed -> active
                if service in self.failed_services:
                    self.failed_services.remove(service)
                    alert_info_index = 2
                if service in self.not_active_services:
                    self.not_active_services.pop(service)
            elif state == "failed":
                # inactive -> failed
                if service not in self.failed_services:
                    self.failed_services.append(service)
                    alert_info_index = 0
            else:
                alert_info_index = 3
        elif prev_state == "activating":
            if service in self.not_active_services:
                self.not_active_services.pop(service)
            if state in ["inactive", "deactivating"]:
                # activating -> inactive/deactivating
                self.failed_services.append(service)
                alert_info_index = 0
            elif state == "active":
                # activating -> active
                if service in self.failed_services:
                    self.failed_services.remove(service)
                    alert_info_index = 2
                else:
                    # its a restart.
                    pass
            elif state == "failed":
                # activating -> failed
                if service not in self.failed_services:
                    self.failed_services.append(service)
                    alert_info_index = 0
            else:
                alert_info_index = 3

        if alert_info_index == 3:
            logger.warning(f"{service} service state transition from "\
                           f"{prev_state} to {state} is not handled.")
        if alert_info_index != -1:
            self.raise_alert(service, prev_state, state, prev_substate,
                             substate, prev_pid, pid, alert_info_index)
示例#20
0
    def run(self):
        logger.info(f"Monitoring Services : {self.services_to_monitor}")
        try:
            # Register all the services to signal of 'PropertiesChanged' and
            # raise an alert if some service is not active on initially or if
            # Unit is not found for the service
            services_to_monitor_copy = self.services_to_monitor.copy()
            for service in services_to_monitor_copy:
                err = self.connect_to_prop_changed_signal(service)
                if err:
                    self.raise_alert(service, "N/A", "N/A", "N/A", "N/A",
                                     "N/A", "N/A", 0)
                    logger.error(
                        f"{service} is not active initially. \n Error {err}")
                else:
                    self.services_to_monitor.remove(service)

            logger.debug(f"failed_services : {self.failed_services}")
            logger.debug(f"services_to_monitor : {self.services_to_monitor}")

            # Retrieve the main loop which will be called in the run method
            self._loop = GLib.MainLoop()

            # Initialize the gobject threads and get its context
            GLib.threads_init()
            context = self._loop.get_context()

            time_to_check_lists = self.current_time() + self.polling_frequency

            # WHILE LOOP FUNCTION : every second we check for
            # properties change event if any generated (using context
            # iteration) and after a delay of polling frequency we
            # check for inactive processes.
            while self.is_running():
                # At interval of 'thread_sleep' check for events occured for
                # registered services and process them(call on_pro_changed())
                context.iteration(False)
                time.sleep(self.thread_sleep)

                # At interval of 'polling_freqency' process unregistered
                # services and services with not-active (intermidiate) state.
                if time_to_check_lists <= self.current_time():
                    time_to_check_lists = self.current_time() + \
                                            self.polling_frequency

                    # Try to bind the enabled services on the node to the
                    # signal whose Unit was earlier not found. On successfully
                    # registering for service state change signal, remove from
                    # local list as monitoring enabled through SystemD
                    # and to avoid re-registration.
                    services_to_monitor_copy = self.services_to_monitor.copy()
                    for service in services_to_monitor_copy:
                        if not self.connect_to_prop_changed_signal(service):
                            self.services_to_monitor.remove(service)

                    # Check for services in intermidiate state(not active)
                    self.check_notactive_services()


            logger.info("ServiceMonitor gracefully breaking out " +\
                                "of dbus Loop, not restarting.")
        except GLib.Error as err:
            raise ThreadException(
                self.SENSOR_NAME,
                "Ungrecefully breaking out of GLib.MainLoop() with error: %s" %
                err)
        except DBusException as err:
            raise ThreadException(
                self.SENSOR_NAME,
                "Ungracefully breaking out of dbus loop with error: %s" % err)
        except Exception as err:
            raise ThreadException(self.SENSOR_NAME,
                "Ungracefully breaking out of ServiceMonitor:run() "\
                "with error: %s" % err)
示例#21
0
    def _process_msg(self, jsonMsg):
        """Parses the incoming message and hands off to the appropriate logger
        """
        logger.debug(f"_process_msg, jsonMsg: {jsonMsg}")

        if isinstance(jsonMsg, dict) is False:
            jsonMsg = json.loads(jsonMsg)

        # Parse out the uuid so that it can be sent back in Ack message
        uuid = None
        if jsonMsg.get("sspl_ll_msg_header") is not None and \
           jsonMsg.get("sspl_ll_msg_header").get("uuid") is not None:
            uuid = jsonMsg.get("sspl_ll_msg_header").get("uuid")
            logger.debug(f"_processMsg, uuid: {uuid}")

        # Handle service start, stop, restart, status requests
        if "actuator_request_type" in jsonMsg and \
           "service_controller" in jsonMsg["actuator_request_type"]:

            logger.debug("_processMsg, msg_type: service_controller")

            service_name = jsonMsg.get("actuator_request_type") \
                .get("service_controller").get("service_name")
            service_request = jsonMsg.get("actuator_request_type") \
                .get("service_controller").get("service_request")
            request = f"{service_request}:{service_name}"

            if service_name not in self.monitored_services:
                logger.error(f"{service_name} - service not monitored")
                msg = ("Check if supplied service name is valid, %s is not "
                       "monitored or managed." % service_name)
                self.send_error_response(service_request, service_name, msg,
                                         errno.EINVAL)
                return
            elif service_request not in ["disable", "enable"]:
                status = self._dbus_service.is_enabled(service_name)
                if status == "disabled":
                    logger.error(f"{service_name} - service is disabled")
                    msg = ("%s is disabled, enable request needed before "
                           "current - %s request can be processed." %
                           (service_name, service_request))
                    self.send_error_response(service_request, service_name,
                                             msg, errno.EPERM)
                    return
            # If the state is INITIALIZED, We can assume that actuator is
            # ready to perform operation.
            if actuator_state_manager.is_initialized("Service"):
                logger.debug(f"_process_msg, service_actuator name: \
                                        {self._service_actuator.name()}")
                self._execute_request(self._service_actuator, jsonMsg, uuid)

            # If the state is INITIALIZING, need to send message
            elif actuator_state_manager.is_initializing("Service"):
                # This state will not be reached. Kept here for consistency.
                logger.info("Service actuator is initializing")
                self.send_error_response(service_request, service_name, \
                        "BUSY - Service actuator is initializing.", errno.EBUSY)

            elif actuator_state_manager.is_imported("Service"):
                # This case will be for first request only. Subsequent
                # requests will go to INITIALIZED state case.
                logger.info("Service actuator is imported and initializing")
                from actuators.IService import IService
                actuator_state_manager.set_state(
                    "Service", actuator_state_manager.INITIALIZING)
                service_actuator_class = self._query_utility(IService)
                if service_actuator_class:
                    # NOTE: Instantiation part should not time consuming
                    # otherwise ServiceMsgHandler will get block and will
                    # not be able serve any subsequent requests. This applies
                    # to instantiation of evey actuator.
                    self._service_actuator = service_actuator_class()
                    logger.info(f"_process_msg, service_actuator name: \
                                            {self._service_actuator.name()}")
                    self._execute_request(self._service_actuator, jsonMsg,
                                          uuid)
                    actuator_state_manager.set_state(
                        "Service", actuator_state_manager.INITIALIZED)
                else:
                    logger.info("Service actuator is not instantiated")

            # If there is no entry for actuator in table, We can assume
            # that it is not loaded for some reason.
            else:
                logger.warn("Service actuator is not loaded or not supported")

        # Handle events generated by the service monitor
        elif "sensor_request_type" in jsonMsg and \
            "service_status_alert" in jsonMsg["sensor_request_type"]:
            logger.debug(f"Received alert from ServiceMonitor : {jsonMsg}")
            jsonMsg1 = ServiceMonitorMsg(
                jsonMsg["sensor_request_type"]).getJson()
            self._write_internal_msgQ("EgressProcessor", jsonMsg1)
示例#22
0
 def get_disk_groups_info(self):
     """Update and return disk-group information in specific format."""
     dg_data = []
     dg_vol_map = {}
     diskgroups = self.get_realstor_encl_data("disk-groups")
     # Mapping logical volumes with disk group.
     logicalvolumes = self.get_realstor_encl_data("volumes")
     if logicalvolumes:
         for logicalvolume in logicalvolumes:
             volume_pool_sr_no = logicalvolume.get("container-serial", "NA")
             volume_uid = logicalvolume.get("volume-name", "NA")
             if volume_pool_sr_no in dg_vol_map:
                 dg_vol_map[volume_pool_sr_no].append(
                     {"volume_uid": volume_uid})
             else:
                 dg_vol_map.update(
                     {volume_pool_sr_no: [{
                         "volume_uid": volume_uid
                     }]})
     if diskgroups:
         for diskgroup in diskgroups:
             uid = diskgroup.get("name", "NA")
             health = diskgroup.get("health", "NA")
             pool_sr_no = diskgroup.get("pool-serial-number", "NA")
             if pool_sr_no in dg_vol_map:
                 volumes = dg_vol_map[pool_sr_no]
             else:
                 volumes = None
             recommendation = diskgroup.get("health-recommendation", "NA")
             specifics = [{
                 "class":
                 diskgroup.get("storage-type", "NA"),
                 "disks":
                 diskgroup.get("diskcount", "NA"),
                 "size":
                 diskgroup.get("size", "NA"),
                 "free":
                 diskgroup.get("freespace", "NA"),
                 "status":
                 diskgroup.get("status", "NA"),
                 "current_job":
                 diskgroup.get("current-job", "NA"),
                 "current_job_completion":
                 diskgroup.get("current-job-completion", "NA"),
                 "tier":
                 diskgroup.get("storage-tier", "NA"),
                 "pool":
                 diskgroup.get("pool", "NA"),
                 "blocksize":
                 diskgroup.get("blocksize", "NA"),
                 "chunksize":
                 diskgroup.get("chunksize", "NA"),
                 "volumes":
                 volumes
             }]
             dg_data_dict = self.get_health_template(uid, is_fru=False)
             self.set_health_data(dg_data_dict,
                                  health,
                                  recommendation=recommendation,
                                  specifics=specifics)
             dg_data.append(dg_data_dict)
     logger.debug(self.log.svc_log(f"disk-group Health Data:{dg_data}"))
     return dg_data
    def get_system_status(self):
        """Retreive realstor system state info using cli api /show/system"""

        # poll system would get invoked through multiple realstor sensors
        # with less frequency compared to configured polling frequency
        # adding check to comply with polling frequency
        elapsed = time.time() - self.poll_system_ts

        if elapsed < self.pollfreq:
            logger.warn("/show/system request came in {0} seconds,"
                        "while configured polling frequency is {1} seconds,"
                        "ignoring".format(elapsed, self.pollfreq))
            return

        system = None

        # make ws request
        url = self.build_url(self.URI_CLIAPI_SHOWSYSTEM)
        #logger.info("show system url: %s" % url)

        response = self.ws_request(url, self.ws.HTTP_GET)

        if not response:
            logger.warn("System status unavailable as ws request failed")
            return

        if response.status_code != self.ws.HTTP_OK:
            logger.info("{0}:: http request {1} polling system status failed"
                " with http err {2}".format(self.LDR_R1_ENCL, url, \
                response.status_code))
            return

        self.poll_system_ts = time.time()

        try:
            jresponse = json.loads(response.content)
        except ValueError as badjson:
            logger.error("%s returned mal-formed json:\n%s" % (url, badjson))

        if jresponse:
            api_resp = self.get_api_status(jresponse['status'])

            if ((api_resp == -1)
                    and (response.status_code == self.ws.HTTP_OK)):
                logger.warn("/show/system api response unavailable, "
                            "marking success as http code is 200")
                api_resp = 0

            if api_resp == 0:
                system = jresponse['system'][0]
                self.memcache_system = system

            if system:
                # Check if fault exists
                # TODO: use self.FAULT_KEY in system: system.key() generates
                # list and find item in that.
                if not self.FAULT_KEY in system.keys():
                    logger.debug("{0} Healthy, no faults seen".format(
                        self.LDR_R1_ENCL))
                    self.latest_faults = {}
                    return

                # Extract system faults
                self.latest_faults = system[self.FAULT_KEY]

                #If no in-memory fault cache built yet!
                if not self.memcache_faults:
                    # build from persistent cache if available
                    logger.info(
                        "No cached faults, building from  persistent cache {0}"\
                        .format(self.faults_persistent_cache))

                    self.memcache_faults = store.get(
                        self.faults_persistent_cache)

                    # still if none, build from latest faults & persist
                    if not self.memcache_faults:
                        logger.info("No persistent faults cache, building "
                                    "cache from latest faults")

                        self.memcache_faults = self.latest_faults

                        # On SSPL boot, run through existing faults as no cache to
                        # verify with for new faults
                        self.existing_faults = True

                        #logger.debug("existing_faults {0}".\
                        #    format(self.existing_faults))

                        store.put(self.memcache_faults,
                                  self.faults_persistent_cache)
                else:
                    # Reset flag as existing faults processed by now
                    # and cached faults are built already
                    self.existing_faults = False
            else:
                logger.error("poll system failed with err %d" % api_resp)
    def _transmit_msg_on_exchange(self):
        """Transmit json message onto RabbitMQ exchange"""
        self._log_debug("_transmit_msg_on_exchange, jsonMsg: %s" %
                        self._jsonMsg)

        try:
            # Check for shut down message from sspl_ll_d and set a flag to shutdown
            #  once our message queue is empty
            if self._jsonMsg.get("message").get(
                    "actuator_response_type") is not None and \
                    self._jsonMsg.get("message").get(
                        "actuator_response_type").get(
                        "thread_controller") is not None and \
                    self._jsonMsg.get("message").get(
                        "actuator_response_type").get("thread_controller").get(
                        "thread_response") == \
                    "SSPL-LL is shutting down":
                logger.info(
                    "RabbitMQegressProcessor, _transmit_msg_on_exchange, received"
                    "global shutdown message from sspl_ll_d")
                self._request_shutdown = True

            # Publish json message to the correct channel
            # NOTE: We need to route ThreadController messages to ACK channel.
            # We can't modify schema as it will affect other modules too. As a
            # temporary solution we have added a extra check to see if actuator_response_type
            # is "thread_controller".
            # TODO: Find a proper way to solve this issue. Avoid changing
            # core egress processor code
            if self._jsonMsg.get("message").get(
                    "actuator_response_type") is not None and \
                    (self._jsonMsg.get("message").get(
                        "actuator_response_type").get("ack") is not None or
                     self._jsonMsg.get("message").get(
                         "actuator_response_type").get(
                         "thread_controller") is not None):
                self._add_signature()
                self._producer.send([json.dumps(self._jsonMsg)])
                logger.debug(
                    "_transmit_msg_on_exchange, Successfully Sent: %s" %
                    self._jsonMsg)

            # Routing requests for IEM msgs sent from the LoggingMsgHandler
            elif self._jsonMsg.get("message").get("IEM_routing") is not None:
                log_msg = self._jsonMsg.get("message").get("IEM_routing").get(
                    "log_msg")
                if self._iem_route_addr != "":
                    self._producer.send([json.dumps(self._jsonMsg)])
                else:
                    logger.warn(
                        "RabbitMQegressProcessor, Attempted to route IEM without a valid 'iem_route_addr' set."
                    )
                logger.debug(
                    "_transmit_msg_on_exchange, Successfully Sent: %s" %
                    log_msg)
            else:
                self._add_signature()
                jsonMsg = json.dumps(self._jsonMsg)
                try:
                    if self.store_queue.is_empty():
                        self._producer.send([jsonMsg])
                        logger.info(f"Published Alert: {jsonMsg}")
                    else:
                        logger.info("'Accumulated msg queue' is not Empty." +
                                    " Adding the msg to the end of the queue")
                        self.store_queue.put(jsonMsg)
                except MessageBusError as e:
                    logger.error(
                        f"RabbitMQegressProcessor, _transmit_msg_on_exchange, error {e} in producing message,\
                                    adding message to consul {self._jsonMsg}")
                    self.store_queue.put(jsonMsg)
                except Exception as err:
                    logger.error(
                        f'RabbitMQegressProcessor, _transmit_msg_on_exchange, Unknown error {err} while publishing the message, adding to persistent store {self._jsonMsg}'
                    )
                    self.store_queue.put(jsonMsg)

            # If event is added by sensors, set it
            if self._event:
                self._event.set()

        except Exception as ex:
            logger.error(
                f'RabbitMQegressProcessor, _transmit_msg_on_exchange, problem while publishing the message:{ex}, adding message to consul: {self._jsonMsg}'
            )
示例#25
0
    def perform_request(self, jsonMsg):
        """Performs the service request"""
        self._check_debug(jsonMsg)

        # Parse out the service name and request to perform on it
        if jsonMsg.get("actuator_request_type").get("service_controller") \
                                                                is not None:
            self._service_name = jsonMsg.get("actuator_request_type").\
                                get("service_controller").get("service_name")
            self._service_request = jsonMsg.get("actuator_request_type").\
                        get("service_controller").get("service_request")
        else:
            self._service_name = jsonMsg.get("actuator_request_type").\
                        get("service_watchdog_controller").get("service_name")
            self._service_request = jsonMsg.get("actuator_request_type").\
                        get("service_watchdog_controller").get("service_request")

        logger.debug("perform_request, service_name: %s, service_request: %s" % \
                        (self._service_name, self._service_request))

        try:
            # Load the systemd unit for the service
            systemd_unit = self._manager.LoadUnit(self._service_name)

            # Get a proxy to systemd for accessing properties of units
            self._proxy = self._bus.get_object("org.freedesktop.systemd1", \
                                                            str(systemd_unit))

            # The returned result of the desired action
            result = {}
            is_err_response = False
            if self._service_request in ['restart', 'start']:
                # Before restart/start the service, check service state.
                # If it is not active or activating then only process
                # restart/start request.
                service_state = self._service.get_state(self._service_name)
                state = service_state.state
                if state not in ['active', 'activating']:
                    if self._service_request == "restart":
                        self._service.restart(self._service_name)
                    elif self._service_request == "start":
                        self._service.start(self._service_name)
                    # Ensure we get an "active" state and not "activating"
                    service_state = self._service.get_state(self._service_name)
                    state = service_state.state
                    max_wait = 0
                    while state != "active":
                        logger.debug(
                            "%s status is activating, needs 'active' "
                            "state after %s request has been processed, retrying"
                            % (self._service_name, self._service_request))
                        time.sleep(1)
                        max_wait += 1
                        if max_wait > 20:
                            logger.debug("maximum wait - %s seconds, for "
                                         "service restart reached." % max_wait)
                            break
                        service_state = self._service.get_state(
                            self._service_name)
                        state = service_state.state

                else:
                    is_err_response = True
                    err_msg = (
                        "Can not process %s request, for %s, as service "
                        "is already in %s state." %
                        (self._service_request, self._service_name, state))
                    logger.error(err_msg)
                    return (self._service_name, err_msg, is_err_response)

            elif self._service_request == "stop":
                self._service.stop(self._service_name)

            elif self._service_request == "status":
                # Return the status below
                service_status = self._service.get_state(self._service_name)

            # TODO: Use cortx.utils Service class methods for
            # enable/disable services.
            elif self._service_request == "enable":
                service_list = []
                service_list.append(self._service_name)

                # EnableUnitFiles() function takes second argument as boolean.
                # 'True' will enable a service for runtime only(creates symlink
                #  in /run/.. directory) 'False' will enable a service
                #  persistently (creates symlink in /etc/.. directory)
                _, dbus_result = self._manager.EnableUnitFiles(
                    service_list, False, True)
                res = parse_enable_disable_dbus_result(dbus_result)
                result.update(res)
                logger.debug("perform_request, result for enable request: "
                             "result: %s" % (result))

            elif self._service_request == "disable":
                service_list = []
                service_list.append(self._service_name)

                # DisableUnitFiles() function takes second argument as boolean.
                # 'True' will disable a service for runtime only(removes symlink
                # from /run/.. directory) 'False' will disable a service
                # persistently(removes symlink from /etc/.. directory)
                dbus_result = self._manager.DisableUnitFiles(
                    service_list, False)
                res = parse_enable_disable_dbus_result(dbus_result)
                result.update(res)
                logger.debug(
                    "perform_request, result for disable request: %s" % result)
            else:
                logger.error("perform_request, Unknown service request - %s "
                             "for service - %s" %
                             (self._service_request, self._service_name))
                is_err_response = True
                return (self._service_name, "Unknown service request",
                        is_err_response)

        except debus_exceptions.DBusException as error:
            is_err_response = True
            logger.exception("DBus Exception: %r" % error)
            return (self._service_name, str(error), is_err_response)

        except Exception as ae:
            logger.exception("SystemD Exception: %r" % ae)
            is_err_response = True
            return (self._service_name, str(ae), is_err_response)

        # Give the unit some time to finish starting/stopping to get final status
        time.sleep(5)

        # Get the current status of the process and return it back:
        service_status = self._service.get_state(self._service_name)
        pid = service_status.pid
        state = service_status.state
        substate = service_status.substate
        status = self._service.is_enabled(self._service_name)
        uptime = get_service_uptime(self._service_name)
        # Parse dbus output to fetch command line path with args.
        command_line = service_status.command_line_path
        command_line_path_with_args = []
        for field in list(command_line[0][1]):
            command_line_path_with_args.append(str(field))
        result["pid"] = pid
        result["state"] = state
        result["substate"] = substate
        result["status"] = status
        result["uptime"] = uptime
        result["command_line_path"] = command_line_path_with_args

        logger.debug("perform_request, state: %s, substate: %s" %
                     (str(state), str(substate)))
        return (self._service_name, result, is_err_response)
示例#26
0
    def _run_ipmitool_subcommand(self, subcommand, grep_args=None):
        """Executes ipmitool sub-commands, and optionally greps the output."""
        self.ACTIVE_IPMI_TOOL = self.IPMITOOL
        host_conf_cmd = ""

        # Set ipmitool to ipmisimtool if activated.
        if os.path.exists(f"{DATA_PATH}/server/activate_ipmisimtool"):
            cmd = self.IPMISIMTOOL + " sel info"
            _, _, retcode = SimpleProcess(cmd).run()
            if retcode in [0, 2]:
                self.ACTIVE_IPMI_TOOL = self.IPMISIMTOOL
                logger.debug("IPMI simulator is activated.")

        # Fetch channel info from config file and cache.
        _channel_interface = Conf.get(
            SSPL_CONF, "%s>%s" % (BMC_INTERFACE, BMC_CHANNEL_IF))

        _active_interface = store.get(BMCInterface.ACTIVE_BMC_IF.value, None)
        if isinstance(_active_interface, bytes):
            _active_interface = _active_interface.decode()
        # Set host_conf_cmd based on channel info.
        if (self.ACTIVE_IPMI_TOOL != self.IPMISIMTOOL
                and _active_interface in BMCInterface.LAN_IF.value):
            bmc_ip = Conf.get(GLOBAL_CONF, BMC_IP_KEY, '')
            bmc_user = Conf.get(GLOBAL_CONF, BMC_USER_KEY, 'ADMIN')
            bmc_secret = Conf.get(GLOBAL_CONF, BMC_SECRET_KEY, 'ADMIN')

            decryption_key = encryptor.gen_key(MACHINE_ID,
                                               ServiceTypes.SERVER_NODE.value)
            bmc_pass = encryptor.decrypt(decryption_key, bmc_secret, self.NAME)

            host_conf_cmd = BMCInterface.LAN_CMD.value.format(
                _active_interface, bmc_ip, bmc_user, bmc_pass)

        # generate the final cmd and execute on shell.
        command = " ".join([self.ACTIVE_IPMI_TOOL, host_conf_cmd, subcommand])
        command = shlex.split(command)

        out, error, retcode = SimpleProcess(command).run()

        # Decode bytes encoded strings.
        if not isinstance(out, str):
            out = out.decode(self.IPMI_ENCODING)
        if not isinstance(error, str):
            error = error.decode(self.IPMI_ENCODING)

        # Grep the output as per grep_args provided.
        if grep_args is not None and retcode == 0:
            final_list = []
            for l in out.split('\n'):
                if re.search(grep_args, l) is not None:
                    final_list += [l]
            out = '\n'.join(final_list)

        # Assign error_msg to err from output
        if retcode and not error:
            out, error = error, out
        # Remove '\n' from error, for matching errors to error stings.
        if error:
            error = error.replace('\n', '')

        return out, error, retcode
示例#27
0
 def get_platform_sensors_info(self):
     sensor_list = ['temperature', 'current', 'voltage']
     sensor_data = self.build_encl_platform_sensors_data(sensor_list)
     logger.debug(
         self.log.svc_log(f"Platform Sensors Health Data:{sensor_data}"))
     return sensor_data
示例#28
0
 def get_drives_info(self):
     """Update and return drives information in specific format."""
     data = []
     drives = ENCL.get_realstor_encl_data("drives")
     for drive in drives:
         slot = drive.get("slot", -1)
         if slot == -1:
             continue
         drive_dict = {
             "uid":
             drive.get("durable-id", "NA"),
             "type":
             drive.get("type", "NA"),
             "description":
             drive.get("description", "NA"),
             "product":
             drive.get("object-name", "NA"),
             "manufacturer":
             drive.get("vendor", "NA"),
             "serial_number":
             drive.get("serial-number", "NA"),
             "version":
             drive.get("hardware-version", "NA"),
             "part_number":
             drive.get("part-number", "NA"),
             "last_updated":
             int(time.time()),
             "specifics": [{
                 "drive-serial-number":
                 drive.get("serial-number")[:8],
                 "model":
                 drive.get("model", "NA"),
                 "slot":
                 slot,
                 "architecture":
                 drive.get("architecture", "NA"),
                 "interface":
                 drive.get("interface", "NA"),
                 "usage":
                 drive.get("usage", "NA"),
                 "current_job_completion":
                 drive.get("current-job-completion", "NA"),
                 "speed":
                 drive.get("speed", "NA"),
                 "size":
                 drive.get("size", "NA"),
                 "enclosure_wwn":
                 drive.get("enclosure-wwn", "NA"),
                 "status":
                 drive.get("status", "NA"),
                 "ssd_life_left":
                 drive.get("ssd-life-left", "NA"),
                 "led_status":
                 drive.get("led-status", "NA"),
                 "temperature":
                 drive.get("temperature", "NA"),
                 "location":
                 drive.get("location", "NA")
             }]
         }
         data.append(drive_dict)
         logger.debug(self.log.svc_log(f"Drive Manifest Data:{data}"))
     sort_key_path = self.resource_indexing_map["hw"]["disk"]
     data = MonUtils.sort_by_specific_kv(data, sort_key_path, self.log)
     return data
示例#29
0
 def get_drives_info(self):
     """Update and return drives information in specific format."""
     drive_data = []
     drives = self.get_realstor_encl_data("drives")
     for drive in drives:
         slot = drive.get("slot", -1)
         if slot == -1:
             continue
         uid = drive.get("durable-id")
         status = drive.get("health", "NA")
         description = drive.get("description", "NA")
         recommendation = drive.get("health-recommendation", "NA")
         specifics = [{
             "serial-number":
             drive.get("serial-number", "NA"),
             "model":
             drive.get("model", "NA"),
             "size":
             drive.get("size", "NA"),
             "temperature":
             drive.get("temperature", "NA"),
             "disk-group":
             drive.get("disk-group", "NA"),
             "storage-pool-name":
             drive.get("storage-pool-name", "NA"),
             "location":
             drive.get("location", "NA"),
             "enclosure-id":
             drive.get("enclosure-id", "NA"),
             "drawer-id":
             drive.get("drawer-id", "NA"),
             "slot":
             drive.get("slot", "NA"),
             "port":
             drive.get("port", "NA"),
             "scsi-id":
             drive.get("scsi-id", "NA"),
             "blocksize":
             drive.get("blocksize", "NA"),
             "blocks":
             drive.get("blocks", "NA"),
             "vendor":
             drive.get("vendor", "NA"),
             "revision":
             drive.get("revision", "NA"),
             "architecture":
             drive.get("architecture", "NA"),
             "interface":
             drive.get("interface", "NA"),
             "type":
             drive.get("type", "NA"),
             "blink":
             drive.get("blink", "NA"),
             "locator-led":
             drive.get("locator-led", "NA"),
             "enclosure-wwn":
             drive.get("enclosure-wwn", "NA"),
             "virtual-disk-serial":
             drive.get("virtual-disk-serial", "NA"),
             "led-status":
             drive.get("led-status", "NA"),
             "power-on-hours":
             drive.get("power-on-hours", "NA")
         }]
         drives_dict = self.get_health_template(uid, is_fru=True)
         self.set_health_data(drives_dict, status, description,
                              recommendation, specifics)
         drive_data.append(drives_dict)
     logger.debug(self.log.svc_log(f"disk Health data:{drive_data}"))
     return drive_data
示例#30
0
 def get_versions_info(self):
     """Update and return versions information in specific format."""
     data = []
     versions = ENCL.get_realstor_encl_data("versions")
     for version in versions:
         version_dict = {
             "uid":
             version.get("object-name", "NA"),
             "type":
             version.get("sc-cpu-type", "NA"),
             "description":
             version.get("description", "NA"),
             "product":
             version.get("object-name", "NA"),
             "manufacturer":
             version.get("vendor", "NA"),
             "serial_number":
             version.get("serial-number", "NA"),
             "version":
             version.get("bundle-version", "NA"),
             "part_number":
             version.get("part-number", "NA"),
             "last_updated":
             int(time.time()),
             "specifics": [{
                 "sc_cpu_type":
                 version.get("sc-cpu-type", "NA"),
                 "bundle_version":
                 version.get("bundle-version", "NA"),
                 "bundle_base_version":
                 version.get("bundle-base-version", "NA"),
                 "build_date":
                 version.get("build-date", "NA"),
                 "sc_fw":
                 version.get("sc-fw", "NA"),
                 "sc_baselevel":
                 version.get("sc-baselevel", "NA"),
                 "sc_memory":
                 version.get("sc-memory", "NA"),
                 "sc_fu_version":
                 version.get("sc-fu-version", "NA"),
                 "sc_loader":
                 version.get("sc-loader", "NA"),
                 "capi_version":
                 version.get("capi-version", "NA"),
                 "mc_fw":
                 version.get("mc-fw", "NA"),
                 "mc_loader":
                 version.get("mc-loader", "NA"),
                 "mc_base_fw":
                 version.get("mc-base-fw", "NA"),
                 "fw_default_platform_brand":
                 version.get("fw-default-platform-brand", "NA"),
                 "ec_fw":
                 version.get("ec-fw", "NA"),
                 "pld_rev":
                 version.get("pld-rev", "NA"),
                 "prm_version":
                 version.get("prm-version", "NA"),
                 "hw_rev":
                 version.get("hw-rev", "NA"),
                 "him_rev":
                 version.get("him-rev", "NA"),
                 "him_model":
                 version.get("him-model", "NA"),
                 "backplane_type":
                 version.get("backplane-type", "NA"),
                 "host_channel_revision":
                 version.get("host-channel_revision", "NA"),
                 "disk_channel_revision":
                 version.get("disk-channel_revision", "NA"),
                 "mrc_version":
                 version.get("mrc-version", "NA"),
                 "ctk_version":
                 version.get("ctk-version", "NA")
             }]
         }
         data.append(version_dict)
         logger.debug(
             self.log.svc_log(f"Controller firmware Manifest Data:{data}"))
     return data