def __init__(self):
        super(RealStorPSUSensor, self).__init__(
            self.SENSOR_NAME, self.PRIORITY)

        self._faulty_psu_file_path = None

        self.rssencl = singleton_realstorencl

        # psus persistent cache
        self.psu_prcache = None

        # Holds PSUs with faults. Used for future reference.
        self._previously_faulty_psus = {}

        self.pollfreq_psusensor = \
            int(Conf.get(SSPL_CONF, f"{self.rssencl.CONF_REALSTORPSUSENSOR}>{POLLING_FREQUENCY_OVERRIDE}",
                        0))

        if self.pollfreq_psusensor == 0:
                self.pollfreq_psusensor = self.rssencl.pollfreq

        # Flag to indicate suspension of module
        self._suspended = False

        self._event = Event()
        self.os_utils = OSUtils()
    def __init__(self):
        super(RealStorEnclosureSensor, self).__init__(self.SENSOR_NAME,
                                                      self.PRIORITY)

        self.rssencl = singleton_realstorencl

        # Flag to indicate suspension of module
        self._suspended = False
        self.os_utils = OSUtils()
Exemplo n.º 3
0
    def __init__(self):
        super(RAIDsensor, self).__init__(self.SENSOR_NAME, self.PRIORITY)
        # Current RAID status information
        self._RAID_status = None

        # Location of hpi data directory populated by dcs-collector
        self._start_delay = 10

        # Flag to indicate suspension of module
        self._suspended = False
        self.os_utils = OSUtils()
Exemplo n.º 4
0
    def __init__(self, utility_instance=None):
        """init method"""
        super(CPUFaultSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY)

        # Initialize the utility instance
        self._utility_instance = utility_instance

        # CPU info
        self.stored_cpu_info = None
        self.prev_cpu_info = None
        self.current_cpu_info = None
        self.os_utils = OSUtils()
Exemplo n.º 5
0
 def get_alert(cls, service, alert):
     if service.state in ["active", "failed"]:
         description = alert.description.format(
             service.name, service.state, service.threshold_waiting_time)
     else:
         description = alert.description.format(service.name, service.state,
                                                service.nonactive_threshold)
     return {
         "sensor_request_type": {
             "service_status_alert": {
                 "host_id": OSUtils.get_fqdn(),
                 "severity":
                 SeverityReader().map_severity(alert.alert_type),
                 "alert_id": MonUtils.get_alert_id(str(int(time.time()))),
                 "alert_type": alert.alert_type,
                 "info": {
                     "resource_type": cls.RESOURCE_TYPE,
                     "resource_id": service.name,
                     "event_time": str(int(time.time())),
                     "description": description,
                     "impact": alert.impact.format(service.name),
                     "recommendation": alert.recommendation,
                 },
                 "specific_info": {
                     "service_name": service.name,
                     "previous_state": service.previous_state,
                     "state": service.state,
                     "previous_substate": service.previous_substate,
                     "substate": service.substate,
                     "previous_pid": service.previous_pid,
                     "pid": service.pid,
                 }
             }
         }
     }
Exemplo n.º 6
0
    def __init__(self, utility_instance=None):
        """init method"""
        super(SASPortSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY)

        # Initialize the utility instance
        self._utility_instance = utility_instance

        self.phy_dir_to_linkrate_mapping = None

        # Flag to indicate suspension of module
        self._suspended = False
        self._count = 0
        self.phy_link_count = 0
        self.sas_ports_status = {}
        self.port_phy_list_dict = {}
        self.sas_phy_stored_alert = None
        self.os_utils = OSUtils()
Exemplo n.º 7
0
    def __init__(self):
        super(NodeData, self).__init__()

        self.os_utils = OSUtils()
        self._epoch_time = str(int(time.time()))
        # Total number of CPUs
        self.cpus = psutil.cpu_count()
        self.host_id = self.os_utils.get_fqdn()

        # Calculate the load averages on separate blocking threads
        self.load_1min_average = []
        self.load_5min_average = []
        self.load_15min_average = []
        self.prev_bmcip = None
        load_1min_avg = threading.Thread(target=self._load_1min_avg).start()
        load_5min_avg = threading.Thread(target=self._load_5min_avg).start()
        load_15min_avg = threading.Thread(target=self._load_15min_avg).start()

        self.conf_reader = ConfigReader()

        nw_fault_utility = Conf.get(
            SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}", "sysfs")

        self._utility_instance = None

        try:
            # Creating the instance of ToolFactory class
            self.tool_factory = ToolFactory()
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(nw_fault_utility)
            if self._utility_instance:
                # Initialize the path as /sys/class/net/
                self.nw_interface_path = self._utility_instance.get_sys_dir_path(
                    'net')
        except KeyError as key_error:
            logger.error(
                f'NodeData, Unable to get the instance of {nw_fault_utility} Utility'
            )
        except Exception as err:
            logger.error(
                f'NodeData, Problem occured while getting the instance of {nw_fault_utility}'
            )
Exemplo n.º 8
0
    def __init__(self):
        super(RealStorDiskSensor, self).__init__(self.SENSOR_NAME,
                                                    self.PRIORITY)
        self.last_alert = None

        self.rssencl = singleton_realstorencl

        # disks persistent cache
        self.disks_prcache = f"{self.rssencl.frus}disks/"

        self.pollfreq_disksensor = \
            int(Conf.get(SSPL_CONF, f"{self.rssencl.CONF_REALSTORDISKSENSOR}>{POLLING_FREQUENCY_OVERRIDE}",
                        0))

        if self.pollfreq_disksensor == 0:
                self.pollfreq_disksensor = self.rssencl.pollfreq

        # Flag to indicate suspension of module
        self._suspended = False

        self._event = None
        self._event_wait_results = set()
        self.os_utils = OSUtils()
    def __init__(self):
        super(RealStorLogicalVolumeSensor, self).__init__(
            self.SENSOR_NAME, self.PRIORITY)

        self._faulty_disk_group_file_path = None
        self._faulty_logical_volume_file_path = None

        self.rssencl = singleton_realstorencl

        # logical volumes persistent cache
        self._logical_volume_prcache = None
        # disk groups persistent cache
        self._disk_group_prcache = None

        # Holds Disk Groups with faults. Used for future reference.
        self._previously_faulty_disk_groups = {}
        # Holds Logical Volumes with faults. Used for future reference.
        self._previously_faulty_logical_volumes = {}

        self.pollfreq_DG_logical_volume_sensor = \
            int(Conf.get(SSPL_CONF, f"{self.rssencl.CONF_REALSTORLOGICALVOLUMESENSOR}>{POLLING_FREQUENCY_OVERRIDE}",
                            10))

        if self.pollfreq_DG_logical_volume_sensor == 0:
                self.pollfreq_DG_logical_volume_sensor = self.rssencl.pollfreq

        # Flag to indicate suspension of module
        self._suspended = False

        self._event = Event()
        self.os_utils = OSUtils()
        cvg_info = Conf.get(GLOBAL_CONF, CVG_INFO_KEY)
        self.cvg_info_dict = {}
        if cvg_info:
            self.cvg_info_dict = {cvg['name']: idx for idx, cvg in \
                enumerate(cvg_info) if 'name' in cvg}
Exemplo n.º 10
0
    def __init__(self):
        super(RealStorFanSensor, self).__init__(self.SENSOR_NAME,
                                                self.PRIORITY)
        self.rssencl = singleton_realstorencl

        self._faulty_fan_file_path = None
        self._faulty_fan_modules_list = {}
        self._fan_modules_list = {}

        # fan modules psus persistent cache
        self._fanmodule_prcache = None

        self.pollfreq_fansensor = \
            int(Conf.get(SSPL_CONF, f"{self.rssencl.CONF_REALSTORFANSENSOR}>{POLLING_FREQUENCY_OVERRIDE}",
                        0))

        if self.pollfreq_fansensor == 0:
                self.pollfreq_fansensor = self.rssencl.pollfreq

        # Flag to indicate suspension of module
        self._suspended = False

        self._event = Event()
        self.os_utils = OSUtils()
Exemplo n.º 11
0
class NodeData(Debug):
    """Obtains data about the node and makes it available"""

    SENSOR_NAME = "NodeData"

    # conf attribute initialization
    PROBE = 'probe'

    @staticmethod
    def name():
        """@return: name of the module."""
        return NodeData.SENSOR_NAME

    @staticmethod
    def impact():
        """Returns impact of the module."""
        return ("Server CPU, network, disk space, process and local mount "
                "data can not be monitored.")

    def __init__(self):
        super(NodeData, self).__init__()

        self.os_utils = OSUtils()
        self._epoch_time = str(int(time.time()))
        # Total number of CPUs
        self.cpus = psutil.cpu_count()
        self.host_id = self.os_utils.get_fqdn()

        # Calculate the load averages on separate blocking threads
        self.load_1min_average = []
        self.load_5min_average = []
        self.load_15min_average = []
        self.prev_bmcip = None
        load_1min_avg = threading.Thread(target=self._load_1min_avg).start()
        load_5min_avg = threading.Thread(target=self._load_5min_avg).start()
        load_15min_avg = threading.Thread(target=self._load_15min_avg).start()

        self.conf_reader = ConfigReader()

        nw_fault_utility = Conf.get(
            SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}", "sysfs")

        self._utility_instance = None

        try:
            # Creating the instance of ToolFactory class
            self.tool_factory = ToolFactory()
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(nw_fault_utility)
            if self._utility_instance:
                # Initialize the path as /sys/class/net/
                self.nw_interface_path = self._utility_instance.get_sys_dir_path(
                    'net')
        except KeyError as key_error:
            logger.error(
                f'NodeData, Unable to get the instance of {nw_fault_utility} Utility'
            )
        except Exception as err:
            logger.error(
                f'NodeData, Problem occured while getting the instance of {nw_fault_utility}'
            )

    def read_data(self, subset, debug, units="MB"):
        """Updates data based on a subset"""
        self._set_debug(debug)
        self._log_debug("read_data, subset: %s, units: %s" % (subset, units))

        try:
            # Determine the units factor value
            self.units_factor = 1
            if units == "GB":
                self.units_factor = 1000000000
            elif units == "MB":
                self.units_factor = 1000000
            elif units == "KB":
                self.units_factor = 1000

            self.host_id = self.os_utils.get_fqdn()
            # get_fqdn() function checks the socket.gethostname() to get the host name if it not available
            # then it try to find host name from socket.gethostbyaddr(socket.gethostname())[0] and return the
            # meaningful host name.

            self.local_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S %Z')

            # Branch off and gather data based upon value sent into subset
            if subset == "host_update":
                self._get_host_update_data()

            elif subset == "local_mount_data":
                self._get_local_mount_data()

            elif subset == "cpu_data":
                self._get_cpu_data()

            elif subset == "if_data":
                self._get_if_data()

            elif subset == "disk_space_alert":
                self._get_disk_space_alert_data()

        except Exception as e:
            raise Exception(f"Failed to read data, {e}")

        return True

    def _get_host_update_data(self):
        """Retrieves node information for the host_update json message"""
        logged_in_users = []
        uname_keys = ("sysname", "nodename", "version", "release", "machine")
        self.up_time = int(psutil.boot_time())
        self.boot_time = self._epoch_time
        self.uname = dict(zip(uname_keys, os.uname()))
        self.total_memory = dict(psutil.virtual_memory()._asdict())
        self.process_count = len(psutil.pids())
        for users in psutil.users():
            logged_in_users.append(dict(users._asdict()))
        self.logged_in_users = logged_in_users
        # Calculate the current number of running processes at this moment
        total_running_proc = 0
        for proc in psutil.process_iter():
            try:
                pinfo = proc.as_dict(attrs=['status'])
                if pinfo['status'] not in (psutil.STATUS_ZOMBIE,
                                           psutil.STATUS_DEAD,
                                           psutil.STATUS_STOPPED,
                                           psutil.STATUS_IDLE,
                                           psutil.STATUS_SLEEPING):
                    total_running_proc += 1
            except psutil.NoSuchProcess:
                logger.warn(
                    f"(psutil) Process '{proc.name()}' exited unexpectedly.")
        self.running_process_count = total_running_proc

    def _get_local_mount_data(self):
        """Retrieves node information for the local_mount_data json message"""
        self.total_space = int(psutil.disk_usage("/")[0]) // int(
            self.units_factor)
        self.free_space = int(psutil.disk_usage("/")[2]) // int(
            self.units_factor)
        self.total_swap = int(psutil.swap_memory()[0]) // int(
            self.units_factor)
        self.free_swap = int(psutil.swap_memory()[2]) // int(self.units_factor)
        self.free_inodes = int(100 - math.ceil((float(os.statvfs("/").f_files - os.statvfs("/").f_ffree) \
                             / os.statvfs("/").f_files) * 100))

    def _get_cpu_data(self):
        """Retrieves node information for the cpu_data json message"""
        cpu_core_usage_dict = dict()
        cpu_data = psutil.cpu_times_percent()
        self._log_debug(
            "_get_cpu_data, cpu_data: %s %s %s %s %s %s %s %s %s %s" %
            cpu_data)

        self.csps = 0  # What the hell is csps - cycles per second?
        self.user_time = int(cpu_data[0])
        self.nice_time = int(cpu_data[1])
        self.system_time = int(cpu_data[2])
        self.idle_time = int(cpu_data[3])
        self.iowait_time = int(cpu_data[4])
        self.interrupt_time = int(cpu_data[5])
        self.softirq_time = int(cpu_data[6])
        self.steal_time = int(cpu_data[7])

        self.cpu_usage = psutil.cpu_percent(interval=1, percpu=False)
        # Array to hold data about each CPU core
        self.cpu_core_data = []
        index = 0
        while index < self.cpus:
            self._log_debug(
                "_get_cpu_data, index: %s, 1 min: %s, 5 min: %s, 15 min: %s" %
                (index, self.load_1min_average[index],
                 self.load_5min_average[index],
                 self.load_15min_average[index]))

            cpu_core_data = {
                "coreId": index,
                "load1MinAvg": int(self.load_1min_average[index]),
                "load5MinAvg": int(self.load_5min_average[index]),
                "load15MinAvg": int(self.load_15min_average[index]),
                "ips": 0
            }
            self.cpu_core_data.append(cpu_core_data)
            index += 1

    def _get_if_data(self):
        """Retrieves node information for the if_data json message"""
        net_data = psutil.net_io_counters(pernic=True)
        # Array to hold data about each network interface
        self.if_data = []
        bmc_data = self._get_bmc_info()
        for interface, if_data in net_data.items():
            self._log_debug("_get_if_data, interface: %s %s" %
                            (interface, net_data))
            nw_status = self._fetch_nw_status()
            nw_cable_conn_status = self.fetch_nw_cable_conn_status(interface)
            if_data = {
                "ifId":
                interface,
                "networkErrors":
                (net_data[interface].errin + net_data[interface].errout),
                "droppedPacketsIn":
                net_data[interface].dropin,
                "packetsIn":
                net_data[interface].packets_recv,
                "trafficIn":
                net_data[interface].bytes_recv,
                "droppedPacketsOut":
                net_data[interface].dropout,
                "packetsOut":
                net_data[interface].packets_sent,
                "trafficOut":
                net_data[interface].bytes_sent,
                "nwStatus":
                nw_status[interface][0],
                "ipV4":
                nw_status[interface][1],
                "nwCableConnStatus":
                nw_cable_conn_status
            }
            self.if_data.append(if_data)
        self.if_data.append(bmc_data)

    def _fetch_nw_status(self):
        nw_dict = {}
        nws = os.popen("ip --br a | awk '{print $1, $2, $3}'").read().split(
            '\n')[:-1]
        for nw in nws:
            if nw.split(' ')[2]:
                ip = nw.split(' ')[2].split("/")[0]
            else:
                ip = ""
            nw_dict[nw.split(' ')[0]] = [nw.split(' ')[1], ip]
        logger.debug("network info going is : {}".format(nw_dict))
        return nw_dict

    def fetch_nw_cable_conn_status(self, interface):
        carrier_status = None
        try:
            carrier_status = Network().get_link_state(interface)
        except NetworkError as err:
            # NetworkError i.e. all OSError exceptions indicate that
            # the carrier file is not available to access which
            # constitute the UNKOWN status for network cable.
            logger.debug(err)
            carrier_status = "UNKNOWN"
        except Exception as e:
            # All other exceptions are unexpected and are logged as errors.
            logger.excpetion(
                "Problem occured while reading from nw carrier file:"
                f" {self.nw_interface_path}/{interface}/carrier. Error: {e}")
        return carrier_status

    def _get_bmc_info(self):
        """
        nwCableConnection will be default UNKNOWN,
        Until solution to find bmc eth port cable connection status is found.
        """
        try:
            bmcdata = {
                'ifId': 'ebmc0',
                'ipV4Prev': "",
                'ipV4': "",
                'nwStatus': "DOWN",
                'nwCableConnStatus': 'UNKNOWN'
            }
            ipdata = sp.Popen(
                "sudo ipmitool lan print",
                shell=True,
                stdout=sp.PIPE,
                stderr=sp.PIPE).communicate()[0].decode().strip()
            bmcip = re.findall("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", ipdata)
            if bmcip:
                bmcip = bmcip[0]
                pingbmchost = "ping -c1 -W1 -q " + bmcip
                child = sp.Popen(pingbmchost.split(), stdout=sp.PIPE)
                streamdata = child.communicate(
                )[0]  #child must be communicated before fetching return code.
                retcode = child.returncode
                if self.prev_bmcip is not None and self.prev_bmcip != bmcip:
                    bmcdata['ipV4Prev'] = self.prev_bmcip
                    bmcdata['ipV4'] = bmcip
                    self.prev_bmcip = bmcip
                else:
                    self.prev_bmcip = bmcdata['ipV4Prev'] = bmcdata[
                        'ipV4'] = bmcip
                if retcode == 0:
                    bmcdata['nwStatus'] = "UP"
                else:
                    logger.warn("BMC Host:{0} is not reachable".format(bmcip))
        except Exception as e:
            logger.error(
                "Exception occurs while fetching bmc_info:{}".format(e))
        return bmcdata

    def _get_disk_space_alert_data(self):
        """Retrieves node information for the disk_space_alert_data json message"""
        self.total_space = int(psutil.disk_usage("/")[0]) // int(
            self.units_factor)
        self.free_space = int(psutil.disk_usage("/")[2]) // int(
            self.units_factor)
        self.disk_used_percentage = psutil.disk_usage("/")[3]

    def _load_1min_avg(self):
        """Loop forever calculating the one minute average load"""
        # Initialize list to -1 indicating the time interval has not occurred yet
        index = 0
        while index < self.cpus:
            self.load_1min_average.append(-1)
            index += 1

        while True:
            # API call blocks for one minute and then returns the value
            self.load_1min_average = psutil.cpu_percent(interval=1,
                                                        percpu=True)

    def _load_5min_avg(self):
        """Loop forever calculating the five minute average load"""
        # Initialize list to -1 indicating the time interval has not occurred yet
        index = 0
        while index < self.cpus:
            self.load_5min_average.append(-1)
            index += 1

        while True:
            # API call blocks for five minutes and then returns the value
            self.load_5min_average = psutil.cpu_percent(interval=5,
                                                        percpu=True)

    def _load_15min_avg(self):
        """Loop forever calculating the fifteen minute average load"""
        # Initialize list to -1 indicating the time interval has not occurred yet
        index = 0
        while index < self.cpus:
            self.load_15min_average.append(-1)
            index += 1

        while True:
            # API call blocks for fifteen minutes and then returns the value
            self.load_15min_average = psutil.cpu_percent(interval=15,
                                                         percpu=True)
Exemplo n.º 12
0
class RealStorFanSensor(SensorThread, InternalMsgQ):


    SENSOR_NAME = "RealStorFanSensor"
    SENSOR_TYPE = "enclosure_fan_module_alert"
    RESOURCE_TYPE = "enclosure:hw:fan"

    PRIORITY = 1

    # Fan Modules directory name
    FAN_MODULES_DIR = "fanmodules"

    # Dependency list
    DEPENDENCIES = {
                    "plugins": ["RealStorEnclMsgHandler"],
                    "rpms": []
    }

    @staticmethod
    def name():
        """@return: name of the monitoring module."""
        return RealStorFanSensor.SENSOR_NAME

    @staticmethod
    def impact():
        """Returns impact of the module."""
        return "Fan modules in storage enclosure can not be monitored."

    @staticmethod
    def dependencies():
        """Returns a list of plugins and RPMs this module requires
           to function.
        """
        return RealStorFanSensor.DEPENDENCIES

    def __init__(self):
        super(RealStorFanSensor, self).__init__(self.SENSOR_NAME,
                                                self.PRIORITY)
        self.rssencl = singleton_realstorencl

        self._faulty_fan_file_path = None
        self._faulty_fan_modules_list = {}
        self._fan_modules_list = {}

        # fan modules psus persistent cache
        self._fanmodule_prcache = None

        self.pollfreq_fansensor = \
            int(Conf.get(SSPL_CONF, f"{self.rssencl.CONF_REALSTORFANSENSOR}>{POLLING_FREQUENCY_OVERRIDE}",
                        0))

        if self.pollfreq_fansensor == 0:
                self.pollfreq_fansensor = self.rssencl.pollfreq

        # Flag to indicate suspension of module
        self._suspended = False

        self._event = Event()
        self.os_utils = OSUtils()

    def initialize(self, conf_reader, msgQlist, products):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(RealStorFanSensor, self).initialize(conf_reader)

        # Initialize internal message queues for this module
        super(RealStorFanSensor, self).initialize_msgQ(msgQlist)


        self._fanmodule_prcache = os.path.join(self.rssencl.frus, \
                                      self.FAN_MODULES_DIR)

        # Persistence file location. This file stores faulty FanModule data
        self._faulty_fan_file_path = os.path.join(
            self._fanmodule_prcache, "fanmodule_data.json")

        # Load faulty Fan Module data from file if available
        self._faulty_fan_modules_list = store.get(\
                                           self._faulty_fan_file_path)

        if self._faulty_fan_modules_list is None:
            self._faulty_fan_modules_list = {}
            store.put(self._faulty_fan_modules_list,\
                self._faulty_fan_file_path)

        return True

    def read_data(self):
        """Return the Current fan_module information"""
        return self._fan_modules_list

    def run(self):
        """Run the sensor on its own thread"""

        # Do not proceed if module is suspended
        if self._suspended == True:
            self._scheduler.enter(30, self._priority, self.run, ())
            return

        # Check for debug mode being activated
        self._read_my_msgQ_noWait()

        # Periodically check if there is any fault in the fan_module
        self._check_for_fan_module_fault()

        self._scheduler.enter(self.pollfreq_fansensor, self._priority, self.run, ())

    def _check_for_fan_module_fault(self):
        """Iterates over fan modules list. maintains a dictionary in order to
           keep track of previous health of the FRU in order to set
           alert_type"""

        self._fan_modules_list = self._get_fan_modules_list()
        alert_type = None

        if not self._fan_modules_list:
            return

        try:
            for fan_module in self._fan_modules_list:
                fru_status = fan_module.get("health").lower()
                durable_id = fan_module.get("durable-id").lower()
                health_reason = fan_module.get("health-reason").lower()

                if fru_status == self.rssencl.HEALTH_FAULT and \
                    self._check_if_fan_module_is_installed(health_reason):
                    if durable_id not in self._faulty_fan_modules_list:
                        alert_type = self.rssencl.FRU_MISSING
                        self._faulty_fan_modules_list[durable_id] = alert_type
                    else:
                        prev_alert_type = self._faulty_fan_modules_list[durable_id]
                        if prev_alert_type != self.rssencl.FRU_MISSING:
                            alert_type = self.rssencl.FRU_MISSING
                            self._faulty_fan_modules_list[durable_id] = alert_type
                elif fru_status == self.rssencl.HEALTH_FAULT or \
                         fru_status == self.rssencl.HEALTH_DEGRADED:
                    if durable_id not in self._faulty_fan_modules_list:
                        alert_type = self.rssencl.FRU_FAULT
                        self._faulty_fan_modules_list[durable_id] = alert_type
                    else:
                        prev_alert_type = self._faulty_fan_modules_list[durable_id]
                        if prev_alert_type != self.rssencl.FRU_FAULT:
                            alert_type = self.rssencl.FRU_FAULT
                            self._faulty_fan_modules_list[durable_id] = alert_type
                elif fru_status == self.rssencl.HEALTH_OK:
                    if durable_id in self._faulty_fan_modules_list:
                        prev_alert_type = \
                            self._faulty_fan_modules_list[durable_id]
                        if prev_alert_type == self.rssencl.FRU_MISSING:
                            alert_type = self.rssencl.FRU_INSERTION
                        else:
                            alert_type = self.rssencl.FRU_FAULT_RESOLVED
                        del self._faulty_fan_modules_list[durable_id]

                # Persist faulty Fan Module list to file only if there is any
                # type of alert generated
                if alert_type:
                    internal_json_message = \
                        self._create_internal_json_msg(fan_module, alert_type)
                    self._send_json_message(internal_json_message)
                    # Wait till msg is sent to message bus or added in consul for resending.
                    # If timed out, do not update cache and revert in-memory cache.
                    # So, in next iteration change can be detectedcted
                    if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT):
                        store.put(self._faulty_fan_modules_list,\
                            self._faulty_fan_file_path)
                    else:
                        self._faulty_fan_modules_list = store.get(self._faulty_fan_file_path)
                    alert_type = None
        except Exception as e:
            logger.exception(e)

    def _check_if_fan_module_is_installed(self, health_reason):
        """ This function returns true if given string contains substring
            otherwise, it returns false. To achieve this, it uses search
            method of python re module"""

        not_installed_health_string = "not installed"
        return bool(re.search(not_installed_health_string, health_reason))

    def _get_fan_modules_list(self):
        """Returns fan module list using API /show/fan-modules"""

        url = self.rssencl.build_url(
                  self.rssencl.URI_CLIAPI_SHOWFANMODULES)

        response = self.rssencl.ws_request(
                        url, self.rssencl.ws.HTTP_GET)

        if not response:
            logger.warn(f"{self.rssencl.LDR_R1_ENCL}:: Fan-modules status unavailable as ws request {url} failed")
            return

        if response.status_code != self.rssencl.ws.HTTP_OK:
            if url.find(self.rssencl.ws.LOOPBACK) == -1:
                raise Exception(f"{self.rssencl.LDR_R1_ENCL}:: http request {url} "
                                f"to get fan-modules failed with http err {response.status_code}")
            return

        response_data = json.loads(response.text)

        fan_modules_list = response_data["fan-modules"]
        return fan_modules_list

    def _get_fan_attributes(self, fan_module):
        """Returns individual fan attributes from each fan-module"""

        fan_list = []
        fans = {}
        fan_key = ""

        fan_attribute_list = [ 'status', 'name', 'speed', 'durable-id',
            'health', 'fw-revision', 'health-reason', 'serial-number',
                'location', 'position', 'part-number', 'health-recommendation',
                    'hw-revision', 'locator-led' ]

        fru_fans = fan_module.get("fan", [])

        for fan in fru_fans:
            for fan_key in filter(lambda common_key: common_key in fan_attribute_list, fan):
                fans[fan_key] = fan.get(fan_key)
            fan_list.append(fans)
        return fan_list

    def _create_internal_json_msg(self, fan_module, alert_type):
        """Creates internal json structure which is sent to
            realstor_msg_handler for further processing"""

        fan_module_info_key_list = \
            ['name', 'location', 'status', 'health',
                'health-reason', 'health-recommendation', 'enclosure-id',
                'durable-id', 'position']

        fan_module_info_dict = {}
        fan_module_extended_info_dict = {}

        fans_list = self._get_fan_attributes(fan_module)

        for fan_module_key, fan_module_value in fan_module.items():
            if fan_module_key in fan_module_info_key_list:
                fan_module_info_dict[fan_module_key] = fan_module_value

        fan_module_info_dict["fans"] = fans_list

        severity_reader = SeverityReader()
        severity = severity_reader.map_severity(alert_type)
        epoch_time = str(int(time.time()))

        alert_id = self._get_alert_id(epoch_time)
        fru = self.rssencl.is_storage_fru('FAN MODULE')
        resource_id = fan_module_info_dict.get("name", "")
        host_name = self.os_utils.get_fqdn()

        info = {
                "resource_type": self.RESOURCE_TYPE,
                "fru": fru,
                "resource_id": resource_id,
                "event_time": epoch_time
                }

        # Creates internal json message request structure.
        # this message will be passed to the StorageEnclHandler
        internal_json_msg = json.dumps(
            {"sensor_request_type": {
                "enclosure_alert": {
                        "status": "update",
                        "host_id": host_name,
                        "alert_type": alert_type,
                        "severity": severity,
                        "alert_id": alert_id,
                        "info": info,
                        "specific_info": fan_module_info_dict
                    }
            }})

        return internal_json_msg

    def _get_alert_id(self, epoch_time):
        """Returns alert id which is a combination of
           epoch_time and salt value
        """
        salt = str(uuid.uuid4().hex)
        alert_id = epoch_time + salt
        return alert_id

    def _send_json_message(self, json_msg):
        """Transmit data to RealStorMsgHandler to be processed and sent out"""

        self._event.clear()
        # Send the event to real stor message handler
        # to generate json message and send out
        self._write_internal_msgQ(RealStorEnclMsgHandler.name(), json_msg, self._event)

    def suspend(self):
        """Suspends the module thread. It should be non-blocking"""
        super(RealStorFanSensor, self).suspend()
        self._suspended = True

    def resume(self):
        """Resumes the module thread. It should be non-blocking"""
        super(RealStorFanSensor, self).resume()
        self._suspended = False

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread"""
        super(RealStorFanSensor, self).shutdown()
Exemplo n.º 13
0
class CPUFaultSensor(SensorThread, InternalMsgQ):
    """CPU Fault Sensor which runs on its own thread on each boot up and
       is responsible for sensing changes in online CPUs using
       available tool/utility"""

    SENSOR_NAME = "CPUFaultSensor"
    PRIORITY = 1
    RESOURCE_TYPE = "node:os:cpu:core"

    # Section in the configuration store
    SYSTEM_INFORMATION_KEY = "SYSTEM_INFORMATION"
    CACHE_DIR_NAME = "server"

    RESOURCE_ID = "CPU-"

    PROBE = "probe"

    # Dependency list
    DEPENDENCIES = {"plugins": ["NodeDataMsgHandler"], "rpms": []}

    @staticmethod
    def name():
        """@return: name of the module."""
        return CPUFaultSensor.SENSOR_NAME

    @staticmethod
    def impact():
        """Returns impact of the module."""
        return "Server CPU presence and status change can not be monitored."

    def __init__(self, utility_instance=None):
        """init method"""
        super(CPUFaultSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY)

        # Initialize the utility instance
        self._utility_instance = utility_instance

        # CPU info
        self.stored_cpu_info = None
        self.prev_cpu_info = None
        self.current_cpu_info = None
        self.os_utils = OSUtils()

    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(CPUFaultSensor, self).initialize(conf_reader)

        super(CPUFaultSensor, self).initialize_msgQ(msgQlist)

        # get the cpu fault implementor from configuration
        cpu_fault_utility = Conf.get(SSPL_CONF,
                                     f"{self.name().upper()}>{self.PROBE}",
                                     'sysfs')

        # Creating the instance of ToolFactory class
        self.tool_factory = ToolFactory()

        try:
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(cpu_fault_utility)
        except Exception as err:
            raise Exception(
                "Error while initializing. "
                f"Unable to get the instance of {cpu_fault_utility} Utility, {err}"
            )

        self._node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY, 'SN01')
        cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME)
        self.CPU_FAULT_SENSOR_DATA = os.path.join(
            cache_dir_path, f'CPU_FAULT_SENSOR_DATA_{self._node_id}')

        return True

    def read_stored_cpu_info(self):
        """Read the most recent stored cpu info"""
        try:
            if self.stored_cpu_info is None:
                self.stored_cpu_info = store.get(self.CPU_FAULT_SENSOR_DATA)
            if self.stored_cpu_info is not None and self._node_id in self.stored_cpu_info.keys(
            ):
                self.prev_cpu_info = self.stored_cpu_info[
                    self._node_id]['CPU_LIST']
        except Exception as e:
            raise Exception(f"Error while reading stored cpu info, {e}")

    def read_current_cpu_info(self):
        """Read current cpu info"""
        try:
            self.current_cpu_info = self._utility_instance.get_cpu_info()
        except Exception as e:
            raise Exception(f"Error while reading current cpu info, {e}")

    def run(self):
        """Run the sensor on its own thread"""

        # Check for debug mode being activated
        self._read_my_msgQ_noWait()
        # Read recent stored cpu info
        self.read_stored_cpu_info()
        # Store alerts to be sent here
        self.alerts_for = {}
        # Specific info field for alerts
        self.specific_info = []
        # Read current cpu info
        self.read_current_cpu_info()

        to_update = False
        # Compare with previous cpu info
        # If a cpu is present in prev_cpu_info and not present in current_cpu_info : fault alert is generated
        # If a cpu is present in current_cpu_info and not present in prev_cpu_info : two possibilities
        #   1) if cpu has an outstanding fault alert : it is a repaired cpu, hence generate fault_resolved
        #   2) if cpu has no outstanding alert : it is a newly added cpu, do not do anything
        try:
            if self.prev_cpu_info:
                if self.current_cpu_info != self.prev_cpu_info:
                    # Create a set of all relevant cpus
                    cpu_list = set(self.prev_cpu_info + self.current_cpu_info)
                    # Iterate through the set
                    for cpu in cpu_list:
                        if cpu not in self.current_cpu_info and cpu not in self.stored_cpu_info[
                                self._node_id]['FAULT_LIST']:
                            # This is a failed cpu
                            self.stored_cpu_info[
                                self._node_id]['FAULT_LIST'].append(cpu)
                            self.alerts_for[cpu] = "fault"
                        elif cpu not in self.prev_cpu_info and cpu in self.stored_cpu_info[
                                self._node_id]['FAULT_LIST']:
                            # This is a repaired cpu
                            self.alerts_for[cpu] = "fault_resolved"
                    # Update stored cpu info for next run
                    self.stored_cpu_info[
                        self._node_id]['CPU_LIST'] = self.current_cpu_info
                    to_update = True
            else:
                # Previous cpu info not available, need to store current info
                if not self.stored_cpu_info:
                    # No info is available
                    self.stored_cpu_info = {}
                # Add info for the current node
                self.stored_cpu_info[self._node_id] = {}
                self.stored_cpu_info[
                    self._node_id]['CPU_LIST'] = self.current_cpu_info
                self.stored_cpu_info[self._node_id]['FAULT_LIST'] = []
                # Update stored cpu info
                to_update = True

        except Exception as e:
            raise Exception(f"Failed while processing cpu info, {e}")

        # Send alerts
        for cpu, alert_type in self.alerts_for.items():
            if self._generate_alert(
                    cpu,
                    alert_type) == True and alert_type == "fault_resolved":
                # Delete from the FAULT_LIST
                self.stored_cpu_info[self._node_id]['FAULT_LIST'].remove(cpu)

        # Update stored cpu info
        if to_update:
            store.put(self.stored_cpu_info, self.CPU_FAULT_SENSOR_DATA)

    def fill_specific_info(self):
        """Fills the specific info to be sent via alert"""
        if not self.specific_info:
            # Create a set of all relevant cpus
            cpu_list = set(self.prev_cpu_info + self.current_cpu_info)
            # Iterate through the set
            for cpu in cpu_list:
                item = {}
                item['resource_id'] = self.RESOURCE_ID + str(cpu)
                # Keep default state online
                item['state'] = "online"
                if cpu in self.alerts_for.keys():
                    if self.alerts_for[cpu] == "fault":
                        item['state'] = "offline"
                self.specific_info.append(item)

    def _create_json_message(self, cpu, alert_type):
        """Creates a defined json message structure which can flow inside SSPL
           modules"""

        internal_json_msg = None
        severity_reader = SeverityReader()
        severity = severity_reader.map_severity(alert_type)
        epoch_time = str(int(time.time()))

        alert_id = self._get_alert_id(epoch_time)
        host_name = self.os_utils.get_fqdn()

        # Populate specific info
        self.fill_specific_info()
        alert_specific_info = self.specific_info
        res_id = self.RESOURCE_ID + str(cpu)

        for item in alert_specific_info:
            if item['resource_id'] == res_id:
                if alert_type == "fault":
                    description = "Faulty CPU detected, %s state is %s" % (
                        item['resource_id'], item["state"])
                else:
                    description = "Fault resolved for CPU, %s state is  %s" % (
                        item['resource_id'], item["state"])

        info = {
            "resource_type": self.RESOURCE_TYPE,
            "resource_id": self.RESOURCE_ID + str(cpu),
            "event_time": epoch_time,
            "description": description
        }

        internal_json_msg = json.dumps({
            "sensor_request_type": {
                "node_data": {
                    "status": "update",
                    "host_id": host_name,
                    "alert_type": alert_type,
                    "severity": severity,
                    "alert_id": alert_id,
                    "info": info,
                    "specific_info": alert_specific_info
                }
            }
        })

        return internal_json_msg

    def _get_alert_id(self, epoch_time):
        """Returns alert id which is a combination of
           epoch_time and salt value
        """
        salt = str(uuid.uuid4().hex)
        alert_id = epoch_time + salt
        return alert_id

    def _generate_alert(self, cpu, alert_type):
        """Queues the message to NodeData Message Handler"""
        try:
            json_msg = self._create_json_message(cpu, alert_type)
            if json_msg:
                self._write_internal_msgQ(NodeDataMsgHandler.name(), json_msg)
            return True
        except Exception as e:
            logger.error(f"Exception while sending alert : {e}")
            return False

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread"""
        super(CPUFaultSensor, self).shutdown()
Exemplo n.º 14
0
class RealStorDiskSensor(SensorThread, InternalMsgQ):
    """Monitors RealStor enclosure disks state and raise sspl events for
       detected faults, insertion,removal events """


    SENSOR_NAME = "RealStorDiskSensor"
    RESOURCE_TYPE = "enclosure:hw:disk"

    PRIORITY = 1

    RSS_DISK_GET_ALL = "all"

    # Mandatory attributes in disk json data
    disk_generic_info = [ "enclosure-id", "enclosure-wwn", "slot", "description",
                          "architecture", "interface", "serial-number", "size",
                          "vendor", "model", "revision", "temperature", "status",
                          "LED-status", "locator-LED", "blink", "smart",
                          "health", "health-reason", "health-recommendation"
                        ]

    # local resource cache
    latest_disks = {}
    memcache_disks = {}
    DISK_IDENTIFIER = "Disk 0."
    NUMERIC_IDENTIFIER = "numeric"
    invalidate_latest_disks_info = False

    # Dependency list
    DEPENDENCIES = {
                    "plugins": ["RealStorEnclMsgHandler"],
                    "rpms": []
    }

    @staticmethod
    def name():
        """@return: name of the module."""
        return RealStorDiskSensor.SENSOR_NAME

    @staticmethod
    def impact():
        """Returns impact of the module."""
        return "Disks in storage enclosure can not be monitored."

    @staticmethod
    def dependencies():
        """Returns a list of plugins and RPMs this module requires
           to function.
        """
        return RealStorDiskSensor.DEPENDENCIES

    def __init__(self):
        super(RealStorDiskSensor, self).__init__(self.SENSOR_NAME,
                                                    self.PRIORITY)
        self.last_alert = None

        self.rssencl = singleton_realstorencl

        # disks persistent cache
        self.disks_prcache = f"{self.rssencl.frus}disks/"

        self.pollfreq_disksensor = \
            int(Conf.get(SSPL_CONF, f"{self.rssencl.CONF_REALSTORDISKSENSOR}>{POLLING_FREQUENCY_OVERRIDE}",
                        0))

        if self.pollfreq_disksensor == 0:
                self.pollfreq_disksensor = self.rssencl.pollfreq

        # Flag to indicate suspension of module
        self._suspended = False

        self._event = None
        self._event_wait_results = set()
        self.os_utils = OSUtils()

    def initialize(self, conf_reader, msgQlist, products):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(RealStorDiskSensor, self).initialize(conf_reader)

        # Initialize internal message queues for this module
        super(RealStorDiskSensor, self).initialize_msgQ(msgQlist)

        return True

    def read_data(self):
        """Return the last raised alert, none otherwise"""
        return self.last_alert

    def run(self):
        """Run disk monitoring periodically on its own thread."""

        # Do not proceed if module is suspended
        if self._suspended == True:
            self._scheduler.enter(self.pollfreq_disksensor, self._priority, self.run, ())
            return

        # Allow RealStor Encl MC to start services.
        #time.sleep(self.rssencl.REALSTOR_MC_BOOTWAIT)

        # Check for debug mode being activated
        self._read_my_msgQ_noWait()

        # poll all disk status and raise events if
        # insertion/removal detected
        self._rss_check_disks_presence()

        #Do not proceed further if latest disks info can't be validated due to store function error
        if not self.invalidate_latest_disks_info:
            # Polling system status
            self.rssencl.get_system_status()

            # check for disk faults & raise if found
            self._rss_check_disk_faults()
        else:
            logger.warn("Can not validate disk faults or presence due to persistence store error")

        # Reset debug mode if persistence is not enabled
        self._disable_debug_if_persist_false()

        # Fire every configured seconds to poll disks status
        self._scheduler.enter(self.pollfreq_disksensor,
          self._priority, self.run, ())

    def _rss_raise_disk_alert(self, alert_type, disk_info):
        """Raise disk alert with supported alert type"""

        #logger.debug("Raise - alert type {0}, info {1}".format(alert_type,disk_info))
        if not disk_info:
            logger.warn("disk_info None, ignoring")
            return

        if alert_type not in self.rssencl.fru_alerts:
            logger.error(f"Supplied alert type [{alert_type}] not supported")
            return

        # form json with default values
        disk = dict.fromkeys(self.disk_generic_info, "NA")
        disk['slot'] = -1
        disk['blink'] = 0
        disk['enclosure-id'] = 0

        # Build data for must fields in fru disk data
        for item in self.disk_generic_info:
            if item in disk_info:
                disk[item] = disk_info[item]

        encl = self.rssencl.ENCL_FAMILY
        disk[encl] = self.rssencl.LDR_R1_ENCL

        # Build data for platform specific fields in fru disk data
        # get remaining extra key value pairs from passed disk_info
        extended_info = {key:disk_info[key] for key in disk_info if key not in\
                            disk and self.NUMERIC_IDENTIFIER not in key}

        # notify realstor encl msg handler
        self._send_json_msg(alert_type, disk, extended_info)

    def _rss_check_disks_presence(self):
        """Match cached realstor disk info with latest retrieved disks info """

        self.rss_cliapi_poll_disks(self.RSS_DISK_GET_ALL)

        if not self.memcache_disks:
            if self.rssencl.active_ip != self.rssencl.ws.LOOPBACK:
                logger.warn("Last polled drives info in-memory cache "
                    "unavailable , unable to check drive presence change")
                return

        if not self.latest_disks:
            if self.rssencl.active_ip != self.rssencl.ws.LOOPBACK:
                logger.warn("Latest polled drives info in-memory cache "
                    "unavailable, unable to check drive presence change")
            return

        # keys are disk slot numbers
        removed_disks = set(self.memcache_disks.keys()) - set(self.latest_disks.keys())
        inserted_disks = set(self.latest_disks.keys()) - set(self.memcache_disks.keys())

        # get populated slots in both caches
        populated = set(self.memcache_disks.keys()) & set(self.latest_disks.keys())

        # check for replaced disks
        for slot in populated:
            if self.memcache_disks[slot]['serial-number'] != self.latest_disks[slot]['serial-number']:

                if slot not in removed_disks:
                    removed_disks.add(slot)

                if slot not in inserted_disks:
                    inserted_disks.add(slot)

        # If no difference seen between cached & latest set of disk list,
        # means no disk removal or insertion happened
        if not (removed_disks or inserted_disks):
            #logger.info("Disk presence state _NOT_ changed !!!")
            return

        self._event = Event()
        for slot in removed_disks:
            #get removed drive data from disk cache
            disk_datafile = f"{self.disks_prcache}disk_{slot}.json.prev"

            path_exists, _ = store.exists(disk_datafile)
            if not path_exists:
                disk_datafile = f"{self.disks_prcache}disk_{slot}.json"

            disk_info = store.get(disk_datafile)

            #raise alert for missing drive
            self._rss_raise_disk_alert(self.rssencl.FRU_MISSING, disk_info)
            # Wait till msg is sent to message bus or added in consul for resending.
            # If timed out, do not update cache
            if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT):
                store.delete(disk_datafile)
            self._event.clear()
        self._event = None

        for slot in inserted_disks:
            #get inserted drive data from disk cache
            disk_info = store.get(f"{self.disks_prcache}disk_{slot}.json")

            #raise alert for added drive
            self._rss_raise_disk_alert(self.rssencl.FRU_INSERTION, disk_info)

            # Update health status for inserted disk in memfault cache,
            # to raise fault alert after insertion if inserted disk status is not OK.
            if disk_info["health"] != "OK":
                for id_fault, cached_fault in enumerate(self.rssencl.memcache_faults):
                    #fetch disk slot from component_id present in memcache_faults.
                    try:
                        component_id = cached_fault["component-id"]
                        if component_id.startswith('Disk 0'):
                            disk_id = int(cached_fault["component-id"].split()[1].split('.')[1])
                            if disk_id == slot:
                                self.rssencl.memcache_faults[id_fault]['health'] = "OK"
                    except Exception as e:
                        logger.error(f"Error in updating health status for \
                        inserted disk in memfault cache {e}")

        # Update cached disk data after comparison
        self.memcache_disks = self.latest_disks
        self.rssencl.memcache_frus.update({"disks":self.memcache_disks})

        return

    def rss_cliapi_poll_disks(self, disk):
        """Retreive realstor disk info using cli api /show/disks"""

        # make ws request
        url = self.rssencl.build_url(
                  self.rssencl.URI_CLIAPI_SHOWDISKS)

        if(disk != self.RSS_DISK_GET_ALL):
           diskId = disk.partition("0.")[2]

           if(diskId.isdigit()):
               url = f"{url}/{disk}"
        url = f"{url}/detail"

        response = self.rssencl.ws_request(
                        url, self.rssencl.ws.HTTP_GET)

        if not response:
            logger.warn(f"{self.rssencl.LDR_R1_ENCL}:: Disks status unavailable as ws request {url} failed")
            return

        if response.status_code != self.rssencl.ws.HTTP_OK:
            if url.find(self.rssencl.ws.LOOPBACK) == -1:
                raise Exception(f"{self.rssencl.LDR_R1_ENCL}:: http request {url} "
                                f"to poll disks failed with err {response.status_code}")
            return

        try:
            jresponse = json.loads(response.content)
        except ValueError as badjson:
            logger.error(f"{url} returned mal-formed json:\n{badjson}")

        if jresponse:
            api_resp = self.rssencl.get_api_status(jresponse['status'])
            #logger.debug("%s api response:%d" % (url.format(),api_resp))

            if ((api_resp == -1) and
                   (response.status_code == self.rssencl.ws.HTTP_OK)):
                logger.warn("/show/disks api response unavailable, "
                    "marking success as http code is 200")
                api_resp = 0

            if api_resp == 0:
                drives = jresponse['drives']

                # reset latest drive cache to build new
                self.latest_disks = {}
                self.invalidate_latest_disks_info = False

                for drive in drives:
                    slot = drive.get("slot", -1)
                    sn = drive.get("serial-number", "NA")
                    health = drive.get("health", "NA")

                    if slot != -1:
                        self.latest_disks[slot] = {"serial-number":sn, "health":health}

                        #dump drive data to persistent cache
                        dcache_path = f"{self.disks_prcache}disk_{slot}.json"

                        # If drive is replaced, previous drive info needs
                        # to be retained in disk_<slot>.json.prev file and
                        # then only dump new data to disk_<slot>.json
                        path_exists, ret_val = store.exists(dcache_path)
                        if path_exists and ret_val == "Success":
                            prevdrive = store.get(dcache_path)

                            if prevdrive is not None:
                                prevsn = prevdrive.get("serial-number","NA")
                                prevhealth = prevdrive.get("health", "NA")

                                if prevsn != sn or prevhealth != health:
                                    # Rename path
                                    store.put(store.get(dcache_path), dcache_path + ".prev")
                                    store.delete(dcache_path)

                                    store.put(drive, dcache_path)
                        elif not path_exists and ret_val == "Success":
                            store.put(drive, dcache_path)
                        else:
                            # Invalidate latest disks info if persistence store error encountered
                            logger.warn(f"store.exists {dcache_path} return value {ret_val}")
                            self.invalidate_latest_disks_info = True
                            break

                if self.invalidate_latest_disks_info is True:
                    # Reset latest disks info
                    self.latest_disks = {}

            #If no in-memory cache, build from persistent cache
            if not self.memcache_disks:
                self._rss_build_disk_cache_from_persistent_cache()

            # if no memory cache still
            if not self.memcache_disks:
                self.memcache_disks = self.latest_disks


    def _rss_build_disk_cache_from_persistent_cache(self):
        """Retreive realstor system state info using cli api /show/system"""

        files = store.get_keys_with_prefix(self.disks_prcache)

        if not files:
            logger.debug("No files in Disk cache folder, ignoring")
            return

        for filename in files:
            if filename.startswith('disk_') and filename.endswith('.json'):
                if f"{filename}.prev" in files:
                    filename = f"{filename}.prev"
                drive = store.get(self.disks_prcache + filename)
                slotstr = re.findall("disk_(\d+).json", filename)[0]

                if not slotstr.isdigit():
                    logger.debug(f"slot {slotstr} not numeric, ignoring")
                    continue

                slot = int(slotstr)

                if drive :
                    sn = drive.get("serial-number","NA")
                    self.memcache_disks[slot] = {"serial-number":sn}

        #logger.debug("Disk cache built from persistent cache {0}".
        #    format(self.memcache_disks))

    def _rss_check_disk_faults(self):
        """Retreive realstor system state info using cli api /show/system"""

        if not self.rssencl.check_system_faults_changed():
            #logger.debug("System faults state _NOT_ changed !!! ")
            return

        try:
            # Extract new system faults
            faults = self.rssencl.latest_faults
            # TODO optimize to avoid nested 'for' loops.
            # Second 'for' loop in check_new_fault()
            self._event = Event()
            if faults:
                for fault in faults:

                    #logger.debug("Faulty component-id {0}, IDENT {1}"\
                    #    .format(fault["component-id"], self.DISK_IDENTIFIER))

                    # Check faulting component type
                    if self.DISK_IDENTIFIER in fault["component-id"]:
                        # If fault on disk, get disk full info including health
                        if self.rssencl.check_new_fault(fault):

                            # Extract slot from "component-id":"Disk 0.39"
                            slot = fault["component-id"].split()[1].split('.')[1]

                            # Alert send only if disks_prcache updated with latest disk data
                            if self.latest_disks[int(slot)]["health"] != "OK":
                                #get drive data from disk cache
                                disk_info = store.get(
                                    self.disks_prcache+"disk_{0}.json".format(slot))

                                # raise alert for disk fault
                                self._rss_raise_disk_alert(self.rssencl.FRU_FAULT, disk_info)
                                # To ensure all msg is sent to message bus or added in consul for resending.
                                self._event_wait_results.add(
                                    self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT))
                                self._event.clear() 

            # Check for resolved faults
            for cached in self.rssencl.memcache_faults:
                if not any(d.get("component-id", None) == cached["component-id"] \
                    for d in self.rssencl.latest_faults) and self.DISK_IDENTIFIER in cached["component-id"]:

                    # Extract slot from "component-id":"Disk 0.39"
                    logger.info(f"Found resolved disk fault for {cached['component-id']}")
                    slot = cached["component-id"].split()[1].split('.')[1]

                    # Alert send only if disks_prcache updated with latest disk data
                    if self.latest_disks[int(slot)]["health"] == "OK":
                        # get drive data from disk cache
                        disk_info = store.get(
                            self.disks_prcache+"disk_{0}.json".format(slot))
                        # raise alert for resolved disk fault
                        self._rss_raise_disk_alert(self.rssencl.FRU_FAULT_RESOLVED, disk_info)
                        # To ensure all msg is sent to message bus or added in consul for resending.
                        self._event_wait_results.add(
                                    self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT))
                        self._event.clear()
            # If all messages are sent to message bus or added in consul for resending.
            # then only update cache
            if self._event_wait_results and all(self._event_wait_results):
                self.rssencl.update_memcache_faults()
            self._event_wait_results.clear()
            self._event = None

        except Exception as e:
            logger.exception(f"Error in _rss_check_disk_faults {e}")

    def _gen_json_msg(self, alert_type, details, ext):
        """ Generate json message"""

        severity_reader = SeverityReader()
        severity = severity_reader.map_severity(alert_type)
        epoch_time = str(int(time.time()))

        alert_id = self._get_alert_id(epoch_time)
        fru = self.rssencl.is_storage_fru('disk')
        resource_id = ext.get("durable-id")
        host_name = self.os_utils.get_fqdn()

        info = {
                "resource_type": self.RESOURCE_TYPE,
                "fru": fru,
                "resource_id": resource_id,
                "event_time": epoch_time
                }
        specific_info = dict()
        specific_info.update(details)
        specific_info.update(ext)

        for k in specific_info.keys():
            if specific_info[k] == "":
                specific_info[k] = "N/A"


        json_msg = json.dumps(
            {"sensor_request_type" : {
                "enclosure_alert" : {
                    "status": "update",
                    "host_id": host_name,
                    "alert_type": alert_type,
                    "severity": severity,
                    "alert_id": alert_id,
                    "info": info,
                    "specific_info": specific_info
                },
            }})

        return json_msg

    def _get_alert_id(self, epoch_time):
        """Returns alert id which is a combination of
           epoch_time and salt value
        """
        salt = str(uuid.uuid4().hex)
        alert_id = epoch_time + salt
        return alert_id

    def _send_json_msg(self, alert_type, details, ext):
        """Transmit alert data to RealStorEnclMsgHandler to be processed
        and sent out
        """

        internal_json_msg = self._gen_json_msg(alert_type, details, ext)
        self.last_alert = internal_json_msg
        # Send the event to storage encl message handler to generate json message and send out
        self._write_internal_msgQ(RealStorEnclMsgHandler.name(), internal_json_msg, self._event)

    def suspend(self):
        """Suspends the module thread. It should be non-blocking"""
        super(RealStorDiskSensor, self).suspend()
        self._suspended = True

    def resume(self):
        """Resumes the module thread. It should be non-blocking"""
        super(RealStorDiskSensor, self).resume()
        self._suspended = False

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread"""
        super(RealStorDiskSensor, self).shutdown()
class RealStorPSUSensor(SensorThread, InternalMsgQ):
    """Monitors PSU data using RealStor API"""


    SENSOR_NAME = "RealStorPSUSensor"
    RESOURCE_CATEGORY = "enclosure:hw:psu"

    PRIORITY = 1

    # PSUs directory name
    PSUS_DIR = "psus"

    # Dependency list
    DEPENDENCIES = {
                    "plugins": ["RealStorEnclMsgHandler"],
                    "rpms": []
    }

    @staticmethod
    def name():
        """@return: name of the monitoring module."""
        return RealStorPSUSensor.SENSOR_NAME

    @staticmethod
    def impact():
        """Returns impact of the module."""
        return "PSUs in storage enclosure can not be monitored."

    @staticmethod
    def dependencies():
        """Returns a list of plugins and RPMs this module requires
           to function.
        """
        return RealStorPSUSensor.DEPENDENCIES

    def __init__(self):
        super(RealStorPSUSensor, self).__init__(
            self.SENSOR_NAME, self.PRIORITY)

        self._faulty_psu_file_path = None

        self.rssencl = singleton_realstorencl

        # psus persistent cache
        self.psu_prcache = None

        # Holds PSUs with faults. Used for future reference.
        self._previously_faulty_psus = {}

        self.pollfreq_psusensor = \
            int(Conf.get(SSPL_CONF, f"{self.rssencl.CONF_REALSTORPSUSENSOR}>{POLLING_FREQUENCY_OVERRIDE}",
                        0))

        if self.pollfreq_psusensor == 0:
                self.pollfreq_psusensor = self.rssencl.pollfreq

        # Flag to indicate suspension of module
        self._suspended = False

        self._event = Event()
        self.os_utils = OSUtils()

    def initialize(self, conf_reader, msgQlist, products):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(RealStorPSUSensor, self).initialize(conf_reader)

        # Initialize internal message queues for this module
        super(RealStorPSUSensor, self).initialize_msgQ(msgQlist)

        self.psu_prcache = os.path.join(self.rssencl.frus, self.PSUS_DIR)

        # Persistence file location. This file stores faulty PSU data
        self._faulty_psu_file_path = os.path.join(
            self.psu_prcache, "psudata.json")
        self._log_debug(
            f"_faulty_psu_file_path: {self._faulty_psu_file_path}")

        # Load faulty PSU data from file if available
        self._previously_faulty_psus = store.get(\
                                           self._faulty_psu_file_path)

        if self._previously_faulty_psus is None:
            self._previously_faulty_psus = {}
            store.put(self._previously_faulty_psus,\
                self._faulty_psu_file_path)

        return True

    def read_data(self):
        """This method is part of interface. Currently it is not
        in use.
        """
        return {}

    def run(self):
        """Run the sensor on its own thread"""
        # Do not proceed if module is suspended
        if self._suspended == True:
            self._scheduler.enter(10, self._priority, self.run, ())
            return
        # Check for debug mode being activated
        self._read_my_msgQ_noWait()

        psus = None

        psus = self._get_psus()

        if psus:
            self._get_msgs_for_faulty_psus(psus)

        # Reset debug mode if persistence is not enabled
        self._disable_debug_if_persist_false()

        # Fire every 10 seconds to see if We have a faulty PSU
        self._scheduler.enter(self.pollfreq_psusensor,
                self._priority, self.run, ())

    def _get_psus(self):
        """Receives list of PSUs from API.
           URL: http://<host>/api/show/power-supplies
        """
        url = self.rssencl.build_url(
                  self.rssencl.URI_CLIAPI_SHOWPSUS)

        response = self.rssencl.ws_request(
                        url, self.rssencl.ws.HTTP_GET)

        if not response:
            logger.warn(f"{self.rssencl.LDR_R1_ENCL}:: PSUs status unavailable as ws request {url} failed")
            return

        if response.status_code != self.rssencl.ws.HTTP_OK:
            if url.find(self.rssencl.ws.LOOPBACK) == -1:
                raise Exception(f"{self.rssencl.LDR_R1_ENCL}:: http request {url} "
                                f"to get power-supplies failed with err {response.status_code}")
            return

        response_data = json.loads(response.text)
        psus = response_data.get("power-supplies")
        return psus

    def _get_msgs_for_faulty_psus(self, psus, send_message = True):
        """Checks for health of psus and returns list of messages to be
           sent to handler if there are any.
        """
        self._log_debug(
            f"RealStorPSUSensor._get_msgs_for_faulty_psus -> {psus} {send_message}")
        faulty_psu_messages = []
        internal_json_msg = None
        psu_health = None
        durable_id = None
        alert_type = ""
        # Flag to indicate if there is a change in _previously_faulty_psus
        state_changed = False

        if not psus:
            return
        for psu in psus:
            psu_health = psu["health"].lower()
            durable_id = psu["durable-id"]
            psu_health_reason = psu["health-reason"]
            # Check for missing and fault case
            if psu_health == self.rssencl.HEALTH_FAULT:
                self._log_debug("Found fault in PSU {0}".format(durable_id))
                alert_type = self.rssencl.FRU_FAULT
                # Check for removal
                if self._check_if_psu_not_installed(psu_health_reason):
                    alert_type = self.rssencl.FRU_MISSING
                state_changed = not (durable_id in self._previously_faulty_psus and
                        self._previously_faulty_psus[durable_id]["alert_type"] == alert_type)
                if state_changed:
                    self._previously_faulty_psus[durable_id] = {
                        "health": psu_health, "alert_type": alert_type}
                    internal_json_msg = self._create_internal_msg(
                        psu, alert_type)
                    faulty_psu_messages.append(internal_json_msg)
                    # Send message to handler
                    if send_message:
                        self._send_json_msg(internal_json_msg)
            # Check for fault case
            elif psu_health == self.rssencl.HEALTH_DEGRADED:
                self._log_debug("Found degraded in PSU {0}".format(durable_id))
                state_changed = durable_id not in self._previously_faulty_psus
                if state_changed:
                    alert_type = self.rssencl.FRU_FAULT
                    self._previously_faulty_psus[durable_id] = {
                        "health": psu_health, "alert_type": alert_type}
                    internal_json_msg = self._create_internal_msg(
                        psu, alert_type)
                    faulty_psu_messages.append(internal_json_msg)
                    # Send message to handler
                    if send_message:
                        self._send_json_msg(internal_json_msg)
            # Check for healthy case
            elif psu_health == self.rssencl.HEALTH_OK:
                self._log_debug("Found ok in PSU {0}".format(durable_id))
                state_changed = durable_id in self._previously_faulty_psus
                if state_changed:
                    # Send message to handler
                    if send_message:
                        previous_alert_type = \
                            self._previously_faulty_psus[durable_id]["alert_type"]
                        alert_type = self.rssencl.FRU_FAULT_RESOLVED
                        if previous_alert_type == self.rssencl.FRU_MISSING:
                            alert_type = self.rssencl.FRU_INSERTION
                        internal_json_msg = self._create_internal_msg(
                            psu, alert_type)
                        faulty_psu_messages.append(internal_json_msg)
                        if send_message:
                            self._send_json_msg(internal_json_msg)
                    del self._previously_faulty_psus[durable_id]
            # Persist faulty PSU list to file only if something is changed
            if state_changed:
                # Wait till msg is sent to message bus or added in consul for resending.
                # If timed out, do not update cache and revert in-memory cache.
                # So, in next iteration change can be detected
                if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT):
                    store.put(self._previously_faulty_psus,\
                        self._faulty_psu_file_path)
                else:
                    self._previously_faulty_psus = store.get(self._faulty_psu_file_path)
                state_changed = False
            alert_type = ""
        return faulty_psu_messages

    def _get_hostname(self):
        try:
            return self.os_utils.get_fqdn()
        except Exception as e:
            logger.exception("Got exception {} when trying to get hostname"
                    " using getfqdn().".format(e))

        logger.info(" Trying with ip addr command")
        try:
            from subprocess import run, PIPE
            from re import findall

            IP_CMD = "ip -f inet addr show scope global up | grep inet"
            IP_REGEX = b'\\b(\\d{1,3}(?:\\.\d{1,3}){3})/\d{1,2}\\b'

            ip_out = run(IP_CMD, stdout=PIPE, shell=True, check=True)
            ip_list = re.findall(IP_REGEX, ip_out.stdout)
            if ip_list:
                return ip_list[0]
        except Exception as e:
            logger.exception("Got exception {} when trying to get hostname"
                    " using ip addr command.".format(e))

        # Ultimate fallback, when we are completely out of options
        logger.info("Using localhost")
        return "localhost"

    def _create_internal_msg(self, psu_detail, alert_type):
        """Forms a dictionary containing info about PSUs to send to
           message handler.
        """
        self._log_debug(
            f"RealStorPSUSensor._create_internal_msg -> {psu_detail} {alert_type}")
        if not psu_detail:
            return {}

        severity_reader = SeverityReader()
        severity = severity_reader.map_severity(alert_type)
        epoch_time = str(int(time.time()))

        alert_id = self._get_alert_id(epoch_time)
        fru = self.rssencl.is_storage_fru('POWER_SUPPLY')
        resource_id = psu_detail.get("durable-id")
        host_name = self._get_hostname()

        info = {
                "resource_type": self.RESOURCE_CATEGORY,
                "fru": fru,
                "resource_id": resource_id,
                "event_time": epoch_time
                }

        specific_info = {
            "enclosure-id": psu_detail.get("enclosure-id"),
            "serial-number":  psu_detail.get("serial-number"),
            "description":  psu_detail.get("description"),
            "revision":  psu_detail.get("revision"),
            "model":  psu_detail.get("model"),
            "vendor":  psu_detail.get("vendor"),
            "location":  psu_detail.get("location"),
            "part-number":  psu_detail.get("part-number"),
            "fru-shortname":  psu_detail.get("fru-shortname"),
            "mfg-date":  psu_detail.get("mfg-date"),
            "mfg-vendor-id":  psu_detail.get("mfg-vendor-id"),
            "dc12v":  psu_detail.get("dc12v"),
            "dc5v":  psu_detail.get("dc12v"),
            "dc33v":  psu_detail.get("dc33v"),
            "dc12i":  psu_detail.get("dc12i"),
            "dc5i":  psu_detail.get("dc5i"),
            "dctemp":  psu_detail.get("dctemp"),
            "health":  psu_detail.get("health"),
            "health-reason":  psu_detail.get("health-reason"),
            "health-recommendation":  psu_detail.get("health-recommendation"),
            "status":  psu_detail.get("status"),
            "durable-id":  psu_detail.get("durable-id"),
            "position":  psu_detail.get("position"),
        }

        for k in specific_info.keys():
            if specific_info[k] == "":
                specific_info[k] = "N/A"

        # Creates internal json message request structure.
        # this message will be passed to the StorageEnclHandler
        internal_json_msg = json.dumps(
            {"sensor_request_type": {
                "enclosure_alert": {
                        "status": "update",
                        "host_id": host_name,
                        "alert_type": alert_type,
                        "severity": severity,
                        "alert_id": alert_id,
                        "info": info,
                        "specific_info": specific_info
                }
            }})

        return internal_json_msg

    def _get_alert_id(self, epoch_time):
        """Returns alert id which is a combination of
           epoch_time and salt value
        """
        salt = str(uuid.uuid4().hex)
        alert_id = epoch_time + salt
        return alert_id

    def _send_json_msg(self, json_msg):
        """Sends JSON message to Handler"""
        self._log_debug(
            "RealStorPSUSensor._send_json_msg -> {0}".format(json_msg))
        if not json_msg:
            return
        self._event.clear()
        self._write_internal_msgQ(RealStorEnclMsgHandler.name(), json_msg, self._event)

    def _check_if_psu_not_installed(self, health_reason):
        """Checks if PSU is not installed by checking <not installed>
            line in health-reason key. It uses re.findall method to
            check if desired string exists in health-reason. Returns
            boolean based on length of the list of substrings found
            in health-reason. So if length is 0, it returns False,
            else True.
        """
        return bool(re.findall("not installed", health_reason))

    def suspend(self):
        """Suspends the module thread. It should be non-blocking"""
        super(RealStorPSUSensor, self).suspend()
        self._suspended = True

    def resume(self):
        """Resumes the module thread. It should be non-blocking"""
        super(RealStorPSUSensor, self).resume()
        self._suspended = False

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread"""
        super(RealStorPSUSensor, self).shutdown()
class RealStorEnclosureSensor(SensorThread, InternalMsgQ):
    """Monitors Enclosure"""

    # Dependency list
    DEPENDENCIES = {"plugins": ["RealStorEnclMsgHandler"], "rpms": []}

    SENSOR_NAME = "RealStorEnclosureSensor"
    SENSOR_RESP_TYPE = "enclosure_alert"
    RESOURCE_CATEGORY = "hw"
    RESOURCE_TYPE = "enclosure"

    ENCL_FAULT_RESOLVED_EVENTS = ["The network-port Ethernet link is down for controller A",\
                            "The network-port Ethernet link is down for controller B",\
                            "The Management Controller IP address changed",\
                            "The Management Controller booted up.",\
                            "Both controllers have shut down; no restart",\
                            "Storage Controller booted up (cold boot - power up).",\
                            "Management Controller configuration parameters were set"]

    PRIORITY = 1

    alert_type = None
    previous_alert_type = None
    fault_alert = False

    encl_status = None

    system_status = None

    @staticmethod
    def name():
        """@return: name of the monitoring module."""
        return RealStorEnclosureSensor.SENSOR_NAME

    @staticmethod
    def impact():
        """Returns impact of the module."""
        return "Storage enclosure can not be monitored."

    @staticmethod
    def dependencies():
        """Returns a list of plugins and RPMs this module requires
        to function.
        """
        return RealStorEnclosureSensor.DEPENDENCIES

    def __init__(self):
        super(RealStorEnclosureSensor, self).__init__(self.SENSOR_NAME,
                                                      self.PRIORITY)

        self.rssencl = singleton_realstorencl

        # Flag to indicate suspension of module
        self._suspended = False
        self.os_utils = OSUtils()

    def initialize(self, conf_reader, msgQlist, products):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(RealStorEnclosureSensor, self).initialize(conf_reader)

        # Initialize internal message queues for this module
        super(RealStorEnclosureSensor, self).initialize_msgQ(msgQlist)

        self.ENCL_SENSOR_DATA_PATH = os.path.join(self.rssencl.encl_cache,
                                                  'enclosure_data.json')
        # Get the stored previous alert info
        self.persistent_encl_data = store.get(self.ENCL_SENSOR_DATA_PATH)
        if self.persistent_encl_data:
            if self.persistent_encl_data['fault_alert'].lower() == "true":
                self.fault_alert = True
            else:
                self.fault_alert = False
            self.previous_alert_type = self.persistent_encl_data[
                'previous_alert_type']
        else:
            self.persistent_encl_data = {
                'fault_alert': str(self.fault_alert),
                'previous_alert_type': str(self.previous_alert_type),
            }
            store.put(self.persistent_encl_data, self.ENCL_SENSOR_DATA_PATH)

        return True

    def read_data(self):
        """This method is part of interface. Currently it is not
        in use.
        """
        return {}

    def run(self):
        """Run the sensor on its own thread"""
        # Do not proceed if module is suspended
        if self._suspended == True:
            self._scheduler.enter(10, self._priority, self.run, ())
            return

        # Check for debug mode being activated
        self._read_my_msgQ_noWait()

        # Timeout counter for controller login failed and ws request failed
        mc_timeout_counter = self.rssencl.mc_timeout_counter
        # mc_timeout_counter==0, fault_alert==True & prev_alert_type!=FAULT_RESOLVED
        # all can be met True with a sspl restart & persistent cache, so ws_response
        # status finally decides whether to send FAULT_RESOLVED alert or not.
        ws_response_status = self.rssencl.ws_response_status

        if mc_timeout_counter > 10 and self.fault_alert is False:
            self.alert_type = self.rssencl.FRU_FAULT
            self.encl_status = "Storage Enclosure unreachable,"+\
                                "Possible causes : Enclosure / Storage Controller /"+\
                                "Management Controller rebooting,"+\
                                "Network port blocked by firewall,"+\
                                "Network outage or Power outage."

            self.fault_alert = True

        elif mc_timeout_counter == 0 and  ws_response_status == self.rssencl.ws.HTTP_OK \
            and self.previous_alert_type != self.rssencl.FRU_FAULT_RESOLVED \
            and self.fault_alert == True:

            # Check system status
            self.system_status = self.check_system_status()

            if self.system_status is not None:
                self.alert_type = self.rssencl.FRU_FAULT_RESOLVED
                enclosure_status = self.system_status[0:5]

                for status in enclosure_status:
                    if status["severity"] == "INFORMATIONAL":
                        msg = status["message"]
                        for event in self.ENCL_FAULT_RESOLVED_EVENTS:
                            if event in msg:
                                self.encl_status = event
                                break

                self.fault_alert = False

        if self.alert_type is not None:
            self.send_json_msg(self.alert_type, self.encl_status)
            self.alert_type = None

        self._scheduler.enter(30, self._priority, self.run, ())

    def check_system_status(self):
        """Returns system staus using API /show/events"""

        url = self.rssencl.build_url(self.rssencl.URI_CLIAPI_SHOWEVENTS)
        # apply filter to fetch last 20 events
        url = url + " last 20"

        response = self.rssencl.ws_request(url, self.rssencl.ws.HTTP_GET)

        if not response:
            logger.warn("System status unavailable as ws request failed")
            return

        if response.status_code != self.rssencl.ws.HTTP_OK:
            if url.find(self.rssencl.ws.LOOPBACK) == -1:
                raise Exception(
                    f"{self.rssencl.LDR_R1_ENCL}:: http request {url} "
                    f"failed with http err {response.status_code}")
            return

        response_data = json.loads(response.text)
        enclosure_status = response_data["events"]

        return enclosure_status

    def send_json_msg(self, alert_type, encl_status):
        severity_reader = SeverityReader()
        severity = severity_reader.map_severity(alert_type)
        epoch_time = str(int(time.time()))
        alert_id = self._get_alert_id(epoch_time)
        fru = self.rssencl.is_storage_fru('enclosure')
        resource_id = "0"
        host_name = self.os_utils.get_fqdn()

        info = {
            "resource_type": self.RESOURCE_TYPE,
            "fru": fru,
            "resource_id": resource_id,
            "event_time": epoch_time,
            "description": encl_status
        }

        internal_json_msg = json.dumps({
            "sensor_request_type": {
                "enclosure_alert": {
                    "host_id": host_name,
                    "severity": severity,
                    "alert_id": alert_id,
                    "alert_type": alert_type,
                    "status": "update",
                    "info": info,
                    "specific_info": {
                        "event": encl_status
                    }
                }
            }
        })

        self.previous_alert_type = alert_type
        self._write_internal_msgQ(RealStorEnclMsgHandler.name(),
                                  internal_json_msg)
        self.persistent_encl_data = {
            'fault_alert': str(self.fault_alert),
            'previous_alert_type': str(self.previous_alert_type),
        }
        store.put(self.persistent_encl_data, self.ENCL_SENSOR_DATA_PATH)

    def _get_alert_id(self, epoch_time):
        """Returns alert id which is a combination of
            epoch_time and salt value
        """
        salt = str(uuid.uuid4().hex)
        alert_id = epoch_time + salt
        return alert_id

    def suspend(self):
        """Suspend the module thread. It should be non-blocking"""
        super(RealStorEnclosureSensor, self).suspend()
        self._suspended = True

    def resume(self):
        """Resumes the module thread. It should be non-blocking"""
        super(RealStorEnclosureSensor, self).resume()
        self._suspended = False

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread"""
        super(RealStorEnclosureSensor, self).shutdown()
Exemplo n.º 17
0
class RAIDsensor(SensorThread, InternalMsgQ):

    SENSOR_NAME = "RAIDsensor"
    PRIORITY = 1
    RESOURCE_TYPE = "node:os:raid_data"

    # Section and keys in configuration file
    RAIDSENSOR = SENSOR_NAME.upper()
    RAID_STATUS_FILE = 'RAID_status_file'

    RAID_CONF_FILE = '/etc/mdadm.conf'
    RAID_DOWN_DRIVE_STATUS = [{
        "status": "Down/Missing"
    }, {
        "status": "Down/Missing"
    }]

    SYSTEM_INFORMATION = "SYSTEM_INFORMATION"

    prev_alert_type = {}
    alert_type = None

    # alerts
    FAULT_RESOLVED = "fault_resolved"
    FAULT = "fault"
    MISSING = "missing"
    INSERTION = "insertion"

    CACHE_DIR_NAME = "server"

    # Dependency list
    DEPENDENCIES = {
        "init": ["DiskMonitor"],
    }

    @staticmethod
    def name():
        """@return: name of the monitoring module."""
        return RAIDsensor.SENSOR_NAME

    @staticmethod
    def impact():
        """Returns impact of the module."""
        return "Server RAID disks can not be monitored."

    def __init__(self):
        super(RAIDsensor, self).__init__(self.SENSOR_NAME, self.PRIORITY)
        # Current RAID status information
        self._RAID_status = None

        # Location of hpi data directory populated by dcs-collector
        self._start_delay = 10

        # Flag to indicate suspension of module
        self._suspended = False
        self.os_utils = OSUtils()

    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(RAIDsensor, self).initialize(conf_reader)

        # Initialize internal message queues for this module
        super(RAIDsensor, self).initialize_msgQ(msgQlist)

        self._RAID_status_file = self._get_RAID_status_file()
        logger.info(f"Monitoring RAID status file: {self._RAID_status_file}")

        # The status file contents
        self._RAID_status_contents = "N/A"

        # The mdX status line in the status file
        self._RAID_status = {}

        self._faulty_drive_list = {}

        self._faulty_device_list = set()

        self._drives = {}

        self._total_drives = {}

        self._devices = []

        self._missing_drv = {}

        self._prev_drive_dict = {}

        self.prev_alert_type = {}
        self._node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY, 'SN01')

        # Allow systemd to process all the drives so we can map device name to serial numbers
        #time.sleep(120)

        cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME)
        self.RAID_SENSOR_DATA_PATH = os.path.join(
            cache_dir_path, f'RAID_SENSOR_DATA_{self._node_id}')
        # Get the stored previous alert info
        self.persistent_raid_data = {}
        if os.path.isfile(self.RAID_SENSOR_DATA_PATH):
            self.persistent_raid_data = store.get(self.RAID_SENSOR_DATA_PATH)
        if self.persistent_raid_data:
            self._RAID_status_contents = self.persistent_raid_data[
                '_RAID_status_contents']
            self._RAID_status = self.persistent_raid_data['_RAID_status']
            self._faulty_drive_list = self.persistent_raid_data[
                '_faulty_drive_list']
            self._faulty_device_list = self.persistent_raid_data[
                '_faulty_device_list']
            self._drives = self.persistent_raid_data['_drives']
            self._total_drives = self.persistent_raid_data['_total_drives']
            self._devices = self.persistent_raid_data['_devices']
            self._missing_drv = self.persistent_raid_data['_missing_drv']
            self._prev_drive_dict = self.persistent_raid_data[
                '_prev_drive_dict']
            self.prev_alert_type = self.persistent_raid_data['prev_alert_type']
        else:
            self.persistent_raid_data = {
                '_RAID_status_contents': self._RAID_status_contents,
                '_RAID_status': self._RAID_status,
                '_faulty_drive_list': self._faulty_drive_list,
                '_faulty_device_list': self._faulty_device_list,
                '_drives': self._drives,
                '_total_drives': self._total_drives,
                '_devices': self._devices,
                '_missing_drv': self._missing_drv,
                '_prev_drive_dict': self._prev_drive_dict,
                'prev_alert_type': self.prev_alert_type,
            }
            store.put(self.persistent_raid_data, self.RAID_SENSOR_DATA_PATH)

        return True

    def read_data(self):
        """Return the Current RAID status information"""
        return self._RAID_status

    def run(self):
        """Run the sensor on its own thread"""

        # Do not proceed if module is suspended
        if self._suspended == True:
            self._scheduler.enter(30, self._priority, self.run, ())
            return

        # Check for debug mode being activated
        self._read_my_msgQ_noWait()

        # self._set_debug(True)
        # self._set_debug_persist(True)

        # Check for a change in status file and notify the node data msg handler
        self._notify_NodeDataMsgHandler()

        # Reset debug mode if persistence is not enabled
        self._disable_debug_if_persist_false()

        # Fire every 30 seconds to see if there's a change in RAID status file
        self._scheduler.enter(30, self._priority, self.run, ())

    def _notify_NodeDataMsgHandler(self):
        """See if the status files changed and notify node data message handler
            for generating JSON message"""
        self._drive_state_changed = False
        # resource_id for drive alerts
        resource_id = None
        if not os.path.isfile(self._RAID_status_file):
            logger.warn(
                f"status_file: {self._RAID_status_file} does not exist, ignoring."
            )
            return

        # Read in status and see if it has changed
        with open(self._RAID_status_file, "r") as datafile:
            status = datafile.read()

        # Do nothing if the RAID status file has not changed
        if self._RAID_status_contents == status:
            self._log_debug(
                f"_notify_NodeDataMsgHandler status unchanged, ignoring: {status}"
            )
            return

        # Update the RAID status contents of file
        self._RAID_status_contents = status

        # Process mdstat file and send json msg to NodeDataMsgHandler
        md_device_list, drive_dict, drive_status_changed = self._process_mdstat(
        )

        # checks mdadm conf file for missing raid array and send json message to NodeDataMsgHandler
        self._process_missing_md_devices(md_device_list, drive_dict)

        for device in md_device_list:
            if drive_dict:
                if len(drive_dict[device]) < self._total_drives[device] and \
                    device in self.prev_alert_type and self.prev_alert_type[device] != self.MISSING:
                    self.alert_type = self.MISSING
                    if device in self._prev_drive_dict:
                        missing_drive = set(
                            self._prev_drive_dict[device]).difference(
                                set(drive_dict[device]))
                        try:
                            missing_drive = "/dev/" + list(missing_drive)[0]
                        except IndexError:
                            missing_drive = "NA"
                    else:
                        missing_drive = "NA"
                    resource_id = device + ":" + missing_drive
                    self._missing_drv = {
                        "path": missing_drive,
                        "serialNumber": "None"
                    }
                    self._map_drive_status(device, drive_dict, "Missing")
                    self._drive_state_changed = True

                elif len(drive_dict[device]) >= self._total_drives[device] and \
                    device in self.prev_alert_type and self.prev_alert_type[device] == self.MISSING:
                    self.alert_type = self.INSERTION
                    resource_id = device + ":/dev/" + drive_dict[device][0]
                    self._map_drive_status(device, drive_dict[device][0],
                                           "Down/Recovery")
                    self._drive_state_changed = True

                if self.alert_type is not None and self._drive_state_changed == True:
                    self._prev_drive_dict[device] = drive_dict[device]
                    self._send_json_msg(self.alert_type, resource_id, device,
                                        self._drives[device])

                if drive_status_changed[device]:
                    for drive in self._drives[device]:
                        if drive.get("identity") is not None:
                            drive_path = drive.get("identity").get("path")
                            drive_name = drive_path[5:]
                            resource_id = device + ":/dev/" + drive_name
                            drive_status = drive.get("status")
                            if drive_status not in ["U", "UP"] and device in self._faulty_drive_list and \
                                drive_name not in self._faulty_drive_list[device] and \
                                self.prev_alert_type[device] != self.MISSING:
                                self.alert_type = self.FAULT
                                self._map_drive_status(device, drive_name,
                                                       "Down")
                                self._drive_state_changed = True
                                self._faulty_drive_list[device][
                                    drive_name] = self.alert_type

                            elif drive_status in ["U", "UP", "Down/Recovery"] and device in self._faulty_drive_list and \
                                drive_name in self._faulty_drive_list[device]:
                                self.alert_type = self.FAULT_RESOLVED
                                self._map_drive_status(device, drive_name,
                                                       "UP")
                                self._drive_state_changed = True
                                del self._faulty_drive_list[device][drive_name]

                            if self.alert_type is not None and self._drive_state_changed == True:
                                self._prev_drive_dict[device] = drive_dict[
                                    device]
                                self._send_json_msg(self.alert_type,
                                                    resource_id, device,
                                                    self._drives[device])

    def _process_mdstat(self):
        """Parse out status' and path info for each drive"""
        # Replace new line chars with spaces
        mdstat = self._RAID_status_contents.strip().split("\n")
        md_device_list = []
        drive_dict = {}
        monitored_device = mdstat
        drive_status_changed = {}
        self._devices.clear()
        # Array of optional identity json sections for drives in array
        self._identity = {}

        # Read in each line looking for a 'mdXXX' value
        md_line_parsed = False

        for line in monitored_device:
            # The line following the mdXXX : ... contains the [UU] status that we need
            if md_line_parsed is True:
                # Format is [x/y][UUUU____...]
                drive_status_changed[self._device] = self._parse_raid_status(
                    line, self._device)
                # Reset in case their are multiple configs in file
                md_line_parsed = False

            # Break the  line apart into separate fields
            fields = line.split(" ")

            # Parse out status' and path info for each drive
            if "md" in fields[0]:
                self._device = f"/dev/{fields[0]}"
                self._devices.append(self._device)
                self._log_debug(f"md device found: {self._device}")
                md_device_list.append(self._device)
                drive_dict[self._device] = []
                if self._device not in self.prev_alert_type:
                    self.prev_alert_type[self._device] = None
                if self._device not in self._faulty_drive_list:
                    self._faulty_drive_list[self._device] = {}

                # Parse out raid drive paths if they're present
                self._identity[self._device] = {}
                for field in fields:
                    if "[" in field:
                        if field not in drive_dict[self._device]:
                            index = field.find("[")
                            drive_name = field[:index]
                            drive_dict[self._device].append(drive_name)
                        self._add_drive(field, self._device)
                md_line_parsed = True

        return md_device_list, drive_dict, drive_status_changed

    def _add_drive(self, field, device):
        """Adds a drive to the list"""
        first_bracket_index = field.find('[')

        # Parse out the drive path
        drive_path = f"/dev/{field[: first_bracket_index]}"

        # Parse out the drive index into [UU] status which is Device Role field
        detail_command = f"/usr/sbin/mdadm --examine {drive_path} | grep 'Device Role'"
        response, error = self._run_command(detail_command)

        if error:
            self._log_debug(
                f"_add_drive, Error retrieving drive index into status, example: [U_]: {str(error)}"
            )
        try:
            drive_index = int(response.split(" ")[-1])
        except Exception as ae:
            self._log_debug(f"_add_drive, get drive_index error: {str(ae)}")
            return
        self._log_debug(
            f"_add_drive, drive index: {drive_index}, path: {drive_path}")

        # Create the json msg, serial number will be filled in by NodeDataMsgHandler
        identity_data = {"path": drive_path, "serialNumber": "None"}
        self._identity[device][drive_index] = identity_data

    def _parse_raid_status(self, status_line, device):
        """Parses the status of each drive denoted by U & _
            for drive being Up or Down in raid
        """
        # Parse out x for total number of drives
        first_bracket_index = status_line.find('[')

        # If no '[' found, return
        if first_bracket_index == -1:
            return False

        self._total_drives[device] = int(status_line[first_bracket_index + 1])
        self._log_debug("_parse_raid_status, total_drives: %d" %
                        self._total_drives[device])

        # Break the line apart into separate fields
        fields = status_line.split(" ")

        # The last field is the list of U & _
        status = fields[-1]
        self._log_debug("_parse_raid_status, status: %s, total drives: %d" %
                        (status, self._total_drives[device]))

        # Array of raid drives in json format based on schema
        self._drives[device] = []

        drive_index = 0
        while drive_index < self._total_drives[device]:
            # Create the json msg and append it to the list
            if self._identity.get(device).get(drive_index) is not None:
                path = self._identity.get(device).get(drive_index).get("path")
                drive_status_msg = {
                    "status": status[drive_index + 1],  # Move past '['
                    "identity": {
                        "path": path,
                        "serialNumber": "None"
                    }
                }
            else:
                drive_status_msg = {
                    "status": status[drive_index + 1]
                }  # Move past '['

            self._log_debug(f"_parse_raid_status, drive_index: {drive_index}")
            self._log_debug(
                f"_parse_raid_status, drive_status_msg: {drive_status_msg}")
            self._drives[device].append(drive_status_msg)

            drive_index = drive_index + 1

        # See if the status line has changed, if not there's nothing to do
        if device in self._RAID_status and self._RAID_status[device] == status:
            self._log_debug(f"RAID status has not changed, ignoring: {status}")
            return False
        else:
            self._log_debug(
                f"RAID status has changed, old: {self._RAID_status}, new: {status}"
            )
            self._RAID_status[device] = status

        return True

    def _process_missing_md_devices(self, md_device_list, drive_dict):
        """ checks the md raid configuration file, compares all it's
            entries with list of arrays from mdstat file and sends
            missing entry
        """

        if not os.path.isfile(self.RAID_CONF_FILE):
            logger.warn(
                f"_process_missing_md_devices, MDRaid configuration file {self.RAID_CONF_FILE} is missing"
            )
            return

        conf_device_list = []
        with open(self.RAID_CONF_FILE, 'r') as raid_conf_file:
            raid_conf_data = raid_conf_file.read().strip().split("\n")
        for line in raid_conf_data:
            try:
                raid_conf_field = line.split(" ")
                if "#" not in raid_conf_field[0] and "ARRAY" in raid_conf_field[0] and \
                    "/md" in raid_conf_field[1]:
                    # Mapped the device i.e. /dev/md/1 and /dev/md1 will be the same device.
                    map_device = raid_conf_field[1].split('md/')
                    if len(map_device) > 1:
                        conf_device_list.append(map_device[0] + 'md' +
                                                map_device[1])
                    else:
                        conf_device_list.append(raid_conf_field[1])
            except Exception as ae:
                self._log_debug(
                    f"_process_missing_md_devices, error retrieving raid entry    \
                 from {self.RAID_CONF_FILE} file: {str(ae)}")
                return

        # compare conf file raid array list with mdstat raid array list
        for device in conf_device_list:
            if device not in md_device_list and device not in self._faulty_device_list:
                # add that missing raid array entry into the list of raid devices
                self.alert_type = self.FAULT
                self._faulty_device_list.add(device)
                self._send_json_msg(self.alert_type, device, device,
                                    self.RAID_DOWN_DRIVE_STATUS)

            elif device in md_device_list and device in self._faulty_device_list:
                # add that missing raid array entry into the list of raid devices
                self.alert_type = self.FAULT_RESOLVED
                self._map_drive_status(device, drive_dict, "Down/Recovery")
                self._faulty_device_list.remove(device)
                self._send_json_msg(self.alert_type, device, device,
                                    self._drives[device])

    def _map_drive_status(self, device, drives, drv_status):
        for drv in self._drives[device]:
            if isinstance(drives, str):
                if drv["status"] not in [
                        "U", "UP"
                ] and drv["identity"]["path"] == '/dev/' + drives:
                    drv["status"] = drv_status
            else:
                for drive in drives[device]:
                    # Drive info is not available in missing case.
                    if drv_status == "Missing" and drv["status"] == "_":
                        drv["status"] = drv_status
                        drv["identity"] = self._missing_drv
                    elif drv["status"] not in [
                            "U", "UP"
                    ] and drv["identity"]["path"] == '/dev/' + drive:
                        drv["status"] = drv_status

            if drv["status"] == "U":
                drv["status"] = "UP"

    def _send_json_msg(self, alert_type, resource_id, device, drives):
        """Transmit data to NodeDataMsgHandler to be processed and sent out"""

        epoch_time = str(int(time.time()))
        severity_reader = SeverityReader()
        severity = severity_reader.map_severity(alert_type)
        self._alert_id = self._get_alert_id(epoch_time)
        host_name = self.os_utils.get_fqdn()

        if alert_type == self.MISSING:
            description = "RAID array or drive from RAID array is missing."
        elif alert_type == self.FAULT:
            description = "RAID array or drive from RAID array is faulty."
        elif alert_type == self.INSERTION:
            description = "Inserted drive in RAID array."
        elif alert_type == self.FAULT_RESOLVED:
            description = "Fault for RAID array or RAID drive is resolved"
        else:
            description = "Raid array alert"

        info = {
            "resource_type": self.RESOURCE_TYPE,
            "resource_id": resource_id,
            "event_time": epoch_time,
            "description": description
        }
        specific_info = {"device": device, "drives": drives}

        internal_json_msg = json.dumps({
            "sensor_request_type": {
                "node_data": {
                    "status": "update",
                    "sensor_type": "node:os:raid_data",
                    "host_id": host_name,
                    "alert_type": alert_type,
                    "alert_id": self._alert_id,
                    "severity": severity,
                    "info": info,
                    "specific_info": specific_info
                }
            }
        })
        self.prev_alert_type[device] = alert_type
        self.alert_type = None

        # Send the event to node data message handler to generate json message and send out
        self._write_internal_msgQ(NodeDataMsgHandler.name(), internal_json_msg)
        # Save the state to Persistent Cache.
        self.persistent_raid_data = {
            '_RAID_status_contents': self._RAID_status_contents,
            '_RAID_status': self._RAID_status,
            '_faulty_drive_list': self._faulty_drive_list,
            '_faulty_device_list': self._faulty_device_list,
            '_drives': self._drives,
            '_total_drives': self._total_drives,
            '_devices': self._devices,
            '_missing_drv': self._missing_drv,
            '_prev_drive_dict': self._prev_drive_dict,
            'prev_alert_type': self.prev_alert_type,
        }
        store.put(self.persistent_raid_data, self.RAID_SENSOR_DATA_PATH)

    def _get_alert_id(self, epoch_time):
        """Returns alert id which is a combination of
        epoch_time and salt value
        """
        salt = str(uuid.uuid4().hex)
        alert_id = epoch_time + salt
        return alert_id

    def suspend(self):
        """Suspends the module thread. It should be non-blocking"""
        super(RAIDsensor, self).suspend()
        self._suspended = True

    def resume(self):
        """Resumes the module thread. It should be non-blocking"""
        super(RAIDsensor, self).resume()
        self._suspended = False

    def _run_command(self, command):
        """Run the command and get the response and error returned"""
        self._log_debug(f"_run_command: {command}")
        process = subprocess.Popen(command,
                                   shell=True,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)
        response, error = process.communicate()

        if response:
            self._log_debug(f"_run_command, response: {str(response)}")
        if error:
            self._log_debug(f"_run_command: error: {str(error)}")

        return response.decode().rstrip('\n'), error.decode().rstrip('\n')

    def _get_RAID_status_file(self):
        """Retrieves the file containing the RAID status information"""
        return Conf.get(SSPL_CONF,
                        f"{self.RAIDSENSOR}>{self.RAID_STATUS_FILE}",
                        '/proc/mdstat')

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread"""
        super(RAIDsensor, self).shutdown()
class RealStorLogicalVolumeSensor(SensorThread, InternalMsgQ):
    """Monitors Logical Volume data using RealStor API"""


    SENSOR_NAME = "RealStorLogicalVolumeSensor"
    SENSOR_RESP_TYPE = "enclosure_logical_volume_alert"
    RESOURCE_CATEGORY = "cortx"
    RESOURCE_TYPE_LVOL = "enclosure:cortx:logical_volume"
    RESOURCE_TYPE_DG = "enclosure:cortx:disk_group"

    PRIORITY = 1

    # Dependency list
    DEPENDENCIES = {
                    "plugins": ["RealStorEnclMsgHandler"],
                    "rpms": []
    }

    disk_groups_generic = ["object-name", "name", "size", "freespace", "storage-type", "pool",
         "pool-serial-number", "pool-percentage", "owner", "raidtype", "status", "create-date",
         "disk-description", "serial-number", "pool-sector-format", "health", "health-reason",
         "health-recommendation"]

    disk_groups_extended = ['blocksize', 'size-numeric', 'freespace-numeric', 'raw-size',
        'raw-size-numeric', 'storage-type-numeric', 'storage-tier', 'storage-tier-numeric',
        'total-pages', 'allocated-pages', 'available-pages', 'performance-rank', 'owner-numeric',
        'preferred-owner', 'preferred-owner-numeric', 'raidtype-numeric', 'diskcount', 'sparecount',
        'chunksize', 'status-numeric', 'lun', 'min-drive-size', 'min-drive-size-numeric',
        'create-date-numeric', 'cache-read-ahead', 'cache-read-ahead-numeric', 'cache-flush-period',
        'read-ahead-enabled', 'read-ahead-enabled-numeric', 'write-back-enabled',
        'write-back-enabled-numeric', 'job-running', 'current-job', 'current-job-numeric',
        'current-job-completion', 'num-array-partitions', 'largest-free-partition-space',
        'largest-free-partition-space-numeric', 'num-drives-per-low-level-array',
        'num-expansion-partitions', 'num-partition-segments', 'new-partition-lba',
        'new-partition-lba-numeric', 'array-drive-type', 'array-drive-type-numeric',
        'disk-description-numeric', 'is-job-auto-abortable', 'is-job-auto-abortable-numeric',
        'blocks', 'disk-dsd-enable-vdisk', 'disk-dsd-enable-vdisk-numeric', 'disk-dsd-delay-vdisk',
        'scrub-duration-goal', 'adapt-target-spare-capacity', 'adapt-target-spare-capacity-numeric',
        'adapt-actual-spare-capacity', 'adapt-actual-spare-capacity-numeric', 'adapt-critical-capacity',
        'adapt-critical-capacity-numeric', 'adapt-degraded-capacity', 'adapt-degraded-capacity-numeric',
        'adapt-linear-volume-boundary', 'pool-sector-format-numeric', 'health-numeric']

    volumes_generic = ["volume-description", "blocks", "health", "size", "volume-name", "wwn",
         "storage-pool-name", "total-size", "volume-class", "allocated-size", "owner", "object-name",
         "raidtype", "health-reason", "progress", "blocksize", "serial-number", "virtual-disk-serial",
         "write-policy", "volume-type", "health-recommendation", "virtual-disk-name", "storage-type",
         "capabilities"]

    volumes_extended = ["cache-optimization", "container-serial", "cs-primary", "replication-set",
         "attributes", "preferred-owner", "volume-parent", "allowed-storage-tiers", "cs-copy-dest",
         "cs-copy-src", "container-name", "group-key", "snapshot-retention-priority", "pi-format",
         "reserved-size-in-pages", "cs-secondary", "volume-group", "health-numeric",
         "large-virtual-extents", "cs-replication-role", "durable-id", "threshold-percent-of-pool",
         "tier-affinity", "volume-qualifier", "snapshot", "snap-pool", "read-ahead-size",
         "zero-init-page-on-allocation", "allocate-reserved-pages-first"]

    # Logical Volumes directory name
    LOGICAL_VOLUMES_DIR = "logical_volumes"
    # Disk Groups directory name
    DISK_GROUPS_DIR = "disk_groups"

    @staticmethod
    def name():
        """@return: name of the monitoring module."""
        return RealStorLogicalVolumeSensor.SENSOR_NAME

    @staticmethod
    def impact():
        """Returns impact of the module."""
        return "Disk groups and logical volumes of storage enclosure can not be monitored."

    @staticmethod
    def dependencies():
        """Returns a list of plugins and RPMs this module requires
           to function.
        """
        return RealStorLogicalVolumeSensor.DEPENDENCIES

    def __init__(self):
        super(RealStorLogicalVolumeSensor, self).__init__(
            self.SENSOR_NAME, self.PRIORITY)

        self._faulty_disk_group_file_path = None
        self._faulty_logical_volume_file_path = None

        self.rssencl = singleton_realstorencl

        # logical volumes persistent cache
        self._logical_volume_prcache = None
        # disk groups persistent cache
        self._disk_group_prcache = None

        # Holds Disk Groups with faults. Used for future reference.
        self._previously_faulty_disk_groups = {}
        # Holds Logical Volumes with faults. Used for future reference.
        self._previously_faulty_logical_volumes = {}

        self.pollfreq_DG_logical_volume_sensor = \
            int(Conf.get(SSPL_CONF, f"{self.rssencl.CONF_REALSTORLOGICALVOLUMESENSOR}>{POLLING_FREQUENCY_OVERRIDE}",
                            10))

        if self.pollfreq_DG_logical_volume_sensor == 0:
                self.pollfreq_DG_logical_volume_sensor = self.rssencl.pollfreq

        # Flag to indicate suspension of module
        self._suspended = False

        self._event = Event()
        self.os_utils = OSUtils()
        cvg_info = Conf.get(GLOBAL_CONF, CVG_INFO_KEY)
        self.cvg_info_dict = {}
        if cvg_info:
            self.cvg_info_dict = {cvg['name']: idx for idx, cvg in \
                enumerate(cvg_info) if 'name' in cvg}

    def initialize(self, conf_reader, msgQlist, products):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(RealStorLogicalVolumeSensor, self).initialize(conf_reader)

        # Initialize internal message queues for this module
        super(RealStorLogicalVolumeSensor, self).initialize_msgQ(msgQlist)

        self._logical_volume_prcache = os.path.join(self.rssencl.frus,\
             self.LOGICAL_VOLUMES_DIR)
        self._disk_group_prcache = os.path.join(self.rssencl.frus,\
             self.DISK_GROUPS_DIR)

        # Persistence file location. This file stores faulty Logical Volume data
        self._faulty_logical_volume_file_path = os.path.join(
            self._logical_volume_prcache, "logical_volume_data.json")
        # Persistence file location. This file stores faulty Disk Group data
        self._faulty_disk_group_file_path = os.path.join(
            self._disk_group_prcache, "disk_group_data.json")

        # Load faulty Logical Volume data from file if available
        self._previously_faulty_logical_volumes = store.get(\
                                                  self._faulty_logical_volume_file_path)
        # Load faulty Disk Group data from file if available
        self._previously_faulty_disk_groups = store.get(\
                                                  self._faulty_disk_group_file_path)

        if self._previously_faulty_logical_volumes is None:
            self._previously_faulty_logical_volumes = {}
            store.put(self._previously_faulty_logical_volumes,\
                self._faulty_logical_volume_file_path)

        if self._previously_faulty_disk_groups is None:
            self._previously_faulty_disk_groups = {}
            store.put(self._previously_faulty_disk_groups,\
                self._faulty_disk_group_file_path)

        return True

    def read_data(self):
        """This method is part of interface. Currently it is not
        in use.
        """
        return {}

    def run(self):
        """Run the sensor on its own thread"""

        # Do not proceed if module is suspended
        if self._suspended == True:
            self._scheduler.enter(10, self._priority, self.run, ())
            return
        # Check for debug mode being activated
        self._read_my_msgQ_noWait()

        disk_groups = None
        logical_volumes = None

        disk_groups = self._get_disk_groups()

        if disk_groups:
            self._get_msgs_for_faulty_disk_groups(disk_groups)
            for disk_group in disk_groups:
                pool_serial_number = disk_group["pool-serial-number"]
                logical_volumes = self._get_logical_volumes(pool_serial_number)
                if logical_volumes:
                    self._get_msgs_for_faulty_logical_volumes(logical_volumes, disk_group)

        # Reset debug mode if persistence is not enabled
        self._disable_debug_if_persist_false()

        # Fire every 10 seconds to see if We have a faulty Logical Volume
        self._scheduler.enter(self.pollfreq_DG_logical_volume_sensor,
                self._priority, self.run, ())

    def _get_disk_groups(self):
        """Receives list of Disk Groups from API.
           URL: http://<host>/api/show/disk-groups
        """
        url = self.rssencl.build_url(self.rssencl.URI_CLIAPI_SHOWDISKGROUPS)

        response = self.rssencl.ws_request(url, self.rssencl.ws.HTTP_GET)

        if not response:
            logger.warn(f"{self.rssencl.LDR_R1_ENCL}:: Disk Groups status unavailable as ws request {url} failed")
            return

        if response.status_code != self.rssencl.ws.HTTP_OK:
            if url.find(self.rssencl.ws.LOOPBACK) == -1:
                raise Exception(f"{self.rssencl.LDR_R1_ENCL}:: http request {url} "
                                f"to get disk groups failed with err {response.status_code}")
            return

        response_data = json.loads(response.text)
        disk_groups = response_data.get("disk-groups")
        return disk_groups

    def _get_logical_volumes(self, pool_serial_number):
        """Receives list of Logical Volumes from API.
           URL: http://<host>/api/show/volumes/pool/<pool_serial_number>
        """
        url = self.rssencl.build_url(self.rssencl.URI_CLIAPI_SHOWVOLUMES)

        url = f"{url}/pool/{pool_serial_number}"

        response = self.rssencl.ws_request(url, self.rssencl.ws.HTTP_GET)

        if not response:
            logger.warn(f"{self.rssencl.LDR_R1_ENCL}:: Logical Volume status unavailable as ws request {url}"
                " failed")
            return

        if response.status_code != self.rssencl.ws.HTTP_OK:
            raise Exception(f"{self.rssencl.LDR_R1_ENCL}:: http request {url} "
                            f"to get logical volumes failed with err {response.status_code}")
            return

        response_data = json.loads(response.text)
        logical_volumes = response_data.get("volumes")
        return logical_volumes

    def _get_msgs_for_faulty_disk_groups(self, disk_groups, send_message=True):
        """Checks for health of disk groups and returns list of messages to be
           sent to handler if there are any.
        """
        faulty_disk_group_messages = []
        internal_json_msg = None
        disk_group_health = None
        serial_number = None
        alert_type = ""
        # Flag to indicate if there is a change in _previously_faulty_disk_groups
        state_changed = False

        if not disk_groups:
            return

        for disk_group in disk_groups:
            disk_group_health = disk_group["health"].lower()
            serial_number = disk_group["serial-number"]
            # Check for missing and fault case
            if disk_group_health == self.rssencl.HEALTH_FAULT:
                # Status change from Degraded ==> Fault or OK ==> Fault
                if (serial_number in self._previously_faulty_disk_groups and \
                        self._previously_faulty_disk_groups[serial_number]['health']=="degraded") or \
                        (serial_number not in self._previously_faulty_disk_groups):
                    alert_type = self.rssencl.FRU_FAULT
                    self._previously_faulty_disk_groups[serial_number] = {
                        "health": disk_group_health, "alert_type": alert_type}
                    state_changed = True

            # Check for fault case
            elif disk_group_health == self.rssencl.HEALTH_DEGRADED:
                # Status change from Fault ==> Degraded or OK ==> Degraded
                if (serial_number in self._previously_faulty_disk_groups and \
                        self._previously_faulty_disk_groups[serial_number]['health']=="fault") or \
                        (serial_number not in self._previously_faulty_disk_groups):
                    alert_type = self.rssencl.FRU_FAULT
                    self._previously_faulty_disk_groups[serial_number] = {
                        "health": disk_group_health, "alert_type": alert_type}
                    state_changed = True

            # Check for healthy case
            elif disk_group_health == self.rssencl.HEALTH_OK:
                # Status change from Fault ==> OK or Degraded ==> OK
                if serial_number in self._previously_faulty_disk_groups:
                    # Send message to handler
                    if send_message:
                        alert_type = self.rssencl.FRU_FAULT_RESOLVED
                    del self._previously_faulty_disk_groups[serial_number]
                    state_changed = True

            # Persist faulty Disk Group list to file only if something is changed
            if state_changed:
                # Generate the alert contents
                internal_json_msg = self._create_internal_msg_dg(alert_type, disk_group)
                faulty_disk_group_messages.append(internal_json_msg)
                # Send message to handler
                if send_message:
                    self._send_json_msg(internal_json_msg)
                # Wait till msg is sent to message bus or added in consul for resending.
                # If timed out, do not update cache and revert in-memory cache.
                # So, in next iteration change can be detected
                if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT):
                    store.put(self._previously_faulty_disk_groups,\
                        self._faulty_disk_group_file_path)
                else:
                    self._previously_faulty_disk_groups = store.get(self._faulty_disk_group_file_path)
                state_changed = False
            alert_type = ""
        return faulty_disk_group_messages

    def _get_msgs_for_faulty_logical_volumes(self, logical_volumes, disk_group, send_message=True):
        """Checks for health of logical volumes and returns list of messages to be
           sent to handler if there are any.
        """
        faulty_logical_volume_messages = []
        internal_json_msg = None
        logical_volume_health = None
        serial_number = None
        alert_type = ""
        # Flag to indicate if there is a change in _previously_faulty_logical_volumes
        state_changed = False

        if not logical_volumes:
            return

        for logical_volume in logical_volumes:
            logical_volume_health = logical_volume["health"].lower()
            serial_number = logical_volume["serial-number"]

            # Check for missing and fault case
            if logical_volume_health == self.rssencl.HEALTH_FAULT:
                # Status change from Degraded ==> Fault or OK ==> Fault
                if (serial_number in self._previously_faulty_logical_volumes and \
                        self._previously_faulty_logical_volumes[serial_number]['health']=="degraded") or \
                        (serial_number not in self._previously_faulty_logical_volumes):
                    alert_type = self.rssencl.FRU_FAULT
                    self._previously_faulty_logical_volumes[serial_number] = {
                        "health": logical_volume_health, "alert_type": alert_type}
                    state_changed = True

            # Check for degraded case
            elif logical_volume_health == self.rssencl.HEALTH_DEGRADED:
                # Status change from Fault ==> Degraded or OK ==> Degraded
                if (serial_number in self._previously_faulty_logical_volumes and \
                        self._previously_faulty_logical_volumes[serial_number]['health']=="fault") or \
                        (serial_number not in self._previously_faulty_logical_volumes):
                    alert_type = self.rssencl.FRU_FAULT
                    self._previously_faulty_logical_volumes[serial_number] = {
                        "health": logical_volume_health, "alert_type": alert_type}
                    state_changed = True

            # Check for healthy case
            elif logical_volume_health == self.rssencl.HEALTH_OK:
                # Status change from Fault ==> OK or Degraded ==> OK
                if serial_number in self._previously_faulty_logical_volumes:
                    # Send message to handler
                    alert_type = self.rssencl.FRU_FAULT_RESOLVED
                    del self._previously_faulty_logical_volumes[serial_number]
                    state_changed = True

            if state_changed:
                # Generate the alert contents
                internal_json_msg = self._create_internal_msg_lvol(
                    logical_volume, alert_type, disk_group)
                faulty_logical_volume_messages.append(internal_json_msg)
                # Send message to handler
                if send_message:
                    self._send_json_msg(internal_json_msg)
                # Persist faulty Logical Volume list to file only if something is changed
                # Wait till msg is sent to message bus or added in consul for resending.
                # If timed out, do not update cache and revert in-memory cache.
                # So, in next iteration change can be detected
                if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT):
                    store.put(self._previously_faulty_logical_volumes,\
                        self._faulty_logical_volume_file_path)
                else:
                    self._previously_faulty_logical_volumes = store.get(self._faulty_logical_volume_file_path)
                state_changed = False
            alert_type = ""

        return faulty_logical_volume_messages

    def _create_internal_msg_lvol(self, logical_volume_detail, alert_type, disk_group):
        """Forms a dictionary containing info about Logical Volumes to send to
           message handler.
        """
        if not logical_volume_detail:
            return {}

        generic_info = dict.fromkeys(self.volumes_generic, "NA")
        extended_info = dict.fromkeys(self.volumes_extended, "NA")
        disk_groups_info = dict.fromkeys(self.disk_groups_generic, "NA")

        severity_reader = SeverityReader()
        severity = severity_reader.map_severity(alert_type)
        epoch_time = str(int(time.time()))

        alert_id = self._get_alert_id(epoch_time)
        resource_id = logical_volume_detail.get("volume-name", "")
        host_name = self.os_utils.get_fqdn()

        for key, value in logical_volume_detail.items():
            if key in self.volumes_generic:
                generic_info.update({key : value})
            elif key in self.volumes_extended:
                extended_info.update({key : value})

        for key, value in disk_group.items():
            if key in self.disk_groups_generic:
                disk_groups_info.update({key : value})
        generic_info['disk-group'] = [disk_groups_info]
        generic_info.update(extended_info)

        info = {
                "resource_type": self.RESOURCE_TYPE_LVOL,
                "resource_id": resource_id,
                "event_time": epoch_time
                }

        internal_json_msg = json.dumps(
            {"sensor_request_type": {
                "enclosure_alert": {
                    "host_id": host_name,
                    "severity": severity,
                    "alert_id": alert_id,
                    "alert_type": alert_type,
                    "status": "update",
                    "info": info,
                    "specific_info": generic_info
                }
            }})
        return internal_json_msg

    def _create_internal_msg_dg(self, alert_type, disk_group_detail):
        """Forms a dictionary containing info about Disk Groups to send to
           message handler.
        """
        if not disk_group_detail:
            return {}

        generic_info = dict.fromkeys(self.disk_groups_generic, "NA")
        extended_info = dict.fromkeys(self.disk_groups_extended, "NA")

        severity_reader = SeverityReader()
        severity = severity_reader.map_severity(alert_type)
        epoch_time = str(int(time.time()))

        alert_id = self._get_alert_id(epoch_time)
        resource_id = disk_group_detail.get("name", "")
        host_name = self.os_utils.get_fqdn()

        for key, value in disk_group_detail.items():
            if key in self.disk_groups_generic:
                generic_info.update({key : value})
            elif key in self.disk_groups_extended:
                extended_info.update({key : value})

        generic_info.update(extended_info)
        cvg_info = {
            "cvg_name": resource_id if resource_id in self.cvg_info_dict else "NA",
            "cvg_id": self.cvg_info_dict.get(resource_id, "NA")
        }
        generic_info.update(cvg_info)

        info = {
                "resource_type": self.RESOURCE_TYPE_DG,
                "resource_id": resource_id,
                "event_time": epoch_time
                }

        internal_json_msg = json.dumps(
            {"sensor_request_type": {
                "enclosure_alert": {
                    "host_id": host_name,
                    "severity": severity,
                    "alert_id": alert_id,
                    "alert_type": alert_type,
                    "status": "update",
                    "info": info,
                    "specific_info": generic_info
                }
            }})
        return internal_json_msg

    def _get_alert_id(self, epoch_time):
        """Returns alert id which is a combination of
           epoch_time and salt value
        """
        salt = str(uuid.uuid4().hex)
        alert_id = epoch_time + salt
        return alert_id

    def _send_json_msg(self, json_msg):
        """Sends JSON message to Handler"""
        if not json_msg:
            return

        self._event.clear()
        self._write_internal_msgQ(RealStorEnclMsgHandler.name(), json_msg, self._event)

    def suspend(self):
        """Suspends the module thread. It should be non-blocking"""
        super(RealStorLogicalVolumeSensor, self).suspend()
        self._suspended = True

    def resume(self):
        """Resumes the module thread. It should be non-blocking"""
        super(RealStorLogicalVolumeSensor, self).resume()
        self._suspended = False

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread"""
        super(RealStorLogicalVolumeSensor, self).shutdown()
Exemplo n.º 19
0
class RealStorControllerSensor(SensorThread, InternalMsgQ):
    """Monitors Controller data using RealStor API"""


    # Dependency list
    DEPENDENCIES = {
                    "plugins": ["RealStorEnclMsgHandler"],
                    "rpms": []
    }

    SENSOR_NAME = "RealStorControllerSensor"
    SENSOR_RESP_TYPE = "enclosure_controller_alert"
    RESOURCE_CATEGORY = "hw"
    RESOURCE_TYPE = "enclosure:hw:controller"

    PRIORITY          = 1

    # Controllers directory name
    CONTROLLERS_DIR = "controllers"

    @staticmethod
    def name():
        """@return: name of the monitoring module."""
        return RealStorControllerSensor.SENSOR_NAME

    @staticmethod
    def impact():
        """Returns impact of the module."""
        return "Controllers in storage enclosure can not be monitored."

    @staticmethod
    def dependencies():
        """Returns a list of plugins and RPMs this module requires
           to function.
        """
        return RealStorControllerSensor.DEPENDENCIES

    def __init__(self):
        super(RealStorControllerSensor, self).__init__(
            self.SENSOR_NAME, self.PRIORITY)

        self._faulty_controller_file_path = None

        self.rssencl = singleton_realstorencl

        # controllers persistent cache
        self._controller_prcache = None

        # Holds Controllers with faults. Used for future reference.
        self._previously_faulty_controllers = {}

        self.pollfreq_controllersensor = \
            int(Conf.get(SSPL_CONF,f"{self.rssencl.CONF_REALSTORCONTROLLERSENSOR}>{POLLING_FREQUENCY_OVERRIDE}",
                                0))

        if self.pollfreq_controllersensor == 0:
                self.pollfreq_controllersensor = self.rssencl.pollfreq

        # Flag to indicate suspension of module
        self._suspended = False

        self._event = Event()
        self.os_utils = OSUtils()

    def initialize(self, conf_reader, msgQlist, products):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(RealStorControllerSensor, self).initialize(conf_reader)

        # Initialize internal message queues for this module
        super(RealStorControllerSensor, self).initialize_msgQ(msgQlist)

        self._controller_prcache = os.path.join(self.rssencl.frus,\
             self.CONTROLLERS_DIR)

        # Persistence file location. This file stores faulty Controller data
        self._faulty_controller_file_path = os.path.join(
            self._controller_prcache, "controllerdata.json")

        # Load faulty Controller data from file if available
        self._previously_faulty_controllers = store.get(\
                                                  self._faulty_controller_file_path)

        if self._previously_faulty_controllers is None:
            self._previously_faulty_controllers = {}
            store.put(self._previously_faulty_controllers,\
                self._faulty_controller_file_path)

        return True

    def read_data(self):
        """This method is part of interface. Currently it is not
        in use.
        """
        return {}

    def run(self):
        """Run the sensor on its own thread"""

        # Do not proceed if module is suspended
        if self._suspended == True:
            self._scheduler.enter(10, self._priority, self.run, ())
            return

        # Check for debug mode being activated
        self._read_my_msgQ_noWait()

        controllers = None

        controllers = self._get_controllers()

        if controllers:
            self._get_msgs_for_faulty_controllers(controllers)

        # Reset debug mode if persistence is not enabled
        self._disable_debug_if_persist_false()

        # Fire every 10 seconds to see if We have a faulty Controller
        self._scheduler.enter(self.pollfreq_controllersensor,
                self._priority, self.run, ())

    def _get_controllers(self):
        """Receives list of Controllers from API.
           URL: http://<host>/api/show/controllers
        """
        url = self.rssencl.build_url(self.rssencl.URI_CLIAPI_SHOWCONTROLLERS)

        response = self.rssencl.ws_request(url, self.rssencl.ws.HTTP_GET)

        if not response:
            logger.warn(f"{self.rssencl.LDR_R1_ENCL}:: Controllers status unavailable as ws request {url}")
            return

        if response.status_code != self.rssencl.ws.HTTP_OK:
            if url.find(self.rssencl.ws.LOOPBACK) == -1:
                raise Exception(f"{self.rssencl.LDR_R1_ENCL}:: http request {url} "
                                f"to get controllers failed with err {response.status_code}")
            return

        response_data = json.loads(response.text)
        controllers = response_data.get("controllers")
        return controllers

    def _get_msgs_for_faulty_controllers(self, controllers, send_message=True):
        """Checks for health of controllers and returns list of messages to be
           sent to handler if there are any.
        """
        faulty_controller_messages = []
        internal_json_msg = None
        controller_health = None
        durable_id = None
        alert_type = ""
        # Flag to indicate if there is a change in _previously_faulty_controllers
        state_changed = False
        prev_alert_type = None

        if not controllers:
            return
        for controller in controllers:
            controller_health = controller["health"].lower()
            controller_status = controller["status"].lower()
            durable_id = controller["durable-id"]

            # Check for missing and fault case
            if controller_health == self.rssencl.HEALTH_FAULT:
                # Status change from Degraded ==> Fault or OK ==> Fault
                if (durable_id in self._previously_faulty_controllers and \
                        self._previously_faulty_controllers[durable_id]['health']=="degraded") or \
                        (durable_id not in self._previously_faulty_controllers):
                    alert_type = self.rssencl.FRU_FAULT
                    # Check for removal
                    if controller_status == self.rssencl.STATUS_NOTINSTALLED:
                        alert_type = self.rssencl.FRU_MISSING
                    self._previously_faulty_controllers[durable_id] = {
                        "health": controller_health, "alert_type": alert_type}
                    state_changed = True
                    internal_json_msg = self._create_internal_msg(
                        controller, alert_type)
                    faulty_controller_messages.append(internal_json_msg)
                    # Send message to handler
                    if send_message:
                        self._send_json_msg(internal_json_msg)
            # Check for fault case
            elif controller_health == self.rssencl.HEALTH_DEGRADED:
                # Status change from Fault ==> Degraded or OK ==> Degraded
                # Controller can also go into degraded state after installation as well
                # So, Degrade state can be after missing alert as well.
                if (durable_id in self._previously_faulty_controllers and \
                        self._previously_faulty_controllers[durable_id]['health']=="fault") or \
                        (durable_id not in self._previously_faulty_controllers):
                    if self._previously_faulty_controllers and \
                            self._previously_faulty_controllers.get(durable_id).get('alert_type'):
                        prev_alert_type = self._previously_faulty_controllers[durable_id]["alert_type"]

                    # If prev_alert_type is missing, then the next alert type will be insertion first
                    if prev_alert_type and prev_alert_type.lower() == self.rssencl.FRU_MISSING:
                        alert_type = self.rssencl.FRU_INSERTION

                        internal_json_msg = self._create_internal_msg(
                                    controller, alert_type)

                        # send the message to the handler
                        if send_message:
                            self._send_json_msg(internal_json_msg)

                    # And set alert_type as fault
                    alert_type = self.rssencl.FRU_FAULT
                    self._previously_faulty_controllers[durable_id] = {
                        "health": controller_health, "alert_type": alert_type}

                    internal_json_msg = self._create_internal_msg(controller, alert_type)
                    faulty_controller_messages.append(internal_json_msg)

                    state_changed = True

                    # send the message to the handler
                    if send_message:
                        self._send_json_msg(internal_json_msg)

            # Check for healthy case
            elif controller_health == self.rssencl.HEALTH_OK:
                # Status change from Fault ==> OK or Degraded ==> OK
                if durable_id in self._previously_faulty_controllers:
                    # Send message to handler
                    if send_message:
                        previous_alert_type = \
                            self._previously_faulty_controllers[durable_id]["alert_type"]
                        alert_type = self.rssencl.FRU_FAULT_RESOLVED
                        if previous_alert_type == self.rssencl.FRU_MISSING:
                            alert_type = self.rssencl.FRU_INSERTION
                        internal_json_msg = self._create_internal_msg(
                            controller, alert_type)
                        faulty_controller_messages.append(internal_json_msg)
                        if send_message:
                            self._send_json_msg(internal_json_msg)
                    del self._previously_faulty_controllers[durable_id]
                    state_changed = True
            # Persist faulty Controller list to file only if something is changed
            if state_changed:
                # Wait till msg is sent to message bus or added in consul for resending.
                # If timed out, do not update cache and revert in-memory cache.
                # So, in next iteration change can be detected
                if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT):
                    store.put(self._previously_faulty_controllers,\
                        self._faulty_controller_file_path)
                else:
                    self._previously_faulty_controllers = store.get(self._faulty_controller_file_path)
                state_changed = False
            alert_type = ""
        return faulty_controller_messages

    def _create_internal_msg(self, controller_detail, alert_type):
        """Forms a dictionary containing info about Controllers to send to
           message handler.
        """
        if not controller_detail:
            return {}

        severity_reader = SeverityReader()
        severity = severity_reader.map_severity(alert_type)
        epoch_time = str(int(time.time()))

        alert_id = self._get_alert_id(epoch_time)
        fru = self.rssencl.is_storage_fru('controller')
        resource_id = controller_detail.get("durable-id", "")
        host_name = self.os_utils.get_fqdn()
        info = {
                "resource_type": self.RESOURCE_TYPE,
                "fru": fru,
                "resource_id": resource_id,
                "event_time": epoch_time
                }

        internal_json_msg = json.dumps(
            {"sensor_request_type": {
                "enclosure_alert": {
                    "host_id": host_name,
                    "severity": severity,
                    "alert_id": alert_id,
                    "alert_type": alert_type,
                    "status": "update",
                    "info": info,
                    "specific_info": controller_detail
                }
            }})

        return internal_json_msg

    def _get_alert_id(self, epoch_time):
        """Returns alert id which is a combination of
           epoch_time and salt value
        """
        salt = str(uuid.uuid4().hex)
        alert_id = epoch_time + salt
        return alert_id

    def _send_json_msg(self, json_msg):
        """Sends JSON message to Handler"""
        if not json_msg:
            return
        self._event.clear()
        self._write_internal_msgQ(RealStorEnclMsgHandler.name(), json_msg, self._event)

    def suspend(self):
        """Suspends the module thread. It should be non-blocking"""
        super(RealStorControllerSensor, self).suspend()
        self._suspended = True

    def resume(self):
        """Resumes the module thread. It should be non-blocking"""
        super(RealStorControllerSensor, self).resume()
        self._suspended = False

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread"""
        super(RealStorControllerSensor, self).shutdown()
 def __init__(self):
     super(RAIDIntegritySensor, self).__init__(self.SENSOR_NAME,
                                               self.PRIORITY)
     self._cache_state = None
     self.os_utils = OSUtils()
class RAIDIntegritySensor(SensorThread, InternalMsgQ):

    SENSOR_NAME = "RAIDIntegritySensor"
    PRIORITY = 1
    RESOURCE_TYPE = "node:os:raid_integrity"

    # Section and keys in configuration file
    RAIDIntegritySensor = SENSOR_NAME.upper()

    SYSTEM_INFORMATION = "SYSTEM_INFORMATION"

    SCAN_FREQUENCY = "polling_interval"
    RETRY_INTERVAL = "retry_interval"
    TIMESTAMP_FILE_PATH_KEY = "timestamp_file_path"

    # Scan for RAID integrity error every 2 weeks (1209600 seconds)
    DEFAULT_SCAN_FREQUENCY = "1209600"
    # Minimum allowed frequency for RAID integrity scans is 1 day
    # (86400 seconds ), as frequent scans affect disk i/o performance
    MIN_SCAN_FREQUENCY = 86400
    DEFAULT_RAID_DATA_PATH = RaidDataConfig.RAID_RESULT_DIR.value
    DEFAULT_TIMESTAMP_FILE_PATH = DEFAULT_RAID_DATA_PATH + "last_execution_time"

    alert_type = None

    # alerts
    FAULT_RESOLVED = "fault_resolved"
    FAULT = "fault"
    MISSING = "missing"
    SUCCESS = "success"
    FAILED = "failed"

    @staticmethod
    def name():
        """@return: name of the monitoring module."""
        return RAIDIntegritySensor.SENSOR_NAME

    @staticmethod
    def impact():
        """Returns impact of the module."""
        return "Server RAID integrity can not be monitored."

    def __init__(self):
        super(RAIDIntegritySensor, self).__init__(self.SENSOR_NAME,
                                                  self.PRIORITY)
        self._cache_state = None
        self.os_utils = OSUtils()

    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(RAIDIntegritySensor, self).initialize(conf_reader)

        # Initialize internal message queues for this module
        super(RAIDIntegritySensor, self).initialize_msgQ(msgQlist)

        self._alert_msg = None
        self._fault_state = None
        self._suspended = False

        self._timestamp_file_path = Conf.get(
            SSPL_CONF,
            f"{self.RAIDIntegritySensor}>{self.TIMESTAMP_FILE_PATH_KEY}",
            self.DEFAULT_TIMESTAMP_FILE_PATH)
        self._scan_frequency = Conf.get(
            SSPL_CONF, f"{self.RAIDIntegritySensor}>{self.SCAN_FREQUENCY}",
            self.DEFAULT_SCAN_FREQUENCY)
        self._next_scheduled_time = self._scan_frequency

        if self._scan_frequency < self.MIN_SCAN_FREQUENCY:
            self._scan_frequency = self.MIN_SCAN_FREQUENCY

        sysfs_path = Conf.get(SSPL_CONF, f'{SYSTEM_INFORMATION}>{SYSFS_PATH}')
        self.raid_dir = sysfs_path + BLOCK_DIR

        self.retry_interval = int(
            Conf.get(SSPL_CONF,
                     f'{self.RAIDIntegritySensor}>{self.RETRY_INTERVAL}'))

        # Create DEFAULT_RAID_DATA_PATH if already not exist.
        self._create_file(self.DEFAULT_RAID_DATA_PATH)
        return True

    def read_data(self):
        return self._cache_state

    def run(self):
        """Run the sensor on its own thread"""
        # Do not proceed if module is suspended
        if self._suspended == True:
            if os.path.exists(self._timestamp_file_path):
                with open(self._timestamp_file_path, "r") as timestamp_file:
                    last_processed_log_timestamp = timestamp_file.read().strip(
                    )
                current_time = int(time.time())
                if current_time > int(last_processed_log_timestamp):
                    self._next_scheduled_time = self._scan_frequency - \
                        (current_time - int(last_processed_log_timestamp))
            logger.info("Scheduling RAID validate again after: %s seconds" %
                        self._next_scheduled_time)
            self._scheduler.enter(self._next_scheduled_time, self._priority,
                                  self.run, ())
            return

        # Check for debug mode being activated
        self._read_my_msgQ_noWait()

        #cleanup
        self._cleanup()

        # Log RAIDIntegritySensor execution timestamp
        self._create_file(self._timestamp_file_path)
        self._log_timestamp()

        # Validate the raid data files and notify the node data msg handler
        self._raid_health_monitor()

        with open(self._timestamp_file_path, "r") as timestamp_file:
            last_processed_log_timestamp = timestamp_file.read().strip()
            current_time = int(time.time())
            if current_time > int(last_processed_log_timestamp):
                self._next_scheduled_time = self._scan_frequency - \
                    (current_time - int(last_processed_log_timestamp))
        logger.info("Scheduling RAID validate again after: %s seconds" %
                    self._next_scheduled_time)
        self._scheduler.enter(self._next_scheduled_time, self._priority,
                              self.run, ())

    def _raid_health_monitor(self):
        try:
            devices = self._get_devices()
            if len(devices) == 0:
                return
            logger.debug("Fetched devices:{}".format(devices))

            for device in devices:
                # Update the state as 'check' for RAID device file
                result = self._update_raid_device_file(device)
                if result == "failed":
                    self._retry_execution(self._update_raid_device_file,
                                          device)
                logger.info("RAID device state is changed to 'check'")

                # Check RAID device array state is 'idle' or not
                result = self._check_raid_state(device)
                if result == "failed":
                    logger.warn(
                        "'Idle' state not found for RAID device:{}".format(
                            device))
                    # Retry to check RAID state
                    self._retry_execution(self._check_raid_state, device)
                logger.info(
                    "'idle' state is found in Raid device:{}.".format(device))

                # Check Mismatch count in RAID device files.
                result = self._check_mismatch_count(device)
                if result == "failed":
                    # Persist RAID device fault state and send alert
                    fault_status_file = self.DEFAULT_RAID_DATA_PATH + device + "_" + RaidDataConfig.RAID_MISMATCH_FAULT_STATUS.value
                    if os.path.exists(fault_status_file):
                        with open(fault_status_file, 'r') as fs:
                            data = fs.read().rstrip()
                        if self.FAULT_RESOLVED in data:
                            self.alert_type = self.FAULT
                            self._alert_msg = "RAID disks present in %s RAID array"\
                                ", needs synchronization. If fault persists for "\
                                "more than 2 days, Please contact Seagate support."%device
                            self._send_json_msg(self.alert_type, device,
                                                self._alert_msg)
                            self._update_fault_state_file(
                                device, self.FAULT, fault_status_file)
                            self._scan_frequency = self.MIN_SCAN_FREQUENCY
                    else:
                        self.alert_type = self.FAULT
                        self._alert_msg = "RAID disks present in %s RAID array"\
                                ", needs synchronization. If fault persists for "\
                                "more than 2 days, Please contact Seagate support."%device
                        self._send_json_msg(self.alert_type, device,
                                            self._alert_msg)
                        self._update_fault_state_file(device, self.FAULT,
                                                      fault_status_file)
                        self._scan_frequency = self.MIN_SCAN_FREQUENCY

                    # Retry to check mismatch_cnt
                    self._retry_execution(self._check_mismatch_count, device)
                logger.debug(
                    "No mismatch count is found in Raid device:{}".format(
                        device))

        except Exception as ae:
            raise Exception(f"Failed in monitoring RAID health, {ae}")

    def _get_devices(self):
        try:
            mdstat_file = RaidDataConfig.MDSTAT_FILE.value
            with open(mdstat_file, 'r') as fp:
                content = fp.readlines()
            device_array = []
            for line in content:
                if "active" in line:
                    device = line.split(":")[0].rstrip()
                    device_array.append(device)
            if len(device_array) == 0:
                logger.error("No RAID device found in mdstat file.")
            return device_array
        except Exception as ae:
            raise Exception(f"Failed to get the device array, {ae}")

    def _check_mismatch_count(self, device):
        try:
            status = None
            mismatch_cnt_file = RaidDataConfig.MISMATCH_COUNT_FILE.value
            MISMATCH_COUNT_COMMAND = 'cat ' + self.raid_dir + device +\
                                     mismatch_cnt_file
            logger.debug('Executing MISMATCH_CNT_COMMAND:{}'.format(
                MISMATCH_COUNT_COMMAND))
            response, error = self._run_command(MISMATCH_COUNT_COMMAND)
            if error:
                logger.error("Error in cmd{} in raid health monitor".format(
                    MISMATCH_COUNT_COMMAND))
            if response == RaidDataConfig.MISMATCH_COUNT_RESPONSE.value:
                logger.debug("No mismatch count is found")
                status = "success"
                with open(self.output_file, 'a') as raid_file:
                    raid_file.write(
                        RaidDataConfig.MISMATCH_COUNT_RESPONSE.value)
                fault_status_file = self.DEFAULT_RAID_DATA_PATH + device + "_" + RaidDataConfig.RAID_MISMATCH_FAULT_STATUS.value
                if os.path.exists(fault_status_file):
                    with open(fault_status_file, 'r') as fs:
                        data = fs.read().rstrip()
                    if self.FAULT in data:
                        faulty_device = data.split(":")[0].rstrip()
                        if device == faulty_device:
                            self.alert_type = self.FAULT_RESOLVED
                            self._alert_msg = "RAID disks present in %s RAID array are synchronized." % device
                            self._send_json_msg(self.alert_type, device,
                                                self._alert_msg)
                            self._update_fault_state_file(
                                device, self.FAULT_RESOLVED, fault_status_file)
                            self._scan_frequency = Conf.get(
                                SSPL_CONF,
                                f"{self.RAIDIntegritySensor}>{self.SCAN_FREQUENCY}",
                                self.DEFAULT_SCAN_FREQUENCY)
                            self._scan_frequency = max(self._scan_frequency,
                                                       self.MIN_SCAN_FREQUENCY)
            else:
                status = "failed"
                logger.debug(
                    "Mismatch found in {} file in raid_integrity_data!".format(
                        mismatch_cnt_file))
            return status
        except Exception as ae:
            logger.error(
                "Failed in checking mismatch_cnt in RAID file. ERROR:{}".
                format(str(ae)))
            raise

    def _check_raid_state(self, device):
        try:
            status = None
            raid_check = 0
            sync_action_file = RaidDataConfig.SYNC_ACTION_FILE.value
            while raid_check <= RaidDataConfig.MAX_RETRIES.value:
                self.output_file = self._get_unique_filename(
                    RaidDataConfig.RAID_RESULT_FILE_PATH.value, device)
                STATE_COMMAND = 'cat ' + self.raid_dir + device +\
                                sync_action_file
                logger.debug(
                    'Executing STATE_COMMAND:{}'.format(STATE_COMMAND))
                response, error = self._run_command(STATE_COMMAND)
                if error:
                    logger.warn("Error in cmd{} in raid health monitor".format(
                        STATE_COMMAND))
                    raid_check += 1
                else:
                    if response == RaidDataConfig.STATE_COMMAND_RESPONSE.value:
                        status = "success"
                        with open(self.output_file, 'w') as raid_file:
                            raid_file.write(
                                RaidDataConfig.STATE_COMMAND_RESPONSE.value +
                                "\n")
                        break
                    else:
                        status = "failed"
                        raid_check += 1
                        time.sleep(WAIT_BEFORE_RETRY)
            return status
        except Exception as ae:
            logger.error(
                "Failed in checking RAID device state. ERROR:{}".format(
                    str(ae)))
            raise

    def _update_raid_device_file(self, device):
        try:
            status = "failed"
            raid_check = 0
            sync_action_file = RaidDataConfig.SYNC_ACTION_FILE.value
            while raid_check <= RaidDataConfig.MAX_RETRIES.value:
                CHECK_COMMAND = "echo 'check' |sudo tee " + self.raid_dir +\
                                device + sync_action_file + " > /dev/null"
                logger.debug(
                    'Executing CHECK_COMMAND:{}'.format(CHECK_COMMAND))
                response, error = self._run_command(CHECK_COMMAND)
                if error:
                    logger.warn(
                        "Failed in executing command:{}.".format(error))
                    raid_check += 1
                    time.sleep(1)
                else:
                    logger.debug(
                        "RAID device state is changed to 'check' with response : {}"
                        .format(response))
                    status = "success"
                    break
            return status
        except Exception as ae:
            logger.error("Failed to update RAID File. ERROR:{}".format(
                str(ae)))
            raise

    def _retry_execution(self, function_call, device):
        while True:
            logger.debug("Executing function:{} after {} time interval".format(
                function_call, self.retry_interval))
            time.sleep(self.retry_interval)
            result = function_call(device)
            if result == self.SUCCESS:
                return

    def _get_unique_filename(self, filename, device):
        unique_timestamp = datetime.now().strftime("%d-%m-%Y_%I-%M-%S-%p")
        unique_filename = f"{filename}_{device}_{unique_timestamp}.txt"
        return unique_filename

    def _send_json_msg(self, alert_type, resource_id, error_msg):
        """Transmit data to NodeDataMsgHandler to be processed and sent out"""

        epoch_time = str(int(time.time()))
        severity_reader = SeverityReader()
        severity = severity_reader.map_severity(alert_type)
        self._alert_id = self._get_alert_id(epoch_time)
        host_name = self.os_utils.get_fqdn()

        info = {
            "resource_type": self.RESOURCE_TYPE,
            "resource_id": resource_id,
            "event_time": epoch_time,
            "description": error_msg
        }
        specific_info = {"error": error_msg}

        internal_json_msg = json.dumps({
            "sensor_request_type": {
                "node_data": {
                    "status": "update",
                    "sensor_type": "node:os:raid_integrity",
                    "host_id": host_name,
                    "alert_type": alert_type,
                    "alert_id": self._alert_id,
                    "severity": severity,
                    "info": info,
                    "specific_info": specific_info
                }
            }
        })
        self.alert_type = None

        # Send the event to node data message handler to generate json message and send out
        self._write_internal_msgQ(NodeDataMsgHandler.name(), internal_json_msg)

    def _get_alert_id(self, epoch_time):
        """Returns alert id which is a combination of
        epoch_time and salt value
        """
        salt = str(uuid.uuid4().hex)
        alert_id = epoch_time + salt
        return alert_id

    def suspend(self):
        """Suspends the module thread. It should be non-blocking"""
        super(RAIDIntegritySensor, self).suspend()
        self._suspended = True

    def resume(self):
        """Resumes the module thread. It should be non-blocking"""
        super(RAIDIntegritySensor, self).resume()
        self._suspended = False

    def _run_command(self, command):
        """Run the command and get the response and error returned"""
        logger.debug(f"_run_command: {command}")
        process = subprocess.Popen(command,
                                   shell=True,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)
        response, error = process.communicate()
        if response:
            logger.debug(f"_run_command, response: {str(response)}")
        if error:
            logger.debug(f"_run_command: error: {str(error)}")

        return response.decode().rstrip('\n'), error.decode().rstrip('\n')

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread"""
        super(RAIDIntegritySensor, self).shutdown()

    def _create_file(self, path):
        dir_path = path[:path.rindex("/")]
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
            logger.debug("{} in creation of dir path : {}".format(
                self.SUCCESS, dir_path))
        if not os.path.exists(path):
            file = open(path, "w+")
            file.close()

    def _log_timestamp(self):
        current_time = str(int(time.time()))
        with open(self._timestamp_file_path, "w") as timestamp_file:
            timestamp_file.write(current_time)

    def _update_fault_state_file(self, device, fstate, fault_state_file):
        self._fault_state = fstate
        data = device + ":" + self._fault_state
        self._create_file(fault_state_file)
        with open(fault_state_file, 'w') as fs:
            fs.write(data)

    def _cleanup(self):
        """Clean up the validate raid result files"""
        if os.path.exists(self._timestamp_file_path):
            os.remove(self._timestamp_file_path)
        path = RaidDataConfig.RAID_RESULT_DIR.value
        if os.path.exists(path):
            current_time = time.time()
            result_files = [
                file for file in os.listdir(path) if file.endswith(".txt")
            ]
            for file in result_files:
                if os.path.getmtime(os.path.join(
                        path, file)) < (current_time - 24 * 60 * 60):
                    if os.path.isfile(os.path.join(path, file)):
                        os.remove(os.path.join(path, file))
Exemplo n.º 22
0
class SASPortSensor(SensorThread, InternalMsgQ):
    """SAS Port Sensor which runs on its own thread periodically and
       is responsible for sensing changes is SAS ports/cable using
       available tool/utility"""

    SENSOR_NAME = "SASPortSensor"
    PRIORITY = 1
    RESOURCE_TYPE = "node:interface:sas"

    # section in the configuration store
    SYSTEM_INFORMATION = "SYSTEM_INFORMATION"
    POLLING_INTERVAL = "polling_interval"
    CACHE_DIR_NAME = "server"

    DEFAULT_POLLING_INTERVAL = '30'

    PROBE = "probe"

    # Dependency list
    DEPENDENCIES = {"plugins": ["NodeDataMsgHandler"], "rpms": []}

    # Number of SAS Ports
    NUM_SAS_PORTS = 4
    # Number of Phys in a Port
    NUM_PHYS_PER_PORT = 4
    # Current Data Version
    CURRENT_DATA_VERSION = 1

    @staticmethod
    def name():
        """@return: name of the module."""
        return SASPortSensor.SENSOR_NAME

    @staticmethod
    def impact():
        """Returns impact of the module."""
        return "Server SAS ports can not be monitored."

    def __init__(self, utility_instance=None):
        """init method"""
        super(SASPortSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY)

        # Initialize the utility instance
        self._utility_instance = utility_instance

        self.phy_dir_to_linkrate_mapping = None

        # Flag to indicate suspension of module
        self._suspended = False
        self._count = 0
        self.phy_link_count = 0
        self.sas_ports_status = {}
        self.port_phy_list_dict = {}
        self.sas_phy_stored_alert = None
        self.os_utils = OSUtils()

    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(SASPortSensor, self).initialize(conf_reader)

        super(SASPortSensor, self).initialize_msgQ(msgQlist)

        self._node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY, 'SN01')

        # Get the sas port implementor from configuration
        sas_port_utility = Conf.get(
            SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}", "sysfs")

        self.polling_interval = int(
            Conf.get(SSPL_CONF,
                     f"{self.SENSOR_NAME.upper()}>{self.POLLING_INTERVAL}",
                     self.DEFAULT_POLLING_INTERVAL))

        self.HOST_ID = SAS().get_host_list()[0].replace('host', '')

        self.RESOURCE_ID = SAS_RESOURCE_ID + self.HOST_ID  # eg. SASHBA-0 if host_id=0

        # Creating the instance of ToolFactory class
        self.tool_factory = ToolFactory()

        cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME)
        self.SAS_PORT_SENSOR_DATA = os.path.join(
            cache_dir_path, f'SAS_PORT_SENSOR_DATA_{self._node_id}')

        alert_type = None

        try:
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(sas_port_utility)
            self._utility_instance.initialize()
            phy_status = None

            link_value_phy_status_collection = ()

            # Call to sas phy dirctory which will return a dictionary
            # which has phy_name to negotiated link rate mapping
            # Ex: {"phy-0:0": "<12.0, Unknown>"}
            self.phy_dir_to_linkrate_mapping = \
                    self._utility_instance.get_phy_negotiated_link_rate()

            # Iterate over populated dictionary and restructure it
            # Ex: if phy-0:0 is 12.0/6.0/3.0, considered as UP.
            # {"phy-0:0": ("link_rate", <Up/Down>)}
            for phy, value in self.phy_dir_to_linkrate_mapping.items():
                if 'Gbit'.lower() in value.strip().lower():
                    phy_status = 'up'
                    # Increment global phy_link count for UP status
                    self.phy_link_count += 1
                else:
                    phy_status = 'fault'
                link_value_phy_status_collection = (value, phy_status)
                self.phy_dir_to_linkrate_mapping[
                    phy] = link_value_phy_status_collection

            # Get the stored previous alert info
            self.sas_phy_stored_alert = store.get(self.SAS_PORT_SENSOR_DATA)
            self.check_and_send_alert()

        except KeyError as key_error:
            raise Exception(
                f"Unable to get the instance of {sas_port_utility} "
                f"utility, {key_error}")
        except Exception as e:
            if e == errno.ENOENT:
                raise Exception("Problem occurred while reading from sas_phy \
                    directory. directory path doesn't directory.")
            elif e == errno.EACCES:
                raise Exception(
                    "Problem occurred while reading from sas_phy directory. \
                     Not enough permission to read from the directory.")
            else:
                raise Exception(
                    f"Problem occurred while reading from sas_phy directory. {e}"
                )

        return True

    def update_sas_ports_status(self):
        """
        Reads current phy status and updates port connectivity status
        Assumption : phys will be present in multiples of 4
        """
        phy_list = [*self.phy_dir_to_linkrate_mapping]
        phy_list = sort_phy_list(phy_list)

        # Now we have a sorted list of phys
        # Phys 0-3 for the 0th sas port, and so on in groups of 4 phys
        # List containing status of all phys
        hba = []
        for phy in phy_list:
            if self.phy_dir_to_linkrate_mapping[phy][1] == 'up':
                hba.append(1)
            else:
                hba.append(0)

        for i in range(0, self.NUM_SAS_PORTS):
            # Save phy names forming this port for future use
            self.port_phy_list_dict[i] = phy_list[ self.NUM_PHYS_PER_PORT * i : \
                                                        self.NUM_PHYS_PER_PORT * i + self.NUM_PHYS_PER_PORT ]
            # Check port status
            s = set(hba[self.NUM_PHYS_PER_PORT * i:self.NUM_PHYS_PER_PORT * i +
                        self.NUM_PHYS_PER_PORT])
            if len(s) == 1 and 0 in s:
                port_status = 'down'
            elif len(s) == 1 and 1 in s:
                port_status = 'up'
            else:
                port_status = 'degraded'
            # Store the data
            self.sas_ports_status[i] = port_status

    def check_and_send_conn_alert(self):
        """
        Sends conn fault alert if all phys go down
        Sends conn fault_resolved alert if at least 1 sas port (4 phys) comes up
        """
        # Case 1 : all fault for fault alert
        cur_all_fault = True

        # Case 2 : all fault_resolved for fault_resolved alert
        cur_all_fault_resolved = True

        # Previous conn alert that was sent
        prev_conn_alert = self.sas_phy_stored_alert['conn']

        # Current
        for port, value in self.sas_phy_stored_alert.items():
            if port in ['version', 'conn']:
                # This is key for conn alert, skip
                continue

            # Case 1 : All faults in current status
            if value != 'fault':
                cur_all_fault = False

            # Case 2 : All fault_resolved in current status
            elif value != 'fault_resolved':
                cur_all_fault_resolved = False

        if prev_conn_alert == 'fault_resolved' and cur_all_fault:
            # Send conn fault alert
            alert_type = 'fault'
            self._generate_alert(alert_type, -1)
            self.sas_phy_stored_alert['conn'] = alert_type

        elif prev_conn_alert == 'fault' and cur_all_fault_resolved:
            # Send conn fault_resolved alert
            alert_type = 'fault_resolved'
            self._generate_alert(alert_type, -1)
            self.sas_phy_stored_alert['conn'] = alert_type

    def handle_current_version_data(self):
        """Contains logic to check and send alert if data has version == 1."""
        # Compare current status of each port with previous alert_type
        for port, value in self.sas_phy_stored_alert.items():
            if port in ['version', 'conn']:
                # Skip
                continue
            if value == 'fault_resolved' and \
                        self.sas_ports_status[port] == 'down':
                alert_type = 'fault'
                self._generate_alert(alert_type, port)
                self.sas_phy_stored_alert[port] = alert_type
            elif value == 'fault' and \
                        self.sas_ports_status[port] == 'up':
                alert_type = 'fault_resolved'
                self._generate_alert(alert_type, port)
                self.sas_phy_stored_alert[port] = alert_type
        # See if conn failure/conn resolved alert needs to be sent
        self.check_and_send_conn_alert()
        # Save data to store
        store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA)

    def check_and_send_alert(self):
        """Checks whether conditions are met and sends alert if required
        Alerts will be sent if -
        1. All 4 phys of a sas port go up -> down : fault alert
        2. All 4 phys of a sas port come down -> up : fault_resolved alert
        Sensor data stored in persistent storage is a dict of { sas_port_number : alert_type }
        """
        # Update sas ports status
        self.update_sas_ports_status()

        # Check the version of stored alert
        version = None
        try:
            # Try to get the version
            # Exception will be raised if stored alert is None or no Version is available
            version = self.sas_phy_stored_alert['version']
        except Exception:
            logger.warn(f"Found no data or old data format for SASPortSensor, \
                            updating data format to version {self.CURRENT_DATA_VERSION}"
                        )
            # Versioning is not implemented or there is no data, write new data
            # Initialize dummy fault_resolved for all sas ports and conn
            self.sas_phy_stored_alert = {}
            self.sas_phy_stored_alert['version'] = self.CURRENT_DATA_VERSION
            self.sas_phy_stored_alert['conn'] = 'fault_resolved'
            for i in range(0, self.NUM_SAS_PORTS):
                self.sas_phy_stored_alert[i] = 'fault_resolved'
            # Save data to store
            store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA)

        if version == self.CURRENT_DATA_VERSION:
            self.handle_current_version_data()

    def run(self):
        """Run the sensor on its own thread"""

        alert_type = None
        status = None

        new_phy_up = 0
        new_phy_down = 0

        # Do not proceed if module is suspended
        if self._suspended == True:
            self._scheduler.enter(self.polling_interval, self._priority,
                                  self.run, ())
            return

        # Check for debug mode being activated
        self._read_my_msgQ_noWait()

        try:
            phy_link_rate_dict = \
                self._utility_instance.get_phy_negotiated_link_rate()
            if phy_link_rate_dict:
                for key, value in phy_link_rate_dict.items():
                    link_rate = value.strip()
                    prev_linkrate_value = \
                        self.phy_dir_to_linkrate_mapping[key][0].strip()
                    prev_alert_type = \
                        self.phy_dir_to_linkrate_mapping[key][1].strip()
                    status = prev_alert_type

                    # Compare local dict wrt global dictionary for change in the
                    # negotiated link rate
                    if link_rate.lower() != prev_linkrate_value.lower():
                        # If current link rate has no value like 12/6/3 Gbit
                        # and previously it was up, then it's a fault condition
                        if 'Gbit'.lower() not in link_rate.lower(
                        ) and prev_alert_type.lower() == 'up':
                            # Increment count for new phy down which were up previously
                            new_phy_down += 1

                            # Make respective phy_status as fault
                            status = 'fault'

                        # Check if 12/6/3 Gbit is there in the current link rate and
                        # the previous alert_type is fault. If so, means phy is Up again
                        elif 'Gbit'.lower() in link_rate.lower(
                        ) and prev_alert_type.lower() == 'fault':

                            # Mark respective phy_status as Up
                            status = 'up'

                            # Increment count for new phy up
                            new_phy_up += 1

                        # Finally update the global dict with current link rate
                        # and respctive phy status
                        self.phy_dir_to_linkrate_mapping[key] = (link_rate,
                                                                 status)

                # Get current phy status i.e number of Up phys
                new_phy_link_count = self.phy_link_count + new_phy_up - new_phy_down

                # Get the last sent alert info
                self.sas_phy_stored_alert = store.get(
                    self.SAS_PORT_SENSOR_DATA)
                self.check_and_send_alert()
                # Update current active phy count for next iteration
                self.phy_link_count = new_phy_link_count

        except Exception as ae:
            raise Exception(ae)

        # Fire every 30 seconds to see if there's a change in the phy status
        self._scheduler.enter(self.polling_interval, self._priority, self.run,
                              ())

    def _create_json_message(self, alert_type, port):
        """Creates a defined json message structure which can flow inside SSPL
           modules"""

        internal_json_msg = None
        severity_reader = SeverityReader()
        severity = severity_reader.map_severity(alert_type)
        epoch_time = str(int(time.time()))

        alert_id = self._get_alert_id(epoch_time)
        host_name = self.os_utils.get_fqdn()

        specific_info = {}
        specific_info_list = []
        description = "N/A"

        # specific_info will contain all 16 phys for conn level alert
        # Only 4 phys for port level alert
        for key, val in self.phy_dir_to_linkrate_mapping.items():
            if port != -1:
                # This is a port level alert, skip phys that are not relevant
                if key not in self.port_phy_list_dict[port]:
                    # Skip adding this phy
                    continue
            # Key will be phy-1:0.
            # Here phy-1:0 represent 0th phy for SASHBA-1.
            specific_info["resource_id"] = key
            specific_info[
                "negotiated_link_rate"] = self.phy_dir_to_linkrate_mapping[
                    key][0].strip()
            specific_info_list.append(specific_info)
            specific_info = {}

        alert_specific_info = specific_info_list

        if port == -1:
            # This is a SAS HBA level connection alert
            if alert_type == 'fault':
                description = "SAS connection error detected in SAS HBA %s." % self.RESOURCE_ID
            elif alert_type == 'fault_resolved':
                description = "SAS connection re-established in SAS HBA %s." % self.RESOURCE_ID

            info = {
                "resource_type": self.RESOURCE_TYPE,  # node:interface:sas
                "resource_id": self.RESOURCE_ID,  # eg. SASHBA-1
                "event_time": epoch_time,
                "description": description
            }
        else:
            # This is a port level alert
            if alert_type == 'fault':
                description = (
                    "No connectivity detected on the SAS port %s, possible"
                    "causes could be missing SAS cable, bad cable connection,"
                    "faulty cable or SAS port failure." % port)
            elif alert_type == 'fault_resolved':
                description = "Connection established on SAS port."

            info = {
                "resource_type":
                self.RESOURCE_TYPE + ':port',  # node:interface:sas:port
                "resource_id": f'sas_port-{self.HOST_ID}:{port}',
                # eg. sas_port-1:0 represents 0th port of SASHBA-1
                "event_time": epoch_time,
                "description": description
            }

        internal_json_msg = json.dumps({
            "sensor_request_type": {
                "node_data": {
                    "status": "update",
                    "host_id": host_name,
                    "alert_type": alert_type,
                    "severity": severity,
                    "alert_id": alert_id,
                    "info": info,
                    "specific_info": alert_specific_info
                }
            }
        })

        return internal_json_msg

    def _get_alert_id(self, epoch_time):
        """Returns alert id which is a combination of
           epoch_time and salt value
        """
        salt = str(uuid.uuid4().hex)
        alert_id = epoch_time + salt
        return alert_id

    def _generate_alert(self, alert_type, port):
        """Queues the message to NodeData Message Handler"""

        json_msg = self._create_json_message(alert_type, port)
        if json_msg:
            self._write_internal_msgQ(NodeDataMsgHandler.name(), json_msg)

    def suspend(self):
        """Suspends the module thread. It should be non-blocking"""
        super(SASPortSensor, self).suspend()
        self._suspended = True

    def resume(self):
        """Resumes the module thread. It should be non-blocking"""
        super(SASPortSensor, self).resume()
        self._suspended = False

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread"""
        super(SASPortSensor, self).shutdown()