def initialize(self, conf_reader, msgQlist, products): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RealStorEnclosureSensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RealStorEnclosureSensor, self).initialize_msgQ(msgQlist) self.ENCL_SENSOR_DATA_PATH = os.path.join(self.rssencl.encl_cache, 'enclosure_data.json') # Get the stored previous alert info self.persistent_encl_data = store.get(self.ENCL_SENSOR_DATA_PATH) if self.persistent_encl_data: if self.persistent_encl_data['fault_alert'].lower() == "true": self.fault_alert = True else: self.fault_alert = False self.previous_alert_type = self.persistent_encl_data[ 'previous_alert_type'] else: self.persistent_encl_data = { 'fault_alert': str(self.fault_alert), 'previous_alert_type': str(self.previous_alert_type), } store.put(self.persistent_encl_data, self.ENCL_SENSOR_DATA_PATH) return True
def initialize(self, conf_reader, msgQlist, products): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RealStorControllerSensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RealStorControllerSensor, self).initialize_msgQ(msgQlist) self._controller_prcache = os.path.join(self.rssencl.frus,\ self.CONTROLLERS_DIR) # Persistence file location. This file stores faulty Controller data self._faulty_controller_file_path = os.path.join( self._controller_prcache, "controllerdata.json") # Load faulty Controller data from file if available self._previously_faulty_controllers = store.get(\ self._faulty_controller_file_path) if self._previously_faulty_controllers is None: self._previously_faulty_controllers = {} store.put(self._previously_faulty_controllers,\ self._faulty_controller_file_path) return True
def initialize(self, conf_reader, msgQlist, products): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RealStorPSUSensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RealStorPSUSensor, self).initialize_msgQ(msgQlist) self.psu_prcache = os.path.join(self.rssencl.frus, self.PSUS_DIR) # Persistence file location. This file stores faulty PSU data self._faulty_psu_file_path = os.path.join( self.psu_prcache, "psudata.json") self._log_debug( f"_faulty_psu_file_path: {self._faulty_psu_file_path}") # Load faulty PSU data from file if available self._previously_faulty_psus = store.get(\ self._faulty_psu_file_path) if self._previously_faulty_psus is None: self._previously_faulty_psus = {} store.put(self._previously_faulty_psus,\ self._faulty_psu_file_path) return True
def initialize(self, conf_reader, msgQlist, products): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RealStorLogicalVolumeSensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RealStorLogicalVolumeSensor, self).initialize_msgQ(msgQlist) self._logical_volume_prcache = os.path.join(self.rssencl.frus,\ self.LOGICAL_VOLUMES_DIR) # Persistence file location. This file stores faulty Logical Volume data self._faulty_disk_group_file_path = os.path.join( self._logical_volume_prcache, "logicalvolumedata.json") # Load faulty Logical Volume data from file if available self._previously_faulty_disk_groups = store.get(\ self._faulty_disk_group_file_path) if self._previously_faulty_disk_groups is None: self._previously_faulty_disk_groups = {} store.put(self._previously_faulty_disk_groups,\ self._faulty_disk_group_file_path) return True
def update_memcache_faults(self): self.memcache_faults = self.latest_faults #Update faults in persistent cache logger.info("Updating faults persistent cache!!") store.put(self.memcache_faults, self.faults_persistent_cache)
def initialize(self, conf_reader, msgQlist, products): """Initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RealStorSideplaneExpanderSensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RealStorSideplaneExpanderSensor, self).initialize_msgQ(msgQlist) self._sideplane_exp_prcache = os.path.join(self.rssencl.frus,\ self.SIDEPLANE_EXPANDERS_DIR) # Persistence file location. # This file stores faulty sideplane expander data self._faulty_sideplane_expander_file_path = os.path.join( self._sideplane_exp_prcache, "sideplane_expanders_data.json") # Load faulty sideplane expander data from file if available self._faulty_sideplane_expander_dict = \ store.get(\ self._faulty_sideplane_expander_file_path) if self._faulty_sideplane_expander_dict is None: self._faulty_sideplane_expander_dict = {} store.put(\ self._faulty_sideplane_expander_dict,\ self._faulty_sideplane_expander_file_path) return True
def initialize(self, conf_reader, msgQlist, products): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RealStorFanSensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RealStorFanSensor, self).initialize_msgQ(msgQlist) self._fanmodule_prcache = os.path.join(self.rssencl.frus, \ self.FAN_MODULES_DIR) # Persistence file location. This file stores faulty FanModule data self._faulty_fan_file_path = os.path.join(self._fanmodule_prcache, "fanmodule_data.json") # Load faulty Fan Module data from file if available self._faulty_fan_modules_list = store.get(\ self._faulty_fan_file_path) if self._faulty_fan_modules_list is None: self._faulty_fan_modules_list = {} store.put(self._faulty_fan_modules_list,\ self._faulty_fan_file_path) return True
def put(self, item): size_of_item = sys.getsizeof(item) if self.is_full(size_of_item): logger.debug("StoreQueue, put, consul memory usage exceded limit, \ removing old message") self._create_space(size_of_item) store.put(item, f"SSPL_UNSENT_MESSAGES/{self.tail}", pickled=False) self.tail += 1 self.current_size += size_of_item logger.debug("StoreQueue, put, current memory usage %s" % self.current_size)
def dump_to_cache(self): """ Write service status to cache """ data = { "service_state": self.state, "service_monitor_state": self._service_state, "nonactive_enter_timestamp": self.nonactive_enter_timestamp, "active_enter_timestamp": self.active_enter_timestamp } store.put(data, f"{CACHE_PATH}/{self.name}")
def send_json_msg(self, alert_type, encl_status): severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) fru = self.rssencl.is_storage_fru('enclosure') resource_id = "0" host_name = self.os_utils.get_fqdn() info = { "resource_type": self.RESOURCE_TYPE, "fru": fru, "resource_id": resource_id, "event_time": epoch_time, "description": encl_status } internal_json_msg = json.dumps({ "sensor_request_type": { "enclosure_alert": { "host_id": host_name, "severity": severity, "alert_id": alert_id, "alert_type": alert_type, "status": "update", "info": info, "specific_info": { "event": encl_status } } } }) self.previous_alert_type = alert_type self._write_internal_msgQ(RealStorEnclMsgHandler.name(), internal_json_msg) self.persistent_encl_data = { 'fault_alert': str(self.fault_alert), 'previous_alert_type': str(self.previous_alert_type), } store.put(self.persistent_encl_data, self.ENCL_SENSOR_DATA_PATH)
def __init__(self): self._max_size = int( Conf.get(SSPL_CONF, f"{self.RABBITMQPROCESSOR}>{self.LIMIT_CONSUL_MEMORY}", 50000000)) self.cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME) self.SSPL_MEMORY_USAGE = os.path.join(self.cache_dir_path, 'SSPL_MEMORY_USAGE') self._current_size = store.get(self.SSPL_MEMORY_USAGE) if self._current_size is None: store.put(0, self.SSPL_MEMORY_USAGE) self.SSPL_MESSAGE_HEAD_INDEX = os.path.join(self.cache_dir_path, 'SSPL_MESSAGE_HEAD_INDEX') self._head = store.get(self.SSPL_MESSAGE_HEAD_INDEX) if self._head is None: store.put(0, self.SSPL_MESSAGE_HEAD_INDEX) self.SSPL_MESSAGE_TAIL_INDEX = os.path.join(self.cache_dir_path, 'SSPL_MESSAGE_TAIL_INDEX') self._tail = store.get(self.SSPL_MESSAGE_TAIL_INDEX) if self._tail is None: store.put(0, self.SSPL_MESSAGE_TAIL_INDEX) self.SSPL_UNSENT_MESSAGES = os.path.join(self.cache_dir_path, 'MESSAGES')
def _check_module_recovered(module): """ Once SSPL is restarted, check current status of the module after certain recovery cycle time. If module is running and its previous state is fault, raise fault_resolved alert and update cache. """ module_name = module.name() # Wait till sensor module completes few run cycle. Then # raise module recovery fault_resolved alert. polling_cycle_time = Conf.get( SSPL_CONF, f"{SSPL_LL_SETTING}>sensor_polling_cycle_time", 60) time.sleep(polling_cycle_time) if not module.is_running(): return curr_state = "fault_resolved" per_data_path = os.path.join( module_cache_dir, f'{module_name.upper()}_{node_id}') if not os.path.isfile(per_data_path): module_persistent_data[module_name] = {} store.put(module_persistent_data[module_name], per_data_path) # Check previous state before sending fault resolved alert module_persistent_data[module_name] = store.get(per_data_path) prev_state = module_persistent_data[module_name].get('prev_state') if prev_state and curr_state != prev_state: module_persistent_data[module_name] = {"prev_state": curr_state} store.put(module_persistent_data[module_name], per_data_path) specific_info = Conf.get(SSPL_CONF, f"{module_name.upper()}") info = { "module_name": module_name, "alert_type": curr_state, "description": f"{module_name} is recovered", "impact": "", "recommendation": "", "severity": "info", "specific_info": specific_info } jsonMsg = ThreadMonitorMsg(info).getJson() module._write_internal_msgQ(EgressProcessor.name(), jsonMsg)
def __init__(self): self._conf_reader = ConfigReader() self._max_size = int( self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR, self.LIMIT_CONSUL_MEMORY, 50000000)) self._current_size = store.get("SSPL_MEMORY_USAGE") if self._current_size is None: store.put(0, "SSPL_MEMORY_USAGE") self._head = store.get("SSPL_MESSAGE_HEAD_INDEX") if self._head is None: store.put(0, "SSPL_MESSAGE_HEAD_INDEX") self._tail = store.get("SSPL_MESSAGE_TAIL_INDEX") if self._tail is None: store.put(0, "SSPL_MESSAGE_TAIL_INDEX")
def _get_msgs_for_faulty_logical_volumes(self, logical_volumes, disk_group, send_message=True): """Checks for health of logical volumes and returns list of messages to be sent to handler if there are any. """ faulty_logical_volume_messages = [] internal_json_msg = None logical_volume_health = None serial_number = None alert_type = "" # Flag to indicate if there is a change in _previously_faulty_logical_volumes state_changed = False if not logical_volumes: return for logical_volume in logical_volumes: logical_volume_health = logical_volume["health"].lower() serial_number = logical_volume["serial-number"] # Check for missing and fault case if logical_volume_health == self.rssencl.HEALTH_FAULT: # Status change from Degraded ==> Fault or OK ==> Fault if (serial_number in self._previously_faulty_logical_volumes and \ self._previously_faulty_logical_volumes[serial_number]['health']=="degraded") or \ (serial_number not in self._previously_faulty_logical_volumes): alert_type = self.rssencl.FRU_FAULT self._previously_faulty_logical_volumes[serial_number] = { "health": logical_volume_health, "alert_type": alert_type } state_changed = True # Check for degraded case elif logical_volume_health == self.rssencl.HEALTH_DEGRADED: # Status change from Fault ==> Degraded or OK ==> Degraded if (serial_number in self._previously_faulty_logical_volumes and \ self._previously_faulty_logical_volumes[serial_number]['health']=="fault") or \ (serial_number not in self._previously_faulty_logical_volumes): alert_type = self.rssencl.FRU_FAULT self._previously_faulty_logical_volumes[serial_number] = { "health": logical_volume_health, "alert_type": alert_type } state_changed = True # Check for healthy case elif logical_volume_health == self.rssencl.HEALTH_OK: # Status change from Fault ==> OK or Degraded ==> OK if serial_number in self._previously_faulty_logical_volumes: # Send message to handler alert_type = self.rssencl.FRU_FAULT_RESOLVED del self._previously_faulty_logical_volumes[serial_number] state_changed = True if state_changed: # Generate the alert contents internal_json_msg = self._create_internal_msg_lvol( logical_volume, alert_type, disk_group) faulty_logical_volume_messages.append(internal_json_msg) # Send message to handler if send_message: self._send_json_msg(internal_json_msg) # Persist faulty Logical Volume list to file only if something is changed # Wait till msg is sent to rabbitmq or added in consul for resending. # If timed out, do not update cache and revert in-memory cache. # So, in next iteration change can be detected if self._event.wait( self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT): store.put(self._previously_faulty_logical_volumes,\ self._faulty_logical_volume_file_path) else: self._previously_faulty_logical_volumes = store.get( self._faulty_logical_volume_file_path) state_changed = False alert_type = "" return faulty_logical_volume_messages
def tail(self, index): store.put(index, self.SSPL_MESSAGE_TAIL_INDEX)
def head(self, index): store.put(index, self.SSPL_MESSAGE_HEAD_INDEX)
def current_size(self, size): store.put(size, self.SSPL_MEMORY_USAGE)
def rss_cliapi_poll_disks(self, disk): """Retreive realstor disk info using cli api /show/disks""" # make ws request url = self.rssencl.build_url(self.rssencl.URI_CLIAPI_SHOWDISKS) if (disk != self.RSS_DISK_GET_ALL): diskId = disk.partition("0.")[2] if (diskId.isdigit()): url = f"{url}/{disk}" url = f"{url}/detail" response = self.rssencl.ws_request(url, self.rssencl.ws.HTTP_GET) if not response: logger.warn( f"{self.rssencl.LDR_R1_ENCL}:: Disks status unavailable as ws request {url} failed" ) return if response.status_code != self.rssencl.ws.HTTP_OK: if url.find(self.rssencl.ws.LOOPBACK) == -1: logger.error( f"{self.rssencl.LDR_R1_ENCL}:: http request {url} to poll disks failed with \ err {response.status_code}") return try: jresponse = json.loads(response.content) except ValueError as badjson: logger.error(f"{url} returned mal-formed json:\n{badjson}") if jresponse: api_resp = self.rssencl.get_api_status(jresponse['status']) #logger.debug("%s api response:%d" % (url.format(),api_resp)) if ((api_resp == -1) and (response.status_code == self.rssencl.ws.HTTP_OK)): logger.warn("/show/disks api response unavailable, " "marking success as http code is 200") api_resp = 0 if api_resp == 0: drives = jresponse['drives'] # reset latest drive cache to build new self.latest_disks = {} self.invalidate_latest_disks_info = False for drive in drives: slot = drive.get("slot", -1) sn = drive.get("serial-number", "NA") health = drive.get("health", "NA") if slot != -1: self.latest_disks[slot] = { "serial-number": sn, "health": health } #dump drive data to persistent cache dcache_path = f"{self.disks_prcache}disk_{slot}.json" # If drive is replaced, previous drive info needs # to be retained in disk_<slot>.json.prev file and # then only dump new data to disk_<slot>.json path_exists, ret_val = store.exists(dcache_path) if path_exists and ret_val == "Success": prevdrive = store.get(dcache_path) if prevdrive is not None: prevsn = prevdrive.get("serial-number", "NA") prevhealth = prevdrive.get("health", "NA") if prevsn != sn or prevhealth != health: # Rename path store.put(store.get(dcache_path), dcache_path + ".prev") store.delete(dcache_path) store.put(drive, dcache_path) elif not path_exists and ret_val == "Success": store.put(drive, dcache_path) else: # Invalidate latest disks info if persistence store error encountered logger.warn( f"store.exists {dcache_path} return value {ret_val}" ) self.invalidate_latest_disks_info = True break if self.invalidate_latest_disks_info is True: # Reset latest disks info self.latest_disks = {} #If no in-memory cache, build from persistent cache if not self.memcache_disks: self._rss_build_disk_cache_from_persistent_cache() # if no memory cache still if not self.memcache_disks: self.memcache_disks = self.latest_disks
def _get_msgs_for_faulty_psus(self, psus, send_message = True): """Checks for health of psus and returns list of messages to be sent to handler if there are any. """ self._log_debug( f"RealStorPSUSensor._get_msgs_for_faulty_psus -> {psus} {send_message}") faulty_psu_messages = [] internal_json_msg = None psu_health = None durable_id = None alert_type = "" # Flag to indicate if there is a change in _previously_faulty_psus state_changed = False if not psus: return for psu in psus: psu_health = psu["health"].lower() durable_id = psu["durable-id"] psu_health_reason = psu["health-reason"] # Check for missing and fault case if psu_health == self.rssencl.HEALTH_FAULT: self._log_debug("Found fault in PSU {0}".format(durable_id)) alert_type = self.rssencl.FRU_FAULT # Check for removal if self._check_if_psu_not_installed(psu_health_reason): alert_type = self.rssencl.FRU_MISSING state_changed = not (durable_id in self._previously_faulty_psus and self._previously_faulty_psus[durable_id]["alert_type"] == alert_type) if state_changed: self._previously_faulty_psus[durable_id] = { "health": psu_health, "alert_type": alert_type} internal_json_msg = self._create_internal_msg( psu, alert_type) faulty_psu_messages.append(internal_json_msg) # Send message to handler if send_message: self._send_json_msg(internal_json_msg) # Check for fault case elif psu_health == self.rssencl.HEALTH_DEGRADED: self._log_debug("Found degraded in PSU {0}".format(durable_id)) state_changed = durable_id not in self._previously_faulty_psus if state_changed: alert_type = self.rssencl.FRU_FAULT self._previously_faulty_psus[durable_id] = { "health": psu_health, "alert_type": alert_type} internal_json_msg = self._create_internal_msg( psu, alert_type) faulty_psu_messages.append(internal_json_msg) # Send message to handler if send_message: self._send_json_msg(internal_json_msg) # Check for healthy case elif psu_health == self.rssencl.HEALTH_OK: self._log_debug("Found ok in PSU {0}".format(durable_id)) state_changed = durable_id in self._previously_faulty_psus if state_changed: # Send message to handler if send_message: previous_alert_type = \ self._previously_faulty_psus[durable_id]["alert_type"] alert_type = self.rssencl.FRU_FAULT_RESOLVED if previous_alert_type == self.rssencl.FRU_MISSING: alert_type = self.rssencl.FRU_INSERTION internal_json_msg = self._create_internal_msg( psu, alert_type) faulty_psu_messages.append(internal_json_msg) if send_message: self._send_json_msg(internal_json_msg) del self._previously_faulty_psus[durable_id] # Persist faulty PSU list to file only if something is changed if state_changed: # Wait till msg is sent to message bus or added in consul for resending. # If timed out, do not update cache and revert in-memory cache. # So, in next iteration change can be detected if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT): store.put(self._previously_faulty_psus,\ self._faulty_psu_file_path) else: self._previously_faulty_psus = store.get(self._faulty_psu_file_path) state_changed = False alert_type = "" return faulty_psu_messages
def execute_thread(module, msgQlist, conf_reader, product, resume=True): """ Run module as a thread. Recover the module if any error during initialization and run time of the module. If recovery count>0, module will be recovered from failure until the maximum recovery attempt. If not recoverable, corresponding module will be shutdown and failure alert will be raised due to its impact. If recovery count=0, no recovery attempt will be made. """ module_name = module.name() # Suspend module threads if resume == False: module.suspend() # Initialize persistent cache for sensor status per_data_path = os.path.join( module_cache_dir, f"{module_name.upper()}_{node_id}") if not os.path.isfile(per_data_path): module_persistent_data[module_name] = {} store.put(module_persistent_data[module_name], per_data_path) is_sensor_thread = False recovery_count = recovery_interval = 0 if isinstance(module, SensorThread): recovery_count, recovery_interval = _get_recovery_config(module_name) is_sensor_thread = True attempt = 0 while attempt <= recovery_count: attempt += 1 try: # Each module is passed a reference list to message queues so it # can transmit internal messages to other modules as desired module.start_thread(conf_reader, msgQlist, product) except Exception as err: curr_state = "fault" err_msg = f"{module_name}, {err}" logger.error(err_msg) if attempt > recovery_count: logger.debug(traceback.format_exc()) description = f"{module_name} is stopped and unrecoverable. {err_msg}" impact = module.impact() recommendation = "Restart SSPL service" logger.critical( f"{description}. Impact: {impact} Recommendation: {recommendation}") # Check previous state of the module and send fault alert if os.path.isfile(per_data_path): module_persistent_data[module_name] = store.get(per_data_path) prev_state = module_persistent_data[module_name].get('prev_state') if is_sensor_thread and curr_state != prev_state: module_persistent_data[module_name] = {"prev_state": curr_state} store.put(module_persistent_data[module_name], per_data_path) specific_info = Conf.get(SSPL_CONF, f"{module_name.upper()}") info = { "module_name": module_name, "alert_type": curr_state, "description": description, "impact": impact, "recommendation": recommendation, "severity": "critical", "specific_info": specific_info } jsonMsg = ThreadMonitorMsg(info).getJson() module._write_internal_msgQ(EgressProcessor.name(), jsonMsg) else: logger.debug(f"Recovering {module_name} from failure, " f"attempt: {attempt}") time.sleep(recovery_interval) # Shutdown if no recovery attempt logger.info(f"Terminating monitoring thread {module_name}") module.shutdown() retry = 5 while module.is_running(): module.shutdown() retry -= 1 if not retry: break time.sleep(2)
def _check_for_fan_module_fault(self): """Iterates over fan modules list. maintains a dictionary in order to keep track of previous health of the FRU in order to set alert_type""" self._fan_modules_list = self._get_fan_modules_list() alert_type = None if not self._fan_modules_list: return try: for fan_module in self._fan_modules_list: fru_status = fan_module.get("health").lower() durable_id = fan_module.get("durable-id").lower() health_reason = fan_module.get("health-reason").lower() if fru_status == self.rssencl.HEALTH_FAULT and \ self._check_if_fan_module_is_installed(health_reason): if durable_id not in self._faulty_fan_modules_list: alert_type = self.rssencl.FRU_MISSING self._faulty_fan_modules_list[durable_id] = alert_type else: prev_alert_type = self._faulty_fan_modules_list[ durable_id] if prev_alert_type != self.rssencl.FRU_MISSING: alert_type = self.rssencl.FRU_MISSING self._faulty_fan_modules_list[ durable_id] = alert_type elif fru_status == self.rssencl.HEALTH_FAULT or \ fru_status == self.rssencl.HEALTH_DEGRADED: if durable_id not in self._faulty_fan_modules_list: alert_type = self.rssencl.FRU_FAULT self._faulty_fan_modules_list[durable_id] = alert_type else: prev_alert_type = self._faulty_fan_modules_list[ durable_id] if prev_alert_type != self.rssencl.FRU_FAULT: alert_type = self.rssencl.FRU_FAULT self._faulty_fan_modules_list[ durable_id] = alert_type elif fru_status == self.rssencl.HEALTH_OK: if durable_id in self._faulty_fan_modules_list: prev_alert_type = \ self._faulty_fan_modules_list[durable_id] if prev_alert_type == self.rssencl.FRU_MISSING: alert_type = self.rssencl.FRU_INSERTION else: alert_type = self.rssencl.FRU_FAULT_RESOLVED del self._faulty_fan_modules_list[durable_id] # Persist faulty Fan Module list to file only if there is any # type of alert generated if alert_type: internal_json_message = \ self._create_internal_json_msg(fan_module, alert_type) self._send_json_message(internal_json_message) # Wait till msg is sent to message bus or added in consul for resending. # If timed out, do not update cache and revert in-memory cache. # So, in next iteration change can be detectedcted if self._event.wait( self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT): store.put(self._faulty_fan_modules_list,\ self._faulty_fan_file_path) else: self._faulty_fan_modules_list = store.get( self._faulty_fan_file_path) alert_type = None except Exception as e: logger.exception(e)
def get_system_status(self): """Retreive realstor system state info using cli api /show/system""" # poll system would get invoked through multiple realstor sensors # with less frequency compared to configured polling frequency # adding check to comply with polling frequency elapsed = time.time() - self.poll_system_ts if elapsed < self.pollfreq: logger.warn("/show/system request came in {0} seconds," "while configured polling frequency is {1} seconds," "ignoring".format(elapsed, self.pollfreq)) return system = None # make ws request url = self.build_url(self.URI_CLIAPI_SHOWSYSTEM) #logger.info("show system url: %s" % url) response = self.ws_request(url, self.ws.HTTP_GET) if not response: logger.warn("System status unavailable as ws request failed") return if response.status_code != self.ws.HTTP_OK: logger.info("{0}:: http request {1} polling system status failed" " with http err {2}".format(self.LDR_R1_ENCL, url, \ response.status_code)) return self.poll_system_ts = time.time() try: jresponse = json.loads(response.content) except ValueError as badjson: logger.error("%s returned mal-formed json:\n%s" % (url, badjson)) if jresponse: api_resp = self.get_api_status(jresponse['status']) if ((api_resp == -1) and (response.status_code == self.ws.HTTP_OK)): logger.warn("/show/system api response unavailable, " "marking success as http code is 200") api_resp = 0 if api_resp == 0: system = jresponse['system'][0] self.memcache_system = system if system: # Check if fault exists # TODO: use self.FAULT_KEY in system: system.key() generates # list and find item in that. if not self.FAULT_KEY in system.keys(): logger.debug("{0} Healthy, no faults seen".format( self.LDR_R1_ENCL)) self.latest_faults = {} return # Extract system faults self.latest_faults = system[self.FAULT_KEY] #If no in-memory fault cache built yet! if not self.memcache_faults: # build from persistent cache if available logger.info( "No cached faults, building from persistent cache {0}"\ .format(self.faults_persistent_cache)) self.memcache_faults = store.get( self.faults_persistent_cache) # still if none, build from latest faults & persist if not self.memcache_faults: logger.info("No persistent faults cache, building " "cache from latest faults") self.memcache_faults = self.latest_faults # On SSPL boot, run through existing faults as no cache to # verify with for new faults self.existing_faults = True #logger.debug("existing_faults {0}".\ # format(self.existing_faults)) store.put(self.memcache_faults, self.faults_persistent_cache) else: # Reset flag as existing faults processed by now # and cached faults are built already self.existing_faults = False else: logger.error("poll system failed with err %d" % api_resp)
def _get_msgs_for_faulty_controllers(self, controllers, send_message=True): """Checks for health of controllers and returns list of messages to be sent to handler if there are any. """ faulty_controller_messages = [] internal_json_msg = None controller_health = None durable_id = None alert_type = "" # Flag to indicate if there is a change in _previously_faulty_controllers state_changed = False prev_alert_type = None if not controllers: return for controller in controllers: controller_health = controller["health"].lower() controller_status = controller["status"].lower() durable_id = controller["durable-id"] # Check for missing and fault case if controller_health == self.rssencl.HEALTH_FAULT: # Status change from Degraded ==> Fault or OK ==> Fault if (durable_id in self._previously_faulty_controllers and \ self._previously_faulty_controllers[durable_id]['health']=="degraded") or \ (durable_id not in self._previously_faulty_controllers): alert_type = self.rssencl.FRU_FAULT # Check for removal if controller_status == self.rssencl.STATUS_NOTINSTALLED: alert_type = self.rssencl.FRU_MISSING self._previously_faulty_controllers[durable_id] = { "health": controller_health, "alert_type": alert_type} state_changed = True internal_json_msg = self._create_internal_msg( controller, alert_type) faulty_controller_messages.append(internal_json_msg) # Send message to handler if send_message: self._send_json_msg(internal_json_msg) # Check for fault case elif controller_health == self.rssencl.HEALTH_DEGRADED: # Status change from Fault ==> Degraded or OK ==> Degraded # Controller can also go into degraded state after installation as well # So, Degrade state can be after missing alert as well. if (durable_id in self._previously_faulty_controllers and \ self._previously_faulty_controllers[durable_id]['health']=="fault") or \ (durable_id not in self._previously_faulty_controllers): if self._previously_faulty_controllers and \ self._previously_faulty_controllers.get(durable_id).get('alert_type'): prev_alert_type = self._previously_faulty_controllers[durable_id]["alert_type"] # If prev_alert_type is missing, then the next alert type will be insertion first if prev_alert_type and prev_alert_type.lower() == self.rssencl.FRU_MISSING: alert_type = self.rssencl.FRU_INSERTION internal_json_msg = self._create_internal_msg( controller, alert_type) # send the message to the handler if send_message: self._send_json_msg(internal_json_msg) # And set alert_type as fault alert_type = self.rssencl.FRU_FAULT self._previously_faulty_controllers[durable_id] = { "health": controller_health, "alert_type": alert_type} internal_json_msg = self._create_internal_msg(controller, alert_type) faulty_controller_messages.append(internal_json_msg) state_changed = True # send the message to the handler if send_message: self._send_json_msg(internal_json_msg) # Check for healthy case elif controller_health == self.rssencl.HEALTH_OK: # Status change from Fault ==> OK or Degraded ==> OK if durable_id in self._previously_faulty_controllers: # Send message to handler if send_message: previous_alert_type = \ self._previously_faulty_controllers[durable_id]["alert_type"] alert_type = self.rssencl.FRU_FAULT_RESOLVED if previous_alert_type == self.rssencl.FRU_MISSING: alert_type = self.rssencl.FRU_INSERTION internal_json_msg = self._create_internal_msg( controller, alert_type) faulty_controller_messages.append(internal_json_msg) if send_message: self._send_json_msg(internal_json_msg) del self._previously_faulty_controllers[durable_id] state_changed = True # Persist faulty Controller list to file only if something is changed if state_changed: # Wait till msg is sent to message bus or added in consul for resending. # If timed out, do not update cache and revert in-memory cache. # So, in next iteration change can be detected if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT): store.put(self._previously_faulty_controllers,\ self._faulty_controller_file_path) else: self._previously_faulty_controllers = store.get(self._faulty_controller_file_path) state_changed = False alert_type = "" return faulty_controller_messages
def _check_for_sideplane_expander_fault(self): """Iterates over sideplane expander list which has some fault. maintains a dictionary in order to keep track of previous health of the FRU, so that, alert_type can be set accordingly""" self.unhealthy_components = {} self._sideplane_expander_list = \ self._get_sideplane_expander_list() alert_type = None # Declaring the health_recommendation with default type NoneType. health_recommendation = None missing_health = " ".join( "Check that all I/O modules and power supplies in\ the enclosure are fully seated in their slots and that their latches are locked" .split()) if not self._sideplane_expander_list: return for sideplane_expander in self._sideplane_expander_list: try: self.unhealthy_components = \ sideplane_expander.get("unhealthy-component", []) fru_status = sideplane_expander.get("health").lower() durable_id = sideplane_expander.get("durable-id").lower() if self.unhealthy_components: health_recommendation = \ str(self.unhealthy_components[0] ["health-recommendation"]) # checking the health_recommendation not None if the fault response will be # theire it checks missing health. if fru_status == self.rssencl.HEALTH_FAULT and health_recommendation: if missing_health.strip(" ") in health_recommendation: if durable_id not in self._faulty_sideplane_expander_dict: alert_type = self.rssencl.FRU_MISSING self._faulty_sideplane_expander_dict[ durable_id] = alert_type elif fru_status == self.rssencl.HEALTH_FAULT: if durable_id not in self._faulty_sideplane_expander_dict: alert_type = self.rssencl.FRU_FAULT self._faulty_sideplane_expander_dict[ durable_id] = alert_type elif fru_status == self.rssencl.HEALTH_OK: if durable_id in self._faulty_sideplane_expander_dict: previous_alert_type = self._faulty_sideplane_expander_dict.\ get(durable_id) alert_type = self.rssencl.FRU_FAULT_RESOLVED if previous_alert_type == self.rssencl.FRU_MISSING: alert_type = self.rssencl.FRU_INSERTION del self._faulty_sideplane_expander_dict[durable_id] if alert_type: internal_json_message = \ self._create_internal_json_message( sideplane_expander, self.unhealthy_components, alert_type) self._send_json_message(internal_json_message) # Wait till msg is sent to rabbitmq or added in consul for resending. # If timed out, do not update cache and revert in-memory cache. # So, in next iteration change can be detected if self._event.wait( self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT): store.put(\ self._faulty_sideplane_expander_dict,\ self._faulty_sideplane_expander_file_path) else: self._faulty_sideplane_expander_dict = store.get( self._faulty_sideplane_expander_file_path) alert_type = None except Exception as ae: logger.exception(ae)