示例#1
0
文件: control.py 项目: ppjsand/pyteal
 def read_xml(self, xml_constants_element, trace_dict):
     '''Read the GEAR control XML'''
     self.trace_id = trace_dict[xml_constants_element]
     for xml_entry in xml_constants_element:
         entry_name = xml_entry.tag.split('}')[-1]
         get_logger().debug('Processing  {0}'.format(entry_name))
         
         ## Dual support
         if entry_name == GCTL_DEFAULT_CREATE_ALERT_INIT_CLASS:
             # supported by both, so don't need to check
             if 'class' not in xml_entry.attrib:
                 self.ruleset.parse_error(self.trace_id[0], 'default_create_alert_init_class element requires a \'class\' attribute')
             tmp_class = xml_entry.attrib['class']
             try:
                 module_name, class_name = tmp_class.rsplit('.', 1)
                 module = __import__(module_name, globals(), locals(), [class_name])
             except ImportError, ie:
                 get_logger().error(ie)
                 self.ruleset.parse_error(self.trace_id[0], 'gear control unable to load specified create alert init class: {0}'.format(tmp_class))
                 raise ie
             self[GCTL_DEFAULT_CREATE_ALERT_INIT_CLASS] = getattr(module, class_name)
         
         ## Event analysis support
         elif entry_name == GCTL_DEFAULT_EVENT_COMP:
             if not self.ruleset.event_input:
                 self.ruleset.parse_error(self.trace_id[0], 'gear control element \'{0}\' is not supported for this analyzer'.format(GCTL_DEFAULT_EVENT_COMP))
             self[GCTL_DEFAULT_EVENT_COMP] = xml_entry.attrib['value'].strip()
             if self[GCTL_DEFAULT_EVENT_COMP] is None or len(self[GCTL_DEFAULT_EVENT_COMP]) == 0:
                 self.ruleset.parse_error(self.trace_id[0], 'gear_control element is missing \'value\' attribute')
示例#2
0
文件: pnsd.py 项目: ppjsand/pyteal
def parse_event(errm_env):
    ''' Handles a PNSD event that was directly monitored by the node
    '''
    # Make sure it was our sensor that fired
    rsrc_name = errm_env.get('ERRM_RSRC_NAME')
    if  rsrc_name != PNSD_STAT_SENSOR:
        get_logger().warn('Unknown resource name: {0}'.format(rsrc_name))
        return
    
    event_id = PNSD_RETRANSMIT_THRESHOLD
    
    # Time from RMC is in sec,usec format
    sec_usec = errm_env.get('ERRM_TIME', None)
    if sec_usec is not None:
        sec = long(sec_usec.split(',')[0])
    else:
        sec = None
        
    time_occurred = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(sec))
    
    # Create the location codes 
    src = ('PNSD', 'A', '{0}##{1}'.format(errm_env.get('ERRM_NODE_NAME'), rsrc_name))
    rpt = (None, None, None)
    
    # Save the restransmit percentage as the error data
    raw_data_fmt = 0 # No special formatting required
    raw_data = errm_env.get('ERRM_VALUE')

    log_event(event_id, time_occurred, src, rpt, raw_data_fmt, raw_data)
示例#3
0
文件: event.py 项目: ppjsand/pyteal
 def match(self, event_id, src_comp, src_loc, rpt_loc, scope):
     '''check if the event matches the passed in values
     
     A None passed in means wildcard ... i.e. assume it matches
     '''
     try:
         if event_id is not None and event_id != self.get_event_id():
             #get_logger().debug('Match failed due to event id mismatch {0} != {1}'.format(event_id, self.get_event_id()))
             return False
         if src_comp is not None and src_comp != self.get_src_comp():
             #get_logger().debug('Match failed due to comp mismatch {0} != {1}'.format(src_comp, self.get_src_comp()))
             return False
         if src_loc is not None:
             if not src_loc.match(self.get_src_loc(), scope):
                 #get_logger().debug('Match failed due to src_loc mismatch {0} != {1}'.format(str(src_loc), str(self.get_src_loc())))
                 return False
         if rpt_loc is not None:
             if self.get_rpt_loc() is None:
                 #get_logger().debug('Match failed due to event not having rpt_loc value')
                 return False
             if not rpt_loc.match(self.get_rpt_loc(), scope):
                 #get_logger().debug('Match failed due to rpt_loc mismatch {0} != {1}'.format(str(rpt_loc), str(self.get_rpt_loc())))
                 return False
     except BaseException:
         get_logger().exception('Event {0}: Match failed'.format(self.brief_str()))
         return False
     #get_logger().debug('Matched')
     return True
示例#4
0
 def __init__(self, name, inQueue, outQueue, config_dict=None, number=0):
     '''The constructor.'''
     get_logger().debug('Creating GEAR event analyzer named {0}'.format(name))
     
     self.engine = engine_factory(name, config_dict, event_input=True, number=number, send_alert=self.send_alert)
     EventAnalyzer.__init__(self, name, inQueue, outQueue, config_dict, number, checkpoint=self.engine.checkpoint)
     return
示例#5
0
    def __init__(self,
                 name,
                 inEventQueue,
                 outQueue,
                 config_dict=None,
                 number=0,
                 checkpoint=None):
        '''The constructor.
        '''
        EventAnalyzer.__init__(self, name, inEventQueue, outQueue, config_dict,
                               number, checkpoint)

        self.severity = "F"
        self.recommendation = "Diagnose the fatal RAS event that ended the job. "

        # ring buffer to remember jobs ending due to fatal errors
        size = 1024
        self.ring = [None for i in xrange(size)]

        # Get the list of ras events with END_JOB control action
        self.msgIDs, self.msgidService = get_eventList()

        for msgID in self.msgIDs:
            registry.get_logger().debug('msgId = ' + msgID)

        return
    def __init__(self, name, inEventQueue, outQueue, config_dict=None, number=0, checkpoint=None):
        '''The constructor.
        '''
        EventAnalyzer.__init__(self, name, inEventQueue, outQueue, config_dict, number, checkpoint)

        self.severity = "W"
        self.recommendation = "Diagnose the problem that caused the threshold to be reached or exceeded. "
        self.alert_id = 'THRESH01'

        # Get the list of ras events that have threshold counts
        self.msgIDs, self.msgidService, self.msgidCount, self.msgidPeriod = get_eventList()

        for msgID in self.msgIDs:
            registry.get_logger().debug('msgId = ' + msgID)

        # define query for count with no period specified
        eventTable = self.appendSchema('tbgqeventlog')
        self.count_query = "select count(*) from " + eventTable + " where msg_id = ? and location LOC and serialnumber SN and event_time <= ?"
        self.count_query2 = "select sum(bigint(count)) from " + eventTable + " where msg_id = ? and location LOC and serialnumber SN and event_time <= ?"

        # define query for count exceeded with period specified
        self.period_query = "select count(*) from " + eventTable + " where msg_id = ? and location LOC and serialnumber SN and event_time <= ? and event_time > (timestamp('MYTIME') - PERIOD)"
        self.period_query2 = "select sum(bigint(count)) from " + eventTable + " where msg_id = ? and location LOC and serialnumber SN and event_time <= ? and event_time > (timestamp('MYTIME') - PERIOD)"

        # the number of consecutive periods that the threshold has to exceed
        self.msgidConsecutivePeriods = dict()
        self.msgidConsecutivePeriods['0008002F'] = 7
        self.msgidConsecutivePeriods['00080030'] = 3
        
        return
示例#7
0
    def process_alert(self, alert):
        ''' Send an alert via email to the requested recipients
        '''
        alert_dict = alert.write_to_dictionary()
        
        msg = MIMEText(self.body_template.safe_substitute(alert_dict))
        msg['Subject'] = self.subj_template.safe_substitute(alert_dict)
        msg['Date'] = alert_dict[ALERT_ATTR_CREATION_TIME].strftime('%a, %d %b %Y %H:%M:%S -0000')
        msg['From'] = self.sender
        msg['To'] = self.receivers
        
        try:
            # Create the proper SMTP instance
            server = smtplib.SMTP()
            smtp_server = self.server

            # Connect to the server
            server.set_debuglevel(self.debug_level)
            server.connect(smtp_server)

            # Login, if required
            if self.uid:
                server.login(self.uid,self.password)
                
            # Send the alert to the intended recipients
            server.sendmail(self.sender,self.receivers.split(','),msg.as_string())
            
            # Disconnect from the server
            server.quit()
        except Exception,e:            
            get_logger().warn("Failed to send alert({0}) via SMTP: {1}".format(alert_dict[ALERT_ATTR_REC_ID],e))
示例#8
0
 def notifyFromEventQ(self, item):
     '''Handle incoming items'''
     get_logger().debug('Notify called with {0}'.format(str(item)))
     #self.notify_lock.acquire()
     result = item.process(self, None)
     #self.notify_lock.release()
     return result
示例#9
0
文件: alert.py 项目: ppjsand/pyteal
 def _read_from_dictionary(self, in_dict):
     '''Set the attributes of Alert from information from a dictionary'''
     try:
         self.supresses.clear()
         self.condition_events.clear()
         for key in in_dict:
             value = in_dict[key]
             if key == ALERT_ATTR_ALERT_ID:
                 # Alert id is set on the allocate call only
                 # If in dictionary ignore it
                 pass
             elif key == ALERT_ATTR_CREATION_TIME:
                 self.creation_time = value
             elif key == ALERT_ATTR_SEVERITY:
                 self.severity = value
             elif key == ALERT_ATTR_URGENCY:
                 self.urgency = value
             elif key == ALERT_ATTR_EVENT_LOC:
                 try:
                     if value is not None:
                         self.event_loc = Location(in_dict[ALERT_ATTR_EVENT_LOC_TYPE],value)
                 except BaseException, e:
                     get_logger().warning('Error processing location: value is %s', (value,))
                     raise
             elif key == ALERT_ATTR_EVENT_LOC_TYPE:
                 pass # Used to build Location above
             elif key == ALERT_ATTR_EVENT_LOC_OBJECT:
                 self.event_loc = value
示例#10
0
文件: alert.py 项目: ppjsand/pyteal
def create_teal_alert(alert_id, reason, raw_data, src_name='TEAL', severity='I', 
                      urgency='N', loc_instance=None, recommendation='Contact next level of support',
                      disable_dup=False):
    ''' create a TEAL alert
          This will used the parameters to:
            (1) Create the alert initialization dictionary
            (2) Allocate the alert
            (3) Commit the alert
            (4) Put the alert in the delivery queue
    '''
    get_logger().debug('Creating {0} alert'.format(src_name))
    
    # Build the Alert directly from the event information
    alert_dict = {ALERT_ATTR_SEVERITY:severity,
                  ALERT_ATTR_URGENCY:urgency,
                  ALERT_ATTR_RECOMMENDATION:recommendation,
                  ALERT_ATTR_REASON:reason,
                  ALERT_ATTR_RAW_DATA:raw_data,
                  ALERT_ATTR_SRC_NAME: src_name
                  }
        
    alert_dict[ALERT_ATTR_EVENT_LOC_OBJECT] = registry.get_service(SERVICE_LOCATION).get_teal_location(loc_instance)
    registry.get_service(SERVICE_ALERT_MGR).create_and_deliver_alert(alert_id, alert_dict, disable_dup=disable_dup)

    return
示例#11
0
    def __init__(self, location_id, data):
        ''' Constructor 
        '''
        if not(isinstance(data, str) or isinstance(data, unicode)):
            raise TypeError,"Invalid type of Location data: {0}.".format(type(data))

        loc_service = registry.get_service(SERVICE_LOCATION)
        try:
            self.location_info = loc_service[location_id]
            self.location_code = data.split(self.location_info.separator)

            # Location code is initialized, now validate it
            self._validate_location_code()
        except:
            tmp_env = os.environ.get(TEAL_LOCATION_VALIDATION, 'LOG').upper()
            if tmp_env == 'LOG':
                get_logger().exception('LOGGING Location creation failure and continuing processing')
            elif tmp_env == 'IMMEDIATE':
                raise

            self.loc_id = location_id
            self.data = data

            self.ex_type, self.ex_value = sys.exc_info()[:2]

            self.is_unprocessable = self.is_unprocessable_UNPROCESSABLE
            self.new_location_by_scope = self._UNPROCESSABLE
            self.get_comp_value = self._UNPROCESSABLE
            self.get_substitution_dict = self._UNPROCESSABLE
            self.get_location = self.get_location_UNPROCESSABLE
            self.str_impl = self.str_impl_UNPROCESSABLE
            self.match = self.match_UNPROCESSABLE
            self.get_id = self.get_id_UNPROCESSABLE
        return
示例#12
0
 def delete(self):
     ''' Mark the checkpoint as deleted 
     '''
     if self.checkpoint_mgr.use_db == True: 
         with self.lock:
             self.changed = False    # Stop any updates
             self.checkpoint_mgr.unregister_event_checkpoint(self)
     else:
         self.checkpoint_mgr.unregister_event_checkpoint(self)
     
     # Change in memory values 
     self.status = CHECKPOINT_STATUS_DELETED
     self.start_rec_id = None
     self.data = None
     self.starting_cb = None
     self.set_status = self.set_status_DELETED
     self.set_checkpoint = self.set_checkpoint_DELETED
     
     # Delete from DB 
     if self.checkpoint_mgr.use_db == True:
         try:
             dbi = get_service(SERVICE_DB_INTERFACE)
             cnxn = dbi.get_connection()
             cursor = cnxn.cursor()
             dbi.delete(cursor, db_interface.TABLE_CHECKPOINT, 
                        where='${0} = ?'.format(EVENT_CPF_NAME),
                        where_fields=[EVENT_CPF_NAME], 
                        parms=(self.name))
             cnxn.commit()
             cnxn.close()
         except:
             get_logger().exception('Unable to delete event checkpoint named {0}'.format(self.name))
             raise
     return 
示例#13
0
 def perform_sparing(self, rec_id, location, rawdata):
     # extract the mask and register from the message details
     mask = '0x000'
     mindex = rawdata.find('Mask=')
     if (mindex >= 0):
         mindex = mindex + len('Mask=')
         mask = rawdata[mindex:mindex + 5]
     register = 'C23'
     rindex = rawdata.find('Register=')
     if (rindex >= 0):
         rindex = rindex + len('Register=')
         register = rawdata[rindex:rindex + 3]
     # log a ras event to have the BQL lane spared
     command = list()
     command.append('/bgsys/drivers/ppcfloor/sbin/mc_server_log_ras')
     command.append('--location')
     command.append(location)
     command.append('--message-id')
     command.append('0x0009020D')
     command.append('--action')
     command.append('BQL_SPARE')
     command.append('--detail')
     m = 'Mask=' + mask
     command.append(m)
     command.append('--detail')
     r = 'Register=' + register
     command.append(r)
     command.append('--detail')
     command.append('Submitter=TEAL')
     command.append('--detail')
     command.append('Associated_Rec_Id=' + str(rec_id))
     registry.get_logger().debug(command)
     subprocess.call(command)
     return
示例#14
0
    def get_generator(self, config_dict):
        """ Return the appropriate SQL generator 
        based on the configuration information retrieved
        """
        DB_CONF_PATH = "{0}/xcat".format(get_service(TEAL_CONF_DIR))
        prefix = os.environ.get(TEAL_TEST_XCAT_CFGLOG_PREFIX, "")
        DB_CONF_FILE = "{0}cfgloc".format(prefix)

        # Set xCAT table names
        db_interface.TABLE_EVENT_LOG = "x_tealeventlog"
        db_interface.TABLE_CHECKPOINT = "x_tealcheckpoint"
        db_interface.TABLE_ALERT_LOG = "x_tealalertlog"
        db_interface.TABLE_ALERT2ALERT = "x_tealalert2alert"
        db_interface.TABLE_ALERT2EVENT = "x_tealalert2event"
        db_interface.TABLE_TEMPLATE = "x_{0}"

        # Well-known path to the information.
        ds_file = "{0}/{1}".format(DB_CONF_PATH, DB_CONF_FILE)
        get_logger().debug("DB Configuration: {0}".format(ds_file))

        try:
            conf_file = open(ds_file, "r")
        except IOError, e:
            get_logger().error("Unable to open DB configuration file. {0}".format(e))
            raise
示例#15
0
    def __init__(self, config_dict, ruleset):
        '''
        Constructor
        Get initial values from the config_dict
        '''
        dict.__init__(self)
        self.ruleset = ruleset
        self.trace_id = (0, 'poolcontrol')

        if config_dict is None or 'initial_pool_duration' not in config_dict:
            self[GPCL_INIT_DURATION] = None
        else:
            self[GPCL_INIT_DURATION] = int(config_dict['initial_pool_duration'])
            
        if config_dict is None or 'max_pool_duration' not in config_dict:
            self[GPCL_MAX_DURATION] = None
        else:
            self[GPCL_MAX_DURATION] = int(config_dict['max_pool_duration'])
            
        # Check for environment variable 
        env_to_check = ENV_ARRIVAL_RATE_EXTENSION.format(self.ruleset.name.upper())
        env_value = os.environ.get(env_to_check, None)
        if env_value is not None:
            tarel = [int(v) for v in env_value.split(',')]
            self[GPCL_ARRIVAL_RATE_EXTENSION] = ArrivalCheckCtl(window_min=tarel[0], window_max=tarel[1], arrival_rate=tarel[2], extension=tarel[3])
            get_logger().warning('Arrival Rate extension overridden using environment variable {0} with value {1}'.format(env_to_check, str(self[GPCL_ARRIVAL_RATE_EXTENSION])))
        else:
            if config_dict is None or CFG_ARRIVAL_RATE_EXTENSION not in config_dict:
                self[GPCL_ARRIVAL_RATE_EXTENSION] = None
            else:
                tarel = [int(v) for v in config_dict[CFG_ARRIVAL_RATE_EXTENSION].split(',')]
                self[GPCL_ARRIVAL_RATE_EXTENSION] = ArrivalCheckCtl(window_min=tarel[0], window_max=tarel[1], arrival_rate=tarel[2], extension=tarel[3])
                get_logger().debug('Arrival Rate extension overridden in config with value {0}'.format(str(self[GPCL_ARRIVAL_RATE_EXTENSION])))
        return
示例#16
0
文件: sfp.py 项目: ppjsand/pyteal
def handle_batch_event(errm_env, remote):
    ''' Process the batch events
    '''
    if remote:
        try:
            remote_details = errm_env['ERRM_VALUE'].strip()
            m = re.match('\[.*,(.*),.*\]', remote_details)
            rmt_filename = m.group(1).strip() 
            
            # Need to copy over the event log so it can be handled
            rmt_host = errm_env['ERRM_NODE_NAME']
            rmt_file = 'hscroot@{0}:{1}'.format(rmt_host, rmt_filename)
            lcl_file = '/tmp/{0}_{1}'.format(rmt_host, os.path.basename(rmt_filename))  
                      
            subprocess.check_call(['/usr/bin/scp', rmt_file, lcl_file])
            
            # Parse the events that were saved on the remote system
            parse_batch_file(lcl_file)
            
            # All done processing events so remove the file
            os.remove(lcl_file)
            
        except subprocess.CalledProcessError, cpe:
            # TODO: Log error and leave
            get_logger().error("Failed to copy batch file: {0}".format(cpe))
        except OSError, ose:
            # TODO: Log the OS Error
            get_logger().error("Failed to process batch file: {0}".format(ose))
示例#17
0
 def perform_sparing(self, rec_id, location, rawdata): 
     # extract the mask and register from the message details
     mask = '0x000'
     mindex = rawdata.find('Mask=') 
     if (mindex >= 0):
         mindex = mindex +  len('Mask=')
         mask = rawdata[mindex:mindex+5]
     register = 'C23'
     rindex = rawdata.find('Register=') 
     if (rindex >= 0):
         rindex = rindex +  len('Register=')
         register = rawdata[rindex:rindex+3]
     # log a ras event to have the BQL lane spared
     command = list()
     command.append('/bgsys/drivers/ppcfloor/sbin/mc_server_log_ras')
     command.append('--location')
     command.append(location)
     command.append('--message-id')
     command.append('0x0009020D')
     command.append('--action')
     command.append('BQL_SPARE')
     command.append('--detail')
     m = 'Mask=' + mask
     command.append(m)
     command.append('--detail')
     r = 'Register=' + register
     command.append(r)
     command.append('--detail')
     command.append('Submitter=TEAL')
     command.append('--detail')
     command.append('Associated_Rec_Id=' + str(rec_id))
     registry.get_logger().debug(command) 
     subprocess.call(command)
     return
示例#18
0
文件: sfp.py 项目: ppjsand/pyteal
def close_event(errm_env, event_data):
    ''' Find an alert that associated with the closed event and close it. This
    will only close an alert that has this event as the one and only event
    '''
    # Find the matching event in the event log
    event_rec_id = find_logged_event(errm_env, event_data)
    
    # If no event was found, there is nothing more to do
    if event_rec_id is None:
        return
    
    # Find an alert that this event is the only event associated with
    alert_recids = find_logged_alerts(event_rec_id)
    
    # If there is no alert associated with this event, then it may have
    # already been closed out or never logged in the first place based 
    # on when this connector started listening for events
    if len(alert_recids) == 0:
        return
    
    # Close this alert and any alerts that were duplicates of this alert
    a_mgr = registry.get_service(registry.SERVICE_ALERT_MGR)
    
    for alert_recid in alert_recids:
        try:
            a_mgr.close(alert_recid)
        except alert_mgr.AlertMgrError, ame:
            get_logger().warn('Failed to close alert({0}) associated to event ({1}): {2}'.format(alert_recid, event_rec_id, ame))
示例#19
0
    def __init__(self, config_dict):
        '''Constructor.
        '''
        # Validate configuration parameters
        if config_dict['enabled'] != 'realtime':
            raise ConfigurationError('Realtime monitor can only enabled for realtime use.  Unsupported value specified: {0}'.format(config_dict['enabled']))

        temp_frequency = os.environ.get(TEAL_UPDATE_CHECKPOINT_FREQUENCY, None)
        if temp_frequency is None: 
            self.update_checkpoint_frequency = DEFAULT_UPDATE_CHECKPOINT_FREQUENCY
        else:     
            try:   
                self.update_checkpoint_frequency = long(temp_frequency)
            except:
                get_logger().warning('Environment variable \'{0}\' was invalid: \'{1}\'. Default value used'.format(TEAL_UPDATE_CHECKPOINT_FREQUENCY, str(temp_frequency)))
                self.update_checkpoint_frequency = DEFAULT_UPDATE_CHECKPOINT_FREQUENCY

        cfg_notifier = os.environ.get(TEAL_TEST_NOTIFIER_CONFIG, None)
        if cfg_notifier is None:
            if CFG_KEY_NOTIFIER not in config_dict: 
                raise ConfigurationError('RealtimeMonitor requires notifier be specified in the configuration file or as an environment variable')
            else:
                cfg_notifier = config_dict[CFG_KEY_NOTIFIER]
        # create notifier class
        try:
            module_name, class_name = cfg_notifier.rsplit('.', 1)
            module = __import__(module_name, globals(), locals(), [class_name])
        except ImportError,ie:
            get_logger().error(ie)
            raise # throw the ImportError up the chain
示例#20
0
 def _log_event(self, db, cursor, ll_event):
     ''' Log event into TEAL event log. The LL event is actually a combination
     of common event data and LL specific data
     '''
     # Translate each event into a TEAL format
     teal_event = self._translate_event(ll_event)
     #print teal_event
     db.insert(cursor, LL_TEAL_COLS, db_interface.TABLE_EVENT_LOG, teal_event)
     
     # Rules assume detail is provided so set to empty string if not
     detail = ll_event[LL_EVENT_COL_DETAIL]
     if detail is None:
         detail = ''
     
     # Now add the LL extended data
     ll_extended_data = [ll_event[LL_EVENT_COL_TIME_OCCURRED],
                         ll_event[LL_EVENT_COL_TIME_LOGGED],
                         ll_event[LL_EVENT_COL_MSG_TYPE],
                         ll_event[LL_EVENT_COL_MESSAGE],
                         detail]
     
     db.insert_dependent(cursor,
                         LL_TEAL_EXTENDED_PK,
                         LL_TEAL_EXTENDED_COLS,
                         LL_TEAL_EXTDATA_TABLE,
                         ll_extended_data)
         
     registry.get_logger().debug("Logged event [{0},{1},{2}]".format(ll_event[LL_EVENT_COL_TIME_OCCURRED],
                                                                     ll_event[LL_EVENT_COL_EVENT_ID],
                                                                     ll_event[LL_EVENT_COL_NODE].strip()))
示例#21
0
 
 def resolve_and_validate(self):
     '''Resolve and validate the evaluatable'''
     self.src_name = self.ruleset.name
     try:
         # OK to set Rule to None, since not using any of the support the requires
         resolve_and_validate_rule_values(self, self.ruleset, None, GEHD_HANDLER_ALERT, rule_part=GRUL_PART_ACTION)
     except XMLParsingError as e:
         self.ruleset.parse_error(self.trace_id[0], '\'on_error\' element {0}'.format(e.msg))
         
     self.trace_id = (self.trace_id[0], self.trace_id[1] + '-' + self.type.get_value())
     
     # Process init class
     if self.init_class.is_set() == False:
         self.init_class_callable = self.ruleset['gear_control'][GCTL_DEFAULT_CREATE_ALERT_INIT_CLASS]
     else:
         try:
             module_name, class_name = self.init_class.get_value().rsplit('.', 1)
             module = __import__(module_name, globals(), locals(), [class_name])
         except ImportError, ie:
             get_logger().error(ie)
             self.ruleset.parse_error(self.trace_id[0], 'gear create alert unable to load specified init class: {0}'.format(self.init_class))
             raise ie
         self.init_class_callable = getattr(module, class_name)
         
         tmp_instance = self.init_class_callable()
         if isinstance(tmp_instance, ExtInitAlert) == False:
示例#22
0
文件: ruleset.py 项目: ppjsand/pyteal
 def flush(self, flush_time):
     ''' flush the ruleset '''
     try:
         self.event_pool.flush(flush_time)
     except IncidentPoolStateTransitionError:
         get_logger().info('Tried to Flush a closed pool')
     return
示例#23
0
 def get_active_sections(self, area, runmode=None, name_required=True, singleton=False):
     '''Get a list that contains tuples of active section key and names for the specified area
     '''
     results = []
     for section in self.sections():
         # split into config_area and entry_name
         result = section.split('.',1)
         if result[0] == area:
             if runmode is not None and self.has_option(section, 'enabled'):
                 enabled_val = self.get(section, 'enabled')
                 if enabled_val != 'all':
                     if  enabled_val == 'false' or                                                    \
                         (enabled_val == 'realtime' and runmode != RUN_MODE_REALTIME) or                  \
                         (enabled_val == 'historic' and runmode != RUN_MODE_HISTORIC):
                         get_logger().debug('Skipping section \'{0}\' with enabled set to \'{1}\''.format(section, enabled_val))
                         continue
                     elif enabled_val not in ['realtime', 'historic']:
                         raise ConfigurationError('Configuration section \'{0}\' has an unrecognized value for enabled keyword: \'{1}\''.format(section, enabled_val))
                         
             if len(result) == 1:
                 if name_required == True:
                     raise ConfigurationError('Configuration sections for \'{0}\' must have a name, but none was specified'.format(area))
                 result.append(None)
                 
             if singleton == True and len(results) == 1:
                 raise ConfigurationError('There can only be one section called \'{0}\''.format(area))
             
             results.append((section,result[1]))
     results.sort()
     return results
示例#24
0
 def _connect_to_hmc(self):
     ''' Connect to the HMC '''
         
     if self.hmc_using_addr == self.hmc_primary_addr:
         # Try primary 
         if self._try_to_connect(self.hmc_primary_addr) == True:
             get_logger().info('Connected to primary HMC')
             self.hmc_connected = True
             return 
         # Try backup 
         if self._try_to_connect(self.hmc_backup_addr) == True:
             self.hmc_using_addr = self.hmc_backup_addr
             self.hmc_connected = True
             get_logger().info('Connected to secondary HMC')
             return
     else:
         # Try backup 
         if self._try_to_connect(self.hmc_backup_addr) == True:
             self.hmc_connected = True
             get_logger().info('Connected to secondary HMC')
             return
         # Try primary 
         if self._try_to_connect(self.hmc_primary_addr) == True:
             self.hmc_connected = True
             self.hmc_using_addr = self.hmc_primary_addr
             get_logger().info('Connected to primary HMC')
             return 
         
     # Neither worked
     get_logger().info('Unable to connect to HMC')
     self.hmc_using_addr = self.hmc_primary_addr   # Start with primary next time
     return
示例#25
0
 def process_alert(self, alert):
     ''' Convert the alert to a service focal point log and send to the service focal point
 	'''
     get_logger().debug('In cnm_alert_listener')
     self.queue.put(alert, True)
     
     return
示例#26
0
 def read_xml(self, xml_element, trace_dict):
     '''Read the pool control XML'''
     self.trace_id = trace_dict[xml_element]
     entry_found = False
     for xml_entry in xml_element:
         entry_name = xml_entry.tag.split('}')[-1]
         get_logger().debug('Processing  {0}'.format(entry_name))
         if entry_name == GPCL_INIT_DURATION:
             entry_found = True
             try:
                 self[GPCL_INIT_DURATION] = self._process_duration_xml(xml_entry, self[GPCL_INIT_DURATION])
             except XMLParsingError as e:
                 self.ruleset.parse_error(self.trace_id[0], 'pool control initial duration error: {0}'.format(e.msg))
         elif entry_name == GPCL_MAX_DURATION:
             entry_found = True
             try:
                 self[GPCL_MAX_DURATION] = self._process_duration_xml(xml_entry, self[GPCL_MAX_DURATION])
             except XMLParsingError as e:
                 self.ruleset.parse_error(self.trace_id[0], 'pool control max duration error: {0}'.format(e.msg))
         elif entry_name == GPCL_ARRIVAL_RATE_EXTENSION:
             entry_found = True
             if self[GPCL_ARRIVAL_RATE_EXTENSION] is None:
                 self._process_arrival_rate_extension(xml_entry)
         else:
             self.ruleset.parse_error(self.trace_id[0], 'pool control encountered an unexpected element \'{0}\''.format(entry_name))
     if not entry_found:
         self.ruleset.parse_error(self.trace_id[0], 'pool control element must have at least one sub-element specified')
     return
示例#27
0
 def read_from_xml(self, xml_templates_element, trace_dict):
     '''Add template info defined in an XML templates element'''
     self.trace_id = trace_dict[xml_templates_element]
     for template_entry in xml_templates_element:
         template_type = template_entry.tag.split('}')[-1]
         if template_type != GTPL_CONDITION:
             self.context.parse_error(self.trace_id[0], '\'templates\' element does not support the sub-element \'{0}\''.format(template_type))
         # Condition template -- only one currently supported 
         # Process attributes
         name = None
         for att_key in template_entry.attrib:
             att_value = template_entry.attrib[att_key]
             if att_key == 'name':
                 name = att_value.strip()
             else:
                 self.ruleset.parse_error(self.trace_id[0], '\'condition_template\' element encountered an unexpected attribute \'{0}\''.format(att_key))
         # Name was required
         if name is None:
             self.context.parse_error(self.trace_id[0], '\'condition_template\' element requires \'name\' attribute')
 
         get_logger().debug('Condition template defined with name {0}'.format(name))
             
         # Must have a contained element
         if len(template_entry) < 1:
             self.context.parse_error(self.trace_id[0], 'template must contain one and only one sub-element')
         # Put in the template dictionary
         self[GTPL_CONDITION][name] = template_entry[0]
     return
示例#28
0
文件: teal.py 项目: ppjsand/pyteal
 def alert_not_analyzed_callback(self, alert):
     ''' When an alert is not handled in the alert analyzer queue pass it to the filter queue'''
     if isinstance(alert, Alert):
         get_logger().debug('Alert {0} was not analyzed in Alert Analysis Queue -- put in Delivery Queue'.format(alert.brief_str()))
         registry.get_service(SERVICE_ALERT_DELIVERY_Q).put_nowait(alert)
     else:
         get_logger().debug('Command {0} was processed by the Alert Analysis Queue'.format(alert.brief_str()))
示例#29
0
 
 def location_error(self, event, location):
     ''' Handle a location error '''
     if self.loc_handler is None:
         get_logger().debug('No location error handler.  Raise the exception')
         location._UNPROCESSABLE()
     self.loc_handler.create_alert(event, location)
示例#30
0
文件: sfp.py 项目: ppjsand/pyteal
def handle_event(errm_env):
    ''' Parse and log the event retrieved from the HMC
    '''
    event_data = parse_event(errm_env['ERRM_VALUE'])
    if not event_filtered(errm_env, event_data):
        event_type = event_data[EVENT_TYPE]
        
        if (event_type == EVENT_TYPE_OPEN):
            # New event -- Tell TEAL about it
            log_event(errm_env, event_data)
            
        elif (event_type == EVENT_TYPE_CLOSED):
            get_logger().info('{0}:{1} - {2}'.format(event_data[EVENT_PROB_NUM], 
                                                     event_data[EVENT_REFCODE],
                                                     event_type))
            close_event(errm_env, event_data)
            
        elif (event_type == EVENT_TYPE_CHANGED):
            # Make sure this is for an event with a valid problem number
            if (EVENT_PROB_NUM in event_data):
                # If this event has not been logged, then we missed the 
                # initial logging or never saw an open event (which can occur)
                if find_logged_event(errm_env, event_data) is None:
                    log_event(errm_env, event_data)
                    
        else:
            # Other event status changes are not operated on
            get_logger().warn('{0}:{1} - {2}'.format(event_data[EVENT_PROB_NUM], 
                                                     event_data[EVENT_REFCODE],
                                                     event_type))
示例#31
0
    def get_generator(self, config_dict):
        ''' Return the appropriate SQL generator based on the configuration information
        retrieved from the bg.properties
        '''
        # 1) see if person calling has it specified
        properties_file = config_dict.get('bgproperties', None)
        
        # 2) go to env variable PROPERTIES_FILE
        if not properties_file:
            properties_file = os.environ.get('PROPERTIES_FILE',None)
        if not properties_file:
            properties_file = os.environ.get('BG_PROPERTIES_FILE',None)

        # 3) then look in /bgsys/local/etc
        if not properties_file:
            properties_file = BG_PROPERTIES_FILE
            
        cfg = ConfigParser.ConfigParser()
        cfg.readfp(BgqPropertiesFile(properties_file))

        # get ras filter info
        try:
            rasFilter = cfg.get('ras','filter')
        except ConfigParser.NoOptionError:
            rasFilter = '/bgsys/drivers/ppcfloor/ras/etc/ras_environment_filter.xml'
        registry.get_logger().debug('RAS Environment filter file: ' + rasFilter)
        registry.register_service('BGQ_RAS_FILTER', rasFilter)
        config_service = ConfigService()
        registry.register_service('BGQ_CONFIG_SERVICE', config_service)

        # get database info
        db = cfg.get('database','name')
        try:
            usr_schema = cfg.get('database','schema_name') + '.'
        except ConfigParser.NoOptionError:
            usr_schema = ''

        pwless = False
        try:
            usr = cfg.get('database','user')
            pw = cfg.get('database','password')
        except ConfigParser.NoOptionError:
            registry.get_logger().debug('Database user and/or password is not specified.')
            pwless = True

        # Set the table names
        db_interface.TABLE_EVENT_LOG = usr_schema + 'x_tealeventlog'
        db_interface.TABLE_EVENT_LOG_EXT = usr_schema + 'x_tealeventlogext'
        db_interface.TABLE_BG_EVENT_LOG = usr_schema + 'tbgqeventlog'
        db_interface.TABLE_CHECKPOINT = usr_schema + 'x_tealcheckpoint'
        db_interface.TABLE_ALERT_LOG = usr_schema + 'x_tealalertlog'
        db_interface.TABLE_ALERT2ALERT = usr_schema + 'x_tealalert2alert'
        db_interface.TABLE_ALERT2EVENT = usr_schema + 'x_tealalert2event'
        db_interface.TABLE_TEMPLATE = usr_schema + 'x_{0}'

        if pwless:
            return SQLGeneratorDB2({'dsn':db})
        else:
            return SQLGeneratorDB2({'dsn':db, 'uid':usr, 'pwd':pw})
示例#32
0
 def cancel(self):
     '''Cancel the timer
     '''
     get_logger().debug('Canceling timer')
     if self.my_timer is not None:
         self.my_timer.cancel()
     self.callback = None 
     return
示例#33
0
    def _periodic_monitor(self):
        ''' Runs the monitor thread waiting for new events to occur
        '''
        registry.get_logger().debug("in periodic monitor " + str(self.last_processed_event))

        # Wait for the next polling iteration
        time.sleep(self.poll_interval)
        self._get_last_processed_event()
        self._query_and_log_event(">", self.last_processed_event)
示例#34
0
 def will_analyze_event(self, event):
     '''Indicate this analyzer handles certain BQL events.
     '''
     event_id = event.get_event_id()
     if event_id in self.msgIDs:
        registry.get_logger().debug('matched event id ' + event_id + ' ' + str(event.get_rec_id()))
        return True
     else:
        registry.get_logger().debug('not matched ' + event_id + ' ' + str(event.get_rec_id()))
        return False
def get_excludeMsgIDs():
    '''Get the list of excluded message IDs.
    '''
    cfg = registry.get_service(registry.SERVICE_CONFIGURATION)
    excludeMsgIDs = ''
    try:
        excludeMsgIDs = cfg.get(BGQ_TEAL_THRESHOLD_ANALYZER,
                                BGQ_TEAL_THRESHOLD_EXCLUDE_IDS)
        registry.get_logger().debug('Exclude List = ' + excludeMsgIDs)
    except Exception, e:
        registry.get_logger().debug(e)
示例#36
0
 def _configure(self):
     # Set the polling time based on the BGQ Connector conf file
     cfg = registry.get_service(registry.SERVICE_CONFIGURATION)
     try:
         value = cfg.get(BGQ_TEAL_CONFIG, BGQ_TEAL_CONFIG_POLL_INTERVAL)
         self.poll_interval = int(value)
         if self.poll_interval <= 0:
             registry.get_logger().error('The value ' + str(self.poll_interval) + ' specified in the poll interval is not valid. The value must be greater than zero.')    
             raise 
     except:
         registry.get_logger().warn('Configuring poll interval to default {0} seconds'.format(BGQ_DEFAULT_POLL_INTERVAL))            
         self.poll_interval = BGQ_DEFAULT_POLL_INTERVAL
示例#37
0
    def rt_callback(self,recid,msgid):
        '''Realtime callback function.
        '''
        # In case some RAS events occurred before the real-time server could be started ...
        if self.first_realtime_event:
            # Not sure this is needed?  This should be current from the previous call to query_and_log_event
#            self._get_last_processed_event()
            self._query_and_log_event(">", self.last_processed_event, recid)
            self.first_realtime_event = False
            
        registry.get_logger().debug("in rt_callback " + str(recid) + " " + msgid)
        self._query_and_log_event("=", recid)
        return
示例#38
0
    def _query_and_log_event(self, query_sign, recid, max_recid=0):
        ''' Query the BG event log for new events and log into TEAL
        '''
        registry.get_logger().debug("in _query_and_log_event")
        event_logged = False
        db = registry.get_service(registry.SERVICE_DB_INTERFACE)
        cnxn = db.get_connection()
        bgq_cursor = cnxn.cursor()
        teal_cursor = cnxn.cursor()
            
        # Query the BG event log for new events
        bgEvent_query = "SELECT RECID, CATEGORY, COMPONENT, JOBID, BLOCK, LOCATION, MSG_ID FROM " + db_interface.TABLE_BG_EVENT_LOG + " WHERE RECID " + query_sign + " ? ORDER BY RECID ASC"
        bgq_cursor.execute(bgEvent_query, recid)
        commit_count = 0
        for bg_event in next_row(bgq_cursor):
            
            # Don't process events with recids >= max_recid, if it is nonzero
            if max_recid > 0 and bg_event[0] >= max_recid:
                break
                
            # Log only events we are interested in
            if bg_event[6] in self.msgIDs:
                event_logged = True
                
                # Log the event into TEAL
                self._log_event(bg_event, teal_cursor)
                      
                # Commit every so often to limit the transaction size
                commit_count += 1
                if commit_count == COMMIT_LIMIT:           
                    cnxn.commit()
                    commit_count = 0
            else:
                registry.get_logger().debug('ignore msgid ' + bg_event[6])

            # Update the 'cursor' into the BGQ database
            self.last_processed_event = bg_event[0]
                    
        # Notify TEAL that events have been inserted
        if (event_logged):
            registry.get_logger().debug("event to log " + str(event_logged))
            cnxn.commit()
                
            if self.notifier:
                self.notifier.post()
            else:
                registry.get_logger().warn('TEAL notifier not configured.')
        
        cnxn.close()
        registry.get_logger().debug("exit _query_and_log_event")
    def __init__(self, name, inEventQueue, outQueue, config_dict=None, number=0, checkpoint=None):
        '''The constructor.
        '''
        EventAnalyzer.__init__(self, name, inEventQueue, outQueue, config_dict, number, checkpoint)

        self.severity = "F"
        self.recommendation = "Diagnose hardware that has been placed in an error state. "

        # Get the list of ras events with 'hardware in error' control actions
        self.msgIDs, self.msgidService = get_eventList()

        for msgID in self.msgIDs:
            registry.get_logger().debug('msgId = ' + msgID)

        return
示例#40
0
    def has_duplicate(self, alert_time, query, cursor):
        ''' Query alerts for the same location.
        '''
        # Query for the number of alerts for the same location
        query = query.replace('ALERT_TIME', alert_time)
        query = query.replace('WINDOW', self.window_time)
        registry.get_logger().info('Duplicate query: %s', query)
        cursor.execute(query)

        # Return True if find a match
        row = cursor.fetchone()
        if row:
            return True

        return False
示例#41
0
    def __init__(self, name, inEventQueue, outQueue, config_dict=None, number=0, checkpoint=None):
        '''The constructor.
        '''
        EventAnalyzer.__init__(self, name, inEventQueue, outQueue, config_dict, number, checkpoint)

        self.severity = "W"
        self.recommendation = '''Schedule service to isolate the BQL issue.  Possible causes are environmental, cable, or a board.  Multiple BQLs reporting issues on multiple boards in the same midplane may be a side effect of an environmental issue like neighboring boards being powered off unexpectedly.  Issues with a single cable may be due to poor seating of the cable into the connector or debris on the cable.   The cable and board can be cleaned with an optics cleaning tool.  Low voltage on all or most lanes on a single optical module may be an issue with the board at this location (the receiver) or its neighboring board (the transmitter) at the other end of the cable.'''
        self.alert_id = 'BQL01'

        # Get the exclude list of message IDs
        cfg = registry.get_service(registry.SERVICE_CONFIGURATION)
        excludeList = ''
        try: 
            excludeList = cfg.get(BGQ_TEAL_BQL_ANALYZER, excludeMsgIDs)
            registry.get_logger().debug('Exclude List = ' + excludeList)
        except Exception, e:
            registry.get_logger().debug(e)
def get_eventList():
    '''Get the list of RAS events with thresdhold count.
    '''
    # Get the exclude list of message IDs from the configuration file
    excludeMsgList = get_excludeMsgIDs()

    # Search the tbgmsgtypes for ras events that have threshold counts
    # Note: do not include ras events already handled by HardwareInError and JobFatal analyzers.
    schema = str(db_interface.TABLE_TEMPLATE).split('.')
    msgtypesTable = schema[0] + '.' + 'tbgqmsgtypes'
    count_query = "select msg_id, thresholdcount, svcaction, relevantdiags from " + msgtypesTable + " where thresholdcount is not NULL and (ctlaction is NULL or (ctlaction not like '%END_JOB%' and (ctlaction not like '%_IN_ERROR%' or ctlaction like '%SOFTWARE_IN_ERROR%')))"
    dbi = registry.get_service(SERVICE_DB_INTERFACE)
    dbConn = dbi.get_connection()
    cursor = dbConn.cursor()
    cursor.execute(count_query)
    rows = cursor.fetchall()
    msgIDs = list()
    msgidCount = dict()
    msgidService = dict()
    for r in rows:
        msgid = r[0].strip()
        if excludeMsgList.find(msgid) >= 0:
            registry.get_logger().debug(' excluding ' + msgid)
            continue
        msgIDs.append(msgid)
        msgidCount[msgid] = r[1]
        sa = 'Service action: '
        if r[2]:
            sa += r[2].strip()
        else:
            sa += "None."
        if r[3]:
            sa += ' Relevant diagnostic bucket(s): ' + r[3].strip()
        msgidService[msgid] = sa

    # Search the tbgqmsgtypes for ras events that have threshold period
    period_query = "select msg_id, thresholdperiod from " + msgtypesTable + " where thresholdperiod is not NULL"
    cursor.execute(period_query)
    rows = cursor.fetchall()
    msgidPeriod = dict()
    for r in rows:
        msgidPeriod[r[0]] = r[1]

    return msgIDs, msgidService, msgidCount, msgidPeriod
示例#43
0
    def has_matching_blockId(self, block, alert_time, cursor):
        ''' Query alerts for the associated event with the same block id
        '''
        # Get the block id from the associated event for the current alert
        # Return if no block id.

        same_block_query = self.same_block_query_str.format(alert_time, self.window_time, block)
       
        registry.get_logger().info('Same block query: %s', same_block_query)
        cursor.execute(same_block_query)
        row = cursor.fetchone()

        # If the query returns a nonzero count then there are some records with matching block ids

        if row is not None and len(row) > 0:
            if row[0] > 0:
                return True

        return False
示例#44
0
    def _get_last_processed_event(self):
        ''' Log events that have occurred prior to starting the monitor
        '''
        dbi = registry.get_service(registry.SERVICE_DB_INTERFACE)
        cnxn = dbi.get_connection()
        cursor = cnxn.cursor()

        # Find the last event injected into TEAL and then inject
        # all the events that have occurred since then
        maxEvent_query = "SELECT MAX(REC_ID) FROM " + db_interface.TABLE_EVENT_LOG_EXT
        cursor.execute(maxEvent_query);
        
        max_id = cursor.fetchone()[0]
        if max_id is None:
            self.last_processed_event = 0
        else:
            self.last_processed_event = max_id

        registry.get_logger().info('Last Processed Event = ' + str(self.last_processed_event))
        cnxn.close()
 def delta_period(self, period):
     pindex = period.find(' ')
     pdigit = ''
     punits = ''
     if pindex != -1:
         pdigit = period[0:pindex]
         punits = period[(pindex + 1):]
     # registry.get_logger().error("delta_period digit=" + pdigit + ", unit=" + punits)
     if punits == 'MONTH' or punits == 'MONTHS' or punits == 'month' or punits == 'months':
         return timedelta(months=int(pdigit))
     if punits == 'DAY' or punits == 'DAYS' or punits == 'day' or punits == 'days':
         return timedelta(days=int(pdigit))
     if punits == 'MINUTE' or punits == 'MINUTES' or punits == 'minute' or punits == 'minutes':
         return timedelta(minutes=int(pdigit))
     if punits == 'SECOND' or punits == 'SECONDS' or punits == 'second' or punits == 'seconds':
         return timedelta(seconds=int(pdigit))
     if punits == 'MICROSECOND' or punits == 'MICROSECONDS' or punits == 'microsecond' or punits == 'microseconds':
         return timedelta(microseconds=int(pdigit))
     registry.get_logger().error("No timedelta possible for " + period)
     return None
示例#46
0
    def send_common_alert(self, loc, cur_alert_recid, event, alert_time, dup_query, cursor):
        ''' Send an alert for the common location.
        '''
        # Close current alert prior to creating a new common alert
        registry.get_logger().info('Closing current alert recid %d prior to creating a common mode alert', cur_alert_recid)
        registry.get_service(SERVICE_ALERT_MGR).close(cur_alert_recid)

        # Get the location 
        loc_name = self.get_loc_name(loc)
        loc_type = loc.get_id()
        loc_parent, loc_parent_list = self.get_loc_parent(loc)
        loc_parent_object = Location(loc_type, loc_parent)

        # Removed the duplicate check that was here -- this has already been determined

        # Fill in alert info
        reason = self.reason.replace('LOC_NAME', loc_name)
        reason = reason.replace('LOC_PARENT', loc_parent)
        recommendation = self.recommendation.replace('LOC_PARENT', loc_parent)
        alert_dict = {alert.ALERT_ATTR_SEVERITY:self.severity,
                      alert.ALERT_ATTR_URGENCY:'I',
                      alert.ALERT_ATTR_EVENT_LOC_OBJECT:loc_parent_object,
                      alert.ALERT_ATTR_RECOMMENDATION:recommendation,
                      alert.ALERT_ATTR_REASON:reason,
                      alert.ALERT_ATTR_RAW_DATA:'No raw data',
                      alert.ALERT_ATTR_SRC_NAME:self.get_name(),
                      alert.ALERT_ATTR_CONDITION_EVENTS:set((event,))
                      }

        # Get the alert manager to create/allocate/commit the alert
        alertMgr = registry.get_service(registry.SERVICE_ALERT_MGR)
        bg_alert = alertMgr.allocate(self.alertId, in_dict=alert_dict)
        alertMgr.commit(bg_alert, disable_dup=False)

        # Now the alert is created, need to put it in the queue so that it can be analyzed
        # by alert analyzer (instead of sending it - send_alert, which will get reported 
        # through the pipeline right away)
        registry.get_logger().info("Put alertId = %s with event recid = %d on the alert analyzer queue", self.alertId, event.get_rec_id())
        registry.get_service(SERVICE_ALERT_ANALYZER_Q).put(bg_alert) 

        return    
示例#47
0
    def has_common_location(self, loc, alert_time, query, cursor):
        ''' Query alerts for the common location to indicate whether or not to send an alert.
        '''
        locParent, locParent_list = self.get_loc_parent(loc)

        # Query for the number of alerts for the same parent's location
        query = query.replace('LOCATION',loc.get_location())
        query = query.replace('PLOC',locParent)
        query = query.replace('ALERT_TIME', alert_time)
        query = query.replace('WINDOW', self.window_time)
        cursor.execute(query)
        registry.get_logger().info("Trying to match parent: %s from query: %s", locParent, query)

        # Send a common alert if ther are alerts with common location
        # Start counting from the current alert (include current alert)
        loc_type = loc.get_id()
        count = 1
        rows = cursor.fetchall()
        for r in rows:
            r_loc = r[0].strip()
            r_loc_object = Location(loc_type, r_loc)
            r_loc_parent, r_loc_parent_list = self.get_loc_parent(r_loc_object)
            if r_loc_parent == locParent:
                registry.get_logger().info("Match: %s with rec: %s", locParent, r_loc_parent)
                count += 1
                if count >= self.threshold:
                    return True
            else:
                registry.get_logger().info("No match: %s with rec: %s", locParent, r_loc_parent)

        return False
示例#48
0
    def will_analyze_alert(self, alert):
        '''Whether or not to analyze an alert
        '''
        # Will not analyze duplicate alert
        alertId = alert.get_rec_id()
        if alert.dup_alert_recid is not None:
            if alert.dup_alert_recid > 0:
                registry.get_logger().debug('Duplicate alert rec id %d is not analyzed', alert.get_rec_id())
                return False

        else:

            # If the dup_alert_recid field is None, then it has not been set, so determine if 
            # there are duplicates as before
            if self.alertMgr.is_duplicate(alertId):
                registry.get_logger().debug('Duplicate alert id %d is not analyzed.', alert.get_rec_id())
                return False

        # Will not analyze alert without hardware location (C: Compute; I: I/O)
        loc_type = alert.event_loc.get_id()
        if loc_type != 'C' and loc_type != 'I':
            registry.get_logger().debug('Alert id %d with location type %s is not analyzed.', alert.get_rec_id(), loc_type)
            return False

        # Will analyze all other alerts
        return True
    def __init__(self,
                 name,
                 inEventQueue,
                 outQueue,
                 config_dict=None,
                 number=0,
                 checkpoint=None):
        '''The constructor.
        '''
        EventAnalyzer.__init__(self, name, inEventQueue, outQueue, config_dict,
                               number, checkpoint)

        self.severity = "W"
        self.recommendation = "Diagnose the problem that caused the threshold to be reached or exceeded. "
        self.alert_id = 'THRESH01'

        # Get the list of ras events that have threshold counts
        self.msgIDs, self.msgidService, self.msgidCount, self.msgidPeriod = get_eventList(
        )

        for msgID in self.msgIDs:
            registry.get_logger().debug('msgId = ' + msgID)

        # define query for count with no period specified
        eventTable = self.appendSchema('tbgqeventlog')
        self.count_query = "select count(*) from " + eventTable + " where msg_id = ? and location LOC and serialnumber SN and event_time <= ?"
        self.count_query2 = "select sum(bigint(count)) from " + eventTable + " where msg_id = ? and location LOC and serialnumber SN and event_time <= ?"

        # define query for count exceeded with period specified
        self.period_query = "select count(*) from " + eventTable + " where msg_id = ? and location LOC and serialnumber SN and event_time <= ? and event_time > (timestamp('MYTIME') - PERIOD)"
        self.period_query2 = "select sum(bigint(count)) from " + eventTable + " where msg_id = ? and location LOC and serialnumber SN and event_time <= ? and event_time > (timestamp('MYTIME') - PERIOD)"

        # the number of consecutive periods that the threshold has to exceed
        self.msgidConsecutivePeriods = dict()
        self.msgidConsecutivePeriods['0008002F'] = 7
        self.msgidConsecutivePeriods['00080030'] = 3

        return
示例#50
0
    def analyze_event(self, event):
        '''Analyze a RAS event and determine whether the BQL threshold of errors has been reached or exceeded.
        '''
        msg_id = event.get_event_id()
        rec_id = event.get_rec_id()
        registry.get_logger().info("Analyzing msgid = " + msg_id +
                                   " recid = " + str(rec_id))

        location = str(event.get_src_loc())
        location = location[3:].strip()
        severity = event.raw_data['severity'].strip()
        serialnumber = event.raw_data['serialnumber']
        ecid = event.raw_data['ecid']
        event_time = event.get_time_logged()
        block = event.raw_data['block'].strip()
        jobid = event.raw_data['jobid']
        msgText = event.raw_data['message'].strip()
        rawdata = event.raw_data['rawdata'].strip()
        count = event.get_event_cnt()

        # Set threshold value
        threshold = self.msgidCount[msg_id]
        tmsg = "BQL error threshold of " + str(
            threshold) + " has been reached or exceeded, total count is "

        # check if thresholds have been reached or exceeded for events
        xmsg = ""
        xmsg = " in a period of " + self.msgidPeriod[msg_id].strip()
        query = self.period_query.replace('PERIOD',
                                          self.msgidPeriod[msg_id].strip())
        query = query.replace('MYTIME', str(event_time))

        # search for events associated with this location's midplane or I/O board
        qryloc = location.strip()[0:6] + '%'
        registry.get_logger().debug(query + " xmsgId=" + msg_id + " loc=" +
                                    qryloc + " ev_time=" + str(event_time))

        msgCount = 0
        for x in range(5):
            try:
                self.cursor.execute(query, qryloc)
                row = self.cursor.fetchone()
                msgCount = row[0]
                break
            except Exception, e:
                registry.get_logger().debug(e)
                if x < 4:
                    dbi = registry.get_service(SERVICE_DB_INTERFACE)
                    self.dbConn = dbi.get_connection()
                    self.cursor = self.dbConn.cursor()
                else:
                    raise Exception(
                        'Error: bgq_BqlEventAnalyzer could not connect to the database'
                    )
示例#51
0
    def get_threshold(self):
        ''' Get the window time config info for alert analyzer
        '''
        # Get the threshold for a hardware location
        try:
            threshold = self.cfg.get(BGQ_TEAL_ALERT_ANALYZER, BGQ_TEAL_ALERT_ANALYZER_THRESHOLD)
            if int(threshold) <= 0:
                registry.get_logger().error('The value %s specified in the threshold is not valid. The value must be greater than zero.', threshold)
                raise
            registry.get_logger().debug('threshold = %s', threshold)

        except Exception, e:
            threshold = str(BGQ_DEFAULT_THRESHOLD)
            registry.get_logger().warn('Configuring the threshold to default %s due to exception: %s', threshold, e)
示例#52
0
    def get_window_time(self):
        ''' Get the window time config info for alert analyzer
        '''
        # Get the window time (in seconds)
        windowTime = 1
        try: 
            windowTime = self.cfg.get(BGQ_TEAL_ALERT_ANALYZER, BGQ_TEAL_ALERT_ANALYZER_WINDOW_TIME)
            if int(windowTime) <= 0:
                registry.get_logger().error('The value %s specified in the window time is not valid. The value must be greater than zero.', windowTime)
                raise
            window_time = windowTime + ' SECONDS'
            registry.get_logger().debug('windowTime = %s', window_time)

        except Exception, e:
            registry.get_logger().warn('Configuring window time to default %s seconds due to exception: %s', window_time, e)
            window_time = str(BGQ_DEFAULT_WINDOW_TIME) + ' SECONDS'
    def analyze_event(self, event):
        '''Analyze a RAS event and send an alert.
        '''
        msg_id = event.get_event_id()
        rec_id = event.get_rec_id()
        registry.get_logger().info("Analyzing msgid = " + msg_id + " recid = " + str(rec_id))

        # Exclude event logged from DIAG run
        if event.raw_data['diags'] == 'T':
            registry.get_logger().debug('RAS Event generated by Diagnostics, skip creating an alert')
            return

        # Fill in alert with appropriate data 
        reason = "The hardware been put in an error state.  \nRAS event details:" \
                 " message id = " + msg_id + \
                 ", recid = " + str(rec_id) + \
                 ", timestamp = " + str(event.get_time_occurred()) + \
                 ", serial number = " + str(event.raw_data['serialnumber']) + \
                 ", ecid = " + self.ecidString(event.raw_data['ecid']) + \
                 ", jobid = " + str(event.raw_data['jobid']) + \
                 ", block = " + str(event.raw_data['block'])
        raw_data = "RAS Message: " + event.raw_data['message']
        recommendation = self.recommendation + " " + self.msgidService[msg_id]

        alert_dict = {alert.ALERT_ATTR_SEVERITY:self.severity,
                      alert.ALERT_ATTR_URGENCY:'I',
                      alert.ALERT_ATTR_EVENT_LOC_OBJECT:event.get_src_loc(),
                      alert.ALERT_ATTR_RECOMMENDATION:recommendation,
                      alert.ALERT_ATTR_REASON:reason,
                      alert.ALERT_ATTR_RAW_DATA:raw_data,
                      alert.ALERT_ATTR_SRC_NAME:self.get_name(),
                      alert.ALERT_ATTR_CONDITION_EVENTS:set((event,))
                      }

        # Get the alert manager to create/allocate/commit the alert
        alertMgr = registry.get_service(registry.SERVICE_ALERT_MGR)
        alert_id = 'HWERR01'
        bg_alert = alertMgr.allocate(alert_id, in_dict=alert_dict)
        alertMgr.commit(bg_alert, disable_dup=False)

        # Now the alert is created and can be reported through the pipeline
        registry.get_logger().info("Sending alert for msgid = " + msg_id + " recid = " + str(rec_id))
        self.send_alert(bg_alert)
        return
示例#54
0
    def run(self):
        ''' Runs the monitor thread waiting for new events to occur
        '''
        self._get_last_processed_event()
        self._query_and_log_event(">", self.last_processed_event)

        while (self.running):
            # Start real-time client
            registry.get_logger().info("starting real-time monitor")
            registry.get_logger().debug("RAS event ids to filter: " + self.filter)
            self.first_realtime_event = True
            self.t = Thread(None, pyrealtime.ras_init, 'pyrealtime', (self.filter,self.rt_callback,self.rt_term_callback))
            self.t.start()

            # If the real-time client thread terminate for whatever reason, start the periodic monitor
            self.t.join()
            registry.get_logger().info("real_time server is ended, starting periodic monitor")
            self._periodic_monitor()

        return
示例#55
0
class bgqBqlEventAnalyzer(bgqBaseAnalyzer):
    '''The BqlEventAnalyzer class determines what action to take 
    for BQL RAS events of interest.
    '''
    def __init__(self,
                 name,
                 inEventQueue,
                 outQueue,
                 config_dict=None,
                 number=0,
                 checkpoint=None):
        '''The constructor.
        '''
        EventAnalyzer.__init__(self, name, inEventQueue, outQueue, config_dict,
                               number, checkpoint)

        self.severity = "W"
        self.recommendation = '''Schedule service to isolate the BQL issue.  Possible causes are environmental, cable, or a board.  Multiple BQLs reporting issues on multiple boards in the same midplane may be a side effect of an environmental issue like neighboring boards being powered off unexpectedly.  Issues with a single cable may be due to poor seating of the cable into the connector or debris on the cable.   The cable and board can be cleaned with an optics cleaning tool.  Low voltage on all or most lanes on a single optical module may be an issue with the board at this location (the receiver) or its neighboring board (the transmitter) at the other end of the cable.'''
        self.alert_id = 'BQL01'

        # Get the exclude list of message IDs
        cfg = registry.get_service(registry.SERVICE_CONFIGURATION)
        excludeList = ''
        try:
            excludeList = cfg.get(BGQ_TEAL_BQL_ANALYZER, excludeMsgIDs)
            registry.get_logger().debug('Exclude List = ' + excludeList)
        except Exception, e:
            registry.get_logger().debug(e)

        # ras events that have BQL_SPARE detail data
        self.msgIDs = get_eventList()
        for msgid in self.msgIDs:
            registry.get_logger().debug('msgId = ' + msgid)

        # set the threshold
        self.msgidCount = dict()
        self.msgidCount['00090200'] = 2
        self.msgidCount['00090201'] = 1
        self.msgidCount['00090202'] = 1
        self.msgidCount['00090210'] = 4
        self.msgidCount['00090211'] = 4

        # set the window = 2 X the period
        self.msgidPeriod = dict()
        self.msgidPeriod['00090200'] = '11 seconds'
        self.msgidPeriod['00090201'] = '11 seconds'
        self.msgidPeriod['00090202'] = '11 seconds'
        self.msgidPeriod['00090210'] = '11 seconds'
        self.msgidPeriod['00090211'] = '11 seconds'

        # BQL related ras events
        self.bqlIDs = list()

        # define query for count of recent events at this location
        # within a window (plus and minus the event time)
        #    parameter 1 = location
        #    parameter 2 = event time
        eventTable = self.appendSchema('tbgqeventlog')
        self.period_query = "select count(*) from " + eventTable + " where location like ? and category='BQL' and event_time <=  (timestamp('MYTIME') + PERIOD) and event_time > (timestamp('MYTIME') - PERIOD)"

        # define query for count of open alerts at this location
        # within a day from the event time
        #    parameter 1 = location
        #    parameter 2 = event time
        alertTable = self.appendSchema('x_tealalertlog')
        self.alert_period = '1 day'
        self.alert_query = "select count(*) from " + alertTable + " where \"alert_id\"='BQL01' and \"event_loc\"= ? and \"creation_time\" >= (timestamp('MYTIME') - PERIOD) and \"state\"=1"

        # database connection and cursor
        dbi = registry.get_service(SERVICE_DB_INTERFACE)
        self.dbConn = dbi.get_connection()
        self.cursor = self.dbConn.cursor()

        return
示例#56
0
 def handle_control_msg(self, control_msg):
     ''' Handle any control messages that have been sent. No special action
     required
     '''
     registry.get_logger().debug(
         '...Control message received: {0}'.format(control_msg))
示例#57
0
    def analyze_alert(self, alert):
        '''Analyze an alert
        '''
        alert_recId = alert.get_rec_id()
        alert_id = alert.get_incident_id()
        loc_type = alert.event_loc.get_id()
        location = alert.event_loc.get_location()
        #alert_msgId = alert.get_incident_id()
        registry.get_logger().info('Analyzing alert id %d loc_type: %s: %s', alert_recId, loc_type, location)

        # There should only be one condition event associated with the alert.  
        events = alert.condition_events
        if len(events) == 0:
            registry.get_logger().error('No event associated with the alert recid %d', alert_recId)
            registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert)
            return
        event = events.pop()

        if (alert_id == 'BQL01'):
            # No need to analyze BQL01 alerts, just pass it to the delivery queue
            registry.get_logger().info('Nothing to analyze for alert id %s ', alert_id)
            registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert)
            return

        # Get the location 
        loc = Location(loc_type, location)
        locName = self.get_loc_name(loc)

        # No need to analyze alert with rack location
        alert_time = str(alert.get_time_occurred())
        if locName == 'rack':
            registry.get_logger().info('Nothing to analyze for alert recid %d with rack location', alert_recId)
            registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert)
            return

        # Find out if there are other alerts with the same block id (for ENDJOB01 and THRES01)
        dup_qry = ''
        if (alert_id == 'ENDJOB01' or alert_id == 'THRES01'):
            if event.raw_data['block'] is None:
                event_block = None
            else:
                event_block = event.raw_data['block'].strip()
          
            if event_block is None or event_block == BGQ_EVENT_NULL_BLOCK:
                 # Found no prior alert with the same block id, pass current alert to the delivery queue
                registry.get_logger().info('No block id for alert id %d, no common alert generated for block: %s', alert_recId, event_block)
                registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert)
                return

            same_block = False
            # Get db connection needed for query
            dbi = registry.get_service(SERVICE_DB_INTERFACE)
            dbConn = dbi.get_connection()
            cursor = dbConn.cursor()

            if (alert_id == 'ENDJOB01'):
                # For ENDJOB01, look for alert id HWERR01 or COMMON01 with the same block id
                same_block = self.has_matching_blockId(event_block, alert_time, cursor)
            else:
                # For THRES0101, look for alert id HWERR01 or COMMON01 or ENDJOB01 with the same block id
                same_block = self.has_matching_blockId(event_block, alert_time, cursor)

            if same_block:
                # Found prior alert with the same block id, close current alert
                registry.get_logger().info('Closing current alert recid %d due to prior alert with the same block id', alert_recId)
                registry.get_service(SERVICE_ALERT_MGR).close(alert_recId)
            else:
                # Found no prior alert with the same block id, pass current alert to the delivery queue
                registry.get_logger().info('No common block id found for alert id %d within the last %s', alert_recId, self.window_time)
                registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert)

            return

        elif (alert_id == 'BQL01'):
            # No need to analyze BQL01 alerts, just pass it to the delivery queue
            registry.get_logger().info('Nothing to analyze for alert id %s.', alert_id)
            registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert)
            return

        # The following will handle the rest of the alert ids (HWERR01 or COMMON01).
        # Find out if there is common mode alert already exist for the same location or higher hierarchy 
        loc_parent, loc_parent_list = self.get_loc_parent(loc)
        loc_qry = '('
        idx = 0
        for pLoc in loc_parent_list:
            if idx != 0: 
                loc_qry += " or "
            loc_qry +=  " \"event_loc\" like '" + pLoc + "'"
            idx += 1

        dup_qry2 = self.dup_query + loc_qry + ")"
        loc_qry += " or \"event_loc\" like '" + location + "')"
        dup_qry = self.dup_query + loc_qry
        dbi = registry.get_service(SERVICE_DB_INTERFACE)
        dbConn = dbi.get_connection()
        cursor = dbConn.cursor()

        dup = self.has_duplicate(alert_time, dup_qry, cursor)
        if dup:
            # Found prior alert with the same block id, close current alert
            registry.get_logger().info('Closing current alert recid %d due to prior alert with same common location', alert_recId)
            registry.get_service(SERVICE_ALERT_MGR).close(alert_recId)
            return

        # Look for a common hardware problem if there are multiple alerts for different location
        # on the same hardware.
        sendAlert = self.has_common_location(loc, alert_time, self.query, cursor)
        if sendAlert:
            # Send commmon alert
            self.send_common_alert(loc, alert_recId, event, alert_time, dup_qry2, cursor)
        else:
            # Pass current alert to the delivery queue
            registry.get_logger().info('No common location for %s found for alert id: %d within the last %s ', location, alert_recId, self.window_time)
            registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert)

        return
示例#58
0
                break
            except Exception, e:
                registry.get_logger().debug(e)
                if x < 4:
                    dbi = registry.get_service(SERVICE_DB_INTERFACE)
                    self.dbConn = dbi.get_connection()
                    self.cursor = self.dbConn.cursor()
                else:
                    raise Exception(
                        'Error: bgq_BqlEventAnalyzer could not connect to the database'
                    )

        if msgCount < threshold:
            if msg_id == '00090200':
                registry.get_logger().info(
                    "The optical lane will be spared since only " +
                    str(msgCount) +
                    " BQL event(s) were logged during the window.")
                # perform the BQL sparing action
                self.perform_sparing(rec_id, location, rawdata)
            if msg_id == '00090210' or msg_id == '00090211':
                registry.get_logger().info(
                    "The failing optical lane is spared automatically by the control system for message id "
                    + msg_id +
                    ".  No administrator action is required.  Retry booting the block."
                )
            return

        aquery = self.alert_query.replace('PERIOD', self.alert_period)
        aquery = aquery.replace('MYTIME', str(event_time))
        registry.get_logger().debug(aquery + " xmsgId=" + msg_id + " loc=" +
                                    location.strip() + " ev_time=" +
示例#59
0
                self.cursor.execute(query, qryloc)
                row = self.cursor.fetchone()
                msgCount = row[0]
                break
            except Exception, e:
                registry.get_logger().debug(e)
                if x < 4:
                    dbi = registry.get_service(SERVICE_DB_INTERFACE)
                    self.dbConn = dbi.get_connection()
                    self.cursor = self.dbConn.cursor()
                else:
                    raise Exception('Error: bgq_BqlEventAnalyzer could not connect to the database')

        if msgCount < threshold:
            if msg_id == '00090200':
                registry.get_logger().info("The optical lane will be spared since only " + str(msgCount) + " BQL event(s) were logged during the window.")
                # perform the BQL sparing action
                self.perform_sparing(rec_id, location, rawdata)
                return

        aquery = self.alert_query.replace('PERIOD',self.alert_period)
        aquery = aquery.replace('MYTIME', str(event_time))
        registry.get_logger().debug(aquery + " xmsgId=" + msg_id + " loc=" + location.strip() + " ev_time=" + str(event_time))

        msgCount = 0
        for x in range(5):
            try:
                self.cursor.execute(aquery,location.strip())
                row = self.cursor.fetchone()
                msgCount = row[0]
                break