Пример #1
0
 def testGeneralFilters(self):
     """test alert delivery with global and local filtering"""
     j_in_dq = Journal("j_in_DQ", "data/alert_delivery_test/data_sample_inject_DQ.json")
     # p rint str(j_in_dq)
     dq_q = get_service(SERVICE_ALERT_DELIVERY_Q)
     # Get the AlertListenerJournal journals
     listeners = get_service(SERVICE_ALERT_DELIVERY).listeners
     for listener in listeners:
         name = listener.get_name()
         # p rint name
         if name == "AllAlerts":
             j_out_all = listener.journal
         if name == "OnlyAlertId":
             j_out_alert_id = listener.journal
         if name == "OnlyAlertIdUrgent":
             j_out_ai_urgent = listener.journal
     # inject
     j_in_dq.inject_queue(dq_q)
     # wait for stuff to come out
     self.assertTrue(j_out_all.wait_for_entries(5))
     self.assertTrue(j_out_alert_id.wait_for_entries(3))
     self.assertTrue(j_out_ai_urgent.wait_for_entries(2))
     #
     j_out_all_exp = Journal("j_out_all_exp", "data/alert_delivery_test/data_sample_out_all_alerts.json")
     self.assertTrue(j_out_all.deep_match(j_out_all_exp, ignore_delay=True, ignore_times=True))
     j_out_alert_id_exp = Journal("j_out_alert_id_exp", "data/alert_delivery_test/data_sample_out_alert_id.json")
     self.assertTrue(j_out_alert_id.deep_match(j_out_alert_id_exp, ignore_delay=True, ignore_times=True))
     j_out_ai_urgent_exp = Journal("j_out_ai_urgent_exp", "data/alert_delivery_test/data_sample_out_ai_urgent.json")
     self.assertTrue(j_out_ai_urgent.deep_match(j_out_ai_urgent_exp, ignore_delay=True, ignore_times=True))
     return
Пример #2
0
 def testDBtruncate(self):
     ''' Test that the DB tables correctly truncate '''
     # Truncate the tables 
     self.prepare_db()
     teal = Teal('data/aaaa_assumptions_test/minimal.conf', 'stderr', msgLevel=self.msglevel, data_only=True, commit_alerts=False, commit_checkpoints=False)
     self.dbi = registry.get_service(SERVICE_DB_INTERFACE)
     self.cnxn = self.dbi.get_connection()
     self.cursor = self.cnxn.cursor()
     # Check the tables
     self._check_rows(db_interface.TABLE_EVENT_LOG, 0)
     self._check_rows(db_interface.TABLE_ALERT_LOG, 0)
     self._check_rows(db_interface.TABLE_ALERT2ALERT, 0)
     self._check_rows(db_interface.TABLE_CHECKPOINT, 0)
     self.cnxn.close()
     teal.shutdown()
     
     teal = Teal('data/aaaa_assumptions_test/minimal.conf', 'stderr', msgLevel=self.msglevel)
     self.dbi = registry.get_service(SERVICE_DB_INTERFACE)
     self.cnxn = self.dbi.get_connection()
     self.cursor = self.cnxn.cursor()
     # Check the tables
     self._check_rows(db_interface.TABLE_EVENT_LOG, 0)
     self._check_rows(db_interface.TABLE_ALERT_LOG, 0)
     self._check_rows(db_interface.TABLE_ALERT2ALERT, 0)
     self._check_rows(db_interface.TABLE_CHECKPOINT, 1)
     self.cnxn.close()
     teal.shutdown()
Пример #3
0
    def init_db_interface(self, daemon_mode, run_mode):
        ''' Setup the underlying data store connection '''
        cf_reg = registry.get_service(SERVICE_CONFIGURATION)

        for data_store in self.load_plugins(CONFIG_DB_INTERFACE, run_mode, singleton=True):
            registry.register_service(SERVICE_DB_INTERFACE, data_store[0](dict(cf_reg.items(data_store[2]))))
            
        if daemon_mode:
            # Make sure the DB is up and running before we continue, since this might be 
            # being invoked during IPL and order of startup is not guaranteed
            timeout = 180
            db_exception = None
            dbi = registry.get_service(SERVICE_DB_INTERFACE)
            while (timeout > 0):
                try:
                    cnxn = dbi.get_connection()
                    cnxn.close()
                    break
                except Exception, e:
                    db_exception = e
                    time.sleep(3)
                    timeout -= 3
                    
            if timeout <= 0:
                raise TealError("Cannot connect to database: {0}".format(db_exception))
Пример #4
0
    def __init__(self, name, inEventQueue, inAlertQueue, outQueue, config_dict=None, number=0):
        ''' The constructor '''
        AlertAnalyzer.__init__(self, name, inEventQueue, inAlertQueue, outQueue, config_dict, number)

        # Common mode alert info
        self.alertId = 'COMMON01'
        self.severity = 'W'
        self.recommendation = "Check the environmental monitor data for LOC_PARENT."
        self.reason = "Multiple alerts have been logged against LOC_NAMEs on LOC_PARENT. The cause may be due to a common mode failure.  Analyze LOC_PARENT environmental data for abnormalities prior to replacing individual LOC_NAMEs."

        # Get alert manager
        self.alertMgr = registry.get_service(registry.SERVICE_ALERT_MGR)

        # Get the configuration info for alert analyzer
        self.cfg = registry.get_service(registry.SERVICE_CONFIGURATION)
        self.window_time = self.get_window_time()
        self.threshold = self.get_threshold()

        # Alert table query
        schema = str(db_interface.TABLE_TEMPLATE).split('.')
        alertTable = schema[0] + '.x_tealalertlog'
        alert2eventTable = schema[0] + '.x_tealalert2event'
        eventTable = schema[0] + '.tbgqeventlog'
        query_time_window = "\"creation_time\" >= (timestamp('ALERT_TIME') - WINDOW) and \"creation_time\" < timestamp('ALERT_TIME')"
        self.query = "select \"event_loc\" from " + alertTable + " where \"state\" = 1 and \"event_loc\" like 'PLOC%' and \"event_loc\" not like 'LOCATION' and " + query_time_window
        self.dup_query = "select \"event_loc\" from " + alertTable + " where \"state\" = 1 and \"alert_id\" = 'COMMON01' and " + query_time_window + " and "
        self.alert_recid_query1 = "select \"rec_id\" from " + alertTable + " where \"state\" = 1 and (\"alert_id\" = 'COMMON01' or \"alert_id\" = 'HWERR01') and " + query_time_window
        self.alert_recid_query2 = "select \"rec_id\" from " + alertTable + " where \"state\" = 1 and (\"alert_id\" = 'COMMON01' or \"alert_id\" = 'HWERR01' or \"alert_id\" = 'ENDJOB01') and " + query_time_window
        self.event_recid_query = "select \"t_event_recid\" from " + alert2eventTable + " where \"alert_recid\" = ?"
        self.event_block_id_query = "select block from " + eventTable + " where recid = ?"

        query_time_window_str = "\"creation_time\" >= (timestamp('{0}') - {1}) and \"creation_time\" < timestamp('{0}')"
        self.same_block_query_str = "select count(*) from " + alertTable + " where \"state\" = 1 and (\"alert_id\" = 'COMMON01' or \"alert_id\" = 'HWERR01' or \"alert_id\" = 'ENDJOB01') and " + query_time_window_str + " and \"rec_id\" in (select \"alert_recid\" from " + alert2eventTable + " where \"t_event_recid\" in (select recid from " + eventTable + " where block = '{2}'))"

        return
Пример #5
0
    def testGeneralFilters(self):
        """test alert delivery with global and local filtering"""
        j_in_dq = Journal("j_in_DQ", "data/alert_delivery_test/listener_failure/inject_DQ_alerts.json")
        dq_q = get_service(SERVICE_ALERT_DELIVERY_Q)
        # Get the AlertListenerJournal journals
        listeners = get_service(SERVICE_ALERT_DELIVERY).listeners
        for listener in listeners:
            name = listener.get_name()
            if name == "AllAlerts":
                j_out_all = listener.journal
            if name == "OnlyAnalyzer1":
                j_out_analyzer1 = listener.journal
        # inject
        j_in_dq.inject_queue(dq_q)
        # Create a TEAL alert
        create_teal_alert("XXXXXXXX", "no reason at all", "medium well", loc_instance="YYY")

        # Get expected values
        j_out_all_exp = Journal("all_exp", "data/alert_delivery_test/analyzer_filter/alerts_out_all.json")
        j_out_analyzer1_exp = Journal("analyzer1", "data/alert_delivery_test/analyzer_filter/alerts_out_analyzer1.json")
        # wait for stuff to come out
        self.assertTrue(j_out_all.wait_for_entries(len(j_out_all_exp) + 3))
        self.assertTrue(j_out_analyzer1.wait_for_entries(len(j_out_analyzer1_exp)))
        # Check that it was what was expected

        # Can't really check this because the location is unique for each machine and run
        #  Make sure only 3 extra
        self.assertEqual(len(j_out_all) - len(j_out_all_exp), 3)
        # self.assertTrue(j_out_all.deep_match(j_out_all_exp, ignore_delay=True, ignore_times=True))
        self.assertTrue(j_out_analyzer1.deep_match(j_out_analyzer1_exp, ignore_delay=True, ignore_times=True))
        return
Пример #6
0
 def alert_not_analyzed_callback(self, alert):
     ''' When an alert is not handled in the alert analyzer queue pass it to the filter queue'''
     if isinstance(alert, Alert):
         get_logger().debug('Alert {0} was not analyzed in Alert Analysis Queue -- put in Delivery Queue'.format(alert.brief_str()))
         registry.get_service(SERVICE_ALERT_DELIVERY_Q).put_nowait(alert)
     else:
         get_logger().debug('Command {0} was processed by the Alert Analysis Queue'.format(alert.brief_str()))
Пример #7
0
 def testDemo1EventQ(self):
     '''Test that the first demo flow works -- Inject Event Q'''
     self.teal = Teal('data/teal_test/configurationtest_05_auto.conf', 'stderr', msgLevel=self.msglevel, 
                      commit_alerts=False, commit_checkpoints=False, run_mode=TEAL_RUN_MODE_HISTORIC)
     j_in = Journal('j_in', file='data/demo/data_sample_demo_NEW_001.json')
     j_out_aaq = Journal('j_out_aaq')
     j_out_dq = Journal('j_out_dq')
     j_out_lis = Journal('j_out_lis')
     q_in = registry.get_service(SERVICE_EVENT_Q)
     q_out_aaq = registry.get_service(SERVICE_ALERT_ANALYZER_Q)
     q_out_dq = registry.get_service(SERVICE_ALERT_DELIVERY_Q)
     q_out_dq.register_listener(j_out_dq)
     q_out_aaq.register_listener(j_out_aaq)
     listeners = get_service(SERVICE_ALERT_DELIVERY).listeners
     for listener in listeners:
         if listener.get_name() == 'outputJournal':
             j_out_lis = listener.journal
     j_in.inject_queue(q_in)
     self.assertTrue(j_out_lis.wait_for_entries(3))
     j_exp_aaq = Journal('j_exp_aaq', 'data/teal_test/data_sample_demo_NEW_001_AAQ_Result.json')
     self.assertTrue(j_out_aaq.deep_match(j_exp_aaq, ignore_delay=True, ignore_times=True))
     j_exp_dq = Journal('j_exp_dq', 'data/teal_test/data_sample_demo_NEW_001_DQ_Result.json')
     self.assertTrue(j_out_dq.deep_match(j_exp_dq, ignore_delay=True, ignore_times=True))
     j_exp_lis = Journal('j_exp_lis', 'data/teal_test/data_sample_demo_NEW_001_LIS_Result.json')
     self.assertTrue(j_out_lis.deep_match(j_exp_lis, ignore_delay=True, ignore_times=True))
     
     q_out_aaq.unregister_listener(j_out_aaq)        
     q_out_dq.unregister_listener(j_out_dq)
     self.teal.shutdown()
Пример #8
0
    def install(self, connector_info, connector_comment):
        ''' Install the SNMP connectors into the xCAT monsetting table
        '''
        # First clean up any remnants that still may exist
        self.uninstall(connector_comment)

        db = registry.get_service(registry.SERVICE_DB_INTERFACE)
        conn = db.get_connection()
        cursor = conn.cursor()

        # Get all the cmds entries to determine the starting number to use
        db.select(cursor, ['*'], 'monsetting',
                    where="$name = 'snmpmon' AND $key LIKE 'cmds%'",
                    where_fields=['name', 'key'])

        # Determine the max number for the new commands
        cmd_num = [self._get_cmd_num(row[1]) for row in cursor.fetchall()]
        if cmd_num:
            next_cmd_num = max(cmd_num) + 1
        else:
            next_cmd_num = 1

        root_dir = registry.get_service(registry.TEAL_ROOT_DIR)

        # Insert the entries for each of the SNMP connectors
        for connector in connector_info:
            cursor.executemany("INSERT INTO monsetting VALUES(?, ?, ?, ?, ?)",
                               [('snmpmon', 'cmds{0:02d}'.format(next_cmd_num), os.path.join(root_dir, connector[0]), connector_comment, None),
                                ('snmpmon', 'runcmd{0:02d}'.format(next_cmd_num), connector[1], connector_comment, None)])
            next_cmd_num += 1

        # Commit all the added entries to the monsetting table
        conn.commit()
        cursor.close()
        conn.close()
Пример #9
0
    def is_installed(self, connector_info, connector_comment):
        ''' Check whether the connector is installed or not by checking to see if the command is configured 
        in the monsetting table
        '''
        root_dir = registry.get_service(registry.TEAL_ROOT_DIR)

        db = registry.get_service(registry.SERVICE_DB_INTERFACE)
        conn = db.get_connection()
        cursor = conn.cursor()

        installed = False
        for cmd, snmp_filter in connector_info:
            db.select(cursor, ['*'], 'monsetting',
                      where="$name='snmpmon' AND $key LIKE 'cmd%' and $value = ?",
                      where_fields = ['name','key','value'],
                      parms=(os.path.join(root_dir, cmd),))

            row = cursor.fetchone()
            if row is None:
                installed = False
                break
            else:
                installed = True

        return installed
Пример #10
0
def create_teal_alert(alert_id, reason, raw_data, src_name='TEAL', severity='I', 
                      urgency='N', loc_instance=None, recommendation='Contact next level of support',
                      disable_dup=False):
    ''' create a TEAL alert
          This will used the parameters to:
            (1) Create the alert initialization dictionary
            (2) Allocate the alert
            (3) Commit the alert
            (4) Put the alert in the delivery queue
    '''
    get_logger().debug('Creating {0} alert'.format(src_name))
    
    # Build the Alert directly from the event information
    alert_dict = {ALERT_ATTR_SEVERITY:severity,
                  ALERT_ATTR_URGENCY:urgency,
                  ALERT_ATTR_RECOMMENDATION:recommendation,
                  ALERT_ATTR_REASON:reason,
                  ALERT_ATTR_RAW_DATA:raw_data,
                  ALERT_ATTR_SRC_NAME: src_name
                  }
        
    alert_dict[ALERT_ATTR_EVENT_LOC_OBJECT] = registry.get_service(SERVICE_LOCATION).get_teal_location(loc_instance)
    registry.get_service(SERVICE_ALERT_MGR).create_and_deliver_alert(alert_id, alert_dict, disable_dup=disable_dup)

    return
Пример #11
0
 def start(self):
     '''Start the notifier-based event monitor running.
     '''
     try:
         event_queue =  registry.get_service(SERVICE_EVENT_Q)
         dbi = registry.get_service(SERVICE_DB_INTERFACE)
         next_failure_log = None
         rc = 0   
         
         while self.running:
             if rc == 0:
                 get_logger().debug('Processing events in monitor event injection thread. startRecid = {0}'.format(self.start_recid))
                 try:
                     cnxn = dbi.get_connection()
                     cursor = cnxn.cursor()
                     for row in cursor.execute(self.sql_runtime_query, self.start_recid):
                         get_logger().debug('Processing row, rec_id = {0} time_occurred = {1}, time_logged = {2}'.format(row[0], row[2],row[3]))
                         e = Event.fromDB(row)
                         event_queue.put(e)
                         self.start_recid = row[0] 
                         if self.running == False: 
                             get_logger().info('Monitor event injection thread interrupted.  last recid = {0}'.format(self.start_recid))
                             break
                         if self.start_recid % self.update_checkpoint_frequency == 0:
                             inject_update_checkpoint_msg(self.start_recid)
                     cnxn.close()
                 except:
                     cur_time = datetime.now()
                     if next_failure_log is None or  cur_time > next_failure_log:
                         get_logger().exception('Failure in monitor event injection thread')
                         next_failure_log = cur_time + timedelta(minutes=10)
             rc = self.notifier.wait()
     except:
         get_logger().exception('Monitor event injection thread failure')
     get_logger().debug('Exiting monitor event injection thread.  Last recid = {0}'.format(self.start_recid))    
Пример #12
0
 def testJournalWriteAlertDB4(self):
     ''' Test writing of Alert log queue after reading from DB '''
     # This test does not work with duplicate checking -- probably don't want it to 
     keep_ADC = self.force_env('TEAL_ALERT_DUPLICATE_CHECK', 'No')
     self.teal = Teal('data/journal_test/events_002.conf','stderr',msgLevel=self.msglevel)
     # Events
     je = Journal('DB test input EVENTS', file='data/journal_test/events_002.json')
     je.insert_in_db(truncate=True, no_delay=True)
     # Alerts
     ja = Journal('DB test input ALERTS', file='data/journal_test/alerts_002.json')
     ja.insert_in_db(truncate=False, no_delay=True)
     # Check events
     jedb = Journal('Read DB test EVENTS')
     jedb.select_from_db('event')
     self.assertTrue(je.deep_match(jedb, ignore_delay=True, ignore_times=False, ignore_rec_id=False))
     # Check alerts
     jadb = Journal('Read DB test ALERTS')
     jadb.select_from_db('alert')
     self.assertTrue(ja.deep_match(jadb, ignore_delay=True, ignore_times=False, ignore_rec_id=False))
     # Now insert into the Delivery Queue and make sure all come out 
     jadb.inject_queue(get_service(SERVICE_ALERT_DELIVERY_Q), progress_cb=None, fail_on_invalid=False, no_delay=True)
     listeners = get_service(SERVICE_ALERT_DELIVERY).listeners
     for listener in listeners:
         name = listener.get_name()
         if name == 'Journal':
             j_out_all = listener.journal
     self.assertTrue(j_out_all.wait_for_entries(6))
     self.assertTrue(j_out_all.deep_match(jadb, ignore_delay=True, ignore_times=True))
     self.teal.shutdown()
     self.restore_env('TEAL_ALERT_DUPLICATE_CHECK', keep_ADC)
     return
Пример #13
0
 def testLoadingFromConTwo(self):
     '''Test if one files for each are specified in the config file in two sections
     '''
     teal_inst = teal.Teal('data/metadata_test/load_config_03.conf', 'stderr', msgLevel=self.msglevel, commit_alerts=False, commit_checkpoints=False)
     # it loaded event_metadata_05 and alert_metadata_03
     # Check event metadata via event
     esm1 = get_service(SERVICE_EVENT_METADATA)
     self.assertEqual(len(esm1), 2)
     event_id = 'idvalue1'
     event_comp = 'TST'
     e1 = teal.Event.fromDict({EVENT_ATTR_REC_ID:1, 
                        EVENT_ATTR_EVENT_ID:event_id, 
                        EVENT_ATTR_SRC_COMP: 'TST', 
                        EVENT_ATTR_TIME_OCCURRED: datetime.now()})  
     meta_dict2 = e1.get_metadata()
     self.assertEqual(meta_dict2[META_EVENT_ID], event_id)
     self.assertEqual(meta_dict2[META_EVENT_COMP], event_comp)
     self.assertEqual(meta_dict2[META_EVENT_MSG], 'This is test message 1')
     # check alert metadata directly
     asm1 = get_service(SERVICE_ALERT_METADATA)
     alert_id = 'Alert01'
     self.assertEqual(len(asm1), 3)
     self.assertTrue(alert_id in asm1)
     meta_dict = asm1[alert_id]
     self.assertEqual(meta_dict[META_ALERT_ID], alert_id)
     self.assertEqual(meta_dict[META_ALERT_MSG_TEMPLATE], 'This is Alert 01')
     self.assertEqual(meta_dict[META_ALERT_RECOMMENDATION], 'Recommend doing something')
     self.assertEqual(meta_dict[META_ALERT_URGENCY], 'N')
     self.assertEqual(meta_dict[META_ALERT_SEVERITY], 'W')
     self.assertEqual(meta_dict[META_ALERT_CALL_HOME], 'N')
     self.assertEqual(meta_dict[META_ALERT_CUST_NOTIFICATION], 'N')
     self.assertEqual(meta_dict[META_ALERT_FRU_CLASS], 'fru_class')
     self.assertEqual(meta_dict[META_ALERT_FRU_LIST], 'fru_list1, fru_list2')
     alert_id2 = 'Alert02'
     self.assertTrue(alert_id2 in asm1)
     meta_dict = asm1[alert_id2]
     self.assertEqual(meta_dict[META_ALERT_ID], alert_id2)
     self.assertEqual(meta_dict[META_ALERT_MSG_TEMPLATE], 'This is Alert 02')
     self.assertEqual(meta_dict[META_ALERT_RECOMMENDATION], 'Recommend doing something else')
     self.assertEqual(meta_dict[META_ALERT_URGENCY], 'S')
     self.assertEqual(meta_dict[META_ALERT_SEVERITY], 'E')
     self.assertEqual(meta_dict[META_ALERT_CALL_HOME], 'N')
     self.assertEqual(meta_dict[META_ALERT_CUST_NOTIFICATION], 'N')
     self.assertEqual(meta_dict[META_ALERT_FRU_CLASS], None)
     self.assertEqual(meta_dict[META_ALERT_FRU_LIST], None)
     alert_id3 = 'Alert03'
     self.assertTrue(alert_id3 in asm1)
     meta_dict = asm1[alert_id3]
     self.assertEqual(meta_dict[META_ALERT_ID], alert_id3)
     self.assertEqual(meta_dict[META_ALERT_MSG_TEMPLATE], 'This is Alert 03')
     self.assertEqual(meta_dict[META_ALERT_RECOMMENDATION], 'Do not do anything')
     self.assertEqual(meta_dict[META_ALERT_URGENCY], 'N')
     self.assertEqual(meta_dict[META_ALERT_SEVERITY], 'E')
     self.assertEqual(meta_dict[META_ALERT_CALL_HOME], 'Y')
     self.assertEqual(meta_dict[META_ALERT_CUST_NOTIFICATION], 'N')
     self.assertEqual(meta_dict[META_ALERT_FRU_CLASS], None)
     self.assertEqual(meta_dict[META_ALERT_FRU_LIST], None)
     teal_inst.shutdown()
     return
Пример #14
0
 def testGeneralFilters(self):
     """test alert delivery with global and local filtering"""
     j_in_dq = Journal("j_in_DQ", "data/alert_delivery_test/analyzer_filter/inject_DQ_alerts.json")
     dq_q = get_service(SERVICE_ALERT_DELIVERY_Q)
     # Get the AlertListenerJournal journals
     listeners = get_service(SERVICE_ALERT_DELIVERY).listeners
     for listener in listeners:
         name = listener.get_name()
         if name == "AllAlerts":
             j_out_all = listener.journal
         if name == "OnlyAnalyzer1":
             j_out_analyzer1 = listener.journal
         if name == "AnyButAnalyzer1":
             j_out_not_analyzer1 = listener.journal
         if name == "OnlyAnalyzer2and3":
             j_out_analyzer2and3 = listener.journal
         if name == "AnyButAnalyzer2and3":
             j_out_not_analyzer2and3 = listener.journal
         if name == "AnyButAnalyzer1and2and3":
             j_out_not_analyzer1and2and3 = listener.journal
     # inject
     j_in_dq.inject_queue(dq_q)
     # Get expected values
     j_out_all_exp = Journal("all_exp", "data/alert_delivery_test/analyzer_filter/alerts_out_all.json")
     j_out_analyzer1_exp = Journal("analyzer1", "data/alert_delivery_test/analyzer_filter/alerts_out_analyzer1.json")
     j_out_not_analyzer1_exp = Journal(
         "not_analyzer1", "data/alert_delivery_test/analyzer_filter/alerts_out_not_analyzer1.json"
     )
     j_out_analyzer2and3_exp = Journal(
         "analyzer2and3", "data/alert_delivery_test/analyzer_filter/alerts_out_analyzer2and3.json"
     )
     j_out_not_analyzer2and3_exp = Journal(
         "not_analyzer2and3", "data/alert_delivery_test/analyzer_filter/alerts_out_not_analyzer2and3.json"
     )
     j_out_not_analyzer1and2and3_exp = Journal(
         "not_analyzer1and2and3", "data/alert_delivery_test/analyzer_filter/alerts_out_not_analyzer1and2and3.json"
     )
     # wait for stuff to come out
     self.assertTrue(j_out_all.wait_for_entries(len(j_out_all_exp)))
     self.assertTrue(j_out_analyzer1.wait_for_entries(len(j_out_analyzer1_exp)))
     self.assertTrue(j_out_not_analyzer1.wait_for_entries(len(j_out_not_analyzer1_exp)))
     self.assertTrue(j_out_analyzer2and3.wait_for_entries(len(j_out_analyzer2and3_exp)))
     self.assertTrue(j_out_not_analyzer2and3.wait_for_entries(len(j_out_not_analyzer2and3_exp)))
     self.assertTrue(j_out_not_analyzer1and2and3.wait_for_entries(len(j_out_not_analyzer1and2and3_exp)))
     # Check that it was what was expected
     self.assertTrue(j_out_all.deep_match(j_out_all_exp, ignore_delay=True, ignore_times=True))
     self.assertTrue(j_out_analyzer1.deep_match(j_out_analyzer1_exp, ignore_delay=True, ignore_times=True))
     self.assertTrue(j_out_not_analyzer1.deep_match(j_out_not_analyzer1_exp, ignore_delay=True, ignore_times=True))
     self.assertTrue(j_out_analyzer2and3.deep_match(j_out_analyzer2and3_exp, ignore_delay=True, ignore_times=True))
     self.assertTrue(
         j_out_not_analyzer2and3.deep_match(j_out_not_analyzer2and3_exp, ignore_delay=True, ignore_times=True)
     )
     self.assertTrue(
         j_out_not_analyzer1and2and3.deep_match(
             j_out_not_analyzer1and2and3_exp, ignore_delay=True, ignore_times=True
         )
     )
     return
Пример #15
0
 def testLoadingFromConNone(self):
     '''Test if no files are specified in the config file
     '''
     teal_inst = teal.Teal('data/metadata_test/load_config_01.conf', 'stderr', msgLevel=self.msglevel, commit_alerts=False, commit_checkpoints=False)
     esm1 = get_service(SERVICE_EVENT_METADATA)
     self.assertEqual(len(esm1), 0)
     asm1 = get_service(SERVICE_ALERT_METADATA)
     self.assertEqual(len(asm1), 0)
     teal_inst.shutdown()
     return
Пример #16
0
 def init_location_service(self, run_mode):
     ''' Load the Location Service based on the XML configuration in the configuration file
     '''
     cfg_reg = registry.get_service(SERVICE_CONFIGURATION)
     for result in cfg_reg.get_active_sections(CONFIG_LOCATION, run_mode, name_required=False, singleton=True):
         # result is (section, name), but name not used
         location_file = cfg_reg.get(result[0],'config')
         data_dir = registry.get_service(TEAL_DATA_DIR)
         teal_loc_file_path = os.path.join(data_dir,location_file)
         registry.register_service(SERVICE_LOCATION,LocationService(teal_loc_file_path))
Пример #17
0
    
    def create_alert(self, event, location):
        ''' create the alert '''
        # Populate the dictionary
        alert_dict = {}
        alert_dict[ALERT_ATTR_SRC_NAME] = self.src_name
        alert_dict[ALERT_ATTR_ALERT_ID] = self.alert_id.get_value()
        if self.severity.is_set():
            alert_dict[ALERT_ATTR_SEVERITY] = self.severity.get_value()
        if self.urgency.is_set():
            alert_dict[ALERT_ATTR_URGENCY] = self.urgency.get_value()
        if self.fru_loc.is_set():
            alert_dict[ALERT_ATTR_FRU_LOC] = self.fru_loc.get_value()
        if self.recommendation.is_set():
            alert_dict[ALERT_ATTR_RECOMMENDATION] = self.recommendation.get_value()
        if self.raw_data.is_set():
            alert_dict[ALERT_ATTR_RAW_DATA] = self.raw_data.get_value()
        if self.msg_template.is_set():
            alert_dict[ALERT_ATTR_MSG_TEMPLATE] = self.msg_template.get_value()
        if self.priority.is_set():
            alert_dict[ALERT_ATTR_PRIORITY] = self.priority.get_value()
        alert_dict[ALERT_ATTR_CONDITION_EVENTS] = set([event])
        
        if self.event_loc.is_set():
            loc = self.event_loc.get_value()
        else:
            loc = get_service(SERVICE_LOCATION).get_teal_location(self.ruleset.name)
        alert_dict[ALERT_ATTR_EVENT_LOC] = loc.get_location()
        alert_dict[ALERT_ATTR_EVENT_LOC_TYPE] = loc.get_id()
        
        # Fill in raw data
        raw_data_dict = {}
        raw_data_dict['exception'] = '{0}: {1}'.format(str(location.ex_type), str(location.ex_value))
        alert_dict[ALERT_ATTR_RAW_DATA] = dict2raw_data(raw_data_dict)
        
        # Call init routine if specified 
        if self.init_class_callable is not None:
            try:
                alert_dict = self.init_class_callable().update_init_data_main(alert_dict)
            except ThreadKilled:
                raise
            except ExtFatalError:
                get_logger().exception('FATAL ERROR raised --> kill analyzer')
                raise
            except:
                self.ruleset.trace_error(self.trace_id[1], 'Error in update_init_data_main')
                get_logger().exception('')

        # Allocate the potential alert 
        amgr = get_service(SERVICE_ALERT_MGR)
        alert = amgr.allocate(self.alert_id.get_value(), alert_dict)
        # send the alert to the delivery queue
        get_logger().debug('  creating {0}'.format(str(alert)))
        amgr.commit(alert)
        self.ruleset.send_alert(alert)
Пример #18
0
 def inject_new_entries(self,exp_json='data/restart_test/three_events_one_fromq.json',exp_num=3):
     ''' Verify that events still flow through TEAL after startup
     '''
     # Now make sure we start getting new events
     j_inj = Journal('After restart','data/restart_test/three_events_one.json')
     j_inj.insert_in_db(use_rec_ids=False, no_delay=True)
     registry.get_service(registry.SERVICE_NOTIFIER).post()
     j_exp = Journal('Inject New Entries', exp_json)
     j_act = self.find_analyzer().journal
     self.assertTrue(j_act.wait_for_entries(exp_num))
     self.assertTrue(j_act.deep_match(j_exp, ignore_delay=True))   
Пример #19
0
    def testDemo1DB(self):
        '''Test demo flow by injecting into DB'''
        self.prepare_db()
        keep_var = self.force_env('TEAL_TEST_POOL_TIMERS_OFF', 'YES')
        self.teal = Teal('data/teal_test/configurationtest_05_semaphore_auto.conf', 'stderr', 
                         msgLevel=self.msglevel)

        j_in = Journal('j_in', file='data/demo/data_sample_demo_NEW_001.json')
        j_out_eq = Journal('j_out_eq')
        j_out_aaq = Journal('j_out_aaq')
        j_out_dq = Journal('j_out_dq')
        j_out_lis = Journal('j_out_lis')
        q_out_eq = registry.get_service(SERVICE_EVENT_Q)
        q_out_aaq = registry.get_service(SERVICE_ALERT_ANALYZER_Q)
        q_out_dq = registry.get_service(SERVICE_ALERT_DELIVERY_Q)
        q_out_eq.register_listener(j_out_eq)
        q_out_dq.register_listener(j_out_dq)
        q_out_aaq.register_listener(j_out_aaq)
        listeners = get_service(SERVICE_ALERT_DELIVERY).listeners
        for listener in listeners:
            if listener.get_name() == 'outputJournal':
                j_out_lis = listener.journal
        try:
            j_in.insert_in_db(progress_cb=None, truncate=False, use_rec_ids=True, no_delay=False, post=True)
        except:
            print 'INSERTION FAILED'
            q_out_eq.unregister_listener(j_out_eq)
            q_out_dq.unregister_listener(j_out_dq)
            q_out_aaq.unregister_listener(j_out_aaq)
            raise
       
        # Yes, only 2: Flush can't be injected to connector, so pool does not get closed, so last event
        # Does not get turned into an alert!
        self.assertTrue(j_out_lis.wait_for_entries(2))

        # Note these connector ('C') versions have one less alert
        #     The analyzer is being run in historic mode (see configuration) if that was 
        #     changed to runtime then the pool would time out and the last alert would be journaled
        j_exp_aaq = Journal('j_exp_aaq', 'data/teal_test/data_sample_demo_NEW_001_AAQ_Result_C.json')
        self.assertTrue(j_out_aaq.deep_match(j_exp_aaq, ignore_delay=True, ignore_times=True))
        j_exp_dq = Journal('j_exp_dq', 'data/teal_test/data_sample_demo_NEW_001_DQ_Result_C.json')
        self.assertTrue(j_out_dq.deep_match(j_exp_dq, ignore_delay=True, ignore_times=True))
        j_exp_lis = Journal('j_exp_lis', 'data/teal_test/data_sample_demo_NEW_001_LIS_Result_C.json')
        self.assertTrue(j_out_lis.deep_match(j_exp_lis, ignore_delay=True, ignore_times=True))

        q_out_eq.unregister_listener(j_out_eq)
        q_out_dq.unregister_listener(j_out_dq)
        q_out_aaq.unregister_listener(j_out_aaq)
        
        self.teal.shutdown()
        self.restore_env('TEAL_TEST_POOL_TIMERS_OFF', keep_var)
Пример #20
0
 def __init__(self, name):
     self.name = name
     # Register myself as a listener on the event Q
     self.listenQ = get_service(SERVICE_EVENT_Q)
     self.listenQ.register_listener(self)
     self.event_checkpoint = EventCheckpoint(name)
     return
Пример #21
0
    def get_generator(self, config_dict):
        """ Return the appropriate SQL generator 
        based on the configuration information retrieved
        """
        DB_CONF_PATH = "{0}/xcat".format(get_service(TEAL_CONF_DIR))
        prefix = os.environ.get(TEAL_TEST_XCAT_CFGLOG_PREFIX, "")
        DB_CONF_FILE = "{0}cfgloc".format(prefix)

        # Set xCAT table names
        db_interface.TABLE_EVENT_LOG = "x_tealeventlog"
        db_interface.TABLE_CHECKPOINT = "x_tealcheckpoint"
        db_interface.TABLE_ALERT_LOG = "x_tealalertlog"
        db_interface.TABLE_ALERT2ALERT = "x_tealalert2alert"
        db_interface.TABLE_ALERT2EVENT = "x_tealalert2event"
        db_interface.TABLE_TEMPLATE = "x_{0}"

        # Well-known path to the information.
        ds_file = "{0}/{1}".format(DB_CONF_PATH, DB_CONF_FILE)
        get_logger().debug("DB Configuration: {0}".format(ds_file))

        try:
            conf_file = open(ds_file, "r")
        except IOError, e:
            get_logger().error("Unable to open DB configuration file. {0}".format(e))
            raise
Пример #22
0
def close_event(errm_env, event_data):
    ''' Find an alert that associated with the closed event and close it. This
    will only close an alert that has this event as the one and only event
    '''
    # Find the matching event in the event log
    event_rec_id = find_logged_event(errm_env, event_data)
    
    # If no event was found, there is nothing more to do
    if event_rec_id is None:
        return
    
    # Find an alert that this event is the only event associated with
    alert_recids = find_logged_alerts(event_rec_id)
    
    # If there is no alert associated with this event, then it may have
    # already been closed out or never logged in the first place based 
    # on when this connector started listening for events
    if len(alert_recids) == 0:
        return
    
    # Close this alert and any alerts that were duplicates of this alert
    a_mgr = registry.get_service(registry.SERVICE_ALERT_MGR)
    
    for alert_recid in alert_recids:
        try:
            a_mgr.close(alert_recid)
        except alert_mgr.AlertMgrError, ame:
            get_logger().warn('Failed to close alert({0}) associated to event ({1}): {2}'.format(alert_recid, event_rec_id, ame))
Пример #23
0
 def testDisableDup(self):
     ''' Test That disable dup works'''
     self.teal = Teal('data/common/configurationtest.conf', 'stderr', msgLevel=self.msglevel, commit_alerts=False, commit_checkpoints=False)
     am = get_service(SERVICE_ALERT_MGR)
     self.assertEqual(len(am.in_mem_alerts), 0)
     self.assertEqual(len(am.in_mem_alerts_duplicate), 0)
     self.assertEqual(len(am.active_alerts_open), 0)
     create_teal_alert('XXXXXXXX', 'no reason at all', 'medium well', loc_instance="YYY")
     self.assertEqual(len(am.in_mem_alerts), 1)
     self.assertEqual(len(am.in_mem_alerts_duplicate), 0)
     self.assertEqual(len(am.active_alerts_open), 1)
     create_teal_alert('XXXXXXXX', 'no reason at all', 'medium well', loc_instance="YYY")
     self.assertEqual(len(am.in_mem_alerts), 2)
     self.assertEqual(len(am.in_mem_alerts_duplicate), 1)
     self.assertEqual(len(am.active_alerts_open), 1)
     create_teal_alert('XXXXXXXX', 'no reason at all', 'medium well', loc_instance="YYY", disable_dup=True)
     self.assertEqual(len(am.in_mem_alerts), 3)
     self.assertEqual(len(am.in_mem_alerts_duplicate), 1)
     self.assertEqual(len(am.active_alerts_open), 2)
     create_teal_alert('XXXXXXXX', 'no reason at all', 'medium well', loc_instance="YYY", disable_dup=True)
     self.assertEqual(len(am.in_mem_alerts), 4)
     self.assertEqual(len(am.in_mem_alerts_duplicate), 1)
     self.assertEqual(len(am.active_alerts_open), 3)
     create_teal_alert('XXXXXXXX', 'no reason at all', 'medium well', loc_instance="YYY")
     self.assertEqual(len(am.in_mem_alerts), 5)
     self.assertEqual(len(am.in_mem_alerts_duplicate), 2)
     self.assertEqual(len(am.active_alerts_open), 3)
     self.teal.shutdown()
     return
Пример #24
0
def app_terminate(sig,stack_frame):
    ''' Initiate application termination on signal from the user '''
    global app_termination_signal
    app_termination_signal = sig
    shutdown = registry.get_service(SERVICE_SHUTDOWN)
    if (shutdown is not None):
        shutdown.notify()
Пример #25
0
 def __init__(self, use_db, restart_mode):
     '''
     Initialize the checkpoint manager 
     '''
     get_logger().debug('Initializing checkpoint manager use_db = {0}, restart_mode = {1}'.format(str(use_db), str(restart_mode)))
     self.event_checkpoints = dict()
     self.use_db = use_db
     self.chkpt_recid_lock = threading.Lock()
     self.event_checkpoint_rec_id = 0
     self.shutdown_recid = None 
     
     # Validate restart mode
     if restart_mode is not None and restart_mode not in RESTART_MODES:
         raise ConfigurationError('Unrecognized restart mode specified: {0}'.format(restart_mode))
     self.restart_mode = restart_mode
     
     if use_db == True: 
         get_logger().debug('Checkpoint manager is using the DB')
         
         # Setup SQL strings
         db = get_service(SERVICE_DB_INTERFACE)
         
         #   Insert event row
         global _SQL_EVENT_CP_INSERT
         _SQL_EVENT_CP_INSERT = db.gen_insert(
                 [EVENT_CPF_CHKPT_ID, EVENT_CPF_NAME, EVENT_CPF_STATUS, EVENT_CPF_EVENT_RECID, EVENT_CPF_DATA],
                 db_interface.TABLE_CHECKPOINT)
         
         #   Get event row
         global _SQL_EVENT_CP_SELECT_BY_NAME
         _SQL_EVENT_CP_SELECT_BY_NAME = db.gen_select(
                 [EVENT_CPF_CHKPT_ID, EVENT_CPF_NAME, EVENT_CPF_STATUS, EVENT_CPF_EVENT_RECID, EVENT_CPF_DATA],
                 db_interface.TABLE_CHECKPOINT, 
                 where='${0} = ?'.format(EVENT_CPF_NAME),
                 where_fields=[EVENT_CPF_NAME])
        
         #   Update event checkpoint
         global _SQL_EVENT_CP_UPDATE_CHECKPOINT
         _SQL_EVENT_CP_UPDATE_CHECKPOINT = db.gen_update(
                 [EVENT_CPF_STATUS, EVENT_CPF_EVENT_RECID, EVENT_CPF_DATA],
                 db_interface.TABLE_CHECKPOINT, 
                 where='${0} = ?'.format(EVENT_CPF_CHKPT_ID),
                 where_fields=[EVENT_CPF_CHKPT_ID])
         
         # Determine the next event rec_id to use
         cnxn = db.get_connection()
         cursor = cnxn.cursor()
         db.select_max(cursor, EVENT_CPF_CHKPT_ID, db_interface.TABLE_CHECKPOINT)
         row = cursor.fetchone()
         if row and row[0]:
             self.event_checkpoint_rec_id = row[0]
         get_logger().debug('Checkpoint Manager starting after rec_id = {0}'.format(self.event_checkpoint_rec_id))
         cnxn.close()
         
         # Setup for asynchronous update of checkpoints in DB
         self.update_db_event = threading.Event()
         self.t1 = CheckpointDBUpdater(self)
         self.t1.setDaemon(True)
         self.t1.start()
     return
Пример #26
0
    def init_cfg_service(self, config_file):
        """ Initialize the configuration service
        """
        conf_str = ''
        # Go get the configuration files from the default location
        if config_file is None:
            config_file = os.path.join(registry.get_service(TEAL_CONF_DIR),'teal')
            conf_str += 'None -> '
        # Need to create the list of files to pass to the configuration service
        # so determine if this is a file or directory to recover the proper set
        if os.path.isfile(config_file):
            conf_files = [config_file]
            conf_str += 'File -> {0}'.format(repr(config_file))
        elif os.path.isdir(config_file):
            # Find all the configuration files in the specified directory
            conf_qry =  os.path.join(config_file,'*.conf')
            conf_files = glob.glob(conf_qry)
            conf_str = 'Dir -> {0}'.format(repr(config_file))
        else:
            conf_files = []    
 
        if not conf_files:
            raise ConfigurationError('Configuration file/directory specification of \'{0}\' resulted in no configuration files'.format(config_file))
                
        registry.register_service(SERVICE_CONFIGURATION, Configuration(conf_files))
        return conf_str
Пример #27
0
 def analyze_event(self, event):
     ''' Turn every event from the SFP into an Alert
     '''
     # Build the Alert directly from the event information
     raw_data_dict = {'Problem Number':event.raw_data[SFP_PROB_NUM],
                      'FRU List':eval(event.raw_data[SFP_FRU_LIST]),
                      'SFP':event.get_rpt_loc().get_comp_value('node')}
     
     alert_dict = {alert.ALERT_ATTR_SEVERITY:'E',
                   alert.ALERT_ATTR_URGENCY:'N',
                   alert.ALERT_ATTR_EVENT_LOC_OBJECT:event.get_src_loc(),
                   alert.ALERT_ATTR_RECOMMENDATION:SFP_RECOMMENDATION,
                   alert.ALERT_ATTR_REASON:event.raw_data[SFP_REASON],
                   alert.ALERT_ATTR_RAW_DATA:str(raw_data_dict),
                   alert.ALERT_ATTR_SRC_NAME:self.get_name(),
                   alert.ALERT_ATTR_CONDITION_EVENTS:set((event,))
                   }
     
     # Get the alert manager to create/allocate/commit the alert
     amgr = registry.get_service(registry.SERVICE_ALERT_MGR)
     sfp_alert = amgr.allocate(event.get_event_id(), in_dict=alert_dict)
     
     # Duplicate alerts are already handled by the HMC/SFP so we should not
     # dup them again since they are always different alerts
     amgr.commit(sfp_alert, disable_dup=True)
     
     # Now the alert is created and can be reported through the pipeline
     self.send_alert(sfp_alert)
Пример #28
0
 def shutdown(self):
     '''Stop running the event monitor. 
     '''
     get_logger().debug('Starting shutdown')
     self.running = False
     self.notifier.post()
     
     get_logger().debug('Joining thread')
     self.monitor_thread.join()
     
     last_processed_recid = self.start_recid 
     if registry.get_service(SERVICE_SHUTDOWN_MODE) == SHUTDOWN_MODE_IMMEDIATE:
         # If immediate use the last one that was processed
         last_processed_recid = self.checkpointL.event_checkpoint.start_rec_id
     registry.get_service(SERVICE_CHECKPOINT_MGR).monitor_shutdown(last_processed_recid)
     get_logger().debug('Shutdown complete')
Пример #29
0
    def __init__(self, location_id, data):
        ''' Constructor 
        '''
        if not(isinstance(data, str) or isinstance(data, unicode)):
            raise TypeError,"Invalid type of Location data: {0}.".format(type(data))

        loc_service = registry.get_service(SERVICE_LOCATION)
        try:
            self.location_info = loc_service[location_id]
            self.location_code = data.split(self.location_info.separator)

            # Location code is initialized, now validate it
            self._validate_location_code()
        except:
            tmp_env = os.environ.get(TEAL_LOCATION_VALIDATION, 'LOG').upper()
            if tmp_env == 'LOG':
                get_logger().exception('LOGGING Location creation failure and continuing processing')
            elif tmp_env == 'IMMEDIATE':
                raise

            self.loc_id = location_id
            self.data = data

            self.ex_type, self.ex_value = sys.exc_info()[:2]

            self.is_unprocessable = self.is_unprocessable_UNPROCESSABLE
            self.new_location_by_scope = self._UNPROCESSABLE
            self.get_comp_value = self._UNPROCESSABLE
            self.get_substitution_dict = self._UNPROCESSABLE
            self.get_location = self.get_location_UNPROCESSABLE
            self.str_impl = self.str_impl_UNPROCESSABLE
            self.match = self.match_UNPROCESSABLE
            self.get_id = self.get_id_UNPROCESSABLE
        return
Пример #30
0
 def add_entries_before_restart(self, stop_teal=True):
     ''' Add events to TEAL and make sure they are processed
     '''
     self.start_teal('now')     
     # Insert a set of events and process them
     j_act = self.find_analyzer().journal
     j_inj = Journal('Pre-populate','data/restart_test/three_events_one.json')
     j_inj.insert_in_db(no_delay=True, truncate=True)   # Truncate is testing that we handle the ckpt table being destroyed
     registry.get_service(registry.SERVICE_NOTIFIER).post()
     self.assertTrue(j_act.wait_for_entries(3))
     j_exp = Journal('Expected', 'data/restart_test/three_events_one_fromq.json')
     self.assertTrue(j_act.deep_match(j_exp, ignore_delay=True))        
     # Stop this instance of TEAL if requested otherwise it is up
     # to the caller to stop it
     if stop_teal:
         self.stop_teal()
Пример #31
0
 def executeQuery(self, query):
     ''' Run a query.
     '''
     dbi = registry.get_service(SERVICE_DB_INTERFACE)
     dbConn = dbi.get_connection()
     cursor = dbConn.cursor()
     cursor.execute(query)
     return cursor.fetchall()
def get_eventList():
    '''Get the list of RAS events with hardware '_in error' control actions.
    '''
    # Search the tbgqmsgtypes for ras events with 'hardware in error' control actions
    schema = str(db_interface.TABLE_TEMPLATE).split('.')
    msgtypesTable = schema[0] + '.' + 'tbgqmsgtypes'
    ctlaction_query = "select msg_id, svcaction from " + msgtypesTable + " where ctlaction is not NULL and ctlaction like '%_IN_ERROR%' and ctlaction not like '%SOFTWARE_IN_ERROR%'"
    dbi = registry.get_service(SERVICE_DB_INTERFACE)
    dbConn = dbi.get_connection()
    cursor = dbConn.cursor()
    cursor.execute(ctlaction_query)
    rows = cursor.fetchall()
    msgIDs = list()
    msgidService = dict()
    for r in rows:
         msgid = r[0].strip() 
         msgIDs.append(msgid)
         sa = 'Service action: ' + str(r[1]).strip()
         msgidService[msgid] = sa

    # add in the bqc serdes analysis events that conditionally set 
    # a service action of COMPUTE_IN_ERROR
    msgIDs.append('00090213')
    msgIDs.append('00090216')
    sa = 'Service action: COMPUTE_IN_ERROR'
    msgidService['00090213'] = sa
    msgidService['00090216'] = sa

    # check for control action overrides defined in the ras_environment_filter.xml
    config_service = registry.get_service('BGQ_CONFIG_SERVICE')
    ras_events = config_service.get('RasEventChangeSpec')
    for re in ras_events:
        if re[1] == 'BG_CTL_ACT':
            if (re[2].find('_IN_ERROR') > 0) and (re[2].find('SOFTWARE_IN_ERROR') < 0):
                msgIDs.append(re[0])
                query = "select svcaction from " + msgtypesTable + " where msg_id = ?"
                cursor.execute(query, str(re[0]))
                row = cursor.fetchone()
                sa = 'Service action: ' + row[0]
                msgidService[re[0]] = sa
            else:
                if re[0] in msgIDs:
                    msgIDs.remove(re[0])

    return msgIDs, msgidService
def get_excludeMsgIDs():
    '''Get the list of excluded message IDs.
    '''
    cfg = registry.get_service(registry.SERVICE_CONFIGURATION)
    excludeMsgIDs = ''
    try:
        excludeMsgIDs = cfg.get(BGQ_TEAL_THRESHOLD_ANALYZER,
                                BGQ_TEAL_THRESHOLD_EXCLUDE_IDS)
        registry.get_logger().debug('Exclude List = ' + excludeMsgIDs)
    except Exception, e:
        registry.get_logger().debug(e)
Пример #34
0
    def analyze_event(self, event):
        '''Analyze a RAS event and determine whether the BQL threshold of errors has been reached or exceeded.
        '''
        msg_id = event.get_event_id()
        rec_id = event.get_rec_id()
        registry.get_logger().info("Analyzing msgid = " + msg_id +
                                   " recid = " + str(rec_id))

        location = str(event.get_src_loc())
        location = location[3:].strip()
        severity = event.raw_data['severity'].strip()
        serialnumber = event.raw_data['serialnumber']
        ecid = event.raw_data['ecid']
        event_time = event.get_time_logged()
        block = event.raw_data['block'].strip()
        jobid = event.raw_data['jobid']
        msgText = event.raw_data['message'].strip()
        rawdata = event.raw_data['rawdata'].strip()
        count = event.get_event_cnt()

        # Set threshold value
        threshold = self.msgidCount[msg_id]
        tmsg = "BQL error threshold of " + str(
            threshold) + " has been reached or exceeded, total count is "

        # check if thresholds have been reached or exceeded for events
        xmsg = ""
        xmsg = " in a period of " + self.msgidPeriod[msg_id].strip()
        query = self.period_query.replace('PERIOD',
                                          self.msgidPeriod[msg_id].strip())
        query = query.replace('MYTIME', str(event_time))

        # search for events associated with this location's midplane or I/O board
        qryloc = location.strip()[0:6] + '%'
        registry.get_logger().debug(query + " xmsgId=" + msg_id + " loc=" +
                                    qryloc + " ev_time=" + str(event_time))

        msgCount = 0
        for x in range(5):
            try:
                self.cursor.execute(query, qryloc)
                row = self.cursor.fetchone()
                msgCount = row[0]
                break
            except Exception, e:
                registry.get_logger().debug(e)
                if x < 4:
                    dbi = registry.get_service(SERVICE_DB_INTERFACE)
                    self.dbConn = dbi.get_connection()
                    self.cursor = self.dbConn.cursor()
                else:
                    raise Exception(
                        'Error: bgq_BqlEventAnalyzer could not connect to the database'
                    )
Пример #35
0
    def send_common_alert(self, loc, cur_alert_recid, event, alert_time, dup_query, cursor):
        ''' Send an alert for the common location.
        '''
        # Close current alert prior to creating a new common alert
        registry.get_logger().info('Closing current alert recid %d prior to creating a common mode alert', cur_alert_recid)
        registry.get_service(SERVICE_ALERT_MGR).close(cur_alert_recid)

        # Get the location 
        loc_name = self.get_loc_name(loc)
        loc_type = loc.get_id()
        loc_parent, loc_parent_list = self.get_loc_parent(loc)
        loc_parent_object = Location(loc_type, loc_parent)

        # Removed the duplicate check that was here -- this has already been determined

        # Fill in alert info
        reason = self.reason.replace('LOC_NAME', loc_name)
        reason = reason.replace('LOC_PARENT', loc_parent)
        recommendation = self.recommendation.replace('LOC_PARENT', loc_parent)
        alert_dict = {alert.ALERT_ATTR_SEVERITY:self.severity,
                      alert.ALERT_ATTR_URGENCY:'I',
                      alert.ALERT_ATTR_EVENT_LOC_OBJECT:loc_parent_object,
                      alert.ALERT_ATTR_RECOMMENDATION:recommendation,
                      alert.ALERT_ATTR_REASON:reason,
                      alert.ALERT_ATTR_RAW_DATA:'No raw data',
                      alert.ALERT_ATTR_SRC_NAME:self.get_name(),
                      alert.ALERT_ATTR_CONDITION_EVENTS:set((event,))
                      }

        # Get the alert manager to create/allocate/commit the alert
        alertMgr = registry.get_service(registry.SERVICE_ALERT_MGR)
        bg_alert = alertMgr.allocate(self.alertId, in_dict=alert_dict)
        alertMgr.commit(bg_alert, disable_dup=False)

        # Now the alert is created, need to put it in the queue so that it can be analyzed
        # by alert analyzer (instead of sending it - send_alert, which will get reported 
        # through the pipeline right away)
        registry.get_logger().info("Put alertId = %s with event recid = %d on the alert analyzer queue", self.alertId, event.get_rec_id())
        registry.get_service(SERVICE_ALERT_ANALYZER_Q).put(bg_alert) 

        return    
Пример #36
0
 def _configure(self):
     # Set the polling time based on the BGQ Connector conf file
     cfg = registry.get_service(registry.SERVICE_CONFIGURATION)
     try:
         value = cfg.get(BGQ_TEAL_CONFIG, BGQ_TEAL_CONFIG_POLL_INTERVAL)
         self.poll_interval = int(value)
         if self.poll_interval <= 0:
             registry.get_logger().error('The value ' + str(self.poll_interval) + ' specified in the poll interval is not valid. The value must be greater than zero.')    
             raise 
     except:
         registry.get_logger().warn('Configuring poll interval to default {0} seconds'.format(BGQ_DEFAULT_POLL_INTERVAL))            
         self.poll_interval = BGQ_DEFAULT_POLL_INTERVAL
Пример #37
0
    def __init__(self):
        ''' The constructor.
        '''
        # Parse ras environment filter file (ras_environment_filter.xml)
        self.repository = dict()
        p = xml.sax.make_parser()
        ras_filter = registry.get_service('BGQ_RAS_FILTER')
        if ras_filter != None:
            filter_handler = ConfigRasEnvFilterHandler(self.repository)
            p.setContentHandler(filter_handler)
            p.parse(ras_filter)

        return
Пример #38
0
def get_eventList():
    '''Get the list of RAS events with END_JOB control actions.
    '''
    # Search the tbgqmsgtypes for ras events with END_JOB control action
    # Note: do not include ras events already handled by HardwareInError analyzer.
    schema = str(db_interface.TABLE_TEMPLATE).split('.')
    msgtypesTable = schema[0] + '.' + 'tbgqmsgtypes'
    endJob_query = "select msg_id, svcaction from " + msgtypesTable + " where ctlaction is not NULL and ctlaction like '%END_JOB%' and (ctlaction not like '%_IN_ERROR%' or ctlaction like '%SOFTWARE_IN_ERROR%')"
    dbi = registry.get_service(SERVICE_DB_INTERFACE)
    dbConn = dbi.get_connection()
    cursor = dbConn.cursor()
    cursor.execute(endJob_query)
    rows = cursor.fetchall()
    msgIDs = list()
    msgidService = dict()
    for r in rows:
        msgid = r[0]
        msgIDs.append(msgid)
        sa = 'Service action: ' + str(r[1]).strip()
        msgidService[msgid] = sa

    # check for control action overrides defined in the ras_environment_filter.xml
    config_service = registry.get_service('BGQ_CONFIG_SERVICE')
    ras_events = config_service.get('RasEventChangeSpec')
    for re in ras_events:
        if re[1] == 'BG_CTL_ACT':
            if (re[2].find('END_JOB') > 0):
                msgIDs.append(re[0])
                query = "select svcaction from " + msgtypesTable + " where msg_id = ?"
                cursor.execute(query, str(re[0]))
                row = cursor.fetchone()
                sa = 'Service action: ' + row[0]
                msgidService[re[0]] = sa
            else:
                if re[0] in msgIDs:
                    msgIDs.remove(re[0])

    return msgIDs, msgidService
Пример #39
0
    def _query_and_log_event(self, query_sign, recid, max_recid=0):
        ''' Query the BG event log for new events and log into TEAL
        '''
        registry.get_logger().debug("in _query_and_log_event")
        event_logged = False
        db = registry.get_service(registry.SERVICE_DB_INTERFACE)
        cnxn = db.get_connection()
        bgq_cursor = cnxn.cursor()
        teal_cursor = cnxn.cursor()
            
        # Query the BG event log for new events
        bgEvent_query = "SELECT RECID, CATEGORY, COMPONENT, JOBID, BLOCK, LOCATION, MSG_ID FROM " + db_interface.TABLE_BG_EVENT_LOG + " WHERE RECID " + query_sign + " ? ORDER BY RECID ASC"
        bgq_cursor.execute(bgEvent_query, recid)
        commit_count = 0
        for bg_event in next_row(bgq_cursor):
            
            # Don't process events with recids >= max_recid, if it is nonzero
            if max_recid > 0 and bg_event[0] >= max_recid:
                break
                
            # Log only events we are interested in
            if bg_event[6] in self.msgIDs:
                event_logged = True
                
                # Log the event into TEAL
                self._log_event(bg_event, teal_cursor)
                      
                # Commit every so often to limit the transaction size
                commit_count += 1
                if commit_count == COMMIT_LIMIT:           
                    cnxn.commit()
                    commit_count = 0
            else:
                registry.get_logger().debug('ignore msgid ' + bg_event[6])

            # Update the 'cursor' into the BGQ database
            self.last_processed_event = bg_event[0]
                    
        # Notify TEAL that events have been inserted
        if (event_logged):
            registry.get_logger().debug("event to log " + str(event_logged))
            cnxn.commit()
                
            if self.notifier:
                self.notifier.post()
            else:
                registry.get_logger().warn('TEAL notifier not configured.')
        
        cnxn.close()
        registry.get_logger().debug("exit _query_and_log_event")
Пример #40
0
    def __init__(self, name, inEventQueue, outQueue, config_dict=None, number=0, checkpoint=None):
        '''The constructor.
        '''
        EventAnalyzer.__init__(self, name, inEventQueue, outQueue, config_dict, number, checkpoint)

        self.severity = "W"
        self.recommendation = '''Schedule service to isolate the BQL issue.  Possible causes are environmental, cable, or a board.  Multiple BQLs reporting issues on multiple boards in the same midplane may be a side effect of an environmental issue like neighboring boards being powered off unexpectedly.  Issues with a single cable may be due to poor seating of the cable into the connector or debris on the cable.   The cable and board can be cleaned with an optics cleaning tool.  Low voltage on all or most lanes on a single optical module may be an issue with the board at this location (the receiver) or its neighboring board (the transmitter) at the other end of the cable.'''
        self.alert_id = 'BQL01'

        # Get the exclude list of message IDs
        cfg = registry.get_service(registry.SERVICE_CONFIGURATION)
        excludeList = ''
        try: 
            excludeList = cfg.get(BGQ_TEAL_BQL_ANALYZER, excludeMsgIDs)
            registry.get_logger().debug('Exclude List = ' + excludeList)
        except Exception, e:
            registry.get_logger().debug(e)
    def analyze_event(self, event):
        '''Analyze a RAS event and send an alert.
        '''
        msg_id = event.get_event_id()
        rec_id = event.get_rec_id()
        registry.get_logger().info("Analyzing msgid = " + msg_id + " recid = " + str(rec_id))

        # Exclude event logged from DIAG run
        if event.raw_data['diags'] == 'T':
            registry.get_logger().debug('RAS Event generated by Diagnostics, skip creating an alert')
            return

        # Fill in alert with appropriate data 
        reason = "The hardware been put in an error state.  \nRAS event details:" \
                 " message id = " + msg_id + \
                 ", recid = " + str(rec_id) + \
                 ", timestamp = " + str(event.get_time_occurred()) + \
                 ", serial number = " + str(event.raw_data['serialnumber']) + \
                 ", ecid = " + self.ecidString(event.raw_data['ecid']) + \
                 ", jobid = " + str(event.raw_data['jobid']) + \
                 ", block = " + str(event.raw_data['block'])
        raw_data = "RAS Message: " + event.raw_data['message']
        recommendation = self.recommendation + " " + self.msgidService[msg_id]

        alert_dict = {alert.ALERT_ATTR_SEVERITY:self.severity,
                      alert.ALERT_ATTR_URGENCY:'I',
                      alert.ALERT_ATTR_EVENT_LOC_OBJECT:event.get_src_loc(),
                      alert.ALERT_ATTR_RECOMMENDATION:recommendation,
                      alert.ALERT_ATTR_REASON:reason,
                      alert.ALERT_ATTR_RAW_DATA:raw_data,
                      alert.ALERT_ATTR_SRC_NAME:self.get_name(),
                      alert.ALERT_ATTR_CONDITION_EVENTS:set((event,))
                      }

        # Get the alert manager to create/allocate/commit the alert
        alertMgr = registry.get_service(registry.SERVICE_ALERT_MGR)
        alert_id = 'HWERR01'
        bg_alert = alertMgr.allocate(alert_id, in_dict=alert_dict)
        alertMgr.commit(bg_alert, disable_dup=False)

        # Now the alert is created and can be reported through the pipeline
        registry.get_logger().info("Sending alert for msgid = " + msg_id + " recid = " + str(rec_id))
        self.send_alert(bg_alert)
        return
def get_eventList():
    '''Get the list of RAS events with thresdhold count.
    '''
    # Get the exclude list of message IDs from the configuration file
    excludeMsgList = get_excludeMsgIDs()

    # Search the tbgmsgtypes for ras events that have threshold counts
    # Note: do not include ras events already handled by HardwareInError and JobFatal analyzers.
    schema = str(db_interface.TABLE_TEMPLATE).split('.')
    msgtypesTable = schema[0] + '.' + 'tbgqmsgtypes'
    count_query = "select msg_id, thresholdcount, svcaction, relevantdiags from " + msgtypesTable + " where thresholdcount is not NULL and (ctlaction is NULL or (ctlaction not like '%END_JOB%' and (ctlaction not like '%_IN_ERROR%' or ctlaction like '%SOFTWARE_IN_ERROR%')))"
    dbi = registry.get_service(SERVICE_DB_INTERFACE)
    dbConn = dbi.get_connection()
    cursor = dbConn.cursor()
    cursor.execute(count_query)
    rows = cursor.fetchall()
    msgIDs = list()
    msgidCount = dict()
    msgidService = dict()
    for r in rows:
        msgid = r[0].strip()
        if excludeMsgList.find(msgid) >= 0:
            registry.get_logger().debug(' excluding ' + msgid)
            continue
        msgIDs.append(msgid)
        msgidCount[msgid] = r[1]
        sa = 'Service action: '
        if r[2]:
            sa += r[2].strip()
        else:
            sa += "None."
        if r[3]:
            sa += ' Relevant diagnostic bucket(s): ' + r[3].strip()
        msgidService[msgid] = sa

    # Search the tbgqmsgtypes for ras events that have threshold period
    period_query = "select msg_id, thresholdperiod from " + msgtypesTable + " where thresholdperiod is not NULL"
    cursor.execute(period_query)
    rows = cursor.fetchall()
    msgidPeriod = dict()
    for r in rows:
        msgidPeriod[r[0]] = r[1]

    return msgIDs, msgidService, msgidCount, msgidPeriod
Пример #43
0
    def _get_last_processed_event(self):
        ''' Log events that have occurred prior to starting the monitor
        '''
        dbi = registry.get_service(registry.SERVICE_DB_INTERFACE)
        cnxn = dbi.get_connection()
        cursor = cnxn.cursor()

        # Find the last event injected into TEAL and then inject
        # all the events that have occurred since then
        maxEvent_query = "SELECT MAX(REC_ID) FROM " + db_interface.TABLE_EVENT_LOG_EXT
        cursor.execute(maxEvent_query);
        
        max_id = cursor.fetchone()[0]
        if max_id is None:
            self.last_processed_event = 0
        else:
            self.last_processed_event = max_id

        registry.get_logger().info('Last Processed Event = ' + str(self.last_processed_event))
        cnxn.close()
Пример #44
0
class bgqBqlEventAnalyzer(bgqBaseAnalyzer):
    '''The BqlEventAnalyzer class determines what action to take 
    for BQL RAS events of interest.
    '''
    def __init__(self,
                 name,
                 inEventQueue,
                 outQueue,
                 config_dict=None,
                 number=0,
                 checkpoint=None):
        '''The constructor.
        '''
        EventAnalyzer.__init__(self, name, inEventQueue, outQueue, config_dict,
                               number, checkpoint)

        self.severity = "W"
        self.recommendation = '''Schedule service to isolate the BQL issue.  Possible causes are environmental, cable, or a board.  Multiple BQLs reporting issues on multiple boards in the same midplane may be a side effect of an environmental issue like neighboring boards being powered off unexpectedly.  Issues with a single cable may be due to poor seating of the cable into the connector or debris on the cable.   The cable and board can be cleaned with an optics cleaning tool.  Low voltage on all or most lanes on a single optical module may be an issue with the board at this location (the receiver) or its neighboring board (the transmitter) at the other end of the cable.'''
        self.alert_id = 'BQL01'

        # Get the exclude list of message IDs
        cfg = registry.get_service(registry.SERVICE_CONFIGURATION)
        excludeList = ''
        try:
            excludeList = cfg.get(BGQ_TEAL_BQL_ANALYZER, excludeMsgIDs)
            registry.get_logger().debug('Exclude List = ' + excludeList)
        except Exception, e:
            registry.get_logger().debug(e)

        # ras events that have BQL_SPARE detail data
        self.msgIDs = get_eventList()
        for msgid in self.msgIDs:
            registry.get_logger().debug('msgId = ' + msgid)

        # set the threshold
        self.msgidCount = dict()
        self.msgidCount['00090200'] = 2
        self.msgidCount['00090201'] = 1
        self.msgidCount['00090202'] = 1
        self.msgidCount['00090210'] = 4
        self.msgidCount['00090211'] = 4

        # set the window = 2 X the period
        self.msgidPeriod = dict()
        self.msgidPeriod['00090200'] = '11 seconds'
        self.msgidPeriod['00090201'] = '11 seconds'
        self.msgidPeriod['00090202'] = '11 seconds'
        self.msgidPeriod['00090210'] = '11 seconds'
        self.msgidPeriod['00090211'] = '11 seconds'

        # BQL related ras events
        self.bqlIDs = list()

        # define query for count of recent events at this location
        # within a window (plus and minus the event time)
        #    parameter 1 = location
        #    parameter 2 = event time
        eventTable = self.appendSchema('tbgqeventlog')
        self.period_query = "select count(*) from " + eventTable + " where location like ? and category='BQL' and event_time <=  (timestamp('MYTIME') + PERIOD) and event_time > (timestamp('MYTIME') - PERIOD)"

        # define query for count of open alerts at this location
        # within a day from the event time
        #    parameter 1 = location
        #    parameter 2 = event time
        alertTable = self.appendSchema('x_tealalertlog')
        self.alert_period = '1 day'
        self.alert_query = "select count(*) from " + alertTable + " where \"alert_id\"='BQL01' and \"event_loc\"= ? and \"creation_time\" >= (timestamp('MYTIME') - PERIOD) and \"state\"=1"

        # database connection and cursor
        dbi = registry.get_service(SERVICE_DB_INTERFACE)
        self.dbConn = dbi.get_connection()
        self.cursor = self.dbConn.cursor()

        return
Пример #45
0
    def analyze_alert(self, alert):
        '''Analyze an alert
        '''
        alert_recId = alert.get_rec_id()
        alert_id = alert.get_incident_id()
        loc_type = alert.event_loc.get_id()
        location = alert.event_loc.get_location()
        #alert_msgId = alert.get_incident_id()
        registry.get_logger().info('Analyzing alert id %d loc_type: %s: %s', alert_recId, loc_type, location)

        # There should only be one condition event associated with the alert.  
        events = alert.condition_events
        if len(events) == 0:
            registry.get_logger().error('No event associated with the alert recid %d', alert_recId)
            registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert)
            return
        event = events.pop()

        if (alert_id == 'BQL01'):
            # No need to analyze BQL01 alerts, just pass it to the delivery queue
            registry.get_logger().info('Nothing to analyze for alert id %s ', alert_id)
            registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert)
            return

        # Get the location 
        loc = Location(loc_type, location)
        locName = self.get_loc_name(loc)

        # No need to analyze alert with rack location
        alert_time = str(alert.get_time_occurred())
        if locName == 'rack':
            registry.get_logger().info('Nothing to analyze for alert recid %d with rack location', alert_recId)
            registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert)
            return

        # Find out if there are other alerts with the same block id (for ENDJOB01 and THRES01)
        dup_qry = ''
        if (alert_id == 'ENDJOB01' or alert_id == 'THRES01'):
            if event.raw_data['block'] is None:
                event_block = None
            else:
                event_block = event.raw_data['block'].strip()
          
            if event_block is None or event_block == BGQ_EVENT_NULL_BLOCK:
                 # Found no prior alert with the same block id, pass current alert to the delivery queue
                registry.get_logger().info('No block id for alert id %d, no common alert generated for block: %s', alert_recId, event_block)
                registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert)
                return

            same_block = False
            # Get db connection needed for query
            dbi = registry.get_service(SERVICE_DB_INTERFACE)
            dbConn = dbi.get_connection()
            cursor = dbConn.cursor()

            if (alert_id == 'ENDJOB01'):
                # For ENDJOB01, look for alert id HWERR01 or COMMON01 with the same block id
                same_block = self.has_matching_blockId(event_block, alert_time, cursor)
            else:
                # For THRES0101, look for alert id HWERR01 or COMMON01 or ENDJOB01 with the same block id
                same_block = self.has_matching_blockId(event_block, alert_time, cursor)

            if same_block:
                # Found prior alert with the same block id, close current alert
                registry.get_logger().info('Closing current alert recid %d due to prior alert with the same block id', alert_recId)
                registry.get_service(SERVICE_ALERT_MGR).close(alert_recId)
            else:
                # Found no prior alert with the same block id, pass current alert to the delivery queue
                registry.get_logger().info('No common block id found for alert id %d within the last %s', alert_recId, self.window_time)
                registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert)

            return

        elif (alert_id == 'BQL01'):
            # No need to analyze BQL01 alerts, just pass it to the delivery queue
            registry.get_logger().info('Nothing to analyze for alert id %s.', alert_id)
            registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert)
            return

        # The following will handle the rest of the alert ids (HWERR01 or COMMON01).
        # Find out if there is common mode alert already exist for the same location or higher hierarchy 
        loc_parent, loc_parent_list = self.get_loc_parent(loc)
        loc_qry = '('
        idx = 0
        for pLoc in loc_parent_list:
            if idx != 0: 
                loc_qry += " or "
            loc_qry +=  " \"event_loc\" like '" + pLoc + "'"
            idx += 1

        dup_qry2 = self.dup_query + loc_qry + ")"
        loc_qry += " or \"event_loc\" like '" + location + "')"
        dup_qry = self.dup_query + loc_qry
        dbi = registry.get_service(SERVICE_DB_INTERFACE)
        dbConn = dbi.get_connection()
        cursor = dbConn.cursor()

        dup = self.has_duplicate(alert_time, dup_qry, cursor)
        if dup:
            # Found prior alert with the same block id, close current alert
            registry.get_logger().info('Closing current alert recid %d due to prior alert with same common location', alert_recId)
            registry.get_service(SERVICE_ALERT_MGR).close(alert_recId)
            return

        # Look for a common hardware problem if there are multiple alerts for different location
        # on the same hardware.
        sendAlert = self.has_common_location(loc, alert_time, self.query, cursor)
        if sendAlert:
            # Send commmon alert
            self.send_common_alert(loc, alert_recId, event, alert_time, dup_qry2, cursor)
        else:
            # Pass current alert to the delivery queue
            registry.get_logger().info('No common location for %s found for alert id: %d within the last %s ', location, alert_recId, self.window_time)
            registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert)

        return
Пример #46
0
        aquery = aquery.replace('MYTIME', str(event_time))
        registry.get_logger().debug(aquery + " xmsgId=" + msg_id + " loc=" +
                                    location.strip() + " ev_time=" +
                                    str(event_time))

        msgCount = 0
        for x in range(5):
            try:
                self.cursor.execute(aquery, location.strip())
                row = self.cursor.fetchone()
                msgCount = row[0]
                break
            except Exception, e:
                registry.get_logger().debug(e)
                if x < 4:
                    dbi = registry.get_service(SERVICE_DB_INTERFACE)
                    self.dbConn = dbi.get_connection()
                    self.cursor = self.dbConn.cursor()
                else:
                    raise Exception(
                        'Error: bgq_BqlEventAnalyzer could not connect to the database'
                    )

        # do not log more than one BQL alert per day for the same location
        if msgCount > 0:
            registry.get_logger().debug("An active BQL01 alert for location " +
                                        location.strip() +
                                        " exist within a period of " +
                                        self.alert_period +
                                        ". Skip logging a duplicate.")
            return
Пример #47
0
    
    if options.run_as_daemon:
        # Do the necessary processing to spin off as a daemon
        command.daemonize('teal_bgq')
    else:
        # Allow the user to CTRL-C application and shutdown cleanly        
        signal.signal(signal.SIGINT, app_terminate)    # CTRL-C
    
    if options.log_file is None:
        log_file = '$TEAL_LOG_DIR/teal_bg.log'
    else:
        log_file = options.log_file
        
    # Set up the TEAL environment to get at the data required for logging
    t = teal.Teal(None,
                  data_only=True,
                  msgLevel=options.msg_level,
                  logFile=log_file,
                  daemon_mode=options.run_as_daemon)
            
    # Create the connector and start it
    bgcon = BgqConnector()
    bgcon.setDaemon(True)
    bgcon.start()
        
    # Wait for Teal to shutdown before exiting
    shutdown = registry.get_service(registry.SERVICE_SHUTDOWN)
    shutdown.wait()


Пример #48
0
    def analyze_event(self, event):
        '''Analzyer a Fatal RAS event.
        '''
        msg_id = event.get_event_id()
        rec_id = event.get_rec_id()
        registry.get_logger().info("Analyzing msgid = " + msg_id +
                                   " recid = " + str(rec_id))

        # Exclude event logged from DIAG run
        if event.raw_data['diags'] == 'T':
            registry.get_logger().debug(
                'RAS Event generated by Diagnostics, skip creating an alert')
            return

        jobid = event.raw_data['jobid']
        if jobid:
            # check if already handled
            if jobid in self.ring:
                registry.get_logger().info('Alert is not sent for msgid ' +
                                           msg_id + ' recid ' + str(rec_id) +
                                           ' because jobid ' + str(jobid) +
                                           ' is already handled')
                return

            # remember this jobid
            self.ring.pop(0)  # throw away oldest jobid
            self.ring.append(jobid)  # add to end of the list

        reason = "The fatal RAS event caused the job to end.  \nRAS event details:" \
                 " message id = " + msg_id + \
                 ", recid = " + str(rec_id) + \
                 ", timestamp = " + str(event.get_time_occurred()) + \
                 ", serial number = " + str(event.raw_data['serialnumber']) + \
                 ", ecid = " + self.ecidString(event.raw_data['ecid']) + \
                 ", jobid = " + str(jobid) + \
                 ", block = " + str(event.raw_data['block'])
        raw_data = "RAS Message: " + event.raw_data['message']
        recommendation = self.recommendation + " " + self.msgidService[msg_id]

        alert_dict = {
            alert.ALERT_ATTR_SEVERITY: self.severity,
            alert.ALERT_ATTR_URGENCY: 'I',
            alert.ALERT_ATTR_EVENT_LOC_OBJECT: event.get_src_loc(),
            alert.ALERT_ATTR_RECOMMENDATION: recommendation,
            alert.ALERT_ATTR_REASON: reason,
            alert.ALERT_ATTR_RAW_DATA: raw_data,
            alert.ALERT_ATTR_SRC_NAME: self.get_name(),
            alert.ALERT_ATTR_CONDITION_EVENTS: set((event, ))
        }

        # Get the alert manager to create/allocate/commit the alert
        alertMgr = registry.get_service(registry.SERVICE_ALERT_MGR)
        alert_id = 'ENDJOB01'
        bg_alert = alertMgr.allocate(alert_id, in_dict=alert_dict)
        alertMgr.commit(bg_alert, disable_dup=False)

        # Now the alert is created and can be reported through the pipeline
        registry.get_logger().info("Sending alert for msgid = " + msg_id +
                                   " recid = " + str(rec_id))
        self.send_alert(bg_alert)

        return
    def analyze_event(self, event):
        '''Analyze a RAS event and determine whether threshold has been
        reached or exceeded.
        '''
        msg_id = event.get_event_id()
        rec_id = event.get_rec_id()
        registry.get_logger().info("Analyzing msgid = " + msg_id +
                                   " recid = " + str(rec_id))

        count = event.get_event_cnt()
        event_time = event.get_time_logged()

        location = str(event.get_src_loc())
        location = location[3:].strip()
        # Exclude event logged from DIAG run
        if event.raw_data['diags'] == 'T':
            registry.get_logger().debug(
                'RAS Event generated by Diagnostics, skip creating an alert')
            return

        # Set threshold value
        threshold = self.msgidCount[msg_id]
        tmsg = "Error threshold of " + str(
            threshold) + " has been reached or exceeded, total count is "

        # Check if threshold has been reached or exceeded
        xmsg = ""
        query = self.period_query
        if msg_id in self.msgidPeriod:
            # Query for the count of the RAS event with threhold period
            xmsg = " in a period of " + self.msgidPeriod[msg_id].strip()
            query = self.period_query
            if count:
                query = self.period_query2
            query = query.replace('PERIOD', self.msgidPeriod[msg_id].strip())
            query = query.replace('MYTIME', str(event_time))
        else:
            # Query for the count of the RAS event without threshold period
            query = self.count_query
            if count:
                query = self.count_query2

        if event.raw_data['serialnumber'] is not None:
            serialnumber = event.raw_data['serialnumber'].strip()
            sn = "= '" + serialnumber + "'"
            query = query.replace('SN', sn)
        else:
            serialnumber = None
            query = query.replace('SN', 'is NULL')
        if location:
            loc = "= '" + location + "'"
            query = query.replace('LOC', loc)
        else:
            query = query.replace('LOC', 'is NULL')

        registry.get_logger().debug(query + " msgId=" + msg_id +
                                    " event_time=" + str(event_time))

        dbi = registry.get_service(SERVICE_DB_INTERFACE)
        dbConn = dbi.get_connection()
        cursor = dbConn.cursor()
        cursor.execute(query, msg_id, event_time)
        row = cursor.fetchone()
        msgCount = row[0]

        if msgCount < threshold:
            registry.get_logger().info("Alert is not sent for msgid " +
                                       msg_id + " recid " + str(rec_id) +
                                       " because the count " + str(msgCount) +
                                       " is less than the threshold " +
                                       str(threshold) + ".")
            return

        if msg_id in self.msgidConsecutivePeriods:
            # repeat the query for M-1 more periods
            numPeriods = self.msgidConsecutivePeriods[msg_id]
            numPeriods = numPeriods - 1
            period = self.msgidPeriod[msg_id].strip()
            period = period.strip()
            deltaPeriod = self.delta_period(period)
            registry.get_logger().debug(
                "Checking whether " + msg_id + " recid " + str(rec_id) +
                " has exceeded its threshold for " +
                str(self.msgidConsecutivePeriods[msg_id]) +
                " consecutive periods of " + period + ". Delta period = " +
                str(deltaPeriod))
            qry_time = event_time
            for nums in range(numPeriods):
                query = self.period_query2
                if serialnumber:
                    sn = "= '" + serialnumber + "'"
                    query = query.replace('SN', sn)
                else:
                    query = query.replace('SN', 'is NULL')
                if location:
                    loc = "= '" + location + "'"
                    query = query.replace('LOC', loc)
                else:
                    query = query.replace('LOC', 'is NULL')
                query = query.replace('PERIOD', period)
                qry_time = qry_time - deltaPeriod
                query = query.replace('MYTIME', str(qry_time))
                registry.get_logger().debug(query + " msgId=" + msg_id +
                                            " event_time=" + str(qry_time))
                cursor.execute(query, msg_id, qry_time)
                row = cursor.fetchone()
                msgCount = row[0]
                if msgCount < threshold:
                    registry.get_logger().info("Alert is not sent for msgid " +
                                               msg_id + " recid " +
                                               str(rec_id) +
                                               " because the count " +
                                               str(msgCount) +
                                               " is less than the threshold " +
                                               str(threshold) + ".  period=" +
                                               str(nums + 2) + " of " +
                                               str(numPeriods + 1))
                    return
                else:
                    registry.get_logger().debug("Threshold exceeded " +
                                                msg_id + " recid " +
                                                str(rec_id) +
                                                " for consecutive period " +
                                                str(nums + 2) + " of " +
                                                str(numPeriods + 1))

        msgText = event.raw_data['message'].strip()
        if msg_id == '00040020':
            skipAlert = False
            index = msgText.find('StatusWord=0x0002')
            if (index >= 0):
                skipAlert = True
            else:
                index = msgText.find('StatusWord=0x4001')
                if (index >= 0):
                    skipAlert = True
                else:
                    index = msgText.find('StatusWord=0x4005')
                    if (index >= 0):
                        skipAlert = True
            if (skipAlert):
                registry.get_logger().debug("Supressing alert for " + msg_id +
                                            " because " +
                                            msgText[index:index +
                                                    len('StatusWord=0x0002')])
                return

        tmsg = tmsg + str(msgCount) + xmsg
        reason = tmsg + "\nRAS event details:" \
                 " message id = " + msg_id + \
                 ", recid = " + str(rec_id) + \
                 ", timestamp = " + str(event.get_time_occurred()) + \
                 ", serial number = " + str(event.raw_data['serialnumber']) + \
                 ", ecid = " + self.ecidString(event.raw_data['ecid']) + \
                 ", jobid = " + str(event.raw_data['jobid']) + \
                 ", block = " + str(event.raw_data['block'])

        rasMessage = "RAS Message: " + msgText

        recommendation = self.recommendation
        if location:
            recommendation = recommendation + " Schedule part replacement if this is hardware problem. " + self.msgidService[
                msg_id]
        else:
            recommendation = recommendation + " " + self.msgidService[msg_id]

        alert_dict = {
            alert.ALERT_ATTR_SEVERITY: self.severity,
            alert.ALERT_ATTR_URGENCY: 'I',
            alert.ALERT_ATTR_EVENT_LOC_OBJECT: event.get_src_loc(),
            alert.ALERT_ATTR_RECOMMENDATION: recommendation,
            alert.ALERT_ATTR_REASON: reason,
            alert.ALERT_ATTR_RAW_DATA: rasMessage,
            alert.ALERT_ATTR_SRC_NAME: self.get_name(),
            alert.ALERT_ATTR_CONDITION_EVENTS: set((event, ))
        }

        # Get the alert manager to create/allocate/commit the alert
        alertMgr = registry.get_service(registry.SERVICE_ALERT_MGR)
        alert_id = 'THRES01'
        bg_alert = alertMgr.allocate(alert_id, in_dict=alert_dict)
        alertMgr.commit(bg_alert, disable_dup=False)

        # Now the alert is created and can be reported through the pipeline
        registry.get_logger().info("Sending alert for msgid = " + msg_id +
                                   " recid = " + str(rec_id))
        self.send_alert(bg_alert)
        return