def testGeneralFilters(self): """test alert delivery with global and local filtering""" j_in_dq = Journal("j_in_DQ", "data/alert_delivery_test/data_sample_inject_DQ.json") # p rint str(j_in_dq) dq_q = get_service(SERVICE_ALERT_DELIVERY_Q) # Get the AlertListenerJournal journals listeners = get_service(SERVICE_ALERT_DELIVERY).listeners for listener in listeners: name = listener.get_name() # p rint name if name == "AllAlerts": j_out_all = listener.journal if name == "OnlyAlertId": j_out_alert_id = listener.journal if name == "OnlyAlertIdUrgent": j_out_ai_urgent = listener.journal # inject j_in_dq.inject_queue(dq_q) # wait for stuff to come out self.assertTrue(j_out_all.wait_for_entries(5)) self.assertTrue(j_out_alert_id.wait_for_entries(3)) self.assertTrue(j_out_ai_urgent.wait_for_entries(2)) # j_out_all_exp = Journal("j_out_all_exp", "data/alert_delivery_test/data_sample_out_all_alerts.json") self.assertTrue(j_out_all.deep_match(j_out_all_exp, ignore_delay=True, ignore_times=True)) j_out_alert_id_exp = Journal("j_out_alert_id_exp", "data/alert_delivery_test/data_sample_out_alert_id.json") self.assertTrue(j_out_alert_id.deep_match(j_out_alert_id_exp, ignore_delay=True, ignore_times=True)) j_out_ai_urgent_exp = Journal("j_out_ai_urgent_exp", "data/alert_delivery_test/data_sample_out_ai_urgent.json") self.assertTrue(j_out_ai_urgent.deep_match(j_out_ai_urgent_exp, ignore_delay=True, ignore_times=True)) return
def testDBtruncate(self): ''' Test that the DB tables correctly truncate ''' # Truncate the tables self.prepare_db() teal = Teal('data/aaaa_assumptions_test/minimal.conf', 'stderr', msgLevel=self.msglevel, data_only=True, commit_alerts=False, commit_checkpoints=False) self.dbi = registry.get_service(SERVICE_DB_INTERFACE) self.cnxn = self.dbi.get_connection() self.cursor = self.cnxn.cursor() # Check the tables self._check_rows(db_interface.TABLE_EVENT_LOG, 0) self._check_rows(db_interface.TABLE_ALERT_LOG, 0) self._check_rows(db_interface.TABLE_ALERT2ALERT, 0) self._check_rows(db_interface.TABLE_CHECKPOINT, 0) self.cnxn.close() teal.shutdown() teal = Teal('data/aaaa_assumptions_test/minimal.conf', 'stderr', msgLevel=self.msglevel) self.dbi = registry.get_service(SERVICE_DB_INTERFACE) self.cnxn = self.dbi.get_connection() self.cursor = self.cnxn.cursor() # Check the tables self._check_rows(db_interface.TABLE_EVENT_LOG, 0) self._check_rows(db_interface.TABLE_ALERT_LOG, 0) self._check_rows(db_interface.TABLE_ALERT2ALERT, 0) self._check_rows(db_interface.TABLE_CHECKPOINT, 1) self.cnxn.close() teal.shutdown()
def init_db_interface(self, daemon_mode, run_mode): ''' Setup the underlying data store connection ''' cf_reg = registry.get_service(SERVICE_CONFIGURATION) for data_store in self.load_plugins(CONFIG_DB_INTERFACE, run_mode, singleton=True): registry.register_service(SERVICE_DB_INTERFACE, data_store[0](dict(cf_reg.items(data_store[2])))) if daemon_mode: # Make sure the DB is up and running before we continue, since this might be # being invoked during IPL and order of startup is not guaranteed timeout = 180 db_exception = None dbi = registry.get_service(SERVICE_DB_INTERFACE) while (timeout > 0): try: cnxn = dbi.get_connection() cnxn.close() break except Exception, e: db_exception = e time.sleep(3) timeout -= 3 if timeout <= 0: raise TealError("Cannot connect to database: {0}".format(db_exception))
def __init__(self, name, inEventQueue, inAlertQueue, outQueue, config_dict=None, number=0): ''' The constructor ''' AlertAnalyzer.__init__(self, name, inEventQueue, inAlertQueue, outQueue, config_dict, number) # Common mode alert info self.alertId = 'COMMON01' self.severity = 'W' self.recommendation = "Check the environmental monitor data for LOC_PARENT." self.reason = "Multiple alerts have been logged against LOC_NAMEs on LOC_PARENT. The cause may be due to a common mode failure. Analyze LOC_PARENT environmental data for abnormalities prior to replacing individual LOC_NAMEs." # Get alert manager self.alertMgr = registry.get_service(registry.SERVICE_ALERT_MGR) # Get the configuration info for alert analyzer self.cfg = registry.get_service(registry.SERVICE_CONFIGURATION) self.window_time = self.get_window_time() self.threshold = self.get_threshold() # Alert table query schema = str(db_interface.TABLE_TEMPLATE).split('.') alertTable = schema[0] + '.x_tealalertlog' alert2eventTable = schema[0] + '.x_tealalert2event' eventTable = schema[0] + '.tbgqeventlog' query_time_window = "\"creation_time\" >= (timestamp('ALERT_TIME') - WINDOW) and \"creation_time\" < timestamp('ALERT_TIME')" self.query = "select \"event_loc\" from " + alertTable + " where \"state\" = 1 and \"event_loc\" like 'PLOC%' and \"event_loc\" not like 'LOCATION' and " + query_time_window self.dup_query = "select \"event_loc\" from " + alertTable + " where \"state\" = 1 and \"alert_id\" = 'COMMON01' and " + query_time_window + " and " self.alert_recid_query1 = "select \"rec_id\" from " + alertTable + " where \"state\" = 1 and (\"alert_id\" = 'COMMON01' or \"alert_id\" = 'HWERR01') and " + query_time_window self.alert_recid_query2 = "select \"rec_id\" from " + alertTable + " where \"state\" = 1 and (\"alert_id\" = 'COMMON01' or \"alert_id\" = 'HWERR01' or \"alert_id\" = 'ENDJOB01') and " + query_time_window self.event_recid_query = "select \"t_event_recid\" from " + alert2eventTable + " where \"alert_recid\" = ?" self.event_block_id_query = "select block from " + eventTable + " where recid = ?" query_time_window_str = "\"creation_time\" >= (timestamp('{0}') - {1}) and \"creation_time\" < timestamp('{0}')" self.same_block_query_str = "select count(*) from " + alertTable + " where \"state\" = 1 and (\"alert_id\" = 'COMMON01' or \"alert_id\" = 'HWERR01' or \"alert_id\" = 'ENDJOB01') and " + query_time_window_str + " and \"rec_id\" in (select \"alert_recid\" from " + alert2eventTable + " where \"t_event_recid\" in (select recid from " + eventTable + " where block = '{2}'))" return
def testGeneralFilters(self): """test alert delivery with global and local filtering""" j_in_dq = Journal("j_in_DQ", "data/alert_delivery_test/listener_failure/inject_DQ_alerts.json") dq_q = get_service(SERVICE_ALERT_DELIVERY_Q) # Get the AlertListenerJournal journals listeners = get_service(SERVICE_ALERT_DELIVERY).listeners for listener in listeners: name = listener.get_name() if name == "AllAlerts": j_out_all = listener.journal if name == "OnlyAnalyzer1": j_out_analyzer1 = listener.journal # inject j_in_dq.inject_queue(dq_q) # Create a TEAL alert create_teal_alert("XXXXXXXX", "no reason at all", "medium well", loc_instance="YYY") # Get expected values j_out_all_exp = Journal("all_exp", "data/alert_delivery_test/analyzer_filter/alerts_out_all.json") j_out_analyzer1_exp = Journal("analyzer1", "data/alert_delivery_test/analyzer_filter/alerts_out_analyzer1.json") # wait for stuff to come out self.assertTrue(j_out_all.wait_for_entries(len(j_out_all_exp) + 3)) self.assertTrue(j_out_analyzer1.wait_for_entries(len(j_out_analyzer1_exp))) # Check that it was what was expected # Can't really check this because the location is unique for each machine and run # Make sure only 3 extra self.assertEqual(len(j_out_all) - len(j_out_all_exp), 3) # self.assertTrue(j_out_all.deep_match(j_out_all_exp, ignore_delay=True, ignore_times=True)) self.assertTrue(j_out_analyzer1.deep_match(j_out_analyzer1_exp, ignore_delay=True, ignore_times=True)) return
def alert_not_analyzed_callback(self, alert): ''' When an alert is not handled in the alert analyzer queue pass it to the filter queue''' if isinstance(alert, Alert): get_logger().debug('Alert {0} was not analyzed in Alert Analysis Queue -- put in Delivery Queue'.format(alert.brief_str())) registry.get_service(SERVICE_ALERT_DELIVERY_Q).put_nowait(alert) else: get_logger().debug('Command {0} was processed by the Alert Analysis Queue'.format(alert.brief_str()))
def testDemo1EventQ(self): '''Test that the first demo flow works -- Inject Event Q''' self.teal = Teal('data/teal_test/configurationtest_05_auto.conf', 'stderr', msgLevel=self.msglevel, commit_alerts=False, commit_checkpoints=False, run_mode=TEAL_RUN_MODE_HISTORIC) j_in = Journal('j_in', file='data/demo/data_sample_demo_NEW_001.json') j_out_aaq = Journal('j_out_aaq') j_out_dq = Journal('j_out_dq') j_out_lis = Journal('j_out_lis') q_in = registry.get_service(SERVICE_EVENT_Q) q_out_aaq = registry.get_service(SERVICE_ALERT_ANALYZER_Q) q_out_dq = registry.get_service(SERVICE_ALERT_DELIVERY_Q) q_out_dq.register_listener(j_out_dq) q_out_aaq.register_listener(j_out_aaq) listeners = get_service(SERVICE_ALERT_DELIVERY).listeners for listener in listeners: if listener.get_name() == 'outputJournal': j_out_lis = listener.journal j_in.inject_queue(q_in) self.assertTrue(j_out_lis.wait_for_entries(3)) j_exp_aaq = Journal('j_exp_aaq', 'data/teal_test/data_sample_demo_NEW_001_AAQ_Result.json') self.assertTrue(j_out_aaq.deep_match(j_exp_aaq, ignore_delay=True, ignore_times=True)) j_exp_dq = Journal('j_exp_dq', 'data/teal_test/data_sample_demo_NEW_001_DQ_Result.json') self.assertTrue(j_out_dq.deep_match(j_exp_dq, ignore_delay=True, ignore_times=True)) j_exp_lis = Journal('j_exp_lis', 'data/teal_test/data_sample_demo_NEW_001_LIS_Result.json') self.assertTrue(j_out_lis.deep_match(j_exp_lis, ignore_delay=True, ignore_times=True)) q_out_aaq.unregister_listener(j_out_aaq) q_out_dq.unregister_listener(j_out_dq) self.teal.shutdown()
def install(self, connector_info, connector_comment): ''' Install the SNMP connectors into the xCAT monsetting table ''' # First clean up any remnants that still may exist self.uninstall(connector_comment) db = registry.get_service(registry.SERVICE_DB_INTERFACE) conn = db.get_connection() cursor = conn.cursor() # Get all the cmds entries to determine the starting number to use db.select(cursor, ['*'], 'monsetting', where="$name = 'snmpmon' AND $key LIKE 'cmds%'", where_fields=['name', 'key']) # Determine the max number for the new commands cmd_num = [self._get_cmd_num(row[1]) for row in cursor.fetchall()] if cmd_num: next_cmd_num = max(cmd_num) + 1 else: next_cmd_num = 1 root_dir = registry.get_service(registry.TEAL_ROOT_DIR) # Insert the entries for each of the SNMP connectors for connector in connector_info: cursor.executemany("INSERT INTO monsetting VALUES(?, ?, ?, ?, ?)", [('snmpmon', 'cmds{0:02d}'.format(next_cmd_num), os.path.join(root_dir, connector[0]), connector_comment, None), ('snmpmon', 'runcmd{0:02d}'.format(next_cmd_num), connector[1], connector_comment, None)]) next_cmd_num += 1 # Commit all the added entries to the monsetting table conn.commit() cursor.close() conn.close()
def is_installed(self, connector_info, connector_comment): ''' Check whether the connector is installed or not by checking to see if the command is configured in the monsetting table ''' root_dir = registry.get_service(registry.TEAL_ROOT_DIR) db = registry.get_service(registry.SERVICE_DB_INTERFACE) conn = db.get_connection() cursor = conn.cursor() installed = False for cmd, snmp_filter in connector_info: db.select(cursor, ['*'], 'monsetting', where="$name='snmpmon' AND $key LIKE 'cmd%' and $value = ?", where_fields = ['name','key','value'], parms=(os.path.join(root_dir, cmd),)) row = cursor.fetchone() if row is None: installed = False break else: installed = True return installed
def create_teal_alert(alert_id, reason, raw_data, src_name='TEAL', severity='I', urgency='N', loc_instance=None, recommendation='Contact next level of support', disable_dup=False): ''' create a TEAL alert This will used the parameters to: (1) Create the alert initialization dictionary (2) Allocate the alert (3) Commit the alert (4) Put the alert in the delivery queue ''' get_logger().debug('Creating {0} alert'.format(src_name)) # Build the Alert directly from the event information alert_dict = {ALERT_ATTR_SEVERITY:severity, ALERT_ATTR_URGENCY:urgency, ALERT_ATTR_RECOMMENDATION:recommendation, ALERT_ATTR_REASON:reason, ALERT_ATTR_RAW_DATA:raw_data, ALERT_ATTR_SRC_NAME: src_name } alert_dict[ALERT_ATTR_EVENT_LOC_OBJECT] = registry.get_service(SERVICE_LOCATION).get_teal_location(loc_instance) registry.get_service(SERVICE_ALERT_MGR).create_and_deliver_alert(alert_id, alert_dict, disable_dup=disable_dup) return
def start(self): '''Start the notifier-based event monitor running. ''' try: event_queue = registry.get_service(SERVICE_EVENT_Q) dbi = registry.get_service(SERVICE_DB_INTERFACE) next_failure_log = None rc = 0 while self.running: if rc == 0: get_logger().debug('Processing events in monitor event injection thread. startRecid = {0}'.format(self.start_recid)) try: cnxn = dbi.get_connection() cursor = cnxn.cursor() for row in cursor.execute(self.sql_runtime_query, self.start_recid): get_logger().debug('Processing row, rec_id = {0} time_occurred = {1}, time_logged = {2}'.format(row[0], row[2],row[3])) e = Event.fromDB(row) event_queue.put(e) self.start_recid = row[0] if self.running == False: get_logger().info('Monitor event injection thread interrupted. last recid = {0}'.format(self.start_recid)) break if self.start_recid % self.update_checkpoint_frequency == 0: inject_update_checkpoint_msg(self.start_recid) cnxn.close() except: cur_time = datetime.now() if next_failure_log is None or cur_time > next_failure_log: get_logger().exception('Failure in monitor event injection thread') next_failure_log = cur_time + timedelta(minutes=10) rc = self.notifier.wait() except: get_logger().exception('Monitor event injection thread failure') get_logger().debug('Exiting monitor event injection thread. Last recid = {0}'.format(self.start_recid))
def testJournalWriteAlertDB4(self): ''' Test writing of Alert log queue after reading from DB ''' # This test does not work with duplicate checking -- probably don't want it to keep_ADC = self.force_env('TEAL_ALERT_DUPLICATE_CHECK', 'No') self.teal = Teal('data/journal_test/events_002.conf','stderr',msgLevel=self.msglevel) # Events je = Journal('DB test input EVENTS', file='data/journal_test/events_002.json') je.insert_in_db(truncate=True, no_delay=True) # Alerts ja = Journal('DB test input ALERTS', file='data/journal_test/alerts_002.json') ja.insert_in_db(truncate=False, no_delay=True) # Check events jedb = Journal('Read DB test EVENTS') jedb.select_from_db('event') self.assertTrue(je.deep_match(jedb, ignore_delay=True, ignore_times=False, ignore_rec_id=False)) # Check alerts jadb = Journal('Read DB test ALERTS') jadb.select_from_db('alert') self.assertTrue(ja.deep_match(jadb, ignore_delay=True, ignore_times=False, ignore_rec_id=False)) # Now insert into the Delivery Queue and make sure all come out jadb.inject_queue(get_service(SERVICE_ALERT_DELIVERY_Q), progress_cb=None, fail_on_invalid=False, no_delay=True) listeners = get_service(SERVICE_ALERT_DELIVERY).listeners for listener in listeners: name = listener.get_name() if name == 'Journal': j_out_all = listener.journal self.assertTrue(j_out_all.wait_for_entries(6)) self.assertTrue(j_out_all.deep_match(jadb, ignore_delay=True, ignore_times=True)) self.teal.shutdown() self.restore_env('TEAL_ALERT_DUPLICATE_CHECK', keep_ADC) return
def testLoadingFromConTwo(self): '''Test if one files for each are specified in the config file in two sections ''' teal_inst = teal.Teal('data/metadata_test/load_config_03.conf', 'stderr', msgLevel=self.msglevel, commit_alerts=False, commit_checkpoints=False) # it loaded event_metadata_05 and alert_metadata_03 # Check event metadata via event esm1 = get_service(SERVICE_EVENT_METADATA) self.assertEqual(len(esm1), 2) event_id = 'idvalue1' event_comp = 'TST' e1 = teal.Event.fromDict({EVENT_ATTR_REC_ID:1, EVENT_ATTR_EVENT_ID:event_id, EVENT_ATTR_SRC_COMP: 'TST', EVENT_ATTR_TIME_OCCURRED: datetime.now()}) meta_dict2 = e1.get_metadata() self.assertEqual(meta_dict2[META_EVENT_ID], event_id) self.assertEqual(meta_dict2[META_EVENT_COMP], event_comp) self.assertEqual(meta_dict2[META_EVENT_MSG], 'This is test message 1') # check alert metadata directly asm1 = get_service(SERVICE_ALERT_METADATA) alert_id = 'Alert01' self.assertEqual(len(asm1), 3) self.assertTrue(alert_id in asm1) meta_dict = asm1[alert_id] self.assertEqual(meta_dict[META_ALERT_ID], alert_id) self.assertEqual(meta_dict[META_ALERT_MSG_TEMPLATE], 'This is Alert 01') self.assertEqual(meta_dict[META_ALERT_RECOMMENDATION], 'Recommend doing something') self.assertEqual(meta_dict[META_ALERT_URGENCY], 'N') self.assertEqual(meta_dict[META_ALERT_SEVERITY], 'W') self.assertEqual(meta_dict[META_ALERT_CALL_HOME], 'N') self.assertEqual(meta_dict[META_ALERT_CUST_NOTIFICATION], 'N') self.assertEqual(meta_dict[META_ALERT_FRU_CLASS], 'fru_class') self.assertEqual(meta_dict[META_ALERT_FRU_LIST], 'fru_list1, fru_list2') alert_id2 = 'Alert02' self.assertTrue(alert_id2 in asm1) meta_dict = asm1[alert_id2] self.assertEqual(meta_dict[META_ALERT_ID], alert_id2) self.assertEqual(meta_dict[META_ALERT_MSG_TEMPLATE], 'This is Alert 02') self.assertEqual(meta_dict[META_ALERT_RECOMMENDATION], 'Recommend doing something else') self.assertEqual(meta_dict[META_ALERT_URGENCY], 'S') self.assertEqual(meta_dict[META_ALERT_SEVERITY], 'E') self.assertEqual(meta_dict[META_ALERT_CALL_HOME], 'N') self.assertEqual(meta_dict[META_ALERT_CUST_NOTIFICATION], 'N') self.assertEqual(meta_dict[META_ALERT_FRU_CLASS], None) self.assertEqual(meta_dict[META_ALERT_FRU_LIST], None) alert_id3 = 'Alert03' self.assertTrue(alert_id3 in asm1) meta_dict = asm1[alert_id3] self.assertEqual(meta_dict[META_ALERT_ID], alert_id3) self.assertEqual(meta_dict[META_ALERT_MSG_TEMPLATE], 'This is Alert 03') self.assertEqual(meta_dict[META_ALERT_RECOMMENDATION], 'Do not do anything') self.assertEqual(meta_dict[META_ALERT_URGENCY], 'N') self.assertEqual(meta_dict[META_ALERT_SEVERITY], 'E') self.assertEqual(meta_dict[META_ALERT_CALL_HOME], 'Y') self.assertEqual(meta_dict[META_ALERT_CUST_NOTIFICATION], 'N') self.assertEqual(meta_dict[META_ALERT_FRU_CLASS], None) self.assertEqual(meta_dict[META_ALERT_FRU_LIST], None) teal_inst.shutdown() return
def testGeneralFilters(self): """test alert delivery with global and local filtering""" j_in_dq = Journal("j_in_DQ", "data/alert_delivery_test/analyzer_filter/inject_DQ_alerts.json") dq_q = get_service(SERVICE_ALERT_DELIVERY_Q) # Get the AlertListenerJournal journals listeners = get_service(SERVICE_ALERT_DELIVERY).listeners for listener in listeners: name = listener.get_name() if name == "AllAlerts": j_out_all = listener.journal if name == "OnlyAnalyzer1": j_out_analyzer1 = listener.journal if name == "AnyButAnalyzer1": j_out_not_analyzer1 = listener.journal if name == "OnlyAnalyzer2and3": j_out_analyzer2and3 = listener.journal if name == "AnyButAnalyzer2and3": j_out_not_analyzer2and3 = listener.journal if name == "AnyButAnalyzer1and2and3": j_out_not_analyzer1and2and3 = listener.journal # inject j_in_dq.inject_queue(dq_q) # Get expected values j_out_all_exp = Journal("all_exp", "data/alert_delivery_test/analyzer_filter/alerts_out_all.json") j_out_analyzer1_exp = Journal("analyzer1", "data/alert_delivery_test/analyzer_filter/alerts_out_analyzer1.json") j_out_not_analyzer1_exp = Journal( "not_analyzer1", "data/alert_delivery_test/analyzer_filter/alerts_out_not_analyzer1.json" ) j_out_analyzer2and3_exp = Journal( "analyzer2and3", "data/alert_delivery_test/analyzer_filter/alerts_out_analyzer2and3.json" ) j_out_not_analyzer2and3_exp = Journal( "not_analyzer2and3", "data/alert_delivery_test/analyzer_filter/alerts_out_not_analyzer2and3.json" ) j_out_not_analyzer1and2and3_exp = Journal( "not_analyzer1and2and3", "data/alert_delivery_test/analyzer_filter/alerts_out_not_analyzer1and2and3.json" ) # wait for stuff to come out self.assertTrue(j_out_all.wait_for_entries(len(j_out_all_exp))) self.assertTrue(j_out_analyzer1.wait_for_entries(len(j_out_analyzer1_exp))) self.assertTrue(j_out_not_analyzer1.wait_for_entries(len(j_out_not_analyzer1_exp))) self.assertTrue(j_out_analyzer2and3.wait_for_entries(len(j_out_analyzer2and3_exp))) self.assertTrue(j_out_not_analyzer2and3.wait_for_entries(len(j_out_not_analyzer2and3_exp))) self.assertTrue(j_out_not_analyzer1and2and3.wait_for_entries(len(j_out_not_analyzer1and2and3_exp))) # Check that it was what was expected self.assertTrue(j_out_all.deep_match(j_out_all_exp, ignore_delay=True, ignore_times=True)) self.assertTrue(j_out_analyzer1.deep_match(j_out_analyzer1_exp, ignore_delay=True, ignore_times=True)) self.assertTrue(j_out_not_analyzer1.deep_match(j_out_not_analyzer1_exp, ignore_delay=True, ignore_times=True)) self.assertTrue(j_out_analyzer2and3.deep_match(j_out_analyzer2and3_exp, ignore_delay=True, ignore_times=True)) self.assertTrue( j_out_not_analyzer2and3.deep_match(j_out_not_analyzer2and3_exp, ignore_delay=True, ignore_times=True) ) self.assertTrue( j_out_not_analyzer1and2and3.deep_match( j_out_not_analyzer1and2and3_exp, ignore_delay=True, ignore_times=True ) ) return
def testLoadingFromConNone(self): '''Test if no files are specified in the config file ''' teal_inst = teal.Teal('data/metadata_test/load_config_01.conf', 'stderr', msgLevel=self.msglevel, commit_alerts=False, commit_checkpoints=False) esm1 = get_service(SERVICE_EVENT_METADATA) self.assertEqual(len(esm1), 0) asm1 = get_service(SERVICE_ALERT_METADATA) self.assertEqual(len(asm1), 0) teal_inst.shutdown() return
def init_location_service(self, run_mode): ''' Load the Location Service based on the XML configuration in the configuration file ''' cfg_reg = registry.get_service(SERVICE_CONFIGURATION) for result in cfg_reg.get_active_sections(CONFIG_LOCATION, run_mode, name_required=False, singleton=True): # result is (section, name), but name not used location_file = cfg_reg.get(result[0],'config') data_dir = registry.get_service(TEAL_DATA_DIR) teal_loc_file_path = os.path.join(data_dir,location_file) registry.register_service(SERVICE_LOCATION,LocationService(teal_loc_file_path))
def create_alert(self, event, location): ''' create the alert ''' # Populate the dictionary alert_dict = {} alert_dict[ALERT_ATTR_SRC_NAME] = self.src_name alert_dict[ALERT_ATTR_ALERT_ID] = self.alert_id.get_value() if self.severity.is_set(): alert_dict[ALERT_ATTR_SEVERITY] = self.severity.get_value() if self.urgency.is_set(): alert_dict[ALERT_ATTR_URGENCY] = self.urgency.get_value() if self.fru_loc.is_set(): alert_dict[ALERT_ATTR_FRU_LOC] = self.fru_loc.get_value() if self.recommendation.is_set(): alert_dict[ALERT_ATTR_RECOMMENDATION] = self.recommendation.get_value() if self.raw_data.is_set(): alert_dict[ALERT_ATTR_RAW_DATA] = self.raw_data.get_value() if self.msg_template.is_set(): alert_dict[ALERT_ATTR_MSG_TEMPLATE] = self.msg_template.get_value() if self.priority.is_set(): alert_dict[ALERT_ATTR_PRIORITY] = self.priority.get_value() alert_dict[ALERT_ATTR_CONDITION_EVENTS] = set([event]) if self.event_loc.is_set(): loc = self.event_loc.get_value() else: loc = get_service(SERVICE_LOCATION).get_teal_location(self.ruleset.name) alert_dict[ALERT_ATTR_EVENT_LOC] = loc.get_location() alert_dict[ALERT_ATTR_EVENT_LOC_TYPE] = loc.get_id() # Fill in raw data raw_data_dict = {} raw_data_dict['exception'] = '{0}: {1}'.format(str(location.ex_type), str(location.ex_value)) alert_dict[ALERT_ATTR_RAW_DATA] = dict2raw_data(raw_data_dict) # Call init routine if specified if self.init_class_callable is not None: try: alert_dict = self.init_class_callable().update_init_data_main(alert_dict) except ThreadKilled: raise except ExtFatalError: get_logger().exception('FATAL ERROR raised --> kill analyzer') raise except: self.ruleset.trace_error(self.trace_id[1], 'Error in update_init_data_main') get_logger().exception('') # Allocate the potential alert amgr = get_service(SERVICE_ALERT_MGR) alert = amgr.allocate(self.alert_id.get_value(), alert_dict) # send the alert to the delivery queue get_logger().debug(' creating {0}'.format(str(alert))) amgr.commit(alert) self.ruleset.send_alert(alert)
def inject_new_entries(self,exp_json='data/restart_test/three_events_one_fromq.json',exp_num=3): ''' Verify that events still flow through TEAL after startup ''' # Now make sure we start getting new events j_inj = Journal('After restart','data/restart_test/three_events_one.json') j_inj.insert_in_db(use_rec_ids=False, no_delay=True) registry.get_service(registry.SERVICE_NOTIFIER).post() j_exp = Journal('Inject New Entries', exp_json) j_act = self.find_analyzer().journal self.assertTrue(j_act.wait_for_entries(exp_num)) self.assertTrue(j_act.deep_match(j_exp, ignore_delay=True))
def testDemo1DB(self): '''Test demo flow by injecting into DB''' self.prepare_db() keep_var = self.force_env('TEAL_TEST_POOL_TIMERS_OFF', 'YES') self.teal = Teal('data/teal_test/configurationtest_05_semaphore_auto.conf', 'stderr', msgLevel=self.msglevel) j_in = Journal('j_in', file='data/demo/data_sample_demo_NEW_001.json') j_out_eq = Journal('j_out_eq') j_out_aaq = Journal('j_out_aaq') j_out_dq = Journal('j_out_dq') j_out_lis = Journal('j_out_lis') q_out_eq = registry.get_service(SERVICE_EVENT_Q) q_out_aaq = registry.get_service(SERVICE_ALERT_ANALYZER_Q) q_out_dq = registry.get_service(SERVICE_ALERT_DELIVERY_Q) q_out_eq.register_listener(j_out_eq) q_out_dq.register_listener(j_out_dq) q_out_aaq.register_listener(j_out_aaq) listeners = get_service(SERVICE_ALERT_DELIVERY).listeners for listener in listeners: if listener.get_name() == 'outputJournal': j_out_lis = listener.journal try: j_in.insert_in_db(progress_cb=None, truncate=False, use_rec_ids=True, no_delay=False, post=True) except: print 'INSERTION FAILED' q_out_eq.unregister_listener(j_out_eq) q_out_dq.unregister_listener(j_out_dq) q_out_aaq.unregister_listener(j_out_aaq) raise # Yes, only 2: Flush can't be injected to connector, so pool does not get closed, so last event # Does not get turned into an alert! self.assertTrue(j_out_lis.wait_for_entries(2)) # Note these connector ('C') versions have one less alert # The analyzer is being run in historic mode (see configuration) if that was # changed to runtime then the pool would time out and the last alert would be journaled j_exp_aaq = Journal('j_exp_aaq', 'data/teal_test/data_sample_demo_NEW_001_AAQ_Result_C.json') self.assertTrue(j_out_aaq.deep_match(j_exp_aaq, ignore_delay=True, ignore_times=True)) j_exp_dq = Journal('j_exp_dq', 'data/teal_test/data_sample_demo_NEW_001_DQ_Result_C.json') self.assertTrue(j_out_dq.deep_match(j_exp_dq, ignore_delay=True, ignore_times=True)) j_exp_lis = Journal('j_exp_lis', 'data/teal_test/data_sample_demo_NEW_001_LIS_Result_C.json') self.assertTrue(j_out_lis.deep_match(j_exp_lis, ignore_delay=True, ignore_times=True)) q_out_eq.unregister_listener(j_out_eq) q_out_dq.unregister_listener(j_out_dq) q_out_aaq.unregister_listener(j_out_aaq) self.teal.shutdown() self.restore_env('TEAL_TEST_POOL_TIMERS_OFF', keep_var)
def __init__(self, name): self.name = name # Register myself as a listener on the event Q self.listenQ = get_service(SERVICE_EVENT_Q) self.listenQ.register_listener(self) self.event_checkpoint = EventCheckpoint(name) return
def get_generator(self, config_dict): """ Return the appropriate SQL generator based on the configuration information retrieved """ DB_CONF_PATH = "{0}/xcat".format(get_service(TEAL_CONF_DIR)) prefix = os.environ.get(TEAL_TEST_XCAT_CFGLOG_PREFIX, "") DB_CONF_FILE = "{0}cfgloc".format(prefix) # Set xCAT table names db_interface.TABLE_EVENT_LOG = "x_tealeventlog" db_interface.TABLE_CHECKPOINT = "x_tealcheckpoint" db_interface.TABLE_ALERT_LOG = "x_tealalertlog" db_interface.TABLE_ALERT2ALERT = "x_tealalert2alert" db_interface.TABLE_ALERT2EVENT = "x_tealalert2event" db_interface.TABLE_TEMPLATE = "x_{0}" # Well-known path to the information. ds_file = "{0}/{1}".format(DB_CONF_PATH, DB_CONF_FILE) get_logger().debug("DB Configuration: {0}".format(ds_file)) try: conf_file = open(ds_file, "r") except IOError, e: get_logger().error("Unable to open DB configuration file. {0}".format(e)) raise
def close_event(errm_env, event_data): ''' Find an alert that associated with the closed event and close it. This will only close an alert that has this event as the one and only event ''' # Find the matching event in the event log event_rec_id = find_logged_event(errm_env, event_data) # If no event was found, there is nothing more to do if event_rec_id is None: return # Find an alert that this event is the only event associated with alert_recids = find_logged_alerts(event_rec_id) # If there is no alert associated with this event, then it may have # already been closed out or never logged in the first place based # on when this connector started listening for events if len(alert_recids) == 0: return # Close this alert and any alerts that were duplicates of this alert a_mgr = registry.get_service(registry.SERVICE_ALERT_MGR) for alert_recid in alert_recids: try: a_mgr.close(alert_recid) except alert_mgr.AlertMgrError, ame: get_logger().warn('Failed to close alert({0}) associated to event ({1}): {2}'.format(alert_recid, event_rec_id, ame))
def testDisableDup(self): ''' Test That disable dup works''' self.teal = Teal('data/common/configurationtest.conf', 'stderr', msgLevel=self.msglevel, commit_alerts=False, commit_checkpoints=False) am = get_service(SERVICE_ALERT_MGR) self.assertEqual(len(am.in_mem_alerts), 0) self.assertEqual(len(am.in_mem_alerts_duplicate), 0) self.assertEqual(len(am.active_alerts_open), 0) create_teal_alert('XXXXXXXX', 'no reason at all', 'medium well', loc_instance="YYY") self.assertEqual(len(am.in_mem_alerts), 1) self.assertEqual(len(am.in_mem_alerts_duplicate), 0) self.assertEqual(len(am.active_alerts_open), 1) create_teal_alert('XXXXXXXX', 'no reason at all', 'medium well', loc_instance="YYY") self.assertEqual(len(am.in_mem_alerts), 2) self.assertEqual(len(am.in_mem_alerts_duplicate), 1) self.assertEqual(len(am.active_alerts_open), 1) create_teal_alert('XXXXXXXX', 'no reason at all', 'medium well', loc_instance="YYY", disable_dup=True) self.assertEqual(len(am.in_mem_alerts), 3) self.assertEqual(len(am.in_mem_alerts_duplicate), 1) self.assertEqual(len(am.active_alerts_open), 2) create_teal_alert('XXXXXXXX', 'no reason at all', 'medium well', loc_instance="YYY", disable_dup=True) self.assertEqual(len(am.in_mem_alerts), 4) self.assertEqual(len(am.in_mem_alerts_duplicate), 1) self.assertEqual(len(am.active_alerts_open), 3) create_teal_alert('XXXXXXXX', 'no reason at all', 'medium well', loc_instance="YYY") self.assertEqual(len(am.in_mem_alerts), 5) self.assertEqual(len(am.in_mem_alerts_duplicate), 2) self.assertEqual(len(am.active_alerts_open), 3) self.teal.shutdown() return
def app_terminate(sig,stack_frame): ''' Initiate application termination on signal from the user ''' global app_termination_signal app_termination_signal = sig shutdown = registry.get_service(SERVICE_SHUTDOWN) if (shutdown is not None): shutdown.notify()
def __init__(self, use_db, restart_mode): ''' Initialize the checkpoint manager ''' get_logger().debug('Initializing checkpoint manager use_db = {0}, restart_mode = {1}'.format(str(use_db), str(restart_mode))) self.event_checkpoints = dict() self.use_db = use_db self.chkpt_recid_lock = threading.Lock() self.event_checkpoint_rec_id = 0 self.shutdown_recid = None # Validate restart mode if restart_mode is not None and restart_mode not in RESTART_MODES: raise ConfigurationError('Unrecognized restart mode specified: {0}'.format(restart_mode)) self.restart_mode = restart_mode if use_db == True: get_logger().debug('Checkpoint manager is using the DB') # Setup SQL strings db = get_service(SERVICE_DB_INTERFACE) # Insert event row global _SQL_EVENT_CP_INSERT _SQL_EVENT_CP_INSERT = db.gen_insert( [EVENT_CPF_CHKPT_ID, EVENT_CPF_NAME, EVENT_CPF_STATUS, EVENT_CPF_EVENT_RECID, EVENT_CPF_DATA], db_interface.TABLE_CHECKPOINT) # Get event row global _SQL_EVENT_CP_SELECT_BY_NAME _SQL_EVENT_CP_SELECT_BY_NAME = db.gen_select( [EVENT_CPF_CHKPT_ID, EVENT_CPF_NAME, EVENT_CPF_STATUS, EVENT_CPF_EVENT_RECID, EVENT_CPF_DATA], db_interface.TABLE_CHECKPOINT, where='${0} = ?'.format(EVENT_CPF_NAME), where_fields=[EVENT_CPF_NAME]) # Update event checkpoint global _SQL_EVENT_CP_UPDATE_CHECKPOINT _SQL_EVENT_CP_UPDATE_CHECKPOINT = db.gen_update( [EVENT_CPF_STATUS, EVENT_CPF_EVENT_RECID, EVENT_CPF_DATA], db_interface.TABLE_CHECKPOINT, where='${0} = ?'.format(EVENT_CPF_CHKPT_ID), where_fields=[EVENT_CPF_CHKPT_ID]) # Determine the next event rec_id to use cnxn = db.get_connection() cursor = cnxn.cursor() db.select_max(cursor, EVENT_CPF_CHKPT_ID, db_interface.TABLE_CHECKPOINT) row = cursor.fetchone() if row and row[0]: self.event_checkpoint_rec_id = row[0] get_logger().debug('Checkpoint Manager starting after rec_id = {0}'.format(self.event_checkpoint_rec_id)) cnxn.close() # Setup for asynchronous update of checkpoints in DB self.update_db_event = threading.Event() self.t1 = CheckpointDBUpdater(self) self.t1.setDaemon(True) self.t1.start() return
def init_cfg_service(self, config_file): """ Initialize the configuration service """ conf_str = '' # Go get the configuration files from the default location if config_file is None: config_file = os.path.join(registry.get_service(TEAL_CONF_DIR),'teal') conf_str += 'None -> ' # Need to create the list of files to pass to the configuration service # so determine if this is a file or directory to recover the proper set if os.path.isfile(config_file): conf_files = [config_file] conf_str += 'File -> {0}'.format(repr(config_file)) elif os.path.isdir(config_file): # Find all the configuration files in the specified directory conf_qry = os.path.join(config_file,'*.conf') conf_files = glob.glob(conf_qry) conf_str = 'Dir -> {0}'.format(repr(config_file)) else: conf_files = [] if not conf_files: raise ConfigurationError('Configuration file/directory specification of \'{0}\' resulted in no configuration files'.format(config_file)) registry.register_service(SERVICE_CONFIGURATION, Configuration(conf_files)) return conf_str
def analyze_event(self, event): ''' Turn every event from the SFP into an Alert ''' # Build the Alert directly from the event information raw_data_dict = {'Problem Number':event.raw_data[SFP_PROB_NUM], 'FRU List':eval(event.raw_data[SFP_FRU_LIST]), 'SFP':event.get_rpt_loc().get_comp_value('node')} alert_dict = {alert.ALERT_ATTR_SEVERITY:'E', alert.ALERT_ATTR_URGENCY:'N', alert.ALERT_ATTR_EVENT_LOC_OBJECT:event.get_src_loc(), alert.ALERT_ATTR_RECOMMENDATION:SFP_RECOMMENDATION, alert.ALERT_ATTR_REASON:event.raw_data[SFP_REASON], alert.ALERT_ATTR_RAW_DATA:str(raw_data_dict), alert.ALERT_ATTR_SRC_NAME:self.get_name(), alert.ALERT_ATTR_CONDITION_EVENTS:set((event,)) } # Get the alert manager to create/allocate/commit the alert amgr = registry.get_service(registry.SERVICE_ALERT_MGR) sfp_alert = amgr.allocate(event.get_event_id(), in_dict=alert_dict) # Duplicate alerts are already handled by the HMC/SFP so we should not # dup them again since they are always different alerts amgr.commit(sfp_alert, disable_dup=True) # Now the alert is created and can be reported through the pipeline self.send_alert(sfp_alert)
def shutdown(self): '''Stop running the event monitor. ''' get_logger().debug('Starting shutdown') self.running = False self.notifier.post() get_logger().debug('Joining thread') self.monitor_thread.join() last_processed_recid = self.start_recid if registry.get_service(SERVICE_SHUTDOWN_MODE) == SHUTDOWN_MODE_IMMEDIATE: # If immediate use the last one that was processed last_processed_recid = self.checkpointL.event_checkpoint.start_rec_id registry.get_service(SERVICE_CHECKPOINT_MGR).monitor_shutdown(last_processed_recid) get_logger().debug('Shutdown complete')
def __init__(self, location_id, data): ''' Constructor ''' if not(isinstance(data, str) or isinstance(data, unicode)): raise TypeError,"Invalid type of Location data: {0}.".format(type(data)) loc_service = registry.get_service(SERVICE_LOCATION) try: self.location_info = loc_service[location_id] self.location_code = data.split(self.location_info.separator) # Location code is initialized, now validate it self._validate_location_code() except: tmp_env = os.environ.get(TEAL_LOCATION_VALIDATION, 'LOG').upper() if tmp_env == 'LOG': get_logger().exception('LOGGING Location creation failure and continuing processing') elif tmp_env == 'IMMEDIATE': raise self.loc_id = location_id self.data = data self.ex_type, self.ex_value = sys.exc_info()[:2] self.is_unprocessable = self.is_unprocessable_UNPROCESSABLE self.new_location_by_scope = self._UNPROCESSABLE self.get_comp_value = self._UNPROCESSABLE self.get_substitution_dict = self._UNPROCESSABLE self.get_location = self.get_location_UNPROCESSABLE self.str_impl = self.str_impl_UNPROCESSABLE self.match = self.match_UNPROCESSABLE self.get_id = self.get_id_UNPROCESSABLE return
def add_entries_before_restart(self, stop_teal=True): ''' Add events to TEAL and make sure they are processed ''' self.start_teal('now') # Insert a set of events and process them j_act = self.find_analyzer().journal j_inj = Journal('Pre-populate','data/restart_test/three_events_one.json') j_inj.insert_in_db(no_delay=True, truncate=True) # Truncate is testing that we handle the ckpt table being destroyed registry.get_service(registry.SERVICE_NOTIFIER).post() self.assertTrue(j_act.wait_for_entries(3)) j_exp = Journal('Expected', 'data/restart_test/three_events_one_fromq.json') self.assertTrue(j_act.deep_match(j_exp, ignore_delay=True)) # Stop this instance of TEAL if requested otherwise it is up # to the caller to stop it if stop_teal: self.stop_teal()
def executeQuery(self, query): ''' Run a query. ''' dbi = registry.get_service(SERVICE_DB_INTERFACE) dbConn = dbi.get_connection() cursor = dbConn.cursor() cursor.execute(query) return cursor.fetchall()
def get_eventList(): '''Get the list of RAS events with hardware '_in error' control actions. ''' # Search the tbgqmsgtypes for ras events with 'hardware in error' control actions schema = str(db_interface.TABLE_TEMPLATE).split('.') msgtypesTable = schema[0] + '.' + 'tbgqmsgtypes' ctlaction_query = "select msg_id, svcaction from " + msgtypesTable + " where ctlaction is not NULL and ctlaction like '%_IN_ERROR%' and ctlaction not like '%SOFTWARE_IN_ERROR%'" dbi = registry.get_service(SERVICE_DB_INTERFACE) dbConn = dbi.get_connection() cursor = dbConn.cursor() cursor.execute(ctlaction_query) rows = cursor.fetchall() msgIDs = list() msgidService = dict() for r in rows: msgid = r[0].strip() msgIDs.append(msgid) sa = 'Service action: ' + str(r[1]).strip() msgidService[msgid] = sa # add in the bqc serdes analysis events that conditionally set # a service action of COMPUTE_IN_ERROR msgIDs.append('00090213') msgIDs.append('00090216') sa = 'Service action: COMPUTE_IN_ERROR' msgidService['00090213'] = sa msgidService['00090216'] = sa # check for control action overrides defined in the ras_environment_filter.xml config_service = registry.get_service('BGQ_CONFIG_SERVICE') ras_events = config_service.get('RasEventChangeSpec') for re in ras_events: if re[1] == 'BG_CTL_ACT': if (re[2].find('_IN_ERROR') > 0) and (re[2].find('SOFTWARE_IN_ERROR') < 0): msgIDs.append(re[0]) query = "select svcaction from " + msgtypesTable + " where msg_id = ?" cursor.execute(query, str(re[0])) row = cursor.fetchone() sa = 'Service action: ' + row[0] msgidService[re[0]] = sa else: if re[0] in msgIDs: msgIDs.remove(re[0]) return msgIDs, msgidService
def get_excludeMsgIDs(): '''Get the list of excluded message IDs. ''' cfg = registry.get_service(registry.SERVICE_CONFIGURATION) excludeMsgIDs = '' try: excludeMsgIDs = cfg.get(BGQ_TEAL_THRESHOLD_ANALYZER, BGQ_TEAL_THRESHOLD_EXCLUDE_IDS) registry.get_logger().debug('Exclude List = ' + excludeMsgIDs) except Exception, e: registry.get_logger().debug(e)
def analyze_event(self, event): '''Analyze a RAS event and determine whether the BQL threshold of errors has been reached or exceeded. ''' msg_id = event.get_event_id() rec_id = event.get_rec_id() registry.get_logger().info("Analyzing msgid = " + msg_id + " recid = " + str(rec_id)) location = str(event.get_src_loc()) location = location[3:].strip() severity = event.raw_data['severity'].strip() serialnumber = event.raw_data['serialnumber'] ecid = event.raw_data['ecid'] event_time = event.get_time_logged() block = event.raw_data['block'].strip() jobid = event.raw_data['jobid'] msgText = event.raw_data['message'].strip() rawdata = event.raw_data['rawdata'].strip() count = event.get_event_cnt() # Set threshold value threshold = self.msgidCount[msg_id] tmsg = "BQL error threshold of " + str( threshold) + " has been reached or exceeded, total count is " # check if thresholds have been reached or exceeded for events xmsg = "" xmsg = " in a period of " + self.msgidPeriod[msg_id].strip() query = self.period_query.replace('PERIOD', self.msgidPeriod[msg_id].strip()) query = query.replace('MYTIME', str(event_time)) # search for events associated with this location's midplane or I/O board qryloc = location.strip()[0:6] + '%' registry.get_logger().debug(query + " xmsgId=" + msg_id + " loc=" + qryloc + " ev_time=" + str(event_time)) msgCount = 0 for x in range(5): try: self.cursor.execute(query, qryloc) row = self.cursor.fetchone() msgCount = row[0] break except Exception, e: registry.get_logger().debug(e) if x < 4: dbi = registry.get_service(SERVICE_DB_INTERFACE) self.dbConn = dbi.get_connection() self.cursor = self.dbConn.cursor() else: raise Exception( 'Error: bgq_BqlEventAnalyzer could not connect to the database' )
def send_common_alert(self, loc, cur_alert_recid, event, alert_time, dup_query, cursor): ''' Send an alert for the common location. ''' # Close current alert prior to creating a new common alert registry.get_logger().info('Closing current alert recid %d prior to creating a common mode alert', cur_alert_recid) registry.get_service(SERVICE_ALERT_MGR).close(cur_alert_recid) # Get the location loc_name = self.get_loc_name(loc) loc_type = loc.get_id() loc_parent, loc_parent_list = self.get_loc_parent(loc) loc_parent_object = Location(loc_type, loc_parent) # Removed the duplicate check that was here -- this has already been determined # Fill in alert info reason = self.reason.replace('LOC_NAME', loc_name) reason = reason.replace('LOC_PARENT', loc_parent) recommendation = self.recommendation.replace('LOC_PARENT', loc_parent) alert_dict = {alert.ALERT_ATTR_SEVERITY:self.severity, alert.ALERT_ATTR_URGENCY:'I', alert.ALERT_ATTR_EVENT_LOC_OBJECT:loc_parent_object, alert.ALERT_ATTR_RECOMMENDATION:recommendation, alert.ALERT_ATTR_REASON:reason, alert.ALERT_ATTR_RAW_DATA:'No raw data', alert.ALERT_ATTR_SRC_NAME:self.get_name(), alert.ALERT_ATTR_CONDITION_EVENTS:set((event,)) } # Get the alert manager to create/allocate/commit the alert alertMgr = registry.get_service(registry.SERVICE_ALERT_MGR) bg_alert = alertMgr.allocate(self.alertId, in_dict=alert_dict) alertMgr.commit(bg_alert, disable_dup=False) # Now the alert is created, need to put it in the queue so that it can be analyzed # by alert analyzer (instead of sending it - send_alert, which will get reported # through the pipeline right away) registry.get_logger().info("Put alertId = %s with event recid = %d on the alert analyzer queue", self.alertId, event.get_rec_id()) registry.get_service(SERVICE_ALERT_ANALYZER_Q).put(bg_alert) return
def _configure(self): # Set the polling time based on the BGQ Connector conf file cfg = registry.get_service(registry.SERVICE_CONFIGURATION) try: value = cfg.get(BGQ_TEAL_CONFIG, BGQ_TEAL_CONFIG_POLL_INTERVAL) self.poll_interval = int(value) if self.poll_interval <= 0: registry.get_logger().error('The value ' + str(self.poll_interval) + ' specified in the poll interval is not valid. The value must be greater than zero.') raise except: registry.get_logger().warn('Configuring poll interval to default {0} seconds'.format(BGQ_DEFAULT_POLL_INTERVAL)) self.poll_interval = BGQ_DEFAULT_POLL_INTERVAL
def __init__(self): ''' The constructor. ''' # Parse ras environment filter file (ras_environment_filter.xml) self.repository = dict() p = xml.sax.make_parser() ras_filter = registry.get_service('BGQ_RAS_FILTER') if ras_filter != None: filter_handler = ConfigRasEnvFilterHandler(self.repository) p.setContentHandler(filter_handler) p.parse(ras_filter) return
def get_eventList(): '''Get the list of RAS events with END_JOB control actions. ''' # Search the tbgqmsgtypes for ras events with END_JOB control action # Note: do not include ras events already handled by HardwareInError analyzer. schema = str(db_interface.TABLE_TEMPLATE).split('.') msgtypesTable = schema[0] + '.' + 'tbgqmsgtypes' endJob_query = "select msg_id, svcaction from " + msgtypesTable + " where ctlaction is not NULL and ctlaction like '%END_JOB%' and (ctlaction not like '%_IN_ERROR%' or ctlaction like '%SOFTWARE_IN_ERROR%')" dbi = registry.get_service(SERVICE_DB_INTERFACE) dbConn = dbi.get_connection() cursor = dbConn.cursor() cursor.execute(endJob_query) rows = cursor.fetchall() msgIDs = list() msgidService = dict() for r in rows: msgid = r[0] msgIDs.append(msgid) sa = 'Service action: ' + str(r[1]).strip() msgidService[msgid] = sa # check for control action overrides defined in the ras_environment_filter.xml config_service = registry.get_service('BGQ_CONFIG_SERVICE') ras_events = config_service.get('RasEventChangeSpec') for re in ras_events: if re[1] == 'BG_CTL_ACT': if (re[2].find('END_JOB') > 0): msgIDs.append(re[0]) query = "select svcaction from " + msgtypesTable + " where msg_id = ?" cursor.execute(query, str(re[0])) row = cursor.fetchone() sa = 'Service action: ' + row[0] msgidService[re[0]] = sa else: if re[0] in msgIDs: msgIDs.remove(re[0]) return msgIDs, msgidService
def _query_and_log_event(self, query_sign, recid, max_recid=0): ''' Query the BG event log for new events and log into TEAL ''' registry.get_logger().debug("in _query_and_log_event") event_logged = False db = registry.get_service(registry.SERVICE_DB_INTERFACE) cnxn = db.get_connection() bgq_cursor = cnxn.cursor() teal_cursor = cnxn.cursor() # Query the BG event log for new events bgEvent_query = "SELECT RECID, CATEGORY, COMPONENT, JOBID, BLOCK, LOCATION, MSG_ID FROM " + db_interface.TABLE_BG_EVENT_LOG + " WHERE RECID " + query_sign + " ? ORDER BY RECID ASC" bgq_cursor.execute(bgEvent_query, recid) commit_count = 0 for bg_event in next_row(bgq_cursor): # Don't process events with recids >= max_recid, if it is nonzero if max_recid > 0 and bg_event[0] >= max_recid: break # Log only events we are interested in if bg_event[6] in self.msgIDs: event_logged = True # Log the event into TEAL self._log_event(bg_event, teal_cursor) # Commit every so often to limit the transaction size commit_count += 1 if commit_count == COMMIT_LIMIT: cnxn.commit() commit_count = 0 else: registry.get_logger().debug('ignore msgid ' + bg_event[6]) # Update the 'cursor' into the BGQ database self.last_processed_event = bg_event[0] # Notify TEAL that events have been inserted if (event_logged): registry.get_logger().debug("event to log " + str(event_logged)) cnxn.commit() if self.notifier: self.notifier.post() else: registry.get_logger().warn('TEAL notifier not configured.') cnxn.close() registry.get_logger().debug("exit _query_and_log_event")
def __init__(self, name, inEventQueue, outQueue, config_dict=None, number=0, checkpoint=None): '''The constructor. ''' EventAnalyzer.__init__(self, name, inEventQueue, outQueue, config_dict, number, checkpoint) self.severity = "W" self.recommendation = '''Schedule service to isolate the BQL issue. Possible causes are environmental, cable, or a board. Multiple BQLs reporting issues on multiple boards in the same midplane may be a side effect of an environmental issue like neighboring boards being powered off unexpectedly. Issues with a single cable may be due to poor seating of the cable into the connector or debris on the cable. The cable and board can be cleaned with an optics cleaning tool. Low voltage on all or most lanes on a single optical module may be an issue with the board at this location (the receiver) or its neighboring board (the transmitter) at the other end of the cable.''' self.alert_id = 'BQL01' # Get the exclude list of message IDs cfg = registry.get_service(registry.SERVICE_CONFIGURATION) excludeList = '' try: excludeList = cfg.get(BGQ_TEAL_BQL_ANALYZER, excludeMsgIDs) registry.get_logger().debug('Exclude List = ' + excludeList) except Exception, e: registry.get_logger().debug(e)
def analyze_event(self, event): '''Analyze a RAS event and send an alert. ''' msg_id = event.get_event_id() rec_id = event.get_rec_id() registry.get_logger().info("Analyzing msgid = " + msg_id + " recid = " + str(rec_id)) # Exclude event logged from DIAG run if event.raw_data['diags'] == 'T': registry.get_logger().debug('RAS Event generated by Diagnostics, skip creating an alert') return # Fill in alert with appropriate data reason = "The hardware been put in an error state. \nRAS event details:" \ " message id = " + msg_id + \ ", recid = " + str(rec_id) + \ ", timestamp = " + str(event.get_time_occurred()) + \ ", serial number = " + str(event.raw_data['serialnumber']) + \ ", ecid = " + self.ecidString(event.raw_data['ecid']) + \ ", jobid = " + str(event.raw_data['jobid']) + \ ", block = " + str(event.raw_data['block']) raw_data = "RAS Message: " + event.raw_data['message'] recommendation = self.recommendation + " " + self.msgidService[msg_id] alert_dict = {alert.ALERT_ATTR_SEVERITY:self.severity, alert.ALERT_ATTR_URGENCY:'I', alert.ALERT_ATTR_EVENT_LOC_OBJECT:event.get_src_loc(), alert.ALERT_ATTR_RECOMMENDATION:recommendation, alert.ALERT_ATTR_REASON:reason, alert.ALERT_ATTR_RAW_DATA:raw_data, alert.ALERT_ATTR_SRC_NAME:self.get_name(), alert.ALERT_ATTR_CONDITION_EVENTS:set((event,)) } # Get the alert manager to create/allocate/commit the alert alertMgr = registry.get_service(registry.SERVICE_ALERT_MGR) alert_id = 'HWERR01' bg_alert = alertMgr.allocate(alert_id, in_dict=alert_dict) alertMgr.commit(bg_alert, disable_dup=False) # Now the alert is created and can be reported through the pipeline registry.get_logger().info("Sending alert for msgid = " + msg_id + " recid = " + str(rec_id)) self.send_alert(bg_alert) return
def get_eventList(): '''Get the list of RAS events with thresdhold count. ''' # Get the exclude list of message IDs from the configuration file excludeMsgList = get_excludeMsgIDs() # Search the tbgmsgtypes for ras events that have threshold counts # Note: do not include ras events already handled by HardwareInError and JobFatal analyzers. schema = str(db_interface.TABLE_TEMPLATE).split('.') msgtypesTable = schema[0] + '.' + 'tbgqmsgtypes' count_query = "select msg_id, thresholdcount, svcaction, relevantdiags from " + msgtypesTable + " where thresholdcount is not NULL and (ctlaction is NULL or (ctlaction not like '%END_JOB%' and (ctlaction not like '%_IN_ERROR%' or ctlaction like '%SOFTWARE_IN_ERROR%')))" dbi = registry.get_service(SERVICE_DB_INTERFACE) dbConn = dbi.get_connection() cursor = dbConn.cursor() cursor.execute(count_query) rows = cursor.fetchall() msgIDs = list() msgidCount = dict() msgidService = dict() for r in rows: msgid = r[0].strip() if excludeMsgList.find(msgid) >= 0: registry.get_logger().debug(' excluding ' + msgid) continue msgIDs.append(msgid) msgidCount[msgid] = r[1] sa = 'Service action: ' if r[2]: sa += r[2].strip() else: sa += "None." if r[3]: sa += ' Relevant diagnostic bucket(s): ' + r[3].strip() msgidService[msgid] = sa # Search the tbgqmsgtypes for ras events that have threshold period period_query = "select msg_id, thresholdperiod from " + msgtypesTable + " where thresholdperiod is not NULL" cursor.execute(period_query) rows = cursor.fetchall() msgidPeriod = dict() for r in rows: msgidPeriod[r[0]] = r[1] return msgIDs, msgidService, msgidCount, msgidPeriod
def _get_last_processed_event(self): ''' Log events that have occurred prior to starting the monitor ''' dbi = registry.get_service(registry.SERVICE_DB_INTERFACE) cnxn = dbi.get_connection() cursor = cnxn.cursor() # Find the last event injected into TEAL and then inject # all the events that have occurred since then maxEvent_query = "SELECT MAX(REC_ID) FROM " + db_interface.TABLE_EVENT_LOG_EXT cursor.execute(maxEvent_query); max_id = cursor.fetchone()[0] if max_id is None: self.last_processed_event = 0 else: self.last_processed_event = max_id registry.get_logger().info('Last Processed Event = ' + str(self.last_processed_event)) cnxn.close()
class bgqBqlEventAnalyzer(bgqBaseAnalyzer): '''The BqlEventAnalyzer class determines what action to take for BQL RAS events of interest. ''' def __init__(self, name, inEventQueue, outQueue, config_dict=None, number=0, checkpoint=None): '''The constructor. ''' EventAnalyzer.__init__(self, name, inEventQueue, outQueue, config_dict, number, checkpoint) self.severity = "W" self.recommendation = '''Schedule service to isolate the BQL issue. Possible causes are environmental, cable, or a board. Multiple BQLs reporting issues on multiple boards in the same midplane may be a side effect of an environmental issue like neighboring boards being powered off unexpectedly. Issues with a single cable may be due to poor seating of the cable into the connector or debris on the cable. The cable and board can be cleaned with an optics cleaning tool. Low voltage on all or most lanes on a single optical module may be an issue with the board at this location (the receiver) or its neighboring board (the transmitter) at the other end of the cable.''' self.alert_id = 'BQL01' # Get the exclude list of message IDs cfg = registry.get_service(registry.SERVICE_CONFIGURATION) excludeList = '' try: excludeList = cfg.get(BGQ_TEAL_BQL_ANALYZER, excludeMsgIDs) registry.get_logger().debug('Exclude List = ' + excludeList) except Exception, e: registry.get_logger().debug(e) # ras events that have BQL_SPARE detail data self.msgIDs = get_eventList() for msgid in self.msgIDs: registry.get_logger().debug('msgId = ' + msgid) # set the threshold self.msgidCount = dict() self.msgidCount['00090200'] = 2 self.msgidCount['00090201'] = 1 self.msgidCount['00090202'] = 1 self.msgidCount['00090210'] = 4 self.msgidCount['00090211'] = 4 # set the window = 2 X the period self.msgidPeriod = dict() self.msgidPeriod['00090200'] = '11 seconds' self.msgidPeriod['00090201'] = '11 seconds' self.msgidPeriod['00090202'] = '11 seconds' self.msgidPeriod['00090210'] = '11 seconds' self.msgidPeriod['00090211'] = '11 seconds' # BQL related ras events self.bqlIDs = list() # define query for count of recent events at this location # within a window (plus and minus the event time) # parameter 1 = location # parameter 2 = event time eventTable = self.appendSchema('tbgqeventlog') self.period_query = "select count(*) from " + eventTable + " where location like ? and category='BQL' and event_time <= (timestamp('MYTIME') + PERIOD) and event_time > (timestamp('MYTIME') - PERIOD)" # define query for count of open alerts at this location # within a day from the event time # parameter 1 = location # parameter 2 = event time alertTable = self.appendSchema('x_tealalertlog') self.alert_period = '1 day' self.alert_query = "select count(*) from " + alertTable + " where \"alert_id\"='BQL01' and \"event_loc\"= ? and \"creation_time\" >= (timestamp('MYTIME') - PERIOD) and \"state\"=1" # database connection and cursor dbi = registry.get_service(SERVICE_DB_INTERFACE) self.dbConn = dbi.get_connection() self.cursor = self.dbConn.cursor() return
def analyze_alert(self, alert): '''Analyze an alert ''' alert_recId = alert.get_rec_id() alert_id = alert.get_incident_id() loc_type = alert.event_loc.get_id() location = alert.event_loc.get_location() #alert_msgId = alert.get_incident_id() registry.get_logger().info('Analyzing alert id %d loc_type: %s: %s', alert_recId, loc_type, location) # There should only be one condition event associated with the alert. events = alert.condition_events if len(events) == 0: registry.get_logger().error('No event associated with the alert recid %d', alert_recId) registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert) return event = events.pop() if (alert_id == 'BQL01'): # No need to analyze BQL01 alerts, just pass it to the delivery queue registry.get_logger().info('Nothing to analyze for alert id %s ', alert_id) registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert) return # Get the location loc = Location(loc_type, location) locName = self.get_loc_name(loc) # No need to analyze alert with rack location alert_time = str(alert.get_time_occurred()) if locName == 'rack': registry.get_logger().info('Nothing to analyze for alert recid %d with rack location', alert_recId) registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert) return # Find out if there are other alerts with the same block id (for ENDJOB01 and THRES01) dup_qry = '' if (alert_id == 'ENDJOB01' or alert_id == 'THRES01'): if event.raw_data['block'] is None: event_block = None else: event_block = event.raw_data['block'].strip() if event_block is None or event_block == BGQ_EVENT_NULL_BLOCK: # Found no prior alert with the same block id, pass current alert to the delivery queue registry.get_logger().info('No block id for alert id %d, no common alert generated for block: %s', alert_recId, event_block) registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert) return same_block = False # Get db connection needed for query dbi = registry.get_service(SERVICE_DB_INTERFACE) dbConn = dbi.get_connection() cursor = dbConn.cursor() if (alert_id == 'ENDJOB01'): # For ENDJOB01, look for alert id HWERR01 or COMMON01 with the same block id same_block = self.has_matching_blockId(event_block, alert_time, cursor) else: # For THRES0101, look for alert id HWERR01 or COMMON01 or ENDJOB01 with the same block id same_block = self.has_matching_blockId(event_block, alert_time, cursor) if same_block: # Found prior alert with the same block id, close current alert registry.get_logger().info('Closing current alert recid %d due to prior alert with the same block id', alert_recId) registry.get_service(SERVICE_ALERT_MGR).close(alert_recId) else: # Found no prior alert with the same block id, pass current alert to the delivery queue registry.get_logger().info('No common block id found for alert id %d within the last %s', alert_recId, self.window_time) registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert) return elif (alert_id == 'BQL01'): # No need to analyze BQL01 alerts, just pass it to the delivery queue registry.get_logger().info('Nothing to analyze for alert id %s.', alert_id) registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert) return # The following will handle the rest of the alert ids (HWERR01 or COMMON01). # Find out if there is common mode alert already exist for the same location or higher hierarchy loc_parent, loc_parent_list = self.get_loc_parent(loc) loc_qry = '(' idx = 0 for pLoc in loc_parent_list: if idx != 0: loc_qry += " or " loc_qry += " \"event_loc\" like '" + pLoc + "'" idx += 1 dup_qry2 = self.dup_query + loc_qry + ")" loc_qry += " or \"event_loc\" like '" + location + "')" dup_qry = self.dup_query + loc_qry dbi = registry.get_service(SERVICE_DB_INTERFACE) dbConn = dbi.get_connection() cursor = dbConn.cursor() dup = self.has_duplicate(alert_time, dup_qry, cursor) if dup: # Found prior alert with the same block id, close current alert registry.get_logger().info('Closing current alert recid %d due to prior alert with same common location', alert_recId) registry.get_service(SERVICE_ALERT_MGR).close(alert_recId) return # Look for a common hardware problem if there are multiple alerts for different location # on the same hardware. sendAlert = self.has_common_location(loc, alert_time, self.query, cursor) if sendAlert: # Send commmon alert self.send_common_alert(loc, alert_recId, event, alert_time, dup_qry2, cursor) else: # Pass current alert to the delivery queue registry.get_logger().info('No common location for %s found for alert id: %d within the last %s ', location, alert_recId, self.window_time) registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert) return
aquery = aquery.replace('MYTIME', str(event_time)) registry.get_logger().debug(aquery + " xmsgId=" + msg_id + " loc=" + location.strip() + " ev_time=" + str(event_time)) msgCount = 0 for x in range(5): try: self.cursor.execute(aquery, location.strip()) row = self.cursor.fetchone() msgCount = row[0] break except Exception, e: registry.get_logger().debug(e) if x < 4: dbi = registry.get_service(SERVICE_DB_INTERFACE) self.dbConn = dbi.get_connection() self.cursor = self.dbConn.cursor() else: raise Exception( 'Error: bgq_BqlEventAnalyzer could not connect to the database' ) # do not log more than one BQL alert per day for the same location if msgCount > 0: registry.get_logger().debug("An active BQL01 alert for location " + location.strip() + " exist within a period of " + self.alert_period + ". Skip logging a duplicate.") return
if options.run_as_daemon: # Do the necessary processing to spin off as a daemon command.daemonize('teal_bgq') else: # Allow the user to CTRL-C application and shutdown cleanly signal.signal(signal.SIGINT, app_terminate) # CTRL-C if options.log_file is None: log_file = '$TEAL_LOG_DIR/teal_bg.log' else: log_file = options.log_file # Set up the TEAL environment to get at the data required for logging t = teal.Teal(None, data_only=True, msgLevel=options.msg_level, logFile=log_file, daemon_mode=options.run_as_daemon) # Create the connector and start it bgcon = BgqConnector() bgcon.setDaemon(True) bgcon.start() # Wait for Teal to shutdown before exiting shutdown = registry.get_service(registry.SERVICE_SHUTDOWN) shutdown.wait()
def analyze_event(self, event): '''Analzyer a Fatal RAS event. ''' msg_id = event.get_event_id() rec_id = event.get_rec_id() registry.get_logger().info("Analyzing msgid = " + msg_id + " recid = " + str(rec_id)) # Exclude event logged from DIAG run if event.raw_data['diags'] == 'T': registry.get_logger().debug( 'RAS Event generated by Diagnostics, skip creating an alert') return jobid = event.raw_data['jobid'] if jobid: # check if already handled if jobid in self.ring: registry.get_logger().info('Alert is not sent for msgid ' + msg_id + ' recid ' + str(rec_id) + ' because jobid ' + str(jobid) + ' is already handled') return # remember this jobid self.ring.pop(0) # throw away oldest jobid self.ring.append(jobid) # add to end of the list reason = "The fatal RAS event caused the job to end. \nRAS event details:" \ " message id = " + msg_id + \ ", recid = " + str(rec_id) + \ ", timestamp = " + str(event.get_time_occurred()) + \ ", serial number = " + str(event.raw_data['serialnumber']) + \ ", ecid = " + self.ecidString(event.raw_data['ecid']) + \ ", jobid = " + str(jobid) + \ ", block = " + str(event.raw_data['block']) raw_data = "RAS Message: " + event.raw_data['message'] recommendation = self.recommendation + " " + self.msgidService[msg_id] alert_dict = { alert.ALERT_ATTR_SEVERITY: self.severity, alert.ALERT_ATTR_URGENCY: 'I', alert.ALERT_ATTR_EVENT_LOC_OBJECT: event.get_src_loc(), alert.ALERT_ATTR_RECOMMENDATION: recommendation, alert.ALERT_ATTR_REASON: reason, alert.ALERT_ATTR_RAW_DATA: raw_data, alert.ALERT_ATTR_SRC_NAME: self.get_name(), alert.ALERT_ATTR_CONDITION_EVENTS: set((event, )) } # Get the alert manager to create/allocate/commit the alert alertMgr = registry.get_service(registry.SERVICE_ALERT_MGR) alert_id = 'ENDJOB01' bg_alert = alertMgr.allocate(alert_id, in_dict=alert_dict) alertMgr.commit(bg_alert, disable_dup=False) # Now the alert is created and can be reported through the pipeline registry.get_logger().info("Sending alert for msgid = " + msg_id + " recid = " + str(rec_id)) self.send_alert(bg_alert) return
def analyze_event(self, event): '''Analyze a RAS event and determine whether threshold has been reached or exceeded. ''' msg_id = event.get_event_id() rec_id = event.get_rec_id() registry.get_logger().info("Analyzing msgid = " + msg_id + " recid = " + str(rec_id)) count = event.get_event_cnt() event_time = event.get_time_logged() location = str(event.get_src_loc()) location = location[3:].strip() # Exclude event logged from DIAG run if event.raw_data['diags'] == 'T': registry.get_logger().debug( 'RAS Event generated by Diagnostics, skip creating an alert') return # Set threshold value threshold = self.msgidCount[msg_id] tmsg = "Error threshold of " + str( threshold) + " has been reached or exceeded, total count is " # Check if threshold has been reached or exceeded xmsg = "" query = self.period_query if msg_id in self.msgidPeriod: # Query for the count of the RAS event with threhold period xmsg = " in a period of " + self.msgidPeriod[msg_id].strip() query = self.period_query if count: query = self.period_query2 query = query.replace('PERIOD', self.msgidPeriod[msg_id].strip()) query = query.replace('MYTIME', str(event_time)) else: # Query for the count of the RAS event without threshold period query = self.count_query if count: query = self.count_query2 if event.raw_data['serialnumber'] is not None: serialnumber = event.raw_data['serialnumber'].strip() sn = "= '" + serialnumber + "'" query = query.replace('SN', sn) else: serialnumber = None query = query.replace('SN', 'is NULL') if location: loc = "= '" + location + "'" query = query.replace('LOC', loc) else: query = query.replace('LOC', 'is NULL') registry.get_logger().debug(query + " msgId=" + msg_id + " event_time=" + str(event_time)) dbi = registry.get_service(SERVICE_DB_INTERFACE) dbConn = dbi.get_connection() cursor = dbConn.cursor() cursor.execute(query, msg_id, event_time) row = cursor.fetchone() msgCount = row[0] if msgCount < threshold: registry.get_logger().info("Alert is not sent for msgid " + msg_id + " recid " + str(rec_id) + " because the count " + str(msgCount) + " is less than the threshold " + str(threshold) + ".") return if msg_id in self.msgidConsecutivePeriods: # repeat the query for M-1 more periods numPeriods = self.msgidConsecutivePeriods[msg_id] numPeriods = numPeriods - 1 period = self.msgidPeriod[msg_id].strip() period = period.strip() deltaPeriod = self.delta_period(period) registry.get_logger().debug( "Checking whether " + msg_id + " recid " + str(rec_id) + " has exceeded its threshold for " + str(self.msgidConsecutivePeriods[msg_id]) + " consecutive periods of " + period + ". Delta period = " + str(deltaPeriod)) qry_time = event_time for nums in range(numPeriods): query = self.period_query2 if serialnumber: sn = "= '" + serialnumber + "'" query = query.replace('SN', sn) else: query = query.replace('SN', 'is NULL') if location: loc = "= '" + location + "'" query = query.replace('LOC', loc) else: query = query.replace('LOC', 'is NULL') query = query.replace('PERIOD', period) qry_time = qry_time - deltaPeriod query = query.replace('MYTIME', str(qry_time)) registry.get_logger().debug(query + " msgId=" + msg_id + " event_time=" + str(qry_time)) cursor.execute(query, msg_id, qry_time) row = cursor.fetchone() msgCount = row[0] if msgCount < threshold: registry.get_logger().info("Alert is not sent for msgid " + msg_id + " recid " + str(rec_id) + " because the count " + str(msgCount) + " is less than the threshold " + str(threshold) + ". period=" + str(nums + 2) + " of " + str(numPeriods + 1)) return else: registry.get_logger().debug("Threshold exceeded " + msg_id + " recid " + str(rec_id) + " for consecutive period " + str(nums + 2) + " of " + str(numPeriods + 1)) msgText = event.raw_data['message'].strip() if msg_id == '00040020': skipAlert = False index = msgText.find('StatusWord=0x0002') if (index >= 0): skipAlert = True else: index = msgText.find('StatusWord=0x4001') if (index >= 0): skipAlert = True else: index = msgText.find('StatusWord=0x4005') if (index >= 0): skipAlert = True if (skipAlert): registry.get_logger().debug("Supressing alert for " + msg_id + " because " + msgText[index:index + len('StatusWord=0x0002')]) return tmsg = tmsg + str(msgCount) + xmsg reason = tmsg + "\nRAS event details:" \ " message id = " + msg_id + \ ", recid = " + str(rec_id) + \ ", timestamp = " + str(event.get_time_occurred()) + \ ", serial number = " + str(event.raw_data['serialnumber']) + \ ", ecid = " + self.ecidString(event.raw_data['ecid']) + \ ", jobid = " + str(event.raw_data['jobid']) + \ ", block = " + str(event.raw_data['block']) rasMessage = "RAS Message: " + msgText recommendation = self.recommendation if location: recommendation = recommendation + " Schedule part replacement if this is hardware problem. " + self.msgidService[ msg_id] else: recommendation = recommendation + " " + self.msgidService[msg_id] alert_dict = { alert.ALERT_ATTR_SEVERITY: self.severity, alert.ALERT_ATTR_URGENCY: 'I', alert.ALERT_ATTR_EVENT_LOC_OBJECT: event.get_src_loc(), alert.ALERT_ATTR_RECOMMENDATION: recommendation, alert.ALERT_ATTR_REASON: reason, alert.ALERT_ATTR_RAW_DATA: rasMessage, alert.ALERT_ATTR_SRC_NAME: self.get_name(), alert.ALERT_ATTR_CONDITION_EVENTS: set((event, )) } # Get the alert manager to create/allocate/commit the alert alertMgr = registry.get_service(registry.SERVICE_ALERT_MGR) alert_id = 'THRES01' bg_alert = alertMgr.allocate(alert_id, in_dict=alert_dict) alertMgr.commit(bg_alert, disable_dup=False) # Now the alert is created and can be reported through the pipeline registry.get_logger().info("Sending alert for msgid = " + msg_id + " recid = " + str(rec_id)) self.send_alert(bg_alert) return