class SchedulerService(TornadoService): name = "scheduler" leader_lock_name = "scheduler" use_mongo = True async def on_activate(self): self.scheduler = Scheduler("scheduler", reset_running=True, max_threads=config.scheduler.max_threads) self.scheduler.run()
class SchedulerService(Service): name = "scheduler" leader_group_name = "scheduler" leader_lock_name = "scheduler" @tornado.gen.coroutine def on_activate(self): self.scheduler = Scheduler( "scheduler", reset_running=True, max_threads=config.scheduler.max_threads, ioloop=self.ioloop ) self.scheduler.run()
class DiscoveryService(Service): name = "discovery" leader_group_name = "discovery-%(pool)s" pooled = True require_nsq_writer = True use_mongo = True process_name = "noc-%(name).10s-%(pool).5s" def __init__(self): super(DiscoveryService, self).__init__() self.send_callback = None self.slot_number = 0 self.total_slots = 0 @tornado.gen.coroutine def on_activate(self): self.slot_number, self.total_slots = yield self.acquire_slot() if self.total_slots > 1: self.logger.info("Enabling distributed mode: Slot %d/%d", self.slot_number, self.total_slots) ifilter = {"key": {"$mod": [self.total_slots, self.slot_number]}} else: self.logger.info("Enabling standalone mode") ifilter = None self.scheduler = Scheduler( "discovery", pool=config.pool, reset_running=True, max_threads=config.discovery.max_threads, ioloop=self.ioloop, filter=ifilter, service=self, sample=config.discovery.sample, ) self.scheduler.run() def get_mon_data(self): r = super(DiscoveryService, self).get_mon_data() if self.scheduler: self.scheduler.apply_metrics(r) return r
class CorrelatorService(Service): name = "correlator" pooled = True leader_lock_name = "correlator-%(pool)s" process_name = "noc-%(name).10s-%(pool).5s" def __init__(self): super(CorrelatorService, self).__init__() self.version = version.version self.rules = {} # event_class -> [Rule] self.back_rules = {} # event_class -> [Rule] self.triggers = {} # alarm_class -> [Trigger1, .. , TriggerN] self.rca_forward = { } # alarm_class -> [RCA condition, ..., RCA condititon] self.rca_reverse = defaultdict( set) # alarm_class -> set([alarm_class]) self.scheduler = None self.rca_lock = Lock() self.topology_rca_lock = Lock() def on_activate(self): self.scheduler = Scheduler( self.name, pool=config.pool, reset_running=True, max_threads=config.correlator.max_threads, ioloop=self.ioloop, # @fixme have to be configured ? submit_threshold=100, max_chunk=100) self.scheduler.correlator = self self.subscribe("correlator.dispose.%s" % config.pool, "dispose", self.on_dispose_event, max_in_flight=config.correlator.max_threads) self.scheduler.run() def on_start(self): """ Load rules from database just after loading config """ super(CorrelatorService, self).on_start() self.load_rules() self.load_triggers() self.load_rca_rules() def load_rules(self): """ Load rules from database """ self.logger.debug("Loading rules") self.rules = {} self.back_rules = {} nr = 0 nbr = 0 for c in EventClass.objects.all(): if c.disposition: r = [] for dr in c.disposition: rule = Rule(c, dr) r += [rule] nr += 1 if dr.combo_condition != "none" and dr.combo_window: for cc in dr.combo_event_classes: try: self.back_rules[cc.id] += [dr] except KeyError: self.back_rules[cc.id] = [dr] nbr += 1 self.rules[c.id] = r self.logger.debug("%d rules are loaded. %d combos" % (nr, nbr)) def load_triggers(self): self.logger.info("Loading triggers") self.triggers = {} n = 0 cn = 0 ec = [(c.name, c.id) for c in AlarmClass.objects.all()] for t in AlarmTrigger.objects.filter(is_enabled=True): self.logger.debug("Trigger '%s' for classes:" % t.name) for c_name, c_id in ec: if re.search(t.alarm_class_re, c_name, re.IGNORECASE): try: self.triggers[c_id] += [Trigger(t)] except KeyError: self.triggers[c_id] = [Trigger(t)] cn += 1 self.logger.debug(" %s" % c_name) n += 1 self.logger.info("%d triggers has been loaded to %d classes" % (n, cn)) def load_rca_rules(self): """ Load root cause analisys rules """ self.logger.info("Loading RCA Rules") n = 0 self.rca_forward = {} self.rca_reverse = {} for a in AlarmClass.objects.filter(root_cause__0__exists=True): if not a.root_cause: continue self.rca_forward[a.id] = [] for c in a.root_cause: rc = RCACondition(a, c) self.rca_forward[a.id] += [rc] if rc.root.id not in self.rca_reverse: self.rca_reverse[rc.root.id] = [] self.rca_reverse[rc.root.id] += [rc] n += 1 self.logger.info("%d RCA Rules have been loaded" % n) def mark_as_failed(self, event): """ Write error log and mark event as failed """ self.logger.error("Failed to process event %s" % str(event.id)) # Prepare traceback t, v, tb = sys.exc_info() now = datetime.datetime.now() r = ["UNHANDLED EXCEPTION (%s)" % str(now)] r += [str(t), str(v)] r += [format_frames(get_traceback_frames(tb))] r = "\n".join(r) event.mark_as_failed(version=self.version, traceback=r) def set_root_cause(self, a): """ Search for root cause and set, if found :returns: Boolean. True, if root cause set """ for rc in self.rca_forward[a.alarm_class.id]: # Check condition if not rc.check_condition(a): continue # Check match condition q = rc.get_match_condition(a) root = ActiveAlarm.objects.filter(**q).first() if root: # Root cause found self.logger.info("%s is root cause for %s (Rule: %s)", root.id, a.id, rc.name) metrics["alarm_correlated_rule"] += 1 a.set_root(root) return True return False def set_reverse_root_cause(self, a): """ Set *a* as root cause for existing events :param a: :return: """ found = False for rc in self.rca_reverse[a.alarm_class.id]: # Check reverse match condition q = rc.get_reverse_match_condition(a) for ca in ActiveAlarm.objects.filter(**q): # Check condition if not rc.check_condition(ca): continue # Try to set root cause qq = rc.get_match_condition(ca, id=a.id) rr = ActiveAlarm.objects.filter(**qq).first() if rr: # Reverse root cause found self.logger.info( "%s is root cause for %s (Reverse rule: %s)", a.id, ca.id, rc.name) metrics["alarm_correlated_rule"] += 1 ca.set_root(a) found = True return found def raise_alarm(self, r, e): managed_object = self.eval_expression(r.managed_object, event=e) if not managed_object: self.logger.info("Empty managed object, ignoring") return # @todo: Make configurable if not managed_object.is_managed: self.logger.info( "Managed object is not managed. Do not raise alarm") return if e.managed_object.id != managed_object.id: metrics["alarm_change_mo"] += 1 self.logger.info("Changing managed object to %s", managed_object.name) discriminator, vars = r.get_vars(e) if r.unique: assert discriminator is not None a = ActiveAlarm.objects.filter( managed_object=managed_object.id, discriminator=discriminator).first() if not a: # Try to reopen alarm a = ArchivedAlarm.objects.filter( managed_object=managed_object.id, discriminator=discriminator, control_time__gte=e.timestamp).first() if a: # Reopen alarm self.logger.info("[%s|%s|%s] %s reopens alarm %s(%s)", e.id, managed_object.name, managed_object.address, e.event_class.name, a.alarm_class.name, a.id) a = a.reopen("Reopened by disposition rule '%s'" % r.u_name) metrics["alarm_reopen"] += 1 if a: # Active alarm found, refresh self.logger.info( "[%s|%s|%s] Contributing event %s to active alarm %s(%s)", e.id, managed_object.name, managed_object.address, e.event_class.name, a.alarm_class.name, a.id) # Contribute event to alarm e.contribute_to_alarm(a) if e.timestamp < a.timestamp: # Set to earlier date a.timestamp = e.timestamp a.save() elif e.timestamp > a.last_update: # Refresh last update a.last_update = e.timestamp a.save() metrics["alarm_contribute"] += 1 return # Calculate alarm coverage summary = ServiceSummary.get_object_summary(managed_object) summary["object"] = {managed_object.object_profile.id: 1} # severity = max(ServiceSummary.get_severity(summary), 1) self.logger.info("[%s|%s|%s] %s: Calculated alarm severity is: %s", e.id, managed_object.name, managed_object.address, r.u_name, severity) # Create new alarm a = ActiveAlarm( timestamp=e.timestamp, last_update=e.timestamp, managed_object=managed_object.id, alarm_class=r.alarm_class, severity=severity, vars=vars, discriminator=discriminator, direct_services=SummaryItem.dict_to_items(summary["service"]), direct_subscribers=SummaryItem.dict_to_items( summary["subscriber"]), total_objects=ObjectSummaryItem.dict_to_items(summary["object"]), total_services=SummaryItem.dict_to_items(summary["service"]), total_subscribers=SummaryItem.dict_to_items(summary["subscriber"]), log=[ AlarmLog(timestamp=datetime.datetime.now(), from_status="A", to_status="A", message="Alarm risen from event %s(%s) by rule '%s'" % (str(e.id), str(e.event_class.name), r.u_name)) ], opening_event=e.id) a.save() e.contribute_to_alarm(a) self.logger.info("[%s|%s|%s] %s raises alarm %s(%s): %r", e.id, managed_object.name, managed_object.address, e.event_class.name, a.alarm_class.name, a.id, a.vars) metrics["alarm_raise"] += 1 self.correlate(r, a) # Notify about new alarm if not a.root: a.managed_object.event( a.managed_object.EV_ALARM_RISEN, { "alarm": a, "subject": a.subject, "body": a.body, "symptoms": a.alarm_class.symptoms, "recommended_actions": a.alarm_class.recommended_actions, "probable_causes": a.alarm_class.probable_causes }, delay=a.alarm_class.get_notification_delay()) # Gather diagnostics when necessary AlarmDiagnosticConfig.on_raise(a) # Watch for escalations, when necessary if config.correlator.auto_escalation and not a.root: AlarmEscalation.watch_escalations(a) def correlate(self, r, a): # Topology RCA if a.alarm_class.topology_rca: with self.topology_rca_lock: self.topology_rca(a) # Rule-based RCA with self.rca_lock: if a.alarm_class.id in self.rca_forward: # Check alarm is a consequence of existing one self.set_root_cause(a) if a.alarm_class.id in self.rca_reverse: # Check alarm is the root cause for existing ones self.set_reverse_root_cause(a) # Call handlers for h in a.alarm_class.get_handlers(): try: has_root = bool(a.root) h(a) if not has_root and a.root: self.logger.info("[%s|%s|%s] Set root to %s (handler %s)", a.id, a.managed_object.name, a.managed_object.address, a.root, h) except: # noqa. Can probable happens anything from handler error_report() metrics["error", ("type", "alarm_handler")] += 1 # Call triggers if necessary if r.alarm_class.id in self.triggers: for t in self.triggers[r.alarm_class.id]: try: t.call(a) except: # noqa. Can probable happens anything from trigger error_report() # if not a.severity: # Alarm severity has been reset to 0 by handlers # Silently drop alarm self.logger.debug("Alarm severity is 0, dropping") a.delete() metrics["alarm_drop"] += 1 return def clear_alarm(self, r, e): managed_object = self.eval_expression(r.managed_object, event=e) if not managed_object: self.logger.info( "[%s|Unknown|Unknown] Referred to unknown managed object, ignoring", e.id) metrics["unknown_object"] += 1 return if r.unique: discriminator, vars = r.get_vars(e) assert discriminator is not None a = ActiveAlarm.objects.filter( managed_object=managed_object.id, discriminator=discriminator).first() if a: self.logger.info("[%s|%s|%s] %s clears alarm %s(%s)", e.id, managed_object.name, managed_object.address, e.event_class.name, a.alarm_class.name, a.id) e.contribute_to_alarm(a) a.closing_event = e.id a.last_update = max(a.last_update, e.timestamp) a.clear_alarm("Cleared by disposition rule '%s'" % r.u_name, ts=e.timestamp) metrics["alarm_clear"] += 1 def get_delayed_event(self, r, e): """ Check wrether all delayed conditions are met :param r: Delayed rule :param e: Event which can trigger delayed rule """ # @todo: Rewrite to scheduler discriminator, vars = r.get_vars(e) ws = e.timestamp - datetime.timedelta(seconds=r.combo_window) de = ActiveEvent.objects.filter(managed_object=e.managed_object_id, event_class=r.event_class, discriminator=discriminator, timestamp__gte=ws).first() if not de: # No starting event return None # Probable starting event found, get all interesting following event # classes fe = [ ee.event_class.id for ee in ActiveEvent.objects.filter( managed_object=e.managed_object_id, event_class__in=r.combo_event_classes, discriminator=discriminator, timestamp__gte=ws).order_by("timestamp") ] if r.combo_condition == "sequence": # Exact match if fe == self.combo_event_classes: return de elif r.combo_condition == "all": # All present if not any([c for c in r.combo_event_classes if c not in fe]): return de elif r.combo_condition == "any": # Any found if fe: return de return None def eval_expression(self, expression, **kwargs): """ Evaluate expression in given context """ env = {"re": re, "utils": utils} env.update(kwargs) return eval(expression, {}, env) def on_dispose_event(self, message, event_id, event=None, *args, **kwargs): """ Called on new dispose message """ self.logger.info("[%s] Receiving message", event_id) message.enable_async() self.get_executor("max").submit(self.dispose_worker, message, event_id, event) def dispose_worker(self, message, event_id, event_hint=None): metrics["alarm_dispose"] += 1 try: if event_hint: event = self.get_event_from_hint(event_hint) else: event = self.lookup_event(event_id) if event: self.dispose_event(event) except Exception: metrics["alarm_dispose_error"] += 1 error_report() self.ioloop.add_callback(message.finish) def lookup_event(self, event_id): """ Lookup event by id. Uses cache heating effect from classifier :param event_id: :return: ActiveEvent instance or None """ self.logger.info("[%s] Lookup event", event_id) e = ActiveEvent.get_by_id(event_id) if not e: self.logger.info("[%s] Event not found, skipping", event_id) metrics["event_lookup_failed"] += 1 metrics["event_lookups"] += 1 return e def get_event_from_hint(self, hint): """ Get ActiveEvent from json hint :param hint: :return: """ metrics["event_hints"] += 1 e = ActiveEvent.from_json(hint) # Prevent TypeError: can't compare offset-naive and offset-aware datetimes # when calculating alarm timestamp e.timestamp = e.timestamp.replace(tzinfo=None) return e def dispose_event(self, e): """ Dispose event according to disposition rule """ event_id = str(e.id) self.logger.info("[%s] Disposing", event_id) drc = self.rules.get(e.event_class.id) if not drc: self.logger.info( "[%s] No disposition rules for class %s, skipping", event_id, e.event_class.name) return # Apply disposition rules for r in drc: if self.eval_expression(r.condition, event=e): # Process action if r.action == "drop": self.logger.info("[%s] Dropped by action", event_id) e.delete() return elif r.action == "ignore": self.logger.info("[%s] Ignored by action", event_id) return elif r.action == "raise" and r.combo_condition == "none": self.raise_alarm(r, e) elif r.action == "clear" and r.combo_condition == "none": self.clear_alarm(r, e) if r.action in ("raise", "clear"): # Write discriminator if can trigger delayed event if r.unique and r.event_class.id in self.back_rules: discriminator, vars = r.get_vars(e) e.discriminator = discriminator e.save() # Process delayed combo conditions if e.event_class.id in self.back_rules: for br in self.back_rules[e.event_class.id]: de = self.get_delayed_event(br, e) if de: if br.action == "raise": self.raise_alarm(br, de) elif br.action == "clear": self.clear_alarm(br, de) if r.stop_disposition: break self.logger.info("[%s] Disposition complete", event_id) def topology_rca(self, alarm, seen=None): def can_correlate(a1, a2): return (not config.correlator.topology_rca_window or total_seconds(a1.timestamp - a2.timestamp) <= config.correlator.topology_rca_window) self.logger.debug("[%s] Topology RCA", alarm.id) seen = seen or set() if alarm.root or alarm.id in seen: self.logger.debug("[%s] Already correlated", alarm.id) return # Already correlated seen.add(alarm.id) # Get neighboring alarms na = {} # Downlinks q = Q(alarm_class=alarm.alarm_class.id, uplinks=alarm.managed_object.id) # Uplinks # @todo: Try to use $graphLookup to find affinity alarms if alarm.uplinks: q |= Q(alarm_class=alarm.alarm_class.id, managed_object__in=list(alarm.uplinks)) for a in ActiveAlarm.objects.filter(q): na[a.managed_object.id] = a # Correlate with uplinks if alarm.uplinks and len([na[o] for o in alarm.uplinks if o in na ]) == len(alarm.uplinks): # All uplinks are faulty # uplinks are ordered according to path length # Correlate with first applicable self.logger.info("[%s] All uplinks are faulty. Correlating", alarm.id) for u in alarm.uplinks: a = na[u] if can_correlate(alarm, a): self.logger.info("[%s] Set root to %s", alarm.id, a.id) alarm.set_root(a) metrics["alarm_correlated_topology"] += 1 break # Correlate neighbors' alarms for d in na: self.topology_rca(na[d], seen) self.logger.debug("[%s] Correlation completed", alarm.id)
class CorrelatorService(TornadoService): name = "correlator" pooled = True use_mongo = True process_name = "noc-%(name).10s-%(pool).5s" def __init__(self): super().__init__() self.version = version.version self.rules = {} # event_class -> [Rule] self.back_rules = {} # event_class -> [Rule] self.triggers = {} # alarm_class -> [Trigger1, .. , TriggerN] self.rca_forward = { } # alarm_class -> [RCA condition, ..., RCA condititon] self.rca_reverse = defaultdict( set) # alarm_class -> set([alarm_class]) # self.slot_number = 0 self.total_slots = 0 self.is_distributed = False # Scheduler self.scheduler: Optional[Scheduler] = None # Locks self.topo_rca_lock: Optional[RCALock] = None async def on_activate(self): self.slot_number, self.total_slots = await self.acquire_slot() self.is_distributed = self.total_slots > 1 # Prepare scheduler if self.is_distributed: self.logger.info("Enabling distributed mode: Slot %d/%d", self.slot_number, self.total_slots) ifilter = {"shard": {"$mod": [self.total_slots, self.slot_number]}} else: self.logger.info("Enabling standalone mode") ifilter = None self.scheduler = Scheduler( self.name, pool=config.pool, reset_running=True, max_threads=config.correlator.max_threads, # @fixme have to be configured ? submit_threshold=100, max_chunk=100, filter=ifilter, ) self.scheduler.correlator = self self.scheduler.run() # Subscribe stream, move to separate task to let the on_activate to terminate self.loop.create_task( self.subscribe_stream("dispose.%s" % config.pool, self.slot_number, self.on_dispose_event)) def on_start(self): """ Load rules from database just after loading config """ super().on_start() connect() # use_mongo connect do after on_start. self.load_rules() self.load_triggers() self.load_rca_rules() def load_rules(self): """ Load rules from database """ self.logger.debug("Loading rules") self.rules = {} self.back_rules = {} nr = 0 nbr = 0 for c in EventClass.objects.all(): if c.disposition: r = [] for dr in c.disposition: rule = Rule(c, dr) r += [rule] nr += 1 if dr.combo_condition != "none" and dr.combo_window: for cc in dr.combo_event_classes: try: self.back_rules[cc.id] += [dr] except KeyError: self.back_rules[cc.id] = [dr] nbr += 1 self.rules[c.id] = r self.logger.debug("%d rules are loaded. %d combos" % (nr, nbr)) def load_triggers(self): self.logger.info("Loading triggers") self.triggers = {} n = 0 cn = 0 ec = [(c.name, c.id) for c in AlarmClass.objects.all()] for t in AlarmTrigger.objects.filter(is_enabled=True): self.logger.debug("Trigger '%s' for classes:" % t.name) for c_name, c_id in ec: if re.search(t.alarm_class_re, c_name, re.IGNORECASE): try: self.triggers[c_id] += [Trigger(t)] except KeyError: self.triggers[c_id] = [Trigger(t)] cn += 1 self.logger.debug(" %s" % c_name) n += 1 self.logger.info("%d triggers has been loaded to %d classes" % (n, cn)) def load_rca_rules(self): """ Load root cause analisys rules """ self.logger.info("Loading RCA Rules") n = 0 self.rca_forward = {} self.rca_reverse = {} for a in AlarmClass.objects.filter(root_cause__0__exists=True): if not a.root_cause: continue self.rca_forward[a.id] = [] for c in a.root_cause: rc = RCACondition(a, c) self.rca_forward[a.id] += [rc] if rc.root.id not in self.rca_reverse: self.rca_reverse[rc.root.id] = [] self.rca_reverse[rc.root.id] += [rc] n += 1 self.logger.info("%d RCA Rules have been loaded" % n) def mark_as_failed(self, event): """ Write error log and mark event as failed """ self.logger.error("Failed to process event %s" % str(event.id)) # Prepare traceback t, v, tb = sys.exc_info() now = datetime.datetime.now() r = ["UNHANDLED EXCEPTION (%s)" % str(now)] r += [str(t), str(v)] r += [format_frames(get_traceback_frames(tb))] r = "\n".join(r) event.mark_as_failed(version=self.version, traceback=r) def set_root_cause(self, a): """ Search for root cause and set, if found :returns: Boolean. True, if root cause set """ for rc in self.rca_forward[a.alarm_class.id]: # Check condition if not rc.check_condition(a): continue # Check match condition q = rc.get_match_condition(a) root = ActiveAlarm.objects.filter(**q).first() if root: # Root cause found self.logger.info("%s is root cause for %s (Rule: %s)", root.id, a.id, rc.name) metrics["alarm_correlated_rule"] += 1 a.set_root(root, rca_type=RCA_RULE) return True return False def set_reverse_root_cause(self, a): """ Set *a* as root cause for existing events :param a: :return: """ found = False for rc in self.rca_reverse[a.alarm_class.id]: # Check reverse match condition q = rc.get_reverse_match_condition(a) for ca in ActiveAlarm.objects.filter(**q): # Check condition if not rc.check_condition(ca): continue # Try to set root cause qq = rc.get_match_condition(ca, id=a.id) rr = ActiveAlarm.objects.filter(**qq).first() if rr: # Reverse root cause found self.logger.info( "%s is root cause for %s (Reverse rule: %s)", a.id, ca.id, rc.name) metrics["alarm_correlated_rule"] += 1 ca.set_root(a, rca_type=RCA_RULE) found = True return found async def raise_alarm(self, r, e): managed_object = self.eval_expression(r.managed_object, event=e) if not managed_object: self.logger.info("Empty managed object, ignoring") return # @todo: Make configurable if not managed_object.is_managed: self.logger.info( "Managed object is not managed. Do not raise alarm") return if e.managed_object.id != managed_object.id: metrics["alarm_change_mo"] += 1 self.logger.info("Changing managed object to %s", managed_object.name) discriminator, vars = r.get_vars(e) if r.unique: assert discriminator is not None a = ActiveAlarm.objects.filter( managed_object=managed_object.id, discriminator=discriminator).first() if not a: # Try to reopen alarm a = ArchivedAlarm.objects.filter( managed_object=managed_object.id, discriminator=discriminator, control_time__gte=e.timestamp, ).first() if a: # Reopen alarm self.logger.info( "[%s|%s|%s] %s reopens alarm %s(%s)", e.id, managed_object.name, managed_object.address, e.event_class.name, a.alarm_class.name, a.id, ) a = a.reopen("Reopened by disposition rule '%s'" % r.u_name) metrics["alarm_reopen"] += 1 if a: # Active alarm found, refresh self.logger.info( "[%s|%s|%s] Contributing event %s to active alarm %s(%s)", e.id, managed_object.name, managed_object.address, e.event_class.name, a.alarm_class.name, a.id, ) # Contribute event to alarm e.contribute_to_alarm(a) if e.timestamp < a.timestamp: # Set to earlier date a.timestamp = e.timestamp a.save() elif e.timestamp > a.last_update: # Refresh last update a.last_update = e.timestamp a.save() metrics["alarm_contribute"] += 1 return # Calculate alarm coverage summary = ServiceSummary.get_object_summary(managed_object) summary["object"] = {managed_object.object_profile.id: 1} # severity = max(ServiceSummary.get_severity(summary), 1) self.logger.info( "[%s|%s|%s] %s: Calculated alarm severity is: %s", e.id, managed_object.name, managed_object.address, r.u_name, severity, ) # Create new alarm direct_services = SummaryItem.dict_to_items(summary["service"]) direct_subscribers = SummaryItem.dict_to_items(summary["subscriber"]) a = ActiveAlarm( timestamp=e.timestamp, last_update=e.timestamp, managed_object=managed_object.id, alarm_class=r.alarm_class, severity=severity, vars=vars, discriminator=discriminator, direct_services=direct_services, direct_subscribers=direct_subscribers, total_objects=ObjectSummaryItem.dict_to_items(summary["object"]), total_services=direct_services, total_subscribers=direct_subscribers, log=[ AlarmLog( timestamp=datetime.datetime.now(), from_status="A", to_status="A", message="Alarm risen from event %s(%s) by rule '%s'" % (str(e.id), str(e.event_class.name), r.u_name), ) ], opening_event=e.id, ) a.save() e.contribute_to_alarm(a) self.logger.info( "[%s|%s|%s] %s raises alarm %s(%s): %r", e.id, managed_object.name, managed_object.address, e.event_class.name, a.alarm_class.name, a.id, a.vars, ) metrics["alarm_raise"] += 1 await self.correlate(r, a) # Notify about new alarm if not a.root: a.managed_object.event( a.managed_object.EV_ALARM_RISEN, { "alarm": a, "subject": a.subject, "body": a.body, "symptoms": a.alarm_class.symptoms, "recommended_actions": a.alarm_class.recommended_actions, "probable_causes": a.alarm_class.probable_causes, }, delay=a.alarm_class.get_notification_delay(), ) # Gather diagnostics when necessary AlarmDiagnosticConfig.on_raise(a) # Watch for escalations, when necessary if config.correlator.auto_escalation and not a.root: AlarmEscalation.watch_escalations(a) async def correlate(self, r, a): # Topology RCA if a.alarm_class.topology_rca: await self.topology_rca(a) # Rule-based RCA if a.alarm_class.id in self.rca_forward: # Check alarm is a consequence of existing one self.set_root_cause(a) if a.alarm_class.id in self.rca_reverse: # Check alarm is the root cause for existing ones self.set_reverse_root_cause(a) # Call handlers for h in a.alarm_class.get_handlers(): try: has_root = bool(a.root) h(a) if not has_root and a.root: self.logger.info( "[%s|%s|%s] Set root to %s (handler %s)", a.id, a.managed_object.name, a.managed_object.address, a.root, h, ) except Exception: # noqa. Can probable happens anything from handler error_report() metrics["error", ("type", "alarm_handler")] += 1 # Call triggers if necessary if r.alarm_class.id in self.triggers: for t in self.triggers[r.alarm_class.id]: try: t.call(a) except: # noqa. Can probable happens anything from trigger error_report() # if not a.severity: # Alarm severity has been reset to 0 by handlers # Silently drop alarm self.logger.debug("Alarm severity is 0, dropping") a.delete() metrics["alarm_drop"] += 1 return def clear_alarm(self, r, e): managed_object = self.eval_expression(r.managed_object, event=e) if not managed_object: self.logger.info( "[%s|Unknown|Unknown] Referred to unknown managed object, ignoring", e.id) metrics["unknown_object"] += 1 return if r.unique: discriminator, vars = r.get_vars(e) assert discriminator is not None a = ActiveAlarm.objects.filter( managed_object=managed_object.id, discriminator=discriminator).first() if a: self.logger.info( "[%s|%s|%s] %s clears alarm %s(%s)", e.id, managed_object.name, managed_object.address, e.event_class.name, a.alarm_class.name, a.id, ) e.contribute_to_alarm(a) a.closing_event = e.id a.last_update = max(a.last_update, e.timestamp) a.clear_alarm("Cleared by disposition rule '%s'" % r.u_name, ts=e.timestamp) metrics["alarm_clear"] += 1 def get_delayed_event(self, r, e): """ Check wrether all delayed conditions are met :param r: Delayed rule :param e: Event which can trigger delayed rule """ # @todo: Rewrite to scheduler discriminator, vars = r.get_vars(e) ws = e.timestamp - datetime.timedelta(seconds=r.combo_window) de = ActiveEvent.objects.filter( managed_object=e.managed_object_id, event_class=r.event_class, discriminator=discriminator, timestamp__gte=ws, ).first() if not de: # No starting event return None # Probable starting event found, get all interesting following event # classes fe = [ ee.event_class.id for ee in ActiveEvent.objects.filter( managed_object=e.managed_object_id, event_class__in=r.combo_event_classes, discriminator=discriminator, timestamp__gte=ws, ).order_by("timestamp") ] if r.combo_condition == "sequence": # Exact match if fe == self.combo_event_classes: return de elif r.combo_condition == "all": # All present if not any([c for c in r.combo_event_classes if c not in fe]): return de elif r.combo_condition == "any": # Any found if fe: return de return None def eval_expression(self, expression, **kwargs): """ Evaluate expression in given context """ env = {"re": re, "utils": utils} env.update(kwargs) return eval(expression, {}, env) async def on_dispose_event(self, msg: Message) -> None: """ Called on new dispose message """ data = orjson.loads(msg.value) event_id = data["event_id"] hint = data["event"] self.logger.info("[%s] Receiving message", event_id) metrics["alarm_dispose"] += 1 try: event = ActiveEvent.from_json(hint) event.timestamp = event.timestamp.replace(tzinfo=None) await self.dispose_event(event) except Exception: metrics["alarm_dispose_error"] += 1 error_report() finally: if self.topo_rca_lock: # Release pending RCA Lock await self.topo_rca_lock.release() self.topo_rca_lock = None async def dispose_event(self, e): """ Dispose event according to disposition rule """ event_id = str(e.id) self.logger.info("[%s] Disposing", event_id) drc = self.rules.get(e.event_class.id) if not drc: self.logger.info( "[%s] No disposition rules for class %s, skipping", event_id, e.event_class.name) return # Apply disposition rules for r in drc: if self.eval_expression(r.condition, event=e): # Process action if r.action == "drop": self.logger.info("[%s] Dropped by action", event_id) e.delete() return elif r.action == "ignore": self.logger.info("[%s] Ignored by action", event_id) return elif r.action == "raise" and r.combo_condition == "none": await self.raise_alarm(r, e) elif r.action == "clear" and r.combo_condition == "none": self.clear_alarm(r, e) if r.action in ("raise", "clear"): # Write discriminator if can trigger delayed event if r.unique and r.event_class.id in self.back_rules: discriminator, vars = r.get_vars(e) e.discriminator = discriminator e.save() # Process delayed combo conditions if e.event_class.id in self.back_rules: for br in self.back_rules[e.event_class.id]: de = self.get_delayed_event(br, e) if de: if br.action == "raise": await self.raise_alarm(br, de) elif br.action == "clear": self.clear_alarm(br, de) if r.stop_disposition: break self.logger.info("[%s] Disposition complete", event_id) async def topology_rca(self, alarm): """ Topology-based RCA :param alarm: :return: """ def can_correlate(a1, a2): """ Check if alarms can be correlated together (within corellation window) :param a1: :param a2: :return: """ return (not config.correlator.topology_rca_window or (a1.timestamp - a2.timestamp).total_seconds() <= config.correlator.topology_rca_window) def all_uplinks_failed(a1): """ Check if all uplinks for alarm is failed :param a1: :return: """ if not a1.uplinks: return False return sum(1 for mo in a1.uplinks if mo in neighbor_alarms) == len(a1.uplinks) def get_root(a1) -> Optional[ActiveAlarm]: """ Get root cause for failed uplinks. Considering all uplinks are failed. Uplinks are ordered according to path length. Return first applicable :param a1: :return: """ for u in a1.uplinks: na = neighbor_alarms[u] if can_correlate(a1, na): return na return None def get_neighboring_alarms(ca: ActiveAlarm) -> Dict[int, ActiveAlarm]: r = { na.managed_object.id: na for na in ActiveAlarm.objects.filter( alarm_class=ca.alarm_class.id, rca_neighbors__in=[ca.managed_object.id]) } # Add current alarm to correlate downlink alarms properly r[alarm.managed_object.id] = ca return r def iter_downlink_alarms(a1): """ Yield all downlink alarms :param a1: :return: """ mo = a1.managed_object.id for na in neighbor_alarms.values(): if na.uplinks and mo in na.uplinks: yield na def correlate_uplinks(ca: ActiveAlarm) -> bool: """ Correlate with uplink alarms if all uplinks are faulty. :param a1: :return: """ if not all_uplinks_failed(ca): return False self.logger.info("[%s] All uplinks are faulty. Correlating", ca.id) ra = get_root(ca) if not ra: return False self.logger.info("[%s] Set root to %s", ca.id, ra.id) ca.set_root(ra, rca_type=RCA_TOPOLOGY) metrics["alarm_correlated_topology"] += 1 return True def correlate_merge_downlinks(ca: ActiveAlarm) -> bool: """ Donwlink merge correlation :param ca: :return: """ if not ca.uplinks or not ca.rca_neighbors: return False dlm_neighbors = { mo: w for mo, w in zip(ca.rca_neighbors, ca.dlm_windows) if w > 0 } dlm_candidates = set(neighbor_alarms) & set(dlm_neighbors) if not dlm_candidates: return False # Get possible candidates t0 = ca.timestamp candidates = list( sorted( (neighbor_alarms[mo] for mo in dlm_candidates if (t0 - neighbor_alarms[mo].timestamp ).total_seconds() <= dlm_neighbors[mo]), key=operator.attrgetter("timestamp"), )) if not candidates: return False ra = candidates[0] self.logger.info("[%s] Set root to %s (downlink merge)", ca.id, ra.id) ca.set_root(ra, rca_type=RCA_DOWNLINK_MERGE) metrics["alarm_correlated_topology"] += 1 return True self.logger.debug("[%s] Topology RCA", alarm.id) # Acquire lock if self.is_distributed: # Set lock until the end of dispose mo = alarm.managed_object self.topo_rca_lock = RCALock(mo.data.rca_neighbors + [mo.id]) await self.topo_rca_lock.acquire() # Get neighboring alarms neighbor_alarms = get_neighboring_alarms(alarm) # Correlate current alarm correlate_uplinks(alarm) or correlate_merge_downlinks(alarm) # Correlate all downlink alarms for a in iter_downlink_alarms(alarm): correlate_uplinks(a) self.logger.debug("[%s] Correlation completed", alarm.id)