示例#1
0
文件: service.py 项目: nbashev/noc
 async def on_activate(self):
     self.slot_number, self.total_slots = await self.acquire_slot()
     self.is_distributed = self.total_slots > 1
     # Prepare scheduler
     if self.is_distributed:
         self.logger.info("Enabling distributed mode: Slot %d/%d",
                          self.slot_number, self.total_slots)
         ifilter = {"shard": {"$mod": [self.total_slots, self.slot_number]}}
     else:
         self.logger.info("Enabling standalone mode")
         ifilter = None
     self.scheduler = Scheduler(
         self.name,
         pool=config.pool,
         reset_running=True,
         max_threads=config.correlator.max_threads,
         # @fixme have to be configured ?
         submit_threshold=100,
         max_chunk=100,
         filter=ifilter,
     )
     self.scheduler.correlator = self
     self.scheduler.run()
     # Subscribe stream, move to separate task to let the on_activate to terminate
     self.loop.create_task(
         self.subscribe_stream("dispose.%s" % config.pool, self.slot_number,
                               self.on_dispose_event))
示例#2
0
    def get_job_avg():
        """
        Calculate average time execute discovery job
        :return:
        """
        job_map = {
            "noc.services.discovery.jobs.periodic.job.PeriodicDiscoveryJob":
            "periodic",
            "noc.services.discovery.jobs.box.job.BoxDiscoveryJob": "box",
        }
        r = {}
        match = {"ldur": {"$gte": 0.05}}  # If Ping fail and skipping discovery

        for pool in Pool.objects.filter():
            coll = Scheduler("discovery", pool=pool.name).get_collection()
            r[pool] = {
                job_map[c["_id"]]: c["avg_dur"]
                for c in coll.aggregate([{
                    "$match": match
                }, {
                    "$group": {
                        "_id": "$jcls",
                        "avg_dur": {
                            "$avg": "$ldur"
                        }
                    }
                }]) if c["_id"]
            }
        return r
示例#3
0
文件: service.py 项目: skripkar/noc
 def on_activate(self):
     self.scheduler = Scheduler(
         "scheduler",
         reset_running=True,
         max_threads=config.scheduler.max_threads,
         ioloop=self.ioloop
     )
     self.scheduler.run()
示例#4
0
 def scheduler(s):
     scheduler, pool = "scheduler", "default"
     if "." in s:
         scheduler, pool = s.split(".")
     if scheduler in SHARDING_SCHEDULER:
         # raise argparse.ArgumentTypeError("Scheduler: %s, not supporting sharding")
         return Scheduler(scheduler, pool=pool).get_collection()
     return Scheduler(scheduler).get_collection()
示例#5
0
 def get_data(self, **kwargs):
     old = datetime.datetime.now() - datetime.timedelta(
         minutes=self.STALE_INTERVAL)
     data = []
     for pool in Pool._get_collection().find({}, {"_id": 0, "name": 1}):
         scheduler = Scheduler("discovery", pool=pool["name"])
         for r in scheduler.get_collection().find({
                 "runs": {
                     "$gt": 1
                 },
                 "jcls": {
                     "$regex": "_discovery$"
                 },
                 "st": {
                     "$lte": old
                 }
         }):
             mo = ManagedObject.get_by_id(r["key"])
             if not mo or not mo.is_managed:
                 continue
             msg = ""
             if r["tb"]:
                 tb = r["tb"]
                 if "text" in tb and "code" in tb:
                     if tb["text"].endswith("END OF TRACEBACK"):
                         tb["text"] = "Job crashed"
                     msg = "(%s) %s" % (tb["text"], tb["code"])
             data += [[
                 mo.administrative_domain.name,
                 mo.name,
                 mo.profile.name,
                 mo.platform.name,
                 mo.version.name,
                 mo.address,
                 mo.segment.name,
                 r["jcls"],
                 humanize_distance(r["st"]),
                 msg,
             ]]
     return self.from_dataset(
         title=self.title,
         columns=[
             _("Admin. Domain"),
             _("Object"),
             _("Profile"),
             _("Platform"),
             _("Version"),
             _("Address"),
             _("Segment"),
             _("Job"),
             _("Last Success"),
             _("Reason"),
         ],
         data=sorted(data),
         enumerate=True,
     )
示例#6
0
 def count(self):
     scheduler = Scheduler(self.scheduler, pool=self.pool).get_collection()
     find = {
         Job.ATTR_KEY: {
             "$in": list(self.mos_filter.values_list("id", flat=True))
         }
     }
     if self.pipeline and "$match" in self.pipeline[0]:
         find.update(self.pipeline[0]["$match"])
     return scheduler.count(find)
示例#7
0
class SchedulerService(TornadoService):
    name = "scheduler"
    leader_lock_name = "scheduler"
    use_mongo = True

    async def on_activate(self):
        self.scheduler = Scheduler("scheduler",
                                   reset_running=True,
                                   max_threads=config.scheduler.max_threads)
        self.scheduler.run()
示例#8
0
 def __iter__(self):
     mos_ids = list(self.mos_filter.values_list("id", flat=True))
     self.pipeline = [{
         "$match": {
             Job.ATTR_KEY: {
                 "$in": mos_ids
             }
         }
     }] + self.pipeline
     scheduler = Scheduler(self.scheduler, pool=self.pool).get_collection()
     yield from scheduler.aggregate(self.pipeline)
示例#9
0
文件: service.py 项目: skripkar/noc
class SchedulerService(Service):
    name = "scheduler"
    leader_group_name = "scheduler"
    leader_lock_name = "scheduler"

    @tornado.gen.coroutine
    def on_activate(self):
        self.scheduler = Scheduler(
            "scheduler",
            reset_running=True,
            max_threads=config.scheduler.max_threads,
            ioloop=self.ioloop
        )
        self.scheduler.run()
示例#10
0
def fix():
    for p in Pool.objects.all():
        s = Scheduler("discovery", pool=p.name)
        c = s.get_collection()
        if not c.count_documents({}):
            continue
        # Remove unused schedules fields
        c.update_many({
            "jcls": "noc.services.discovery.jobs.periodic.job.PeriodicDiscoveryJob"
        }, {
            "$unset": {
                "ctx": "",
                "ctv": ""
            }
        })
        c.database.command({"compact": c.name, "force": True})
示例#11
0
 def on_activate(self):
     self.scheduler = Scheduler(
         self.name,
         pool=config.pool,
         reset_running=True,
         max_threads=config.correlator.max_threads,
         ioloop=self.ioloop,
         # @fixme have to be configured ?
         submit_threshold=100,
         max_chunk=100)
     self.scheduler.correlator = self
     self.subscribe("correlator.dispose.%s" % config.pool,
                    "dispose",
                    self.on_dispose_event,
                    max_in_flight=config.correlator.max_threads)
     self.scheduler.run()
示例#12
0
def call_later(
    name: str,
    delay: Optional[float] = None,
    scheduler: str = "scheduler",
    pool: Optional[str] = None,
    job_class: str = DEFAULT_JOB_CLASS,
    shard: Optional[int] = None,
    max_runs: Optional[int] = None,
    **kwargs,
):
    """
    Schedule to run callable *name* in scheduler process
    :param name: Full callable name
    :param delay: delay in seconds
    :param scheduler: Name of scheduler
    :param pool: Pool name
    :param job_class: Job class
    :param shard: Sharding key
    :param max_runs: Maximum amount of retries
    """
    scheduler = Scheduler(scheduler, pool=pool)
    data = kwargs or {}
    ts = datetime.datetime.now()
    if delay:
        ts += datetime.timedelta(seconds=delay)
    # Process sharding
    if shard is None:
        shard = dict_hash_int_args(job_class=job_class,
                                   name=name,
                                   pool=pool,
                                   **kwargs)
    shard = (shard if shard >= 0 else -shard) % 0x7FFFFFFF
    #
    set_op = {Job.ATTR_TS: ts}
    iset_op = {
        Job.ATTR_STATUS: Job.S_WAIT,
        Job.ATTR_RUNS: 0,
        Job.ATTR_FAULTS: 0,
        Job.ATTR_OFFSET: 0,
        Job.ATTR_SHARD: shard,
    }
    if max_runs:
        iset_op[Job.ATTR_MAX_RUNS] = max_runs
    if data:
        set_op[Job.ATTR_DATA] = {
            k: v
            for k, v in data.items() if not k.startswith("_")
        }

    q = {Job.ATTR_CLASS: job_class, Job.ATTR_KEY: name}
    for k in list(data):
        if k.startswith("_"):
            # Hidden attribute JobClass, remove it from data
            q[k] = data[k]
            continue
        q["%s.%s" % (Job.ATTR_DATA, k)] = data[k]
    op = {"$set": set_op, "$setOnInsert": iset_op}
    logger.info("Delayed call to %s(%s) in %ss", name, data, delay or "0")
    logger.debug("update(%s, %s, upsert=True)", q, op)
    scheduler.get_collection().update_one(q, op, upsert=True)
示例#13
0
 def on_activate(self):
     self.slot_number, self.total_slots = yield self.acquire_slot()
     if self.total_slots > 1:
         self.logger.info("Enabling distributed mode: Slot %d/%d",
                          self.slot_number, self.total_slots)
         ifilter = {"key": {"$mod": [self.total_slots, self.slot_number]}}
     else:
         self.logger.info("Enabling standalone mode")
         ifilter = None
     self.scheduler = Scheduler("discovery",
                                pool=config.pool,
                                reset_running=True,
                                max_threads=config.discovery.max_threads,
                                ioloop=self.ioloop,
                                filter=ifilter,
                                service=self,
                                sample=config.discovery.sample)
     self.scheduler.run()
示例#14
0
 def run_job(self, job, mo, checks):
     if job == "segment":
         scheduler = Scheduler("scheduler",
                               pool=None,
                               service=ServiceStub())
     else:
         scheduler = Scheduler("discovery",
                               pool=mo.pool.name,
                               service=ServiceStub())
     jcls = self.jcls[job]
     # Try to dereference job
     job_args = scheduler.get_collection().find_one({
         Job.ATTR_CLASS: jcls,
         Job.ATTR_KEY: mo.id
     })
     if job_args:
         self.print("Job ID: %s" % job_args["_id"])
     else:
         job_args = {Job.ATTR_ID: "fakeid", Job.ATTR_KEY: mo.id}
     job_args["_checks"] = checks
     job = get_handler(jcls)(scheduler, job_args)
     if job.context_version:
         ctx_key = job.get_context_cache_key()
         self.print("Loading job context from %s" % ctx_key)
         ctx = cache.get(ctx_key, version=job.context_version)
         if not ctx:
             self.print("Job context is empty")
         job.load_context(ctx)
     sample = 1 if self.trace else 0
     with Span(sample=sample):
         job.dereference()
         job.handler()
     if sample:
         spans = get_spans()
         self.print("Spans:")
         self.print("\n".join(str(s) for s in spans))
     if scheduler.service.metrics:
         self.print("Collected CH data:")
         for t in scheduler.service.metrics:
             self.print("Table: %s" % t)
             self.print("\n".join(
                 str(x) for x in scheduler.service.metrics[t]))
     if job.context_version and job.context:
         self.print("Saving job context to %s" % ctx_key)
         scheduler.cache_set(key=ctx_key,
                             value=job.context,
                             version=job.context_version)
         scheduler.apply_cache_ops()
         time.sleep(3)
示例#15
0
def run_job(jcls, mo, checks):
    scheduler = Scheduler("discovery", pool=mo.pool.name, service=get_service(mo.pool))
    job_args = {Job.ATTR_ID: "fakeid", Job.ATTR_KEY: mo.id, "_checks": checks}
    job = jcls(scheduler, job_args)
    # Inject beef managed object
    job.dereference()
    job.object = mo
    # Run job
    job.handler()
    assert not job.problems
示例#16
0
def call_later(name, delay=None, scheduler="scheduler", pool=None,
               max_runs=None, **kwargs):
    """
    Run callable *name* in scheduler process
    :param name: Full callable name
    :param delay: delay in seconds
    :param scheduler: Name of scheduler
    :param pool: Pool name
    :param max_runs: Maximum amount of retries
    """
    scheduler = Scheduler(scheduler, pool=pool)
    data = kwargs or {}
    ts = datetime.datetime.now()
    if delay:
        ts += datetime.timedelta(seconds=delay)
    set_op = {
        Job.ATTR_TS: ts
    }
    iset_op = {
        Job.ATTR_STATUS: Job.S_WAIT,
        Job.ATTR_RUNS: 0,
        Job.ATTR_FAULTS: 0,
        Job.ATTR_OFFSET: 0
    }
    if max_runs:
        iset_op[Job.ATTR_MAX_RUNS] = max_runs
    if data:
        set_op[Job.ATTR_DATA] = data

    q = {
        Job.ATTR_CLASS: "noc.core.scheduler.calljob.CallJob",
        Job.ATTR_KEY: name
    }
    for k in data:
        q["%s.%s" % (Job.ATTR_DATA, k)] = data[k]
    op = {
        "$set": set_op,
        "$setOnInsert": iset_op
    }
    logger.info("Delayed call to %s(%s) in %ss", name, data, delay or "0")
    logger.debug("update(%s, %s, upsert=True)", q, op)
    scheduler.get_collection().update(q, op, upsert=True)
示例#17
0
class DiscoveryService(Service):
    name = "discovery"
    leader_group_name = "discovery-%(pool)s"
    pooled = True
    require_nsq_writer = True
    use_mongo = True
    process_name = "noc-%(name).10s-%(pool).5s"

    def __init__(self):
        super(DiscoveryService, self).__init__()
        self.send_callback = None
        self.slot_number = 0
        self.total_slots = 0

    @tornado.gen.coroutine
    def on_activate(self):
        self.slot_number, self.total_slots = yield self.acquire_slot()
        if self.total_slots > 1:
            self.logger.info("Enabling distributed mode: Slot %d/%d",
                             self.slot_number, self.total_slots)
            ifilter = {"key": {"$mod": [self.total_slots, self.slot_number]}}
        else:
            self.logger.info("Enabling standalone mode")
            ifilter = None
        self.scheduler = Scheduler(
            "discovery",
            pool=config.pool,
            reset_running=True,
            max_threads=config.discovery.max_threads,
            ioloop=self.ioloop,
            filter=ifilter,
            service=self,
            sample=config.discovery.sample,
        )
        self.scheduler.run()

    def get_mon_data(self):
        r = super(DiscoveryService, self).get_mon_data()
        if self.scheduler:
            self.scheduler.apply_metrics(r)
        return r
示例#18
0
 def add_arguments(self, parser):
     parser.add_argument(
         "--scheduler",
         "-s",
         dest="scheduler",
         default=Scheduler("scheduler").get_collection(),
         type=self.scheduler,
         help="Select scheduler. For sharded use SCHEDULER_NAME.SHARD_NAME"
     ),
     parser.add_argument(
         "--format",
         "-f",
         dest="store",
         # action="format",
         choices=["json", "csv"],
         help="Set output format"),
     subparsers = parser.add_subparsers(dest="cmd")
     # load command
     list_parser = subparsers.add_parser("list")
     list_parser.add_argument("--name", help="Job name in scheduler")
     list_parser.add_argument("key",
                              nargs=argparse.REMAINDER,
                              help="List of job key")
     get_parser = subparsers.add_parser("get")
     get_parser.add_argument("--id", help="Job name in scheduler")
     subparsers.add_parser("set")
     # Parse Job Field
     reschedule = subparsers.add_parser("reschedule",
                                        help="Shift Jobs to interval")
     reschedule.add_argument("--name", help="Job name in scheduler")
     reschedule.add_argument("--start",
                             type=self.valid_date,
                             help="Start interval for place")
     reschedule.add_argument("--end",
                             type=self.valid_date,
                             help="End interval for place")
     reschedule.add_argument("--force",
                             default=False,
                             action="store_true",
                             help="Really do reschedule")
     reschedule.add_argument("key",
                             nargs=argparse.REMAINDER,
                             help="List of job key")
     parser.add_argument('infile',
                         nargs='?',
                         type=argparse.FileType('r'),
                         default=sys.stdin)
示例#19
0
文件: service.py 项目: skripkar/noc
 def apply_shards(self):
     # Get shards settings
     shard_threads = defaultdict(int)
     shard_threads[DEFAULT_TTSYSTEM_SHARD] = config.escalator.max_threads
     for s in TTSystem.objects.all():
         if not s.is_active:
             continue
         shard_threads[s.shard_name] += s.max_threads
     # Run shard schedulers
     for sn in shard_threads:
         self.logger.info("Running shard %s (%d threads)", sn,
                          shard_threads[sn])
         self.shards[sn] = Scheduler("escalator",
                                     pool=sn,
                                     reset_running=True,
                                     max_threads=shard_threads[sn],
                                     ioloop=self.ioloop,
                                     service=self,
                                     sample=config.escalator.sample)
         self.shards[sn].run()
示例#20
0
 def add_arguments(self, parser):
     parser.add_argument(
         "--scheduler",
         "-s",
         dest="scheduler",
         default=Scheduler("scheduler").get_collection(),
         type=self.scheduler,
         help="Select scheduler. For sharded use SCHEDULER_NAME.SHARD_NAME",
     ),
     parser.add_argument(
         "--format",
         "-f",
         dest="store",
         # action="format",
         choices=["json", "csv"],
         help="Set output format",
     ),
     subparsers = parser.add_subparsers(dest="cmd")
     # load command
     list_parser = subparsers.add_parser("list")
     list_parser.add_argument("--name", help="Job name in scheduler")
     list_parser.add_argument("key",
                              nargs=argparse.REMAINDER,
                              help="List of job key")
     get_parser = subparsers.add_parser("get")
     get_parser.add_argument("--id", help="Job name in scheduler")
     subparsers.add_parser("set")
     estimate = subparsers.add_parser("estimate")
     estimate.add_argument("--device-count",
                           type=int,
                           default=0,
                           help="Device count")
     estimate.add_argument("--box-interval",
                           type=int,
                           default=65400,
                           help="Box discovery interval (in seconds)")
     estimate.add_argument(
         "--periodic-interval",
         type=int,
         default=300,
         help="Periodic discovery interval (in seconds)",
     )
     # Parse Job Field
     reschedule = subparsers.add_parser("reschedule",
                                        help="Shift Jobs to interval")
     reschedule.add_argument("--name", help="Job name in scheduler")
     reschedule.add_argument("--start",
                             type=self.valid_date,
                             help="Start interval for place")
     reschedule.add_argument("--end",
                             type=self.valid_date,
                             help="End interval for place")
     reschedule.add_argument("--force",
                             default=False,
                             action="store_true",
                             help="Really do reschedule")
     reschedule.add_argument("key",
                             nargs=argparse.REMAINDER,
                             help="List of job key")
     parser.add_argument("infile",
                         nargs="?",
                         type=argparse.FileType("r"),
                         default=sys.stdin)
示例#21
0
文件: service.py 项目: nbashev/noc
class CorrelatorService(TornadoService):
    name = "correlator"
    pooled = True
    use_mongo = True
    process_name = "noc-%(name).10s-%(pool).5s"

    def __init__(self):
        super().__init__()
        self.version = version.version
        self.rules = {}  # event_class -> [Rule]
        self.back_rules = {}  # event_class -> [Rule]
        self.triggers = {}  # alarm_class -> [Trigger1, .. , TriggerN]
        self.rca_forward = {
        }  # alarm_class -> [RCA condition, ..., RCA condititon]
        self.rca_reverse = defaultdict(
            set)  # alarm_class -> set([alarm_class])
        #
        self.slot_number = 0
        self.total_slots = 0
        self.is_distributed = False
        # Scheduler
        self.scheduler: Optional[Scheduler] = None
        # Locks
        self.topo_rca_lock: Optional[RCALock] = None

    async def on_activate(self):
        self.slot_number, self.total_slots = await self.acquire_slot()
        self.is_distributed = self.total_slots > 1
        # Prepare scheduler
        if self.is_distributed:
            self.logger.info("Enabling distributed mode: Slot %d/%d",
                             self.slot_number, self.total_slots)
            ifilter = {"shard": {"$mod": [self.total_slots, self.slot_number]}}
        else:
            self.logger.info("Enabling standalone mode")
            ifilter = None
        self.scheduler = Scheduler(
            self.name,
            pool=config.pool,
            reset_running=True,
            max_threads=config.correlator.max_threads,
            # @fixme have to be configured ?
            submit_threshold=100,
            max_chunk=100,
            filter=ifilter,
        )
        self.scheduler.correlator = self
        self.scheduler.run()
        # Subscribe stream, move to separate task to let the on_activate to terminate
        self.loop.create_task(
            self.subscribe_stream("dispose.%s" % config.pool, self.slot_number,
                                  self.on_dispose_event))

    def on_start(self):
        """
        Load rules from database just after loading config
        """
        super().on_start()
        connect()  # use_mongo connect do after on_start.
        self.load_rules()
        self.load_triggers()
        self.load_rca_rules()

    def load_rules(self):
        """
        Load rules from database
        """
        self.logger.debug("Loading rules")
        self.rules = {}
        self.back_rules = {}
        nr = 0
        nbr = 0
        for c in EventClass.objects.all():
            if c.disposition:
                r = []
                for dr in c.disposition:
                    rule = Rule(c, dr)
                    r += [rule]
                    nr += 1
                    if dr.combo_condition != "none" and dr.combo_window:
                        for cc in dr.combo_event_classes:
                            try:
                                self.back_rules[cc.id] += [dr]
                            except KeyError:
                                self.back_rules[cc.id] = [dr]
                            nbr += 1
                self.rules[c.id] = r
        self.logger.debug("%d rules are loaded. %d combos" % (nr, nbr))

    def load_triggers(self):
        self.logger.info("Loading triggers")
        self.triggers = {}
        n = 0
        cn = 0
        ec = [(c.name, c.id) for c in AlarmClass.objects.all()]
        for t in AlarmTrigger.objects.filter(is_enabled=True):
            self.logger.debug("Trigger '%s' for classes:" % t.name)
            for c_name, c_id in ec:
                if re.search(t.alarm_class_re, c_name, re.IGNORECASE):
                    try:
                        self.triggers[c_id] += [Trigger(t)]
                    except KeyError:
                        self.triggers[c_id] = [Trigger(t)]
                    cn += 1
                    self.logger.debug("    %s" % c_name)
            n += 1
        self.logger.info("%d triggers has been loaded to %d classes" % (n, cn))

    def load_rca_rules(self):
        """
        Load root cause analisys rules
        """
        self.logger.info("Loading RCA Rules")
        n = 0
        self.rca_forward = {}
        self.rca_reverse = {}
        for a in AlarmClass.objects.filter(root_cause__0__exists=True):
            if not a.root_cause:
                continue
            self.rca_forward[a.id] = []
            for c in a.root_cause:
                rc = RCACondition(a, c)
                self.rca_forward[a.id] += [rc]
                if rc.root.id not in self.rca_reverse:
                    self.rca_reverse[rc.root.id] = []
                self.rca_reverse[rc.root.id] += [rc]
                n += 1
        self.logger.info("%d RCA Rules have been loaded" % n)

    def mark_as_failed(self, event):
        """
        Write error log and mark event as failed
        """
        self.logger.error("Failed to process event %s" % str(event.id))
        # Prepare traceback
        t, v, tb = sys.exc_info()
        now = datetime.datetime.now()
        r = ["UNHANDLED EXCEPTION (%s)" % str(now)]
        r += [str(t), str(v)]
        r += [format_frames(get_traceback_frames(tb))]
        r = "\n".join(r)
        event.mark_as_failed(version=self.version, traceback=r)

    def set_root_cause(self, a):
        """
        Search for root cause and set, if found
        :returns: Boolean. True, if root cause set
        """
        for rc in self.rca_forward[a.alarm_class.id]:
            # Check condition
            if not rc.check_condition(a):
                continue
            # Check match condition
            q = rc.get_match_condition(a)
            root = ActiveAlarm.objects.filter(**q).first()
            if root:
                # Root cause found
                self.logger.info("%s is root cause for %s (Rule: %s)", root.id,
                                 a.id, rc.name)
                metrics["alarm_correlated_rule"] += 1
                a.set_root(root, rca_type=RCA_RULE)
                return True
        return False

    def set_reverse_root_cause(self, a):
        """
        Set *a* as root cause for existing events
        :param a:
        :return:
        """
        found = False
        for rc in self.rca_reverse[a.alarm_class.id]:
            # Check reverse match condition
            q = rc.get_reverse_match_condition(a)
            for ca in ActiveAlarm.objects.filter(**q):
                # Check condition
                if not rc.check_condition(ca):
                    continue
                # Try to set root cause
                qq = rc.get_match_condition(ca, id=a.id)
                rr = ActiveAlarm.objects.filter(**qq).first()
                if rr:
                    # Reverse root cause found
                    self.logger.info(
                        "%s is root cause for %s (Reverse rule: %s)", a.id,
                        ca.id, rc.name)
                    metrics["alarm_correlated_rule"] += 1
                    ca.set_root(a, rca_type=RCA_RULE)
                    found = True
        return found

    async def raise_alarm(self, r, e):
        managed_object = self.eval_expression(r.managed_object, event=e)
        if not managed_object:
            self.logger.info("Empty managed object, ignoring")
            return
        # @todo: Make configurable
        if not managed_object.is_managed:
            self.logger.info(
                "Managed object is not managed. Do not raise alarm")
            return
        if e.managed_object.id != managed_object.id:
            metrics["alarm_change_mo"] += 1
            self.logger.info("Changing managed object to %s",
                             managed_object.name)
        discriminator, vars = r.get_vars(e)
        if r.unique:
            assert discriminator is not None
            a = ActiveAlarm.objects.filter(
                managed_object=managed_object.id,
                discriminator=discriminator).first()
            if not a:
                # Try to reopen alarm
                a = ArchivedAlarm.objects.filter(
                    managed_object=managed_object.id,
                    discriminator=discriminator,
                    control_time__gte=e.timestamp,
                ).first()
                if a:
                    # Reopen alarm
                    self.logger.info(
                        "[%s|%s|%s] %s reopens alarm %s(%s)",
                        e.id,
                        managed_object.name,
                        managed_object.address,
                        e.event_class.name,
                        a.alarm_class.name,
                        a.id,
                    )
                    a = a.reopen("Reopened by disposition rule '%s'" %
                                 r.u_name)
                    metrics["alarm_reopen"] += 1
            if a:
                # Active alarm found, refresh
                self.logger.info(
                    "[%s|%s|%s] Contributing event %s to active alarm %s(%s)",
                    e.id,
                    managed_object.name,
                    managed_object.address,
                    e.event_class.name,
                    a.alarm_class.name,
                    a.id,
                )
                # Contribute event to alarm
                e.contribute_to_alarm(a)
                if e.timestamp < a.timestamp:
                    # Set to earlier date
                    a.timestamp = e.timestamp
                    a.save()
                elif e.timestamp > a.last_update:
                    # Refresh last update
                    a.last_update = e.timestamp
                    a.save()
                metrics["alarm_contribute"] += 1
                return
        # Calculate alarm coverage
        summary = ServiceSummary.get_object_summary(managed_object)
        summary["object"] = {managed_object.object_profile.id: 1}
        #
        severity = max(ServiceSummary.get_severity(summary), 1)
        self.logger.info(
            "[%s|%s|%s] %s: Calculated alarm severity is: %s",
            e.id,
            managed_object.name,
            managed_object.address,
            r.u_name,
            severity,
        )
        # Create new alarm
        direct_services = SummaryItem.dict_to_items(summary["service"])
        direct_subscribers = SummaryItem.dict_to_items(summary["subscriber"])
        a = ActiveAlarm(
            timestamp=e.timestamp,
            last_update=e.timestamp,
            managed_object=managed_object.id,
            alarm_class=r.alarm_class,
            severity=severity,
            vars=vars,
            discriminator=discriminator,
            direct_services=direct_services,
            direct_subscribers=direct_subscribers,
            total_objects=ObjectSummaryItem.dict_to_items(summary["object"]),
            total_services=direct_services,
            total_subscribers=direct_subscribers,
            log=[
                AlarmLog(
                    timestamp=datetime.datetime.now(),
                    from_status="A",
                    to_status="A",
                    message="Alarm risen from event %s(%s) by rule '%s'" %
                    (str(e.id), str(e.event_class.name), r.u_name),
                )
            ],
            opening_event=e.id,
        )
        a.save()
        e.contribute_to_alarm(a)
        self.logger.info(
            "[%s|%s|%s] %s raises alarm %s(%s): %r",
            e.id,
            managed_object.name,
            managed_object.address,
            e.event_class.name,
            a.alarm_class.name,
            a.id,
            a.vars,
        )
        metrics["alarm_raise"] += 1
        await self.correlate(r, a)
        # Notify about new alarm
        if not a.root:
            a.managed_object.event(
                a.managed_object.EV_ALARM_RISEN,
                {
                    "alarm": a,
                    "subject": a.subject,
                    "body": a.body,
                    "symptoms": a.alarm_class.symptoms,
                    "recommended_actions": a.alarm_class.recommended_actions,
                    "probable_causes": a.alarm_class.probable_causes,
                },
                delay=a.alarm_class.get_notification_delay(),
            )
        # Gather diagnostics when necessary
        AlarmDiagnosticConfig.on_raise(a)
        # Watch for escalations, when necessary
        if config.correlator.auto_escalation and not a.root:
            AlarmEscalation.watch_escalations(a)

    async def correlate(self, r, a):
        # Topology RCA
        if a.alarm_class.topology_rca:
            await self.topology_rca(a)
        # Rule-based RCA
        if a.alarm_class.id in self.rca_forward:
            # Check alarm is a consequence of existing one
            self.set_root_cause(a)
        if a.alarm_class.id in self.rca_reverse:
            # Check alarm is the root cause for existing ones
            self.set_reverse_root_cause(a)
        # Call handlers
        for h in a.alarm_class.get_handlers():
            try:
                has_root = bool(a.root)
                h(a)
                if not has_root and a.root:
                    self.logger.info(
                        "[%s|%s|%s] Set root to %s (handler %s)",
                        a.id,
                        a.managed_object.name,
                        a.managed_object.address,
                        a.root,
                        h,
                    )
            except Exception:  # noqa. Can probable happens anything from handler
                error_report()
                metrics["error", ("type", "alarm_handler")] += 1
        # Call triggers if necessary
        if r.alarm_class.id in self.triggers:
            for t in self.triggers[r.alarm_class.id]:
                try:
                    t.call(a)
                except:  # noqa. Can probable happens anything from trigger
                    error_report()
        #
        if not a.severity:
            # Alarm severity has been reset to 0 by handlers
            # Silently drop alarm
            self.logger.debug("Alarm severity is 0, dropping")
            a.delete()
            metrics["alarm_drop"] += 1
            return

    def clear_alarm(self, r, e):
        managed_object = self.eval_expression(r.managed_object, event=e)
        if not managed_object:
            self.logger.info(
                "[%s|Unknown|Unknown] Referred to unknown managed object, ignoring",
                e.id)
            metrics["unknown_object"] += 1
            return
        if r.unique:
            discriminator, vars = r.get_vars(e)
            assert discriminator is not None
            a = ActiveAlarm.objects.filter(
                managed_object=managed_object.id,
                discriminator=discriminator).first()
            if a:
                self.logger.info(
                    "[%s|%s|%s] %s clears alarm %s(%s)",
                    e.id,
                    managed_object.name,
                    managed_object.address,
                    e.event_class.name,
                    a.alarm_class.name,
                    a.id,
                )
                e.contribute_to_alarm(a)
                a.closing_event = e.id
                a.last_update = max(a.last_update, e.timestamp)
                a.clear_alarm("Cleared by disposition rule '%s'" % r.u_name,
                              ts=e.timestamp)
                metrics["alarm_clear"] += 1

    def get_delayed_event(self, r, e):
        """
        Check wrether all delayed conditions are met

        :param r: Delayed rule
        :param e: Event which can trigger delayed rule
        """
        # @todo: Rewrite to scheduler
        discriminator, vars = r.get_vars(e)
        ws = e.timestamp - datetime.timedelta(seconds=r.combo_window)
        de = ActiveEvent.objects.filter(
            managed_object=e.managed_object_id,
            event_class=r.event_class,
            discriminator=discriminator,
            timestamp__gte=ws,
        ).first()
        if not de:
            # No starting event
            return None
        # Probable starting event found, get all interesting following event
        # classes
        fe = [
            ee.event_class.id for ee in ActiveEvent.objects.filter(
                managed_object=e.managed_object_id,
                event_class__in=r.combo_event_classes,
                discriminator=discriminator,
                timestamp__gte=ws,
            ).order_by("timestamp")
        ]
        if r.combo_condition == "sequence":
            # Exact match
            if fe == self.combo_event_classes:
                return de
        elif r.combo_condition == "all":
            # All present
            if not any([c for c in r.combo_event_classes if c not in fe]):
                return de
        elif r.combo_condition == "any":
            # Any found
            if fe:
                return de
        return None

    def eval_expression(self, expression, **kwargs):
        """
        Evaluate expression in given context
        """
        env = {"re": re, "utils": utils}
        env.update(kwargs)
        return eval(expression, {}, env)

    async def on_dispose_event(self, msg: Message) -> None:
        """
        Called on new dispose message
        """
        data = orjson.loads(msg.value)
        event_id = data["event_id"]
        hint = data["event"]
        self.logger.info("[%s] Receiving message", event_id)
        metrics["alarm_dispose"] += 1
        try:
            event = ActiveEvent.from_json(hint)
            event.timestamp = event.timestamp.replace(tzinfo=None)
            await self.dispose_event(event)
        except Exception:
            metrics["alarm_dispose_error"] += 1
            error_report()
        finally:
            if self.topo_rca_lock:
                # Release pending RCA Lock
                await self.topo_rca_lock.release()
                self.topo_rca_lock = None

    async def dispose_event(self, e):
        """
        Dispose event according to disposition rule
        """
        event_id = str(e.id)
        self.logger.info("[%s] Disposing", event_id)
        drc = self.rules.get(e.event_class.id)
        if not drc:
            self.logger.info(
                "[%s] No disposition rules for class %s, skipping", event_id,
                e.event_class.name)
            return
        # Apply disposition rules
        for r in drc:
            if self.eval_expression(r.condition, event=e):
                # Process action
                if r.action == "drop":
                    self.logger.info("[%s] Dropped by action", event_id)
                    e.delete()
                    return
                elif r.action == "ignore":
                    self.logger.info("[%s] Ignored by action", event_id)
                    return
                elif r.action == "raise" and r.combo_condition == "none":
                    await self.raise_alarm(r, e)
                elif r.action == "clear" and r.combo_condition == "none":
                    self.clear_alarm(r, e)
                if r.action in ("raise", "clear"):
                    # Write discriminator if can trigger delayed event
                    if r.unique and r.event_class.id in self.back_rules:
                        discriminator, vars = r.get_vars(e)
                        e.discriminator = discriminator
                        e.save()
                    # Process delayed combo conditions
                    if e.event_class.id in self.back_rules:
                        for br in self.back_rules[e.event_class.id]:
                            de = self.get_delayed_event(br, e)
                            if de:
                                if br.action == "raise":
                                    await self.raise_alarm(br, de)
                                elif br.action == "clear":
                                    self.clear_alarm(br, de)
                if r.stop_disposition:
                    break
        self.logger.info("[%s] Disposition complete", event_id)

    async def topology_rca(self, alarm):
        """
        Topology-based RCA
        :param alarm:
        :return:
        """
        def can_correlate(a1, a2):
            """
            Check if alarms can be correlated together (within corellation window)
            :param a1:
            :param a2:
            :return:
            """
            return (not config.correlator.topology_rca_window
                    or (a1.timestamp - a2.timestamp).total_seconds() <=
                    config.correlator.topology_rca_window)

        def all_uplinks_failed(a1):
            """
            Check if all uplinks for alarm is failed
            :param a1:
            :return:
            """
            if not a1.uplinks:
                return False
            return sum(1 for mo in a1.uplinks
                       if mo in neighbor_alarms) == len(a1.uplinks)

        def get_root(a1) -> Optional[ActiveAlarm]:
            """
            Get root cause for failed uplinks.
            Considering all uplinks are failed.
            Uplinks are ordered according to path length.
            Return first applicable

            :param a1:
            :return:
            """
            for u in a1.uplinks:
                na = neighbor_alarms[u]
                if can_correlate(a1, na):
                    return na
            return None

        def get_neighboring_alarms(ca: ActiveAlarm) -> Dict[int, ActiveAlarm]:
            r = {
                na.managed_object.id: na
                for na in ActiveAlarm.objects.filter(
                    alarm_class=ca.alarm_class.id,
                    rca_neighbors__in=[ca.managed_object.id])
            }
            # Add current alarm to correlate downlink alarms properly
            r[alarm.managed_object.id] = ca
            return r

        def iter_downlink_alarms(a1):
            """
            Yield all downlink alarms
            :param a1:
            :return:
            """
            mo = a1.managed_object.id
            for na in neighbor_alarms.values():
                if na.uplinks and mo in na.uplinks:
                    yield na

        def correlate_uplinks(ca: ActiveAlarm) -> bool:
            """
            Correlate with uplink alarms if all uplinks are faulty.
            :param a1:
            :return:
            """
            if not all_uplinks_failed(ca):
                return False
            self.logger.info("[%s] All uplinks are faulty. Correlating", ca.id)
            ra = get_root(ca)
            if not ra:
                return False
            self.logger.info("[%s] Set root to %s", ca.id, ra.id)
            ca.set_root(ra, rca_type=RCA_TOPOLOGY)
            metrics["alarm_correlated_topology"] += 1
            return True

        def correlate_merge_downlinks(ca: ActiveAlarm) -> bool:
            """
            Donwlink merge correlation
            :param ca:
            :return:
            """
            if not ca.uplinks or not ca.rca_neighbors:
                return False
            dlm_neighbors = {
                mo: w
                for mo, w in zip(ca.rca_neighbors, ca.dlm_windows) if w > 0
            }
            dlm_candidates = set(neighbor_alarms) & set(dlm_neighbors)
            if not dlm_candidates:
                return False
            # Get possible candidates
            t0 = ca.timestamp
            candidates = list(
                sorted(
                    (neighbor_alarms[mo] for mo in dlm_candidates
                     if (t0 - neighbor_alarms[mo].timestamp
                         ).total_seconds() <= dlm_neighbors[mo]),
                    key=operator.attrgetter("timestamp"),
                ))
            if not candidates:
                return False
            ra = candidates[0]
            self.logger.info("[%s] Set root to %s (downlink merge)", ca.id,
                             ra.id)
            ca.set_root(ra, rca_type=RCA_DOWNLINK_MERGE)
            metrics["alarm_correlated_topology"] += 1
            return True

        self.logger.debug("[%s] Topology RCA", alarm.id)
        # Acquire lock
        if self.is_distributed:
            # Set lock until the end of dispose
            mo = alarm.managed_object
            self.topo_rca_lock = RCALock(mo.data.rca_neighbors + [mo.id])
            await self.topo_rca_lock.acquire()
        # Get neighboring alarms
        neighbor_alarms = get_neighboring_alarms(alarm)
        # Correlate current alarm
        correlate_uplinks(alarm) or correlate_merge_downlinks(alarm)
        # Correlate all downlink alarms
        for a in iter_downlink_alarms(alarm):
            correlate_uplinks(a)
        self.logger.debug("[%s] Correlation completed", alarm.id)
示例#22
0
 def get_scheduler(cls):
     return Scheduler(cls.SCHEDULER)
示例#23
0
class CorrelatorService(Service):
    name = "correlator"
    pooled = True
    leader_lock_name = "correlator-%(pool)s"
    process_name = "noc-%(name).10s-%(pool).5s"

    def __init__(self):
        super(CorrelatorService, self).__init__()
        self.version = version.version
        self.rules = {}  # event_class -> [Rule]
        self.back_rules = {}  # event_class -> [Rule]
        self.triggers = {}  # alarm_class -> [Trigger1, .. , TriggerN]
        self.rca_forward = {
        }  # alarm_class -> [RCA condition, ..., RCA condititon]
        self.rca_reverse = defaultdict(
            set)  # alarm_class -> set([alarm_class])
        self.scheduler = None
        self.rca_lock = Lock()
        self.topology_rca_lock = Lock()

    def on_activate(self):
        self.scheduler = Scheduler(
            self.name,
            pool=config.pool,
            reset_running=True,
            max_threads=config.correlator.max_threads,
            ioloop=self.ioloop,
            # @fixme have to be configured ?
            submit_threshold=100,
            max_chunk=100)
        self.scheduler.correlator = self
        self.subscribe("correlator.dispose.%s" % config.pool,
                       "dispose",
                       self.on_dispose_event,
                       max_in_flight=config.correlator.max_threads)
        self.scheduler.run()

    def on_start(self):
        """
        Load rules from database just after loading config
        """
        super(CorrelatorService, self).on_start()
        self.load_rules()
        self.load_triggers()
        self.load_rca_rules()

    def load_rules(self):
        """
        Load rules from database
        """
        self.logger.debug("Loading rules")
        self.rules = {}
        self.back_rules = {}
        nr = 0
        nbr = 0
        for c in EventClass.objects.all():
            if c.disposition:
                r = []
                for dr in c.disposition:
                    rule = Rule(c, dr)
                    r += [rule]
                    nr += 1
                    if dr.combo_condition != "none" and dr.combo_window:
                        for cc in dr.combo_event_classes:
                            try:
                                self.back_rules[cc.id] += [dr]
                            except KeyError:
                                self.back_rules[cc.id] = [dr]
                            nbr += 1
                self.rules[c.id] = r
        self.logger.debug("%d rules are loaded. %d combos" % (nr, nbr))

    def load_triggers(self):
        self.logger.info("Loading triggers")
        self.triggers = {}
        n = 0
        cn = 0
        ec = [(c.name, c.id) for c in AlarmClass.objects.all()]
        for t in AlarmTrigger.objects.filter(is_enabled=True):
            self.logger.debug("Trigger '%s' for classes:" % t.name)
            for c_name, c_id in ec:
                if re.search(t.alarm_class_re, c_name, re.IGNORECASE):
                    try:
                        self.triggers[c_id] += [Trigger(t)]
                    except KeyError:
                        self.triggers[c_id] = [Trigger(t)]
                    cn += 1
                    self.logger.debug("    %s" % c_name)
            n += 1
        self.logger.info("%d triggers has been loaded to %d classes" % (n, cn))

    def load_rca_rules(self):
        """
        Load root cause analisys rules
        """
        self.logger.info("Loading RCA Rules")
        n = 0
        self.rca_forward = {}
        self.rca_reverse = {}
        for a in AlarmClass.objects.filter(root_cause__0__exists=True):
            if not a.root_cause:
                continue
            self.rca_forward[a.id] = []
            for c in a.root_cause:
                rc = RCACondition(a, c)
                self.rca_forward[a.id] += [rc]
                if rc.root.id not in self.rca_reverse:
                    self.rca_reverse[rc.root.id] = []
                self.rca_reverse[rc.root.id] += [rc]
                n += 1
        self.logger.info("%d RCA Rules have been loaded" % n)

    def mark_as_failed(self, event):
        """
        Write error log and mark event as failed
        """
        self.logger.error("Failed to process event %s" % str(event.id))
        # Prepare traceback
        t, v, tb = sys.exc_info()
        now = datetime.datetime.now()
        r = ["UNHANDLED EXCEPTION (%s)" % str(now)]
        r += [str(t), str(v)]
        r += [format_frames(get_traceback_frames(tb))]
        r = "\n".join(r)
        event.mark_as_failed(version=self.version, traceback=r)

    def set_root_cause(self, a):
        """
        Search for root cause and set, if found
        :returns: Boolean. True, if root cause set
        """
        for rc in self.rca_forward[a.alarm_class.id]:
            # Check condition
            if not rc.check_condition(a):
                continue
            # Check match condition
            q = rc.get_match_condition(a)
            root = ActiveAlarm.objects.filter(**q).first()
            if root:
                # Root cause found
                self.logger.info("%s is root cause for %s (Rule: %s)", root.id,
                                 a.id, rc.name)
                metrics["alarm_correlated_rule"] += 1
                a.set_root(root)
                return True
        return False

    def set_reverse_root_cause(self, a):
        """
        Set *a* as root cause for existing events
        :param a:
        :return:
        """
        found = False
        for rc in self.rca_reverse[a.alarm_class.id]:
            # Check reverse match condition
            q = rc.get_reverse_match_condition(a)
            for ca in ActiveAlarm.objects.filter(**q):
                # Check condition
                if not rc.check_condition(ca):
                    continue
                # Try to set root cause
                qq = rc.get_match_condition(ca, id=a.id)
                rr = ActiveAlarm.objects.filter(**qq).first()
                if rr:
                    # Reverse root cause found
                    self.logger.info(
                        "%s is root cause for %s (Reverse rule: %s)", a.id,
                        ca.id, rc.name)
                    metrics["alarm_correlated_rule"] += 1
                    ca.set_root(a)
                    found = True
        return found

    def raise_alarm(self, r, e):
        managed_object = self.eval_expression(r.managed_object, event=e)
        if not managed_object:
            self.logger.info("Empty managed object, ignoring")
            return
        # @todo: Make configurable
        if not managed_object.is_managed:
            self.logger.info(
                "Managed object is not managed. Do not raise alarm")
            return
        if e.managed_object.id != managed_object.id:
            metrics["alarm_change_mo"] += 1
            self.logger.info("Changing managed object to %s",
                             managed_object.name)
        discriminator, vars = r.get_vars(e)
        if r.unique:
            assert discriminator is not None
            a = ActiveAlarm.objects.filter(
                managed_object=managed_object.id,
                discriminator=discriminator).first()
            if not a:
                # Try to reopen alarm
                a = ArchivedAlarm.objects.filter(
                    managed_object=managed_object.id,
                    discriminator=discriminator,
                    control_time__gte=e.timestamp).first()
                if a:
                    # Reopen alarm
                    self.logger.info("[%s|%s|%s] %s reopens alarm %s(%s)",
                                     e.id, managed_object.name,
                                     managed_object.address,
                                     e.event_class.name, a.alarm_class.name,
                                     a.id)
                    a = a.reopen("Reopened by disposition rule '%s'" %
                                 r.u_name)
                    metrics["alarm_reopen"] += 1
            if a:
                # Active alarm found, refresh
                self.logger.info(
                    "[%s|%s|%s] Contributing event %s to active alarm %s(%s)",
                    e.id, managed_object.name, managed_object.address,
                    e.event_class.name, a.alarm_class.name, a.id)
                # Contribute event to alarm
                e.contribute_to_alarm(a)
                if e.timestamp < a.timestamp:
                    # Set to earlier date
                    a.timestamp = e.timestamp
                    a.save()
                elif e.timestamp > a.last_update:
                    # Refresh last update
                    a.last_update = e.timestamp
                    a.save()
                metrics["alarm_contribute"] += 1
                return
        # Calculate alarm coverage
        summary = ServiceSummary.get_object_summary(managed_object)
        summary["object"] = {managed_object.object_profile.id: 1}
        #
        severity = max(ServiceSummary.get_severity(summary), 1)
        self.logger.info("[%s|%s|%s] %s: Calculated alarm severity is: %s",
                         e.id, managed_object.name, managed_object.address,
                         r.u_name, severity)
        # Create new alarm
        a = ActiveAlarm(
            timestamp=e.timestamp,
            last_update=e.timestamp,
            managed_object=managed_object.id,
            alarm_class=r.alarm_class,
            severity=severity,
            vars=vars,
            discriminator=discriminator,
            direct_services=SummaryItem.dict_to_items(summary["service"]),
            direct_subscribers=SummaryItem.dict_to_items(
                summary["subscriber"]),
            total_objects=ObjectSummaryItem.dict_to_items(summary["object"]),
            total_services=SummaryItem.dict_to_items(summary["service"]),
            total_subscribers=SummaryItem.dict_to_items(summary["subscriber"]),
            log=[
                AlarmLog(timestamp=datetime.datetime.now(),
                         from_status="A",
                         to_status="A",
                         message="Alarm risen from event %s(%s) by rule '%s'" %
                         (str(e.id), str(e.event_class.name), r.u_name))
            ],
            opening_event=e.id)
        a.save()
        e.contribute_to_alarm(a)
        self.logger.info("[%s|%s|%s] %s raises alarm %s(%s): %r", e.id,
                         managed_object.name, managed_object.address,
                         e.event_class.name, a.alarm_class.name, a.id, a.vars)
        metrics["alarm_raise"] += 1
        self.correlate(r, a)
        # Notify about new alarm
        if not a.root:
            a.managed_object.event(
                a.managed_object.EV_ALARM_RISEN, {
                    "alarm": a,
                    "subject": a.subject,
                    "body": a.body,
                    "symptoms": a.alarm_class.symptoms,
                    "recommended_actions": a.alarm_class.recommended_actions,
                    "probable_causes": a.alarm_class.probable_causes
                },
                delay=a.alarm_class.get_notification_delay())
        # Gather diagnostics when necessary
        AlarmDiagnosticConfig.on_raise(a)
        # Watch for escalations, when necessary
        if config.correlator.auto_escalation and not a.root:
            AlarmEscalation.watch_escalations(a)

    def correlate(self, r, a):
        # Topology RCA
        if a.alarm_class.topology_rca:
            with self.topology_rca_lock:
                self.topology_rca(a)
        # Rule-based RCA
        with self.rca_lock:
            if a.alarm_class.id in self.rca_forward:
                # Check alarm is a consequence of existing one
                self.set_root_cause(a)
            if a.alarm_class.id in self.rca_reverse:
                # Check alarm is the root cause for existing ones
                self.set_reverse_root_cause(a)
        # Call handlers
        for h in a.alarm_class.get_handlers():
            try:
                has_root = bool(a.root)
                h(a)
                if not has_root and a.root:
                    self.logger.info("[%s|%s|%s] Set root to %s (handler %s)",
                                     a.id, a.managed_object.name,
                                     a.managed_object.address, a.root, h)
            except:  # noqa. Can probable happens anything from handler
                error_report()
                metrics["error", ("type", "alarm_handler")] += 1
        # Call triggers if necessary
        if r.alarm_class.id in self.triggers:
            for t in self.triggers[r.alarm_class.id]:
                try:
                    t.call(a)
                except:  # noqa. Can probable happens anything from trigger
                    error_report()
        #
        if not a.severity:
            # Alarm severity has been reset to 0 by handlers
            # Silently drop alarm
            self.logger.debug("Alarm severity is 0, dropping")
            a.delete()
            metrics["alarm_drop"] += 1
            return

    def clear_alarm(self, r, e):
        managed_object = self.eval_expression(r.managed_object, event=e)
        if not managed_object:
            self.logger.info(
                "[%s|Unknown|Unknown] Referred to unknown managed object, ignoring",
                e.id)
            metrics["unknown_object"] += 1
            return
        if r.unique:
            discriminator, vars = r.get_vars(e)
            assert discriminator is not None
            a = ActiveAlarm.objects.filter(
                managed_object=managed_object.id,
                discriminator=discriminator).first()
            if a:
                self.logger.info("[%s|%s|%s] %s clears alarm %s(%s)", e.id,
                                 managed_object.name, managed_object.address,
                                 e.event_class.name, a.alarm_class.name, a.id)
                e.contribute_to_alarm(a)
                a.closing_event = e.id
                a.last_update = max(a.last_update, e.timestamp)
                a.clear_alarm("Cleared by disposition rule '%s'" % r.u_name,
                              ts=e.timestamp)
                metrics["alarm_clear"] += 1

    def get_delayed_event(self, r, e):
        """
        Check wrether all delayed conditions are met

        :param r: Delayed rule
        :param e: Event which can trigger delayed rule
        """
        # @todo: Rewrite to scheduler
        discriminator, vars = r.get_vars(e)
        ws = e.timestamp - datetime.timedelta(seconds=r.combo_window)
        de = ActiveEvent.objects.filter(managed_object=e.managed_object_id,
                                        event_class=r.event_class,
                                        discriminator=discriminator,
                                        timestamp__gte=ws).first()
        if not de:
            # No starting event
            return None
        # Probable starting event found, get all interesting following event
        # classes
        fe = [
            ee.event_class.id for ee in ActiveEvent.objects.filter(
                managed_object=e.managed_object_id,
                event_class__in=r.combo_event_classes,
                discriminator=discriminator,
                timestamp__gte=ws).order_by("timestamp")
        ]
        if r.combo_condition == "sequence":
            # Exact match
            if fe == self.combo_event_classes:
                return de
        elif r.combo_condition == "all":
            # All present
            if not any([c for c in r.combo_event_classes if c not in fe]):
                return de
        elif r.combo_condition == "any":
            # Any found
            if fe:
                return de
        return None

    def eval_expression(self, expression, **kwargs):
        """
        Evaluate expression in given context
        """
        env = {"re": re, "utils": utils}
        env.update(kwargs)
        return eval(expression, {}, env)

    def on_dispose_event(self, message, event_id, event=None, *args, **kwargs):
        """
        Called on new dispose message
        """
        self.logger.info("[%s] Receiving message", event_id)
        message.enable_async()
        self.get_executor("max").submit(self.dispose_worker, message, event_id,
                                        event)

    def dispose_worker(self, message, event_id, event_hint=None):
        metrics["alarm_dispose"] += 1
        try:
            if event_hint:
                event = self.get_event_from_hint(event_hint)
            else:
                event = self.lookup_event(event_id)
            if event:
                self.dispose_event(event)
        except Exception:
            metrics["alarm_dispose_error"] += 1
            error_report()
        self.ioloop.add_callback(message.finish)

    def lookup_event(self, event_id):
        """
        Lookup event by id.
        Uses cache heating effect from classifier
        :param event_id:
        :return: ActiveEvent instance or None
        """
        self.logger.info("[%s] Lookup event", event_id)
        e = ActiveEvent.get_by_id(event_id)
        if not e:
            self.logger.info("[%s] Event not found, skipping", event_id)
            metrics["event_lookup_failed"] += 1
        metrics["event_lookups"] += 1
        return e

    def get_event_from_hint(self, hint):
        """
        Get ActiveEvent from json hint
        :param hint:
        :return:
        """
        metrics["event_hints"] += 1
        e = ActiveEvent.from_json(hint)
        # Prevent TypeError: can't compare offset-naive and offset-aware datetimes
        # when calculating alarm timestamp
        e.timestamp = e.timestamp.replace(tzinfo=None)
        return e

    def dispose_event(self, e):
        """
        Dispose event according to disposition rule
        """
        event_id = str(e.id)
        self.logger.info("[%s] Disposing", event_id)
        drc = self.rules.get(e.event_class.id)
        if not drc:
            self.logger.info(
                "[%s] No disposition rules for class %s, skipping", event_id,
                e.event_class.name)
            return
        # Apply disposition rules
        for r in drc:
            if self.eval_expression(r.condition, event=e):
                # Process action
                if r.action == "drop":
                    self.logger.info("[%s] Dropped by action", event_id)
                    e.delete()
                    return
                elif r.action == "ignore":
                    self.logger.info("[%s] Ignored by action", event_id)
                    return
                elif r.action == "raise" and r.combo_condition == "none":
                    self.raise_alarm(r, e)
                elif r.action == "clear" and r.combo_condition == "none":
                    self.clear_alarm(r, e)
                if r.action in ("raise", "clear"):
                    # Write discriminator if can trigger delayed event
                    if r.unique and r.event_class.id in self.back_rules:
                        discriminator, vars = r.get_vars(e)
                        e.discriminator = discriminator
                        e.save()
                    # Process delayed combo conditions
                    if e.event_class.id in self.back_rules:
                        for br in self.back_rules[e.event_class.id]:
                            de = self.get_delayed_event(br, e)
                            if de:
                                if br.action == "raise":
                                    self.raise_alarm(br, de)
                                elif br.action == "clear":
                                    self.clear_alarm(br, de)
                if r.stop_disposition:
                    break
        self.logger.info("[%s] Disposition complete", event_id)

    def topology_rca(self, alarm, seen=None):
        def can_correlate(a1, a2):
            return (not config.correlator.topology_rca_window
                    or total_seconds(a1.timestamp - a2.timestamp) <=
                    config.correlator.topology_rca_window)

        self.logger.debug("[%s] Topology RCA", alarm.id)
        seen = seen or set()
        if alarm.root or alarm.id in seen:
            self.logger.debug("[%s] Already correlated", alarm.id)
            return  # Already correlated
        seen.add(alarm.id)
        # Get neighboring alarms
        na = {}
        # Downlinks
        q = Q(alarm_class=alarm.alarm_class.id,
              uplinks=alarm.managed_object.id)
        # Uplinks
        # @todo: Try to use $graphLookup to find affinity alarms
        if alarm.uplinks:
            q |= Q(alarm_class=alarm.alarm_class.id,
                   managed_object__in=list(alarm.uplinks))
        for a in ActiveAlarm.objects.filter(q):
            na[a.managed_object.id] = a
        # Correlate with uplinks
        if alarm.uplinks and len([na[o] for o in alarm.uplinks if o in na
                                  ]) == len(alarm.uplinks):
            # All uplinks are faulty
            # uplinks are ordered according to path length
            # Correlate with first applicable
            self.logger.info("[%s] All uplinks are faulty. Correlating",
                             alarm.id)
            for u in alarm.uplinks:
                a = na[u]
                if can_correlate(alarm, a):
                    self.logger.info("[%s] Set root to %s", alarm.id, a.id)
                    alarm.set_root(a)
                    metrics["alarm_correlated_topology"] += 1
                    break
        # Correlate neighbors' alarms
        for d in na:
            self.topology_rca(na[d], seen)
        self.logger.debug("[%s] Correlation completed", alarm.id)