예제 #1
0
 def get_availability(self, days):
     now = datetime.datetime.now()
     d = datetime.timedelta(days=days)
     b = now - d
     outages = defaultdict(int)
     q = Q(start__gte=b) | Q(stop__gte=b) | Q(stop__exists=False)
     for o in Outage.objects.filter(q):
         start = max(o.start, b)
         stop = o.stop if o.stop else now
         outages[o.object] += total_seconds(stop - start)
     td = total_seconds(d)
     # Normalize to percents
     return dict((o, (td - outages[o]) * 100.0 / td) for o in outages)
예제 #2
0
    def get_data(self, duration, **kwargs):
        now = datetime.datetime.now()
        d = datetime.timedelta(seconds=int(duration))
        b = now - d
        outages = defaultdict(list)
        otime = defaultdict(int)
        q = Q(start__gte=b) | Q(stop__gte=b) | Q(stop__exists=False)
        for o in Outage.objects.filter(q):
            start = max(o.start, b)
            stop = o.stop if o.stop else now
            outages[o.object] += [o]
            otime[o.object] = total_seconds(stop - start)
        td = total_seconds(d)
        # Load managed objects
        mos = list(otime)
        chunk = 500
        mo = {}
        while mos:
            for o in ManagedObject.objects.filter(id__in=mos[:chunk]):
                mo[o.id] = o
            mos = mos[chunk:]
        r = []
        for o in sorted(otime, key=lambda x: -otime[x]):
            m = mo.get(o)
            if not m:
                continue  # Hanging Outage
            dt = otime[o]
            downtime = "%02d:%02d:%02d" % ((dt // 3600) % 24,
                                           (dt // 60) % 60, dt % 60)
            if dt >= 86400:
                downtime = "%dd %s" % (dt // 86400, downtime)
            r += [(m.name, m.profile_name, m.platform, m.is_managed,
                   m.get_status(), downtime, float(td - dt) * 100 / td,
                   len(outages[o]))]

        return self.from_dataset(title=self.title,
                                 columns=[
                                     "Object", "Profile", "Platform",
                                     TableColumn("Managed", format="bool"),
                                     TableColumn("Status", format="bool"),
                                     TableColumn("Downtime", align="right"),
                                     TableColumn("Availability",
                                                 align="right",
                                                 format="percent"),
                                     TableColumn("Downs",
                                                 align="right",
                                                 format="integer")
                                 ],
                                 data=r,
                                 enumerate=True)
예제 #3
0
 def watch_escalations(cls, alarm):
     now = datetime.datetime.now()
     for esc in cls.get_class_escalations(alarm.alarm_class):
         for e_item in esc.escalations:
             # Check administrative domain
             if (e_item.administrative_domain and
                     e_item.administrative_domain.id not in alarm.adm_path):
                 continue
             # Check severity
             if e_item.min_severity and alarm.severity < e_item.min_severity:
                 continue
             # Check selector
             if e_item.selector and not SelectorCache.is_in_selector(
                     alarm.managed_object, e_item.selector):
                 continue
             logger.debug("[%s] Watch for %s after %s seconds", alarm.id,
                          esc.name, e_item.delay)
             et = alarm.timestamp + datetime.timedelta(seconds=e_item.delay)
             if et > now:
                 delay = total_seconds(et - now)
             else:
                 delay = None
             call_later("noc.services.escalator.escalation.escalate",
                        scheduler="escalator",
                        pool=alarm.managed_object.escalator_shard,
                        delay=delay,
                        max_runs=esc.max_escalation_retries,
                        alarm_id=alarm.id,
                        escalation_id=esc.id,
                        escalation_delay=e_item.delay)
예제 #4
0
파일: views.py 프로젝트: skripkar/noc
 def get_availability(start_date, stop_date, skip_zero_avail=False):
     # now = datetime.datetime.now()
     b = start_date
     d = stop_date
     outages = defaultdict(list)
     td = total_seconds(d - b)
     # q = Q(start__gte=b) | Q(stop__gte=b) | Q(stop__exists=False)
     q = (Q(start__gte=b) | Q(stop__gte=b)
          | Q(stop__exists=False)) & Q(start__lt=d)
     for o in Outage.objects.filter(q):
         start = max(o.start, b)
         stop = o.stop if (o.stop and o.stop < d) else d
         if total_seconds(stop - start) == td and skip_zero_avail:
             continue
         outages[o.object] += [total_seconds(stop - start)]
     # Normalize to percents
     return dict((o, ((td - sum(outages[o])) * 100.0 / td,
                      int(sum(outages[o])), len(outages[o])))
                 for o in outages)
예제 #5
0
 def get_interval(self):
     if isinstance(self.schedule["interval"], (int, long)):
         # Migrate IntervalJob to MultiIntervalJob
         interval = [(None, self.schedule["interval"])]
         dt = 0
     else:
         interval = self.schedule["interval"]
         dt = total_seconds(datetime.datetime.now() - self.schedule["scheduled"])
     # Find appropriative time time range
     for t, i in interval:
         if t is None or t > dt:
             return i
예제 #6
0
 def forwards(self):
     db = get_db()
     bulk = db.noc.fm.uptimes.initialize_unordered_bulk_op()
     n = 0
     for d in db.noc.fm.uptimes.find({}):
         bulk.find({
             "_id": d["_id"]
         }).update({
             "$set": {
                 "last_value": float(total_seconds(d["last"] - d["start"]))
             }
         })
         n += 1
     if n:
         bulk.execute()
예제 #7
0
 def forwards(self):
     db = get_db()
     bulk = []
     for d in db.noc.fm.uptimes.find({}):
         bulk += [UpdateOne({"_id": d["_id"]}, {
             "$set": {
                 "last_value": float(total_seconds(d["last"] - d["start"]))
             }
         })]
     if bulk:
         print("Commiting changes to database")
         try:
             db.noc.fm.uptimes.bulk_write(bulk)
             print("Database has been synced")
         except BulkWriteError as e:
             print("Bulk write error: '%s'", e.details)
             print("Stopping check")
예제 #8
0
파일: job.py 프로젝트: skripkar/noc
    def run(self):
        with Span(server=self.scheduler.name,
                  service=self.attrs[self.ATTR_CLASS],
                  sample=self.attrs.get(self.ATTR_SAMPLE, 0),
                  in_label=self.attrs.get(self.ATTR_KEY, "")):
            self.start_time = perf_counter()
            if self.is_retries_exceeded():
                self.logger.info("[%s|%s] Retries exceeded. Remove job",
                                 self.name, self.attrs[Job.ATTR_ID])
                self.remove_job()
                return
            self.logger.info(
                "[%s] Starting at %s (Lag %.2fms)", self.name,
                self.scheduler.scheduler_id,
                total_seconds(datetime.datetime.now() -
                              self.attrs[self.ATTR_TS]) * 1000.0)
            # Run handler
            status = self.E_EXCEPTION
            delay = None
            with Span(service="job.dereference"):
                try:
                    ds = self.dereference()
                    can_run = self.can_run()
                except Exception as e:
                    self.logger.error("Unknown error during dereference: %s",
                                      e)
                    ds = None
                    can_run = False

            if ds:
                with Span(service="job.run"):
                    if can_run:
                        try:
                            data = self.attrs.get(self.ATTR_DATA) or {}
                            result = self.handler(**data)
                            if tornado.gen.is_future(result):
                                # Wait for future
                                result = yield result
                            status = self.E_SUCCESS
                        except RetryAfter as e:
                            self.logger.info("Retry after %ss: %s", e.delay, e)
                            status = self.E_RETRY
                            delay = e.delay
                        except self.failed_exceptions:
                            status = self.E_FAILED
                        except Exception:
                            error_report()
                            status = self.E_EXCEPTION
                    else:
                        self.logger.info("Deferred")
                        status = self.E_DEFERRED
            elif ds is not None:
                self.logger.info("Cannot dereference")
                status = self.E_DEREFERENCE
            self.duration = perf_counter() - self.start_time
            self.logger.info("Completed. Status: %s (%.2fms)",
                             self.STATUS_MAP.get(status, status),
                             self.duration * 1000)
            # Schedule next run
            if delay is None:
                with Span(service="job.schedule_next"):
                    self.schedule_next(status)
            else:
                with Span(service="job.schedule_retry"):
                    # Retry
                    if self.context_version:
                        ctx = self.context or None
                        ctx_key = self.get_context_cache_key()
                    else:
                        ctx = None
                        ctx_key = None
                    self.scheduler.set_next_run(
                        self.attrs[self.ATTR_ID],
                        status=status,
                        ts=datetime.datetime.now() +
                        datetime.timedelta(seconds=delay),
                        duration=self.duration,
                        context_version=self.context_version,
                        context=ctx,
                        context_key=ctx_key)
예제 #9
0
 def duration(self):
     """
     Logged event duration in seconds
     """
     return total_seconds(self.timestamp - self.start_timestamp)
예제 #10
0
파일: alarms.py 프로젝트: skripkar/noc
 def extract(self):
     nr = 0
     # Get reboots
     r = Reboot._get_collection().aggregate([{
         "$match": {
             "ts": {
                 "$gt": self.start - self.reboot_interval,
                 "$lte": self.stop
             }
         }
     }, {
         "$sort": {
             "ts": 1
         }
     }, {
         "$group": {
             "_id": "$object",
             "reboots": {
                 "$push": "$ts"
             }
         }
     }])
     # object -> [ts1, .., tsN]
     reboots = dict((d["_id"], d["reboots"]) for d in r)
     #
     for d in self.iter_data():
         mo = ManagedObject.get_by_id(d["managed_object"])
         if not mo:
             continue
         # Process reboot data
         o_reboots = reboots.get(d["managed_object"], [])
         n_reboots = hits_in_range(o_reboots,
                                   d["timestamp"] - self.reboot_interval,
                                   d["clear_timestamp"])
         #
         self.alarm_stream.push(
             ts=d["timestamp"],
             close_ts=d["clear_timestamp"],
             duration=max(
                 0,
                 int(total_seconds(d["clear_timestamp"] - d["timestamp"]))),
             alarm_id=str(d["_id"]),
             root=str(d.get("root") or ""),
             alarm_class=AlarmClass.get_by_id(d["alarm_class"]),
             severity=d["severity"],
             reopens=d.get("reopens") or 0,
             direct_services=sum(ss["summary"]
                                 for ss in d.get("direct_services", [])),
             direct_subscribers=sum(
                 ss["summary"] for ss in d.get("direct_subscribers", [])),
             total_objects=sum(ss["summary"]
                               for ss in d.get("total_objects", [])),
             total_services=sum(ss["summary"]
                                for ss in d.get("total_services", [])),
             total_subscribers=sum(
                 ss["summary"] for ss in d.get("total_subscribers", [])),
             escalation_ts=d.get("escalation_ts"),
             escalation_tt=d.get("escalation_tt"),
             managed_object=mo,
             pool=mo.pool,
             ip=mo.address,
             profile=mo.profile,
             object_profile=mo.object_profile,
             vendor=mo.vendor,
             platform=mo.platform,
             version=mo.version,
             administrative_domain=mo.administrative_domain,
             segment=mo.segment,
             container=mo.container,
             x=mo.x,
             y=mo.y,
             reboots=n_reboots,
             services=[{
                 "profile": ServiceProfile.get_by_id(ss["profile"]).bi_id,
                 "summary": ss["summary"]
             } for ss in d.get("direct_services", [])],
             subscribers=[{
                 "profile":
                 SubscriberProfile.get_by_id(ss["profile"]).bi_id,
                 "summary":
                 ss["summary"]
             } for ss in d.get("direct_subscribers", [])],
             # location=mo.container.get_address_text()
         )
         nr += 1
         self.last_ts = d["clear_timestamp"]
     self.alarm_stream.finish()
     return nr
예제 #11
0
 def can_correlate(a1, a2):
     return (not config.correlator.topology_rca_window
             or total_seconds(a1.timestamp - a2.timestamp) <=
             config.correlator.topology_rca_window)
예제 #12
0
 def register(cls, managed_object, uptime):
     """
     Register uptime
     :param managed_object: Managed object reference
     :param uptime: Registered uptime in seconds
     """
     if not uptime:
         return
     oid = managed_object.id
     now = datetime.datetime.now()
     delta = datetime.timedelta(seconds=uptime)
     logger.debug("[%s] Register uptime %s", managed_object.name, delta)
     # Update data
     c = cls._get_collection()
     d = c.find_one({"object": oid, "stop": None})
     if d:
         # Check for reboot
         is_rebooted = False
         if d["last_value"] > uptime:
             # Check for counter wrapping
             # Get wrapped delta
             dl = cls.FWRAP - d["last_value"] + uptime
             # Get timestamp delta
             tsd = total_seconds(now - d["last"])
             if abs(dl - tsd) > tsd * cls.WPREC:
                 is_rebooted = True
             else:
                 logger.debug("Counter wrap detected")
         if is_rebooted:
             # Reboot registered
             # Closing existing uptime
             ts = now - delta
             logger.debug("[%s] Closing uptime (%s - %s, delta %s)",
                          managed_object.name, d["start"], ts - cls.SEC,
                          delta)
             c.update({"_id": d["_id"]}, {"$set": {"stop": ts - cls.SEC}})
             # Start new uptime
             logger.debug("[%s] Starting new uptime from %s",
                          managed_object.name, ts)
             c.insert({
                 "object": oid,
                 "start": ts,
                 "stop": None,
                 "last": now,
                 "last_value": uptime
             })
             #
             Reboot.register(managed_object, ts, d["last"])
         else:
             logger.debug("[%s] Refreshing existing uptime (%s - %s)",
                          managed_object.name, d["start"], now)
             c.update({"_id": d["_id"]},
                      {"$set": {
                          "last": now,
                          "last_value": uptime
                      }})
     else:
         # First uptime
         logger.debug("[%s] First uptime from %s", managed_object.name, now)
         c.insert({
             "object": oid,
             "start": now - delta,
             "stop": None,
             "last": now,
             "last_value": uptime
         })
예제 #13
0
    def get_data(self,
                 request,
                 duration,
                 from_date=None,
                 to_date=None,
                 **kwargs):
        now = datetime.datetime.now()
        if not from_date:
            duration = 1
        if int(duration):
            self.logger.info("Use duration\n")
            d = datetime.timedelta(seconds=int(duration))
            b = now - d
            q = Q(start__gte=b) | Q(stop__gte=b) | Q(stop__exists=False)
        else:
            b = datetime.datetime.strptime(from_date, "%d.%m.%Y")
            q = Q(start__gte=b) | Q(stop__gte=b) | Q(stop__exists=False)
            if to_date:
                if from_date == to_date:
                    t1 = datetime.datetime.strptime(
                        to_date, "%d.%m.%Y") + datetime.timedelta(1)
                else:
                    t1 = datetime.datetime.strptime(to_date, "%d.%m.%Y")
            else:
                t1 = now
            q &= Q(start__lte=t1) | Q(stop__lte=t1)
            d = datetime.timedelta(seconds=int((t1 - b).total_seconds()))
        outages = defaultdict(list)
        otime = defaultdict(int)
        for o in Outage.objects.filter(q):
            start = max(o.start, b)
            stop = o.stop if o.stop else now
            outages[o.object] += [o]
            otime[o.object] += total_seconds(stop - start)
        td = total_seconds(d)
        if not request.user.is_superuser:
            for mo in ManagedObject.objects.exclude(
                    administrative_domain__in=UserAccess.get_domains(
                        request.user)):
                if mo.id in otime:
                    otime.pop(mo.id)
        # Load managed objects
        mos = list(otime)
        chunk = 500
        mo = {}
        while mos:
            for o in ManagedObject.objects.filter(id__in=mos[:chunk]):
                mo[o.id] = o
            mos = mos[chunk:]
        r = []
        for o in sorted(otime, key=lambda x: -otime[x]):
            m = mo.get(o)
            if not m:
                continue  # Hanging Outage
            dt = min(td, otime[o])
            downtime = "%02d:%02d:%02d" % ((dt // 3600) % 24,
                                           (dt // 60) % 60, dt % 60)
            if dt >= 86400:
                downtime = "%dd %s" % (dt // 86400, downtime)
            if td:
                avail = float(td - dt) * 100 / td
            else:
                avail = 0
            r += [(m.name, m.address,
                   m.profile.name, m.platform.name if m.platform else "",
                   _("Yes") if m.is_managed else _("No"),
                   _("Yes") if m.get_status() else _("No"), downtime, avail,
                   len(outages[o]))]

        return self.from_dataset(title=self.title,
                                 columns=[
                                     _("Managed Object"),
                                     _("Address"),
                                     _("Profile"),
                                     _("Platform"),
                                     TableColumn(_("Managed"), align="right"),
                                     TableColumn(_("Status"), align="right"),
                                     TableColumn(_("Downtime"), align="right"),
                                     TableColumn(_("Availability"),
                                                 align="right",
                                                 format="percent"),
                                     TableColumn(_("Downs"),
                                                 align="right",
                                                 format="integer")
                                 ],
                                 data=r,
                                 enumerate=True)
예제 #14
0
    def process_mrtasks(self):
        """
        Process Map/Reduce tasks
        """
        def map_callback(mt_id, result=None, error=None):
            try:
                mt = MapTask.objects.get(id=mt_id)
            except MapTask.DoesNotExist:
                self.logger.error("Late answer for map task %d is ignored" %
                                  mt_id)
                return
            if error:
                # Process non-fatal reasons
                TIMEOUTS = {
                    ERR_ACTIVATOR_NOT_AVAILABLE: 10,
                    ERR_OVERLOAD: 10,
                    ERR_DOWN: 30,
                }
                if error.code in TIMEOUTS:
                    # Any of non-fatal reasons require retry
                    timeout = TIMEOUTS[error.code]
                    variation = 2
                    timeout = random.randint(-timeout / variation,
                                             timeout / variation)
                    next_try = (datetime.datetime.now() +
                                datetime.timedelta(seconds=timeout))
                    if error.code in (ERR_OVERLOAD,
                                      ERR_ACTIVATOR_NOT_AVAILABLE):
                        next_retries = mt.retries_left
                    else:
                        next_retries = mt.retries_left - 1
                    if mt.retries_left and (not mt.task
                                            or next_try < mt.task.stop_time):
                        # Check we're still in task time and have retries left
                        self.log_mrt(logging.INFO, task=mt, status="retry")
                        mt.next_try = next_try
                        mt.retries_left = next_retries
                        mt.status = "W"
                        mt.save()
                        return
                mt.status = "F"
                mt.script_result = dict(code=error.code, text=error.text)
                self.log_mrt(logging.INFO,
                             task=mt,
                             status="failed",
                             code=error.code,
                             error=error.text)
            else:
                mt.status = "C"
                mt.script_result = result
                self.log_mrt(logging.INFO, task=mt, status="completed")
            mt.save()

        # Additional stack frame to store mt_id in a closure
        def exec_script(mt):
            kwargs = {}
            if mt.script_params:
                kwargs = mt.script_params
            self.log_mrt(logging.INFO, task=mt, status="running", args=kwargs)
            self.script(mt.managed_object,
                        mt.map_script,
                        lambda result=None, error=None: map_callback(
                            mt.id, result, error),
                        timeout=mt.script_timeout,
                        **kwargs)

        def fail_task(mt, code, text):
            mt.status = "F"
            mt.script_result = dict(code=code, text=text)
            try:
                mt.save()
            except Exception:
                pass  # Can raise integrity error if MRT is gone
            self.log_mrt(logging.INFO,
                         task=mt,
                         status="failed",
                         code=code,
                         error=text)

        t = datetime.datetime.now()
        # self.logger.debug("Processing MRT schedules")
        # Reset rates
        sae_mrt_rate = 0
        shard_mrt_rate = {}  # shard_id -> count
        throttled_shards = set()  # shard_id
        self.blocked_pools = set()  # Reset block status
        # Run tasks
        qs = {"status": "W", "next_try__lte": t}
        if not self.single_shard:
            qs["managed_object__activator__shard__is_active"] = True
            qs["managed_object__activator__shard__name__in"] = self.shards
        for mt in MapTask.objects.filter(**qs)\
                .order_by("next_try")\
                .select_related("activator", "managed_object")\
                .select_for_update():
            # Check object is managed
            if not mt.managed_object.is_managed:
                fail_task(mt, ERR_OBJECT_NOT_MANAGED, "Object is not managed")
                continue
            # Check reduce task still valid
            is_valid_reduce = True
            try:
                mt.task
            except ReduceTask.DoesNotExist:
                is_valid_reduce = False
            # Check for task timeouts
            if not is_valid_reduce or (mt.task and mt.task.stop_time < t):
                fail_task(mt, ERR_TIMEOUT, text="Timed out")
                continue
            # Check blocked pools
            if mt.managed_object.activator.name in self.blocked_pools:
                # Silently skip task until next round
                self.logger.debug("Delaying task to the blocked pool '%s'" %
                                  mt.managed_object.activator.name)
                continue
            # Check for global rate limit
            if self.max_mrt_rate_per_sae:
                if sae_mrt_rate > self.max_mrt_rate_per_sae:
                    self.log_mrt(logging.INFO,
                                 task=mt,
                                 status="throttled",
                                 msg="Per-SAE rate limit exceeded "
                                 "(%d)" % self.max_mrt_rate_per_sae)
                    break
                sae_mrt_rate += 1
            # Check for shard rate limit
            if self.max_mrt_rate_per_shard:
                s_id = mt.managed_object.activator.shard.id
                if s_id in throttled_shards:
                    # Shard is throttled, do not log
                    continue
                sr = shard_mrt_rate.get(s_id, 0) + 1
                if sr > self.max_mrt_rate_per_shard:
                    # Log and throttle shard
                    self.log_mrt(logging.INFO,
                                 task=mt,
                                 status="throttled",
                                 msg="Per-shard rate limit exceeded "
                                 "(%d)" % self.max_mrt_rate_per_shard)
                    throttled_shards.add(s_id)
                else:
                    shard_mrt_rate[s_id] = sr
            mt.status = "R"
            mt.save()
            exec_script(mt)
        dt = total_seconds(datetime.datetime.now() - t)
        # self.logger.debug("MRT Schedules processed in %ss" % dt)
        if dt > self.mrt_schedule_interval:
            self.logger.error(
                "SAE is overloaded by MRT scheduling (took %ss)" % dt)