예제 #1
0
파일: alarm.py 프로젝트: skripkar/noc
 def filter_alarmclass(cls, *args):
     ids = [
         str(AlarmClass.get_by_name(a).id) for a in args
         if AlarmClass.get_by_name(a)
     ]
     if len(ids) == 1:
         return {"%s.alarmclass" % cls.F_META: ids[0]}
     else:
         return {"%s.alarmclass" % cls.F_META: {"$in": ids}}
예제 #2
0
    def update_alarms(self):
        from noc.fm.models.alarmseverity import AlarmSeverity
        from noc.fm.models.alarmclass import AlarmClass

        prev_status = self.context.get("umbrella_settings", False)
        current_status = self.can_update_alarms()
        self.context["umbrella_settings"] = current_status

        if not prev_status and not current_status:
            return
        self.logger.info("Updating alarm statuses")
        umbrella_cls = AlarmClass.get_by_name(self.umbrella_cls)
        if not umbrella_cls:
            self.logger.info(
                "No umbrella alarm class. Alarm statuses not updated")
            return
        details = []
        if current_status:
            fatal_weight = self.get_fatal_alarm_weight()
            weight = self.get_alarm_weight()
            for p in self.problems:
                if not p["alarm_class"]:
                    continue
                ac = AlarmClass.get_by_name(p["alarm_class"])
                if not ac:
                    self.logger.info("Unknown alarm class %s. Skipping",
                                     p["alarm_class"])
                    continue
                details += [{
                    "alarm_class":
                    ac,
                    "path":
                    p["path"],
                    "severity":
                    AlarmSeverity.severity_for_weight(
                        fatal_weight if p["fatal"] else weight),
                    "vars": {
                        "path": p["path"],
                        "message": p["message"]
                    }
                }]
        else:
            # Clean up all open alarms as they has been disabled
            details = []
        self.update_umbrella(umbrella_cls, details)
예제 #3
0
 def get_umbrella_alarm_cfg(self,
                            alarm_class=None,
                            path=None,
                            message=None,
                            fatal=False,
                            **kwargs):
     """
     Getting Umbrella Alarm Cfg
     :param alarm_class: Alarm class instance or name
     :param path: Additional path
     :param message: Text message
     :param fatal: True if problem is fatal and all following checks
         must be disabled
     :param kwargs: Dict containing optional variables
     :return:
     """
     alarm_cfg = {
         "alarm_class": AlarmClass.get_by_name(alarm_class),
         "path": " | ".join(path),
         "vars": kwargs,
     }
     alarm_cfg["vars"]["message"] = message
     alarm_cfg["vars"]["path"] = path
     return alarm_cfg
예제 #4
0
 def get_ac_cm_violations():
     return AlarmClass.get_by_name("Config | Policy Violations")
예제 #5
0
 def get_ac_pm_thresholds():
     return AlarmClass.get_by_name("NOC | PM | Out of Thresholds")
예제 #6
0
class MetricsCheck(DiscoveryCheck):
    """
    MAC discovery
    """
    name = "metrics"
    required_script = "get_metrics"

    _object_profile_metrics = cachetools.TTLCache(1000, 60)
    _interface_profile_metrics = cachetools.TTLCache(1000, 60)
    _slaprofile_metrics = cachetools.TTLCache(1000, 60)

    S_OK = 0
    S_WARN = 1
    S_ERROR = 2

    SMAP = {
        0: "ok",
        1: "warn",
        2: "error"
    }

    SEV_MAP = {
        1: 2000,
        2: 3000
    }

    AC_PM_THRESHOLDS = AlarmClass.get_by_name("NOC | PM | Out of Thresholds")
    AC_PM_LOW_ERROR = AlarmClass.get_by_name("NOC | PM | Low Error")
    AC_PM_HIGH_ERROR = AlarmClass.get_by_name("NOC | PM | High Error")
    AC_PM_LOW_WARN = AlarmClass.get_by_name("NOC | PM | Low Warning")
    AC_PM_HIGH_WARN = AlarmClass.get_by_name("NOC | PM | High Warning")

    SLA_CAPS = [
        "Cisco | IP | SLA | Probes"
    ]

    def __init__(self, *args, **kwargs):
        super(MetricsCheck, self).__init__(*args, **kwargs)
        self.id_count = itertools.count()
        self.id_metrics = {}

    @classmethod
    @cachetools.cachedmethod(
        operator.attrgetter("_object_profile_metrics"),
        lock=lambda _: metrics_lock
    )
    def get_object_profile_metrics(cls, p_id):
        r = {}
        opr = ManagedObjectProfile.get_by_id(id=p_id)
        if not opr:
            return r
        for m in opr.metrics:
            mt_id = m.get("metric_type")
            if not mt_id:
                continue
            mt = MetricType.get_by_id(mt_id)
            if not mt:
                continue
            le = m.get("low_error")
            lw = m.get("low_warn")
            he = m.get("high_error")
            hw = m.get("high_warn")
            lew = AlarmSeverity.severity_for_weight(int(m.get("low_error_weight", 10)))
            lww = AlarmSeverity.severity_for_weight(int(m.get("low_warn_weight", 1)))
            hew = AlarmSeverity.severity_for_weight(int(m.get("high_error_weight", 1)))
            hww = AlarmSeverity.severity_for_weight(int(m.get("high_warn_weight", 10)))
            threshold_profile = None
            if m.get("threshold_profile"):
                threshold_profile = ThresholdProfile.get_by_id(m.get("threshold_profile"))
            r[mt.name] = MetricConfig(
                mt,
                m.get("enable_box", True),
                m.get("enable_periodic", True),
                m.get("is_stored", True),
                m.get("window_type", "m"),
                int(m.get("window", 1)),
                m.get("window_function", "last"),
                m.get("window_config"),
                m.get("window_related", False),
                int(le) if le is not None else None,
                int(lw) if lw is not None else None,
                int(hw) if hw is not None else None,
                int(he) if he is not None else None,
                lew, lww, hww, hew,
                threshold_profile,
                le is not None or lw is not None or he is not None or hw is not None
            )
        return r

    @staticmethod
    def quote_path(path):
        """
        Convert path list to ClickHouse format
        :param path:
        :return:
        """
        return "[%s]" % ",".join("'%s'" % p for p in path)

    @staticmethod
    def config_from_settings(m):
        """
        Returns MetricConfig from .metrics field
        :param m:
        :return:
        """
        return MetricConfig(
            m.metric_type,
            m.enable_box, m.enable_periodic,
            m.is_stored,
            m.window_type, m.window, m.window_function,
            m.window_config, m.window_related,
            m.low_error, m.low_warn, m.high_warn, m.high_error,
            AlarmSeverity.severity_for_weight(m.low_error_weight),
            AlarmSeverity.severity_for_weight(m.low_warn_weight),
            AlarmSeverity.severity_for_weight(m.high_warn_weight),
            AlarmSeverity.severity_for_weight(m.high_error_weight),
            m.threshold_profile,
            m.low_error is not None or m.low_warn is not None or m.high_warn is not None or m.high_error is not None
        )

    @classmethod
    @cachetools.cachedmethod(
        operator.attrgetter("_interface_profile_metrics"),
        lock=lambda _: metrics_lock
    )
    def get_interface_profile_metrics(cls, p_id):
        r = {}
        ipr = InterfaceProfile.get_by_id(id=p_id)
        if not ipr:
            return r
        for m in ipr.metrics:
            r[m.metric_type.name] = cls.config_from_settings(m)
        return r

    @classmethod
    @cachetools.cachedmethod(
        operator.attrgetter("_slaprofile_metrics"),
        lock=lambda _: metrics_lock)
    def get_slaprofile_metrics(cls, p_id):
        r = {}
        spr = SLAProfile.get_by_id(p_id)
        if not spr:
            return r
        for m in spr.metrics:
            r[m.metric_type.name] = cls.config_from_settings(m)
        return r

    def get_object_metrics(self):
        """
        Populate metrics list with objects metrics
        :return:
        """
        metrics = []
        o_metrics = self.get_object_profile_metrics(self.object.object_profile.id)
        self.logger.debug("Object metrics: %s", o_metrics)
        for metric in o_metrics:
            if ((self.is_box and not o_metrics[metric].enable_box) or
                    (self.is_periodic and not o_metrics[metric].enable_periodic)):
                continue
            m_id = next(self.id_count)
            metrics += [{
                "id": m_id,
                "metric": metric
            }]
            self.id_metrics[m_id] = o_metrics[metric]
        if not metrics:
            self.logger.info("Object metrics are not configured. Skipping")
        return metrics

    def get_subinterfaces(self):
        subs = defaultdict(list)  # interface id -> [{"name":, "ifindex":}]
        for si in SubInterface._get_collection().with_options(
            read_preference=ReadPreference.SECONDARY_PREFERRED
        ).find({
            "managed_object": self.object.id
        }, {
            "name": 1,
            "interface": 1,
            "ifindex": 1
        }):
            subs[si["interface"]] += [{
                "name": si["name"],
                "ifindex": si.get("ifindex")
            }]
        return subs

    def get_interface_metrics(self):
        """
        Populate metrics list with interface metrics
        :return:
        """
        subs = None
        metrics = []
        for i in Interface._get_collection().with_options(
                read_preference=ReadPreference.SECONDARY_PREFERRED
        ).find({
            "managed_object": self.object.id,
            "type": "physical"
        }, {
            "_id": 1,
            "name": 1,
            "ifindex": 1,
            "profile": 1
        }):
            ipr = self.get_interface_profile_metrics(i["profile"])
            self.logger.debug("Interface %s. ipr=%s", i["name"], ipr)
            if not ipr:
                continue  # No metrics configured
            i_profile = InterfaceProfile.get_by_id(i["profile"])
            if i_profile.allow_subinterface_metrics and subs is None:
                # Resolve subinterfaces
                subs = self.get_subinterfaces()
            ifindex = i.get("ifindex")
            for metric in ipr:
                if ((self.is_box and not ipr[metric].enable_box) or
                        (self.is_periodic and not ipr[metric].enable_periodic)):
                    continue
                m_id = next(self.id_count)
                m = {
                    "id": m_id,
                    "metric": metric,
                    "path": ["", "", "", i["name"]]
                }
                if ifindex is not None:
                    m["ifindex"] = ifindex
                metrics += [m]
                self.id_metrics[m_id] = ipr[metric]
                if i_profile.allow_subinterface_metrics:
                    for si in subs[i["_id"]]:
                        m_id = next(self.id_count)
                        m = {
                            "id": m_id,
                            "metric": metric,
                            "path": ["", "", "", i["name"], si["name"]]
                        }
                        if si["ifindex"] is not None:
                            m["ifindex"] = si["ifindex"]
                        metrics += [m]
                        self.id_metrics[m_id] = ipr[metric]
        if not metrics:
            self.logger.info("Interface metrics are not configured. Skipping")
        return metrics

    def get_sla_metrics(self):
        if not self.has_any_capability(self.SLA_CAPS):
            self.logger.info("SLA not configured, skipping SLA metrics")
        metrics = []
        for p in SLAProbe._get_collection().with_options(
            read_preference=ReadPreference.SECONDARY_PREFERRED
        ).find({
            "managed_object": self.object.id
        }, {
            "name": 1,
            "group": 1,
            "profile": 1,
            "type": 1
        }):
            if not p.get("profile"):
                self.logger.debug("Probe %s has no profile. Skipping", p["name"])
                continue
            pm = self.get_slaprofile_metrics(p["profile"])
            if not pm:
                self.logger.debug(
                    "Probe %s has profile '%s' with no configured metrics. "
                    "Skipping", p["name"], p.profile.name
                )
                continue
            for metric in pm:
                if ((self.is_box and not pm[metric].enable_box) or
                        (self.is_periodic and not pm[metric].enable_periodic)):
                    continue
                m_id = next(self.id_count)
                metrics += [{
                    "id": m_id,
                    "metric": metric,
                    "path": [p.get("group", ""), p["name"]],
                    "sla_type": p["type"]
                }]
                self.id_metrics[m_id] = pm[metric]
        if not metrics:
            self.logger.info("SLA metrics are not configured. Skipping")
        return metrics

    def process_result(self, result):
        """
        Process IGetMetrics result
        :param result:
        :return:
        """
        # Restore last counter state
        if self.has_artefact("reboot"):
            self.logger.info(
                "Resetting counter context due to detected reboot"
            )
            self.job.context["counters"] = {}
        counters = self.job.context["counters"]
        alarms = []
        data = defaultdict(dict)
        n_metrics = 0
        mo_id = self.object.bi_id
        ts_cache = {}  # timestamp -> (date, ts)
        #
        for m in result:
            path = m.path
            cfg = self.id_metrics.get(m.id)
            if m.type in MT_COUNTER_DELTA:
                # Counter type
                if path:
                    key = "%x|%s" % (
                        cfg.metric_type.bi_id,
                        "|".join(str(p) for p in path)
                    )
                else:
                    key = "%x" % cfg.metric_type.bi_id
                # Restore old value and save new
                r = counters.get(key)
                counters[key] = (m.ts, m.value)
                if r is None:
                    # No stored state
                    self.logger.debug(
                        "[%s] COUNTER value is not found. "
                        "Storing and waiting for a new result",
                        m.label
                    )
                    continue
                # Calculate counter
                self.logger.debug(
                    "[%s] Old value: %s@%s, new value: %s@%s.",
                    m.label, r[1], r[0], m.value, m.ts
                )
                if m.type == MT_COUNTER:
                    cv = self.convert_counter(m, r)
                else:
                    cv = self.convert_delta(m, r)
                if cv is None:
                    # Counter stepback or other errors
                    # Remove broken value
                    self.logger.debug(
                        "[%s] Counter stepback from %s@%s to %s@%s: Skipping",
                        m.label, r[1], r[0], m.value, m.ts
                    )
                    del counters[key]
                    continue
                m.value = cv
                m.abs_value = cv * m.scale
            elif m.type == MT_BOOL:
                # Convert boolean type
                m.abs_value = 1 if m.value else 0
            else:
                # Gauge
                m.abs_value = m.value * m.scale

            self.logger.debug(
                "[%s] Measured value: %s. Scale: %s. Resulting value: %s",
                m.label, m.value, m.scale, m.abs_value
            )
            # Schedule to store
            if cfg.is_stored:
                tsc = ts_cache.get(m.ts)
                if not tsc:
                    lt = time.localtime(m.ts // 1000000000)
                    tsc = (
                        time.strftime("%Y-%m-%d", lt),
                        time.strftime("%Y-%m-%d %H:%M:%S", lt)
                    )
                    ts_cache[m.ts] = tsc
                if path:
                    pk = "%s\t%s\t%d\t%s" % (
                        tsc[0], tsc[1], mo_id,
                        self.quote_path(path)
                    )
                    table = "%s.date.ts.managed_object.path" % cfg.metric_type.scope.table_name
                else:
                    pk = "%s\t%s\t%d" % (tsc[0], tsc[1], mo_id)
                    table = "%s.date.ts.managed_object" % cfg.metric_type.scope.table_name
                field = cfg.metric_type.field_name
                try:
                    data[table, pk][field] = cfg.metric_type.clean_value(m.abs_value)
                except ValueError as e:
                    self.logger.info(
                        "[%s] Cannot clean value %s: %s",
                        m.label, m.abs_value, e
                    )
                    continue
                n_metrics += 1
            if cfg.process_thresholds and m.abs_value is not None:
                alarms += self.process_thresholds(m, cfg)
        return n_metrics, data, alarms

    def handler(self):
        self.logger.info("Collecting metrics")
        # Build get_metrics input parameters
        metrics = self.get_object_metrics()
        metrics += self.get_interface_metrics()
        metrics += self.get_sla_metrics()
        if not metrics:
            self.logger.info("No metrics configured. Skipping")
            return
        # Collect metrics
        self.logger.debug("Collecting metrics: %s", metrics)

        result = [
            MData(**r)
            for r in self.object.scripts.get_metrics(metrics=metrics)
        ]
        if not result:
            self.logger.info("No metrics found")
            return
        # Process results
        n_metrics, data, alarms = self.process_result(result)
        # Send metrics
        if n_metrics:
            self.logger.info("Spooling %d metrics", n_metrics)
            self.send_metrics(data)
        # Set up threshold alarms
        self.logger.info("%d alarms detected", len(alarms))
        self.job.update_umbrella(
            self.AC_PM_THRESHOLDS,
            alarms
        )

    def convert_delta(self, m, r):
        """
        Calculate value from delta, gently handling overflows
        :param m: MData
        :param r: Old state (ts, value)
        """
        if m.value < r[1]:
            # Counter decreased, either due wrap or stepback
            if r[1] <= MAX31:
                mc = MAX31
            elif r[1] <= MAX32:
                mc = MAX32
            else:
                mc = MAX64
            # Direct distance
            d_direct = r[1] - m.value
            # Wrap distance
            d_wrap = m.value + (mc - r[1])
            if d_direct < d_wrap:
                # Possible counter stepback
                # Skip value
                self.logger.debug(
                    "[%s] Counter stepback: %s -> %s",
                    m.label, r[1], m.value
                )
                return None
            else:
                # Counter wrap
                self.logger.debug(
                    "[%s] Counter wrap: %s -> %s",
                    m.label, r[1], m.value
                )
                return d_wrap
        else:
            return m.value - r[1]

    def convert_counter(self, m, r):
        """
        Calculate value from counter, gently handling overflows
        :param m: MData
        :param r: Old state (ts, value)
        """
        dt = (float(m.ts) - float(r[0])) / NS
        delta = self.convert_delta(m, r)
        if delta is None:
            return delta
        return float(delta) / dt

    def get_window_function(self, m, cfg):
        """
        Check thresholds
        :param m: dict with metric result
        :param cfg: MetricConfig
        :return: Value or None
        """
        # Build window state key
        if m.path:
            key = "%x|%s" % (
                cfg.metric_type.bi_id,
                "|".join(str(p) for p in m.path)
            )
        else:
            key = "%x" % cfg.metric_type.bi_id
        #
        states = self.job.context["metric_windows"]
        value = m.abs_value
        ts = m.ts // 1000000000
        # Do not store single-value windows
        drop_window = cfg.window_type == "m" and cfg.window == 1
        # Restore window
        if drop_window:
            window = [(ts, value)]
            window_full = True
            if key in states:
                del states[key]
        else:
            window = states.get(key, [])
            window += [(ts, value)]
            # Trim window according to policy
            if cfg.window_type == WT_MEASURES:
                # Leave fixed amount of measures
                window = window[-cfg.window:]
                window_full = len(window) == cfg.window
            elif cfg.window_type == WT_TIME:
                # Time-based window
                window_full = ts - window[0][0] >= cfg.window
                while ts - window[0][0] > cfg.window:
                    window.pop(0)
            else:
                self.logger.error(
                    "Cannot calculate thresholds for %s (%s): Invalid window type '%s'",
                    m.metric, m.path, cfg.window_type
                )
                return None
            # Store back to context
            states[key] = window
        if not window_full:
            self.logger.error(
                "Cannot calculate thresholds for %s (%s): Window is not filled",
                m.metric, m.path
            )
            return None
        # Process window function
        wf = get_window_function(cfg.window_function)
        if not wf:
            self.logger.error(
                "Cannot calculate thresholds for %s (%s): Invalid window function %s",
                m.metric, m.path, cfg.window_function
            )
            return None
        try:
            return wf(window, cfg.window_config)
        except ValueError as e:
            self.logger.error(
                "Cannot calculate thresholds for %s (%s): %s",
                m.metric, m.path, e
            )
            return None

    def process_thresholds(self, m, cfg):
        """
        Check thresholds
        :param m: dict with metric result
        :param cfg: MetricConfig
        :return: List of umbrella alarm details
        """
        w_value = self.get_window_function(m, cfg)
        alarms = []
        if w_value is None:
            return alarms
        # Check thresholds
        path = m.metric
        if m.path:
            path += " | ".join(m.path)
        alarm_cfg = None
        if cfg.low_error is not None and w_value <= cfg.low_error:
            alarm_cfg = {
                "alarm_class": self.AC_PM_LOW_ERROR,
                "path": path,
                "severity": cfg.low_error_severity,
                "vars": {
                    "path": path,
                    "metric": m.metric,
                    "value": w_value,
                    "threshold": cfg.low_error,
                    "window_type": cfg.window_type,
                    "window": cfg.window,
                    "window_function": cfg.window_function
                }
            }
        elif cfg.low_warn is not None and w_value <= cfg.low_warn:
            alarm_cfg = {
                "alarm_class": self.AC_PM_LOW_WARN,
                "path": path,
                "severity": cfg.low_warn_severity,
                "vars": {
                    "path": path,
                    "metric": m.metric,
                    "value": w_value,
                    "threshold": cfg.low_warn,
                    "window_type": cfg.window_type,
                    "window": cfg.window,
                    "window_function": cfg.window_function
                }
            }
        elif cfg.high_error is not None and w_value >= cfg.high_error:
            alarm_cfg = {
                "alarm_class": self.AC_PM_HIGH_ERROR,
                "path": path,
                "severity": cfg.high_error_severity,
                "vars": {
                    "path": path,
                    "metric": m.metric,
                    "value": w_value,
                    "threshold": cfg.high_error,
                    "window_type": cfg.window_type,
                    "window": cfg.window,
                    "window_function": cfg.window_function
                }
            }
        elif cfg.high_warn is not None and w_value >= cfg.high_warn:
            alarm_cfg = {
                "alarm_class": self.AC_PM_HIGH_WARN,
                "path": path,
                "severity": cfg.high_warn_severity,
                "vars": {
                    "path": path,
                    "metric": m.metric,
                    "value": w_value,
                    "threshold": cfg.high_warn,
                    "window_type": cfg.window_type,
                    "window": cfg.window,
                    "window_function": cfg.window_function
                }
            }
        if alarm_cfg is not None:
            alarms += [alarm_cfg]
            # Apply umbrella filter handler
            if cfg.threshold_profile and cfg.threshold_profile.umbrella_filter_handler:
                try:
                    handler = get_handler(cfg.threshold_profile.umbrella_filter_handler)
                    if handler:
                        alarms = [handler(self, a) for a in alarms]
                        # Remove filtered alarms
                        alarms = [a for a in alarms if a]
                except Exception as e:
                    self.logger.error("Exception when loading handler %s", e)
        return alarms

    def send_metrics(self, data):
        """
        Convert collected metrics to Service.register_metric format
        :param data: (table fields, pk) -> field -> value
        :return:
        """
        # Normalized data
        # fields -> records
        chains = defaultdict(list)
        # Normalize data
        for (fields, pk), values in six.iteritems(data):
            # Sorted list of fields
            f = sorted(values)
            record_fields = "%s.%s" % (fields, ".".join(f))
            if isinstance(record_fields, unicode):
                record_fields = record_fields.encode("utf-8")
            record = "%s\t%s" % (pk, "\t".join(str(values[fn]) for fn in f))
            if isinstance(record, unicode):
                record = record.encode("utf-8")
            chains[record_fields] += [
                record
            ]
        # Spool data
        for f in chains:
            self.service.register_metrics(f, chains[f])