Exemplo n.º 1
0
def check_condition(condition, datapoints):

    lbl = "%s:%s [%s]" % (condition.uuid, condition.rule_id, condition)

    # extract value from series and apply operator
    triggered, value = compute(condition.operator, condition.aggregate,
                               [val for val, timestamp in datapoints],
                               condition.value)

    # condition state changed
    if triggered != condition.state:
        condition.state = triggered
        condition.state_since = time()
        # if condition untriggered and no trigger notification previously sent,
        # set level to 1 so that we don't send OK to core (in case condition
        # uses custom reminder list where first notification happens later).
        if not triggered and condition.notification_level == 0:
            condition.notification_level = 1
        else:
            condition.notification_level = 0
        if triggered:
            # if condition just got triggered, issue a new incident_id
            condition.incident_id = uuid.uuid4().hex
        condition.save()

    # logs are gooood
    since_str = "always"
    if condition.state_since:
        since_str = tdelta_to_str(time() - condition.state_since)
        if since_str:
            since_str += " ago"
        else:
            since_str = "just now"
    msg = "%s is %s since %s (value=%s, level=%d)" % (
        lbl, condition.state, since_str, value, condition.notification_level)

    # notify core if necessary
    reminder_list = condition.reminder_list or config.REMINDER_LIST
    if condition.state and len(reminder_list) > condition.notification_level:
        duration = time() - condition.state_since
        next_notification = reminder_list[condition.notification_level]
        next_notification += condition.reminder_offset
        if duration < next_notification:
            log.info(msg)
            return
        try:
            notify_core(condition, value)
        except Exception as exc:
            # don't advance notification level if notification failed
            log.error("%s - FAILED to send WARNING: %r", msg, exc)
            return
        log.info("%s - sent WARNING", msg)
        condition.notification_level += 1
        condition.save()
    elif not condition.state and not condition.notification_level:
        try:
            notify_core(condition, value)
        except Exception as exc:
            # don't advance notification level if notification failed
            log.error("%s - FAILED to send OK: %r", msg, exc)
            return
        log.info("%s - sent OK", msg)
        condition.notification_level = 1
        condition.save()
    else:
        log.info(msg)
Exemplo n.º 2
0
def check_condition(condition, datapoints):

    lbl = "%s:%s [%s]" % (condition.uuid, condition.rule_id, condition)

    # extract value from series and apply operator
    triggered, value = compute(
        condition.operator, condition.aggregate, [val for val, timestamp in datapoints], condition.value
    )

    # condition state changed
    if triggered != condition.state:
        condition.state = triggered
        condition.state_since = time()
        # if condition untriggered and no trigger notification previously sent,
        # set level to 1 so that we don't send OK to core (in case condition
        # uses custom reminder list where first notification happens later).
        if not triggered and condition.notification_level == 0:
            condition.notification_level = 1
        else:
            condition.notification_level = 0
        if triggered:
            # if condition just got triggered, issue a new incident_id
            condition.incident_id = uuid.uuid4().hex
        condition.save()

    # logs are gooood
    since_str = "always"
    if condition.state_since:
        since_str = tdelta_to_str(time() - condition.state_since)
        if since_str:
            since_str += " ago"
        else:
            since_str = "just now"
    msg = "%s is %s since %s (value=%s, level=%d)" % (
        lbl,
        condition.state,
        since_str,
        value,
        condition.notification_level,
    )

    # notify core if necessary
    reminder_list = condition.reminder_list or config.REMINDER_LIST
    if condition.state and len(reminder_list) > condition.notification_level:
        duration = time() - condition.state_since
        next_notification = reminder_list[condition.notification_level]
        next_notification += condition.reminder_offset
        if duration < next_notification:
            log.info(msg)
            return
        try:
            notify_core(condition, value)
        except Exception as exc:
            # don't advance notification level if notification failed
            log.error("%s - FAILED to send WARNING: %r", msg, exc)
            return
        log.info("%s - sent WARNING", msg)
        condition.notification_level += 1
        condition.save()
    elif not condition.state and not condition.notification_level:
        try:
            notify_core(condition, value)
        except Exception as exc:
            # don't advance notification level if notification failed
            log.error("%s - FAILED to send OK: %r", msg, exc)
            return
        log.info("%s - sent OK", msg)
        condition.notification_level = 1
        condition.save()
    else:
        log.info(msg)
Exemplo n.º 3
0
def check_machine(machine, rule_id=''):
    """Check all conditions for given machine with a single graphite query.

    If rule is specified, on that rule will be checked.

    """

    old_targets = {
        'cpu': 'cpu.total.nonidle',
        'load': 'load.shortterm',
        'ram': 'memory.nonfree_percent',
        'disk-read': 'disk.total.disk_octets.read',
        'disk-write': 'disk.total.disk_octets.write',
        'network-rx': 'interface.total.if_octets.rx',
        'network-tx': 'interface.total.if_octets.tx',
    }

    handler = MultiHandler(machine.uuid)

    # check if machine activated
    if not machine.activated:
        if handler.check_head():
            log.info("%s just got activated after %s", machine.uuid,
                     tdelta_to_str(time() - machine.enabled_time))
            with machine.lock_n_load():
                machine.activated = True
                machine.save()
                for rule_id in machine.rules:
                    condition = machine.get_condition(rule_id)
                    condition.active_after = time() + 30
                    condition.save()
        else:
            log.info("%s not activated since %s", machine.uuid,
                     tdelta_to_str(time() - machine.enabled_time))
        return

    # gather all conditions
    conditions = {}
    rules = [rule_id] if rule_id else machine.rules
    for rule_id in rules:
        lbl = "%s/%s" % (machine.uuid, rule_id)
        try:
            condition = machine.get_condition(rule_id)
        except ConditionNotFoundError:
            log.warning(
                "%s condition not found, probably rule just got "
                "updated, will check on next run", lbl)
            continue
        lbl = "%s [%s]" % (lbl, condition)
        target = old_targets.get(condition.metric, condition.metric)
        ## if "%(head)s." not in target:
        ## target = "%(head)s." + target
        if condition.operator not in ('gt', 'lt'):
            log.error("%s unknown operator '%s'", lbl, condition.operator)
            continue
        if not condition.aggregate:
            log.warning("%s setting aggregate to 'all'", lbl)
            condition.aggregate = 'all'
            condition.save()
        if condition.aggregate not in ('all', 'any', 'avg'):
            log.error("%s unknown aggregate '%s'", lbl, condition.aggregate)
            continue
        if condition.active_after > time():
            log.info("%s not yet active", lbl)
            continue
        if target not in conditions:
            conditions[target] = [condition]
        else:
            conditions[target].append(condition)
    if not conditions:
        log.warning("%s no rules found", machine.uuid)
        return

    try:
        data = handler.get_data(conditions.keys(), start='-90sec')
    except GraphiteError as exc:
        log.warning("%s error fetching stats %r", machine.uuid, exc)
        return

    # check all conditions
    for item in data:
        target = item['_requested_target']
        if target not in conditions:
            log.warning("%s get data returned unexpected target %s",
                        machine.uuid, target)
            continue
        datapoints = [(val, ts) for val, ts in item['datapoints']
                      if val is not None]
        for condition in conditions.pop(target):
            if not datapoints:
                log.warning("%s/%s [%s] no data for rule", machine.uuid,
                            condition.rule_id, condition)
                continue
            check_condition(condition, datapoints)

    if conditions:
        for target in conditions:
            for cond in conditions[target]:
                if target == "nodata":
                    # if nodata rule didn't return any datapoints, the whisper
                    # files must be missing, so make the rule true
                    check_condition(cond, [(1, 0)])
                else:
                    log.warning("%s/%s [%s] target not found for rule",
                                machine.uuid, cond.rule_id, cond)
Exemplo n.º 4
0
def check_machine(machine, rule_id=""):
    """Check all conditions for given machine with a single graphite query.

    If rule is specified, on that rule will be checked.

    """

    old_targets = {
        "cpu": "cpu.total.nonidle",
        "load": "load.shortterm",
        "ram": "memory.nonfree_percent",
        "disk-read": "disk.total.disk_octets.read",
        "disk-write": "disk.total.disk_octets.write",
        "network-rx": "interface.total.if_octets.rx",
        "network-tx": "interface.total.if_octets.tx",
    }

    handler = MultiHandler(machine.uuid)

    # check if machine activated
    if not machine.activated:
        if handler.check_head():
            log.info("%s just got activated after %s", machine.uuid, tdelta_to_str(time() - machine.enabled_time))
            with machine.lock_n_load():
                machine.activated = True
                machine.save()
                for rule_id in machine.rules:
                    condition = machine.get_condition(rule_id)
                    condition.active_after = time() + 30
                    condition.save()
        else:
            log.info("%s not activated since %s", machine.uuid, tdelta_to_str(time() - machine.enabled_time))
        return

    # gather all conditions
    conditions = {}
    rules = [rule_id] if rule_id else machine.rules
    for rule_id in rules:
        lbl = "%s/%s" % (machine.uuid, rule_id)
        try:
            condition = machine.get_condition(rule_id)
        except ConditionNotFoundError:
            log.warning("%s condition not found, probably rule just got " "updated, will check on next run", lbl)
            continue
        lbl = "%s [%s]" % (lbl, condition)
        target = old_targets.get(condition.metric, condition.metric)
        ## if "%(head)s." not in target:
        ## target = "%(head)s." + target
        if condition.operator not in ("gt", "lt"):
            log.error("%s unknown operator '%s'", lbl, condition.operator)
            continue
        if not condition.aggregate:
            log.warning("%s setting aggregate to 'all'", lbl)
            condition.aggregate = "all"
            condition.save()
        if condition.aggregate not in ("all", "any", "avg"):
            log.error("%s unknown aggregate '%s'", lbl, condition.aggregate)
            continue
        if condition.active_after > time():
            log.info("%s not yet active", lbl)
            continue
        if target not in conditions:
            conditions[target] = [condition]
        else:
            conditions[target].append(condition)
    if not conditions:
        log.warning("%s no rules found", machine.uuid)
        return

    try:
        data = handler.get_data(conditions.keys(), start="-90sec")
    except GraphiteError as exc:
        log.warning("%s error fetching stats %r", machine.uuid, exc)
        return

    # check all conditions
    for item in data:
        target = item["_requested_target"]
        if target not in conditions:
            log.warning("%s get data returned unexpected target %s", machine.uuid, target)
            continue
        datapoints = [(val, ts) for val, ts in item["datapoints"] if val is not None]
        for condition in conditions.pop(target):
            if not datapoints:
                log.warning("%s/%s [%s] no data for rule", machine.uuid, condition.rule_id, condition)
                continue
            check_condition(condition, datapoints)

    if conditions:
        for target in conditions:
            for cond in conditions[target]:
                if target == "nodata":
                    # if nodata rule didn't return any datapoints, the whisper
                    # files must be missing, so make the rule true
                    check_condition(cond, [(1, 0)])
                else:
                    log.warning("%s/%s [%s] target not found for rule", machine.uuid, cond.rule_id, cond)