def dispatch(self, host, names): machine = get_machine_from_uuid(host) if not machine: log.error("machine not found, wtf!") return multihandler = MultiHandler(host) metrics = [] for name in names: target = statname(host, name) metric = multihandler.decorate_target(target) if metric['alias'].rfind("%(head)s.") == 0: metric['alias'] = metric['alias'][9:] plugin = metric['alias'].split('.')[0] if plugin not in self.ignore_plugins: metrics.append(metric) if not metrics: return log.info("New metrics for host %s, notifying core: %s", host, metrics) payload = { 'uuid': host, 'collectd_password': machine.collectd_password, 'metrics': metrics, } try: resp = requests.post( "%s/new_metrics" % mon_config.CORE_URI, data=json.dumps(payload), verify=mon_config.SSL_VERIFY ) except Exception as exc: log.error("Error notifying core: %r", exc) return if not resp.ok: log.error("Bad response from core: %s", resp.text) # also save to file in disk if self.fh is not None: try: for name in names: self.fh.write("%s %s\n" % (host, name)) self.fh.flush() except IOError as exc: log.error("Error writing to metrics file: %s", exc)
def check_machine(machine, rule_id=''): """Check all conditions for given machine with a single graphite query. If rule is specified, on that rule will be checked. """ old_targets = { 'cpu': 'cpu.total.nonidle', 'load': 'load.shortterm', 'ram': 'memory.nonfree_percent', 'disk-read': 'disk.total.disk_octets.read', 'disk-write': 'disk.total.disk_octets.write', 'network-rx': 'interface.total.if_octets.rx', 'network-tx': 'interface.total.if_octets.tx', } handler = MultiHandler(machine.uuid) # check if machine activated if not machine.activated: if handler.check_head(): log.info("%s just got activated after %s", machine.uuid, tdelta_to_str(time() - machine.enabled_time)) with machine.lock_n_load(): machine.activated = True machine.save() for rule_id in machine.rules: condition = machine.get_condition(rule_id) condition.active_after = time() + 30 condition.save() else: log.info("%s not activated since %s", machine.uuid, tdelta_to_str(time() - machine.enabled_time)) return # gather all conditions conditions = {} rules = [rule_id] if rule_id else machine.rules for rule_id in rules: lbl = "%s/%s" % (machine.uuid, rule_id) try: condition = machine.get_condition(rule_id) except ConditionNotFoundError: log.warning( "%s condition not found, probably rule just got " "updated, will check on next run", lbl) continue lbl = "%s [%s]" % (lbl, condition) target = old_targets.get(condition.metric, condition.metric) ## if "%(head)s." not in target: ## target = "%(head)s." + target if condition.operator not in ('gt', 'lt'): log.error("%s unknown operator '%s'", lbl, condition.operator) continue if not condition.aggregate: log.warning("%s setting aggregate to 'all'", lbl) condition.aggregate = 'all' condition.save() if condition.aggregate not in ('all', 'any', 'avg'): log.error("%s unknown aggregate '%s'", lbl, condition.aggregate) continue if condition.active_after > time(): log.info("%s not yet active", lbl) continue if target not in conditions: conditions[target] = [condition] else: conditions[target].append(condition) if not conditions: log.warning("%s no rules found", machine.uuid) return try: data = handler.get_data(conditions.keys(), start='-90sec') except GraphiteError as exc: log.warning("%s error fetching stats %r", machine.uuid, exc) return # check all conditions for item in data: target = item['_requested_target'] if target not in conditions: log.warning("%s get data returned unexpected target %s", machine.uuid, target) continue datapoints = [(val, ts) for val, ts in item['datapoints'] if val is not None] for condition in conditions.pop(target): if not datapoints: log.warning("%s/%s [%s] no data for rule", machine.uuid, condition.rule_id, condition) continue check_condition(condition, datapoints) if conditions: for target in conditions: for cond in conditions[target]: if target == "nodata": # if nodata rule didn't return any datapoints, the whisper # files must be missing, so make the rule true check_condition(cond, [(1, 0)]) else: log.warning("%s/%s [%s] target not found for rule", machine.uuid, cond.rule_id, cond)
def check_machine(machine, rule_id=""): """Check all conditions for given machine with a single graphite query. If rule is specified, on that rule will be checked. """ old_targets = { "cpu": "cpu.total.nonidle", "load": "load.shortterm", "ram": "memory.nonfree_percent", "disk-read": "disk.total.disk_octets.read", "disk-write": "disk.total.disk_octets.write", "network-rx": "interface.total.if_octets.rx", "network-tx": "interface.total.if_octets.tx", } handler = MultiHandler(machine.uuid) # check if machine activated if not machine.activated: if handler.check_head(): log.info("%s just got activated after %s", machine.uuid, tdelta_to_str(time() - machine.enabled_time)) with machine.lock_n_load(): machine.activated = True machine.save() for rule_id in machine.rules: condition = machine.get_condition(rule_id) condition.active_after = time() + 30 condition.save() else: log.info("%s not activated since %s", machine.uuid, tdelta_to_str(time() - machine.enabled_time)) return # gather all conditions conditions = {} rules = [rule_id] if rule_id else machine.rules for rule_id in rules: lbl = "%s/%s" % (machine.uuid, rule_id) try: condition = machine.get_condition(rule_id) except ConditionNotFoundError: log.warning("%s condition not found, probably rule just got " "updated, will check on next run", lbl) continue lbl = "%s [%s]" % (lbl, condition) target = old_targets.get(condition.metric, condition.metric) ## if "%(head)s." not in target: ## target = "%(head)s." + target if condition.operator not in ("gt", "lt"): log.error("%s unknown operator '%s'", lbl, condition.operator) continue if not condition.aggregate: log.warning("%s setting aggregate to 'all'", lbl) condition.aggregate = "all" condition.save() if condition.aggregate not in ("all", "any", "avg"): log.error("%s unknown aggregate '%s'", lbl, condition.aggregate) continue if condition.active_after > time(): log.info("%s not yet active", lbl) continue if target not in conditions: conditions[target] = [condition] else: conditions[target].append(condition) if not conditions: log.warning("%s no rules found", machine.uuid) return try: data = handler.get_data(conditions.keys(), start="-90sec") except GraphiteError as exc: log.warning("%s error fetching stats %r", machine.uuid, exc) return # check all conditions for item in data: target = item["_requested_target"] if target not in conditions: log.warning("%s get data returned unexpected target %s", machine.uuid, target) continue datapoints = [(val, ts) for val, ts in item["datapoints"] if val is not None] for condition in conditions.pop(target): if not datapoints: log.warning("%s/%s [%s] no data for rule", machine.uuid, condition.rule_id, condition) continue check_condition(condition, datapoints) if conditions: for target in conditions: for cond in conditions[target]: if target == "nodata": # if nodata rule didn't return any datapoints, the whisper # files must be missing, so make the rule true check_condition(cond, [(1, 0)]) else: log.warning("%s/%s [%s] target not found for rule", machine.uuid, cond.rule_id, cond)