def check_schedulers(r, schedulers): for s in schedulers: action('Check scheduler {} .....'.format(s[2:])) try: ts = r.get("zmon:metrics:{}:ts".format(s)) if ts is None: error("No scheduling loop registered ( running/stuck? )") continue delta = int(time.time() - float(ts)) action("... last loop") highlight("{}".format(delta)) action("s ago ...") if delta > 300: error("Last loop more than 300s ago (stuck? restart?)".format(delta)) continue if delta > 180: error("Last loop more than 180s ago (stuck? check logs/watch)".format(delta)) continue action("...") ok() except Exception as e: error(e)
def check_queues(redis): queues = ['zmon:queue:default', 'zmon:queue:snmp', 'zmon:queue:internal', 'zmon:queue:secure'] for q in queues: action('Checking queue length ... {} ...'.format(q)) l = redis.llen(q) action("...") highlight("{}".format(l)) action(" ...") if l < 2000: ok() continue error("to many tasks")
def check_workers(r, workers): for w in workers: action('Check worker {} ...'.format(w)) try: ts = r.get("zmon:metrics:{}:ts".format(w)) delta = time.time() - float(ts) delta = max(int(delta), 0) action("... last exec") highlight("{}".format(delta)) action("s ago ...") if delta < 30: ok() continue error("no task execute recently") except Exception as e: error(e)