def obtain_data(self): db = pymongo.Connection(self.db_server).clio coll_name = 'postgres_repl_%s' % datetime.utcnow().strftime('%Y%m') field = 'data' res = db[coll_name].find_one( { '$or': [ { 'host': self.server }, { 'data.slaves': { '$elemMatch': { 'host': self.ec2_public_hostname, } } }, ], }, sort=[('ts', pymongo.DESCENDING)], fields=[field, 'host', 'ts'], ) assert (datetime.utcnow() - res['ts']).seconds < 60, "stale data! is arke running?" def calc_offset(data): pieces = data.split('/') return (int('ffffffff', 16) * int(pieces[0], 16) + int(pieces[1], 16)) if self.server == res['host']: self.primary = True self.receive_delay = 0 self.replay_delay = 0 else: self.primary = False master_num = calc_offset(res['data']['master']) slaves = res['data']['slaves'] me = None for slave in slaves: if slave['host'] == self.ec2_public_hostname: me = slave break self.receive_delay = master_num - calc_offset(me['r']) self.replay_delay = master_num - calc_offset(me['p']) self.measures = [ nagiosplugin.Measure('postgres_receive_delay', self.receive_delay, warning=self.warning, critical=self.critical), nagiosplugin.Measure('postgres_replay_delay', self.replay_delay, warning=self.warning, critical=self.critical), ]
def obtain_data(self): db = pymongo.Connection(self.db_server).clio coll_name = 'ssh_hello_%s' % datetime.now().strftime('%Y%m') found = db[coll_name].find_one( sort=[('_id', pymongo.DESCENDING)], skip=1, #the latest result set is probably still receiving results. ) assert (datetime.utcnow() - found['_id']).seconds < 60, "stale data! is arke running?" results = [x for x in found['data'] if x['to'] == self.server] assert len(results) > 0, "no results!" assert len( results ) > self.minimum, "not enough results! only found %i results." % len( results) if all((x['lag'] == -1 for x in results)): avg_lag = -1 else: avg_lag = sum( (x['lag'] for x in results if x['lag'] >= 0)) / len(results) self.lag = avg_lag self.measures = [ nagiosplugin.Measure('ssh_lag', self.lag, warning=self.warning, critical=self.critical) ]
class Symfony2Check(nagiosplugin.Check): name = "Symfony2 health check" version = "1.0" def __init__(self, optparser, logger): self.log = logger optparser.description = 'Health check of Symfony2 application' optparser.version = '1.0' optparser.add_option('-w', '--warning', default='1', metavar='RANGE', help='warning threshold (default: %default%)') optparser.add_option('-c', '--critical', default='1', metavar='RANGE', help='warning threshold (default: %default%)') optparser.add_option('-u', '--url', help='Url to check') optparser.add_option('-a', '--auth', help='Authentication', default=None) def process_args(self, options, args): self.warning = options.warning.rstrip('%') self.critical = options.critical.rstrip('%') if not options.url: raise Exception("Missing url option") self.url = options.url.strip() + "/monitor/health/run" self.hostUrl = options.url.strip() if options.auth is not None: self.username, self.password = options.auth.split(":") else: self.username = None def obtain_data(self): self.badChecks = [] try: content = self.fetch(self.url) json = simplejson.loads(content) if json['globalStatus'] is not 'OK': self.badChecks = [] for check in json['checks']: if check['status']: self.badChecks.append(check["checkName"]) except Exception, e: self.log.warn("Could not connect to url: " + self.url + " res:" + str(e)) self.badChecks.append("config_error_in_nagios") self.measures = [ nagiosplugin.Measure("Num_failed_checks", len(self.badChecks), warning=self.warning, critical=self.critical, minimum=0) ]
def obtain_data(self): db = pymongo.Connection(self.db_server).clio coll_name = 'ssh_hello_%s' % datetime.utcnow().strftime('%Y%m') found = db[coll_name].find_one( sort=[('_id', pymongo.DESCENDING)], skip=1, #the latest result set is probably still receiving results. ) #assert (datetime.utcnow() - found['_id']).seconds < 60, "stale data! is arke running?" coll_name = 'system_%s' % datetime.utcnow().strftime('%Y%m') res = db[coll_name].find_one({'host': self.server}, sort=[('ts', pymongo.DESCENDING)], fields=['ts']) #assert (datetime.utcnow() - res['ts']).seconds < 60, "stale data! is arke running?" #assert ((datetime.utcnow() - res['ts']).seconds < 60) or ((datetime.utcnow() - found['_id']).seconds < 60), "stale data! is arke running?" self.alive = sent_data_recently = (datetime.utcnow() - res['ts']).seconds < 60 ssh_data_fresh = (datetime.utcnow() - found['_id']).seconds < 60 results = [x for x in found['data'] if x['to'] == self.server] if ssh_data_fresh and results: if all((x['lag'] == -1 for x in results)): avg_lag = -1 else: avg_lag = sum( (x['lag'] for x in results if x['lag'] >= 0)) / len(results) self.lag = avg_lag self.measures = [ nagiosplugin.Measure('alive-ssh_lag', self.lag, warning=self.warning, critical=self.critical) ] elif sent_data_recently: self.measures = [ nagiosplugin.Measure('alive', int(self.alive), critical=0) ]
def obtain_data(self): db = pymongo.Connection(self.db_server).clio coll_name = 'system_%s' % datetime.utcnow().strftime('%Y%m') field = 'data.processes' result = db[coll_name].find_one({'host': self.server}, sort=[('ts', pymongo.DESCENDING)], fields=[field, 'ts']) assert (datetime.utcnow() - result['ts']).seconds < 60, "stale data! is arke running?" def is_listening(connections, port): return any((x for x in connections if x['status'] == u'LISTEN' and x['local_address'][1] == port)) parent_pids = [] processes = [] for properties in result['data']['processes']: pid = properties['pid'] if self.search_obj.search(properties['cmdline']) and \ properties['status'] not in ('zombie', 'dead', 'stopped', 'tracing stop'): if self.listening_ports: if getattr(__builtins__, self.port_match)(( is_listening(properties['connections'], p) for p in self.listening_ports )): parent_pids.append(pid) processes.append(( pid, properties['name'], properties['cmdline'], )) else: processes.append(( pid, properties['name'], properties['cmdline'], )) if parent_pids: parent_pids = map(int, parent_pids) for properties in result['data']['processes']: if properties['ppid'] in parent_pids: processes.append(( properties['pid'], properties['name'], properties['cmdline'], )) self.processes = processes self.found_count = len(processes) self.measures = [nagiosplugin.Measure( 'processes_found', self.found_count, warning=self.warning, critical=self.critical)]
def obtain_data(self): sys.argv[:] = sys.argv[:1] self.runner = zope.testrunner.runner.Runner(self.testrunner_argv) old_stdout = sys.stdout sys.stdout = redirected_stdout = StringIO.StringIO() try: self.runner.run() finally: sys.stdout = old_stdout if self.runner.failed: log = tempfile.mktemp('.log', 'check_testrunner.') redirected_stdout.seek(0) open(log, 'w').write(redirected_stdout.getvalue()) self.logger.error("Test runner output logged to %s" % log) self.measures = [ nagiosplugin.Measure(u'run', float(self.runner.ran)), nagiosplugin.Measure(u'errors', float(len(self.runner.errors)), critical='0:0'), nagiosplugin.Measure(u'failures', float(len(self.runner.failures)), critical='0:0') ]
def obtain_data(self): db = pymongo.Connection(self.db_server).clio coll_name = 'system_%s' % datetime.utcnow().strftime('%Y%m') field = 'data.fs.%s.percent' % self.filesystem res = db[coll_name].find_one({'host': self.server}, sort=[('ts', pymongo.DESCENDING)], fields=[field, 'ts']) assert (datetime.utcnow() - res['ts']).seconds < 60, "stale data! is arke running?" fs_perc = res['data']['fs'][self.filesystem]['percent'] self.usage = fs_perc self.measures = [ nagiosplugin.Measure('/', self.usage, '%', self.warning, self.critical, 0, 100) ]
def obtain_data(self): db = pymongo.Connection(self.db_server).clio coll_name = 'system_%s' % datetime.utcnow().strftime('%Y%m') field = 'data.fs' res = db[coll_name].find_one({'host': self.server}, sort=[('ts', pymongo.DESCENDING)], fields=[field, 'ts']) assert (datetime.utcnow() - res['ts']).seconds < 60, "stale data! is arke running?" self.usages = {} self.measures = [] for fs in res['data']['fs']: percent_used = res['data']['fs'][fs]['percent'] self.usages[fs] = percent_used self.measures.append(nagiosplugin.Measure( fs, percent_used, '%', self.warning, self.critical, 0, 100))
def obtain_data(self): mc = self.master_conn.cursor() mc.execute('SELECT pg_current_xlog_location()') master_loc = xlog_to_bytes(mc.fetchone()[0]) self.master_conn.commit() self.master_conn.close() sc = self.slave_conn.cursor() sc.execute('SELECT pg_last_xlog_replay_location()') slave_loc = xlog_to_bytes(sc.fetchone()[0]) self.slave_conn.commit() self.slave_conn.close() self.lag = (master_loc - slave_loc) / 1024 self.measures = [ nagiosplugin.Measure('lag', self.lag, 'kB', self.warning, self.critical) ]
def obtain_data(self): db = pymongo.Connection(self.db_server).clio coll_name = 'mongodb_%s' % datetime.utcnow().strftime('%Y%m') field = 'data.repl_status' res = db[coll_name].find_one( {'host': self.server}, sort=[('ts', pymongo.DESCENDING)], fields=[field, 'ts'], ) assert (datetime.utcnow() - res['ts']).seconds < 60, "stale data! is arke running?" if res['data']['repl_status'] is None: self.primary = None self.repl_lag = 0 else: members = res['data']['repl_status']['members'] primary = None me = None for member in members: if member.get('self', False): me = member if member.get('state', None) == 1: primary = member if primary is me: self.primary = True self.repl_lag = 0 else: self.primary = False self.repl_lag = max(0, primary['optime']['t'] - me['optime']['t']) #assert primary['optime']['t'] >= me['optime']['t'], "optime of master is less than the slave. the hell?\n%s" % pformat(res) self.measures = [ nagiosplugin.Measure('mongodb_repl_lag', self.repl_lag, warning=self.warning, critical=self.critical) ]