def obtain_data(self):
        db = pymongo.Connection(self.db_server).clio
        coll_name = 'postgres_repl_%s' % datetime.utcnow().strftime('%Y%m')
        field = 'data'
        res = db[coll_name].find_one(
            {
                '$or': [
                    {
                        'host': self.server
                    },
                    {
                        'data.slaves': {
                            '$elemMatch': {
                                'host': self.ec2_public_hostname,
                            }
                        }
                    },
                ],
            },
            sort=[('ts', pymongo.DESCENDING)],
            fields=[field, 'host', 'ts'],
        )

        assert (datetime.utcnow() -
                res['ts']).seconds < 60, "stale data! is arke running?"

        def calc_offset(data):
            pieces = data.split('/')
            return (int('ffffffff', 16) * int(pieces[0], 16) +
                    int(pieces[1], 16))

        if self.server == res['host']:
            self.primary = True
            self.receive_delay = 0
            self.replay_delay = 0
        else:
            self.primary = False
            master_num = calc_offset(res['data']['master'])
            slaves = res['data']['slaves']
            me = None
            for slave in slaves:
                if slave['host'] == self.ec2_public_hostname:
                    me = slave
                    break

            self.receive_delay = master_num - calc_offset(me['r'])
            self.replay_delay = master_num - calc_offset(me['p'])

        self.measures = [
            nagiosplugin.Measure('postgres_receive_delay',
                                 self.receive_delay,
                                 warning=self.warning,
                                 critical=self.critical),
            nagiosplugin.Measure('postgres_replay_delay',
                                 self.replay_delay,
                                 warning=self.warning,
                                 critical=self.critical),
        ]
예제 #2
0
    def obtain_data(self):
        db = pymongo.Connection(self.db_server).clio
        coll_name = 'ssh_hello_%s' % datetime.now().strftime('%Y%m')
        found = db[coll_name].find_one(
            sort=[('_id', pymongo.DESCENDING)],
            skip=1,  #the latest result set is probably still receiving results.
        )

        assert (datetime.utcnow() -
                found['_id']).seconds < 60, "stale data! is arke running?"

        results = [x for x in found['data'] if x['to'] == self.server]

        assert len(results) > 0, "no results!"
        assert len(
            results
        ) > self.minimum, "not enough results! only found %i results." % len(
            results)

        if all((x['lag'] == -1 for x in results)):
            avg_lag = -1
        else:
            avg_lag = sum(
                (x['lag'] for x in results if x['lag'] >= 0)) / len(results)

        self.lag = avg_lag
        self.measures = [
            nagiosplugin.Measure('ssh_lag',
                                 self.lag,
                                 warning=self.warning,
                                 critical=self.critical)
        ]
예제 #3
0
class Symfony2Check(nagiosplugin.Check):
    name = "Symfony2 health check"
    version = "1.0"

    def __init__(self, optparser, logger):
        self.log = logger
        optparser.description = 'Health check of Symfony2 application'
        optparser.version = '1.0'
        optparser.add_option('-w',
                             '--warning',
                             default='1',
                             metavar='RANGE',
                             help='warning threshold (default: %default%)')
        optparser.add_option('-c',
                             '--critical',
                             default='1',
                             metavar='RANGE',
                             help='warning threshold (default: %default%)')
        optparser.add_option('-u', '--url', help='Url to check')
        optparser.add_option('-a',
                             '--auth',
                             help='Authentication',
                             default=None)

    def process_args(self, options, args):
        self.warning = options.warning.rstrip('%')
        self.critical = options.critical.rstrip('%')
        if not options.url:
            raise Exception("Missing url option")
        self.url = options.url.strip() + "/monitor/health/run"
        self.hostUrl = options.url.strip()
        if options.auth is not None:
            self.username, self.password = options.auth.split(":")
        else:
            self.username = None

    def obtain_data(self):
        self.badChecks = []
        try:
            content = self.fetch(self.url)
            json = simplejson.loads(content)

            if json['globalStatus'] is not 'OK':
                self.badChecks = []
                for check in json['checks']:
                    if check['status']:
                        self.badChecks.append(check["checkName"])

        except Exception, e:
            self.log.warn("Could not connect to url: " + self.url + " res:" +
                          str(e))
            self.badChecks.append("config_error_in_nagios")

        self.measures = [
            nagiosplugin.Measure("Num_failed_checks",
                                 len(self.badChecks),
                                 warning=self.warning,
                                 critical=self.critical,
                                 minimum=0)
        ]
예제 #4
0
    def obtain_data(self):
        db = pymongo.Connection(self.db_server).clio
        coll_name = 'ssh_hello_%s' % datetime.utcnow().strftime('%Y%m')
        found = db[coll_name].find_one(
            sort=[('_id', pymongo.DESCENDING)],
            skip=1,  #the latest result set is probably still receiving results.
        )

        #assert (datetime.utcnow() - found['_id']).seconds < 60, "stale data! is arke running?"

        coll_name = 'system_%s' % datetime.utcnow().strftime('%Y%m')
        res = db[coll_name].find_one({'host': self.server},
                                     sort=[('ts', pymongo.DESCENDING)],
                                     fields=['ts'])

        #assert (datetime.utcnow() - res['ts']).seconds < 60, "stale data! is arke running?"
        #assert ((datetime.utcnow() - res['ts']).seconds < 60) or ((datetime.utcnow() - found['_id']).seconds < 60), "stale data! is arke running?"
        self.alive = sent_data_recently = (datetime.utcnow() -
                                           res['ts']).seconds < 60
        ssh_data_fresh = (datetime.utcnow() - found['_id']).seconds < 60

        results = [x for x in found['data'] if x['to'] == self.server]

        if ssh_data_fresh and results:

            if all((x['lag'] == -1 for x in results)):
                avg_lag = -1
            else:
                avg_lag = sum(
                    (x['lag']
                     for x in results if x['lag'] >= 0)) / len(results)

            self.lag = avg_lag
            self.measures = [
                nagiosplugin.Measure('alive-ssh_lag',
                                     self.lag,
                                     warning=self.warning,
                                     critical=self.critical)
            ]
        elif sent_data_recently:
            self.measures = [
                nagiosplugin.Measure('alive', int(self.alive), critical=0)
            ]
    def obtain_data(self):
        db = pymongo.Connection(self.db_server).clio
        coll_name = 'system_%s' % datetime.utcnow().strftime('%Y%m')
        field = 'data.processes'
        result = db[coll_name].find_one({'host': self.server},
                                         sort=[('ts', pymongo.DESCENDING)],
                                         fields=[field, 'ts'])

        assert (datetime.utcnow() - result['ts']).seconds < 60, "stale data! is arke running?"

        def is_listening(connections, port):
            return any((x for x in connections if x['status'] == u'LISTEN' and x['local_address'][1] == port))


        parent_pids = []
        processes = []
        for properties in result['data']['processes']:
            pid = properties['pid']
            if self.search_obj.search(properties['cmdline']) and \
               properties['status'] not in ('zombie', 'dead', 'stopped', 'tracing stop'):

                if self.listening_ports:
                    if getattr(__builtins__, self.port_match)((
                        is_listening(properties['connections'], p) for p in self.listening_ports )):

                        parent_pids.append(pid)
                        processes.append((
                            pid,
                            properties['name'],
                            properties['cmdline'],
                        ))
                else:
                    processes.append((
                        pid,
                        properties['name'],
                        properties['cmdline'],
                    ))

        if parent_pids:
            parent_pids = map(int, parent_pids)
            for properties in result['data']['processes']:
                if properties['ppid'] in parent_pids:
                    processes.append((
                        properties['pid'],
                        properties['name'],
                        properties['cmdline'],
                    ))


        self.processes = processes
        self.found_count = len(processes)
        self.measures = [nagiosplugin.Measure(
            'processes_found', self.found_count, warning=self.warning, critical=self.critical)]
예제 #6
0
 def obtain_data(self):
     sys.argv[:] = sys.argv[:1]
     self.runner = zope.testrunner.runner.Runner(self.testrunner_argv)
     old_stdout = sys.stdout
     sys.stdout = redirected_stdout = StringIO.StringIO()
     try:
         self.runner.run()
     finally:
         sys.stdout = old_stdout
     if self.runner.failed:
         log = tempfile.mktemp('.log', 'check_testrunner.')
         redirected_stdout.seek(0)
         open(log, 'w').write(redirected_stdout.getvalue())
         self.logger.error("Test runner output logged to %s" % log)
     self.measures = [
         nagiosplugin.Measure(u'run', float(self.runner.ran)),
         nagiosplugin.Measure(u'errors',
                              float(len(self.runner.errors)),
                              critical='0:0'),
         nagiosplugin.Measure(u'failures',
                              float(len(self.runner.failures)),
                              critical='0:0')
     ]
예제 #7
0
 def obtain_data(self):
     db = pymongo.Connection(self.db_server).clio
     coll_name = 'system_%s' % datetime.utcnow().strftime('%Y%m')
     field = 'data.fs.%s.percent' % self.filesystem
     res = db[coll_name].find_one({'host': self.server},
                                  sort=[('ts', pymongo.DESCENDING)],
                                  fields=[field, 'ts'])
     assert (datetime.utcnow() -
             res['ts']).seconds < 60, "stale data! is arke running?"
     fs_perc = res['data']['fs'][self.filesystem]['percent']
     self.usage = fs_perc
     self.measures = [
         nagiosplugin.Measure('/', self.usage, '%', self.warning,
                              self.critical, 0, 100)
     ]
예제 #8
0
    def obtain_data(self):
        db = pymongo.Connection(self.db_server).clio
        coll_name = 'system_%s' % datetime.utcnow().strftime('%Y%m')
        field = 'data.fs'
        res = db[coll_name].find_one({'host': self.server},
                                         sort=[('ts', pymongo.DESCENDING)],
                                         fields=[field, 'ts'])
        assert (datetime.utcnow() - res['ts']).seconds < 60, "stale data! is arke running?"

        self.usages = {}
        self.measures = []

        for fs in res['data']['fs']:
            percent_used = res['data']['fs'][fs]['percent']
            self.usages[fs] = percent_used
            self.measures.append(nagiosplugin.Measure(
                fs, percent_used, '%', self.warning, self.critical, 0, 100))
예제 #9
0
    def obtain_data(self):
        mc = self.master_conn.cursor()
        mc.execute('SELECT pg_current_xlog_location()')
        master_loc = xlog_to_bytes(mc.fetchone()[0])
        self.master_conn.commit()
        self.master_conn.close()

        sc = self.slave_conn.cursor()
        sc.execute('SELECT pg_last_xlog_replay_location()')
        slave_loc = xlog_to_bytes(sc.fetchone()[0])
        self.slave_conn.commit()
        self.slave_conn.close()

        self.lag = (master_loc - slave_loc) / 1024
        self.measures = [
            nagiosplugin.Measure('lag', self.lag, 'kB', self.warning,
                                 self.critical)
        ]
    def obtain_data(self):
        db = pymongo.Connection(self.db_server).clio
        coll_name = 'mongodb_%s' % datetime.utcnow().strftime('%Y%m')
        field = 'data.repl_status'
        res = db[coll_name].find_one(
            {'host': self.server},
            sort=[('ts', pymongo.DESCENDING)],
            fields=[field, 'ts'],
        )

        assert (datetime.utcnow() -
                res['ts']).seconds < 60, "stale data! is arke running?"

        if res['data']['repl_status'] is None:
            self.primary = None
            self.repl_lag = 0
        else:
            members = res['data']['repl_status']['members']
            primary = None
            me = None

            for member in members:
                if member.get('self', False):
                    me = member
                if member.get('state', None) == 1:
                    primary = member

            if primary is me:
                self.primary = True
                self.repl_lag = 0
            else:
                self.primary = False
                self.repl_lag = max(0,
                                    primary['optime']['t'] - me['optime']['t'])

            #assert primary['optime']['t'] >= me['optime']['t'], "optime of master is less than the slave. the hell?\n%s" % pformat(res)
        self.measures = [
            nagiosplugin.Measure('mongodb_repl_lag',
                                 self.repl_lag,
                                 warning=self.warning,
                                 critical=self.critical)
        ]