예제 #1
0
def main(config_file, std_err=False, verbose=True, dont_send=False):
    """
    Main function of the script

    Args:
        config_file: file path of the config file to load
        std_err: whether print logging output to stderr
        verbose: whether to provide verbose logging messages
        dont_send: whether to sent data to monitoring system or just do a dry
                   run
    """
    try:
        # Configure logging:
        fmt = logging.Formatter('%(filename)s[%(process)d] %(levelname)s: ' +
                                '%(message)s')
        logger = logging.getLogger()
        if verbose:
            logger.setLevel(logging.DEBUG)
        else:
            logger.setLevel(logging.INFO)
        if std_err:
            handler = logging.StreamHandler()
        else:
            handler = lh.SysLogHandler(address='/dev/log',
                                       facility=lh.SysLogHandler.LOG_USER)
        handler.setFormatter(fmt)
        logger.addHandler(handler)

        logger.info("check_cert is starting, command line arguments:" +
                    "config_file={0}, ".format(config_file) +
                    "std_err={0}, ".format(std_err) +
                    "verbose={0}, ".format(verbose))

        # FIXME - Remember to correctly configure syslog, otherwise rsyslog will
        # discard messages
        ScriptConfiguration.load_config(config_file)

        logger.debug("Loaded configuration: " +
                     str(ScriptConfiguration.get_config()))

        # Provide some sane defaults:
        try:
            repo_port = ScriptConfiguration.get_val("repo_port")
        except KeyError:
            repo_port = 22

        try:
            ignored_certs = ScriptConfiguration.get_val("ignored_certs")
        except KeyError:
            ignored_certs = {}

        logger.debug("Remote repo is: {0}@{1}{3}->{4}, tcp port {2}".format(
            ScriptConfiguration.get_val("repo_user"),
            ScriptConfiguration.get_val("repo_host"), repo_port,
            ScriptConfiguration.get_val("repo_url"),
            ScriptConfiguration.get_val("repo_masterbranch")) +
                     ", local repository dir is {0}".format(
                         ScriptConfiguration.get_val('repo_localdir')) +
                     ", repository key is {0}".format(
                         ScriptConfiguration.get_val('repo_pubkey')) +
                     ", warn_thresh is {0}".format(
                         ScriptConfiguration.get_val('warn_treshold')) +
                     ", crit_thresh is {0}".format(
                         ScriptConfiguration.get_val('critical_treshold')))

        # Initialize Riemann/NRPE reporting:
        if ScriptConfiguration.get_val("riemann_enabled") is True:
            ScriptStatus.initialize(
                riemann_enabled=True,
                riemann_hosts_config=ScriptConfiguration.get_val(
                    "riemann_hosts"),
                riemann_tags=ScriptConfiguration.get_val("riemann_tags"),
                riemann_ttl=ScriptConfiguration.get_val("riemann_ttl"),
                riemann_service_name=SERVICE_NAME,
                nrpe_enabled=ScriptConfiguration.get_val("nrpe_enabled"),
                debug=dont_send,
            )
        else:
            ScriptStatus.initialize(
                nrpe_enabled=ScriptConfiguration.get_val("nrpe_enabled"),
                debug=dont_send,
            )

        # Now, let's verify the configuration:
        # FIXME - ScriptStatus might have been already initialized with
        # incorrect config and in effect ScriptStatus.notify_immediate will
        # not reach monitoring system
        conf_issues = _verify_conf(ScriptConfiguration.get_config())
        if conf_issues:
            logging.debug("Configuration problems:\n\t" +
                          '\n\t'.join(conf_issues))
            ScriptStatus.notify_immediate(
                'unknown',
                "Configuration file contains errors: " + ' '.join(conf_issues))

        # Make sure that we are the only ones running on the server:
        ScriptLock.init(ScriptConfiguration.get_val('lockfile'))
        ScriptLock.aqquire()

        # Initialize our repo mirror:
        CertStore.initialize(
            host=ScriptConfiguration.get_val("repo_host"),
            port=repo_port,
            pubkey=ScriptConfiguration.get_val('repo_pubkey'),
            username=ScriptConfiguration.get_val("repo_user"),
            repo_localdir=ScriptConfiguration.get_val('repo_localdir'),
            repo_url=ScriptConfiguration.get_val("repo_url"),
            repo_masterbranch=ScriptConfiguration.get_val("repo_masterbranch"),
        )

        unparsable_certs = {"number": 0, "paths": []}

        for cert in CertStore.lookup_certs(CERTIFICATE_EXTENSIONS):
            # Check whether the cert needs to be included in checks at all:
            cert_hash = hashlib.sha1(cert.content).hexdigest()
            if cert_hash in ignored_certs:
                # This cert should be ignored
                logging.info("certificate {0} (sha1sum: {1})".format(
                    cert.path, cert_hash) + " has been ignored.")
                continue

            # Check if certifice type is supported:
            if cert.path[-3:] not in ['pem', 'crt', 'cer']:
                ScriptStatus.update(
                    'unknown', "Certificate {0} ".format(cert.path) +
                    "is not supported by the check script, " +
                    "please add it to ignore list or upgrade " + "the script.")
                continue

            # Check the expiry date:
            try:
                cert_expiration = get_cert_expiration(cert)
            except RecoverableException:
                unparsable_certs["number"] += 1
                unparsable_certs["paths"].append(cert.path)
                continue

            # -3 days is in fact -4 days, 23:59:58.817181
            # so we compensate and round up
            # additionally, openssl uses utc dates
            now = datetime.utcnow() - timedelta(days=1)
            time_left = cert_expiration - now  # timedelta object
            if time_left.days < 0:
                ScriptStatus.update(
                    'critical', "Certificate {0} expired {1} days ago.".format(
                        cert.path, abs(time_left.days)))
            elif time_left.days == 0:
                ScriptStatus.update(
                    'critical',
                    "Certificate {0} expires today.".format(cert.path))
            elif time_left.days < ScriptConfiguration.get_val(
                    "critical_treshold"):
                ScriptStatus.update(
                    'critical', "Certificate {0} is about to expire in"
                    "{0} days.".format(cert.path, time_left.days))
            elif time_left.days < ScriptConfiguration.get_val("warn_treshold"):
                ScriptStatus.update(
                    'warn', "Certificate {0} is about to expire in"
                    "{0} days.".format(cert.path, time_left.days))
            else:
                logger.info("{0} expires in {1} days - OK!".format(
                    cert.path, time_left.days))

        # We do not want to pollute output in case when there are too many broken
        # certs in the report.
        if unparsable_certs["number"] > 0:
            if unparsable_certs["number"] <= 2:
                ScriptStatus.update(
                    'unknown', 'Script cannot parse certificates: '
                    ','.join(unparsable_certs["paths"]))
            else:
                ScriptStatus.update(
                    'unknown', 'Script cannot parse {0} '.format(
                        unparsable_certs["number"]) +
                    "certificates, please check with verbose out on")

        ScriptStatus.notify_agregated()
        ScriptLock.release()
        sys.exit(0)

    except RecoverableException as e:
        msg = str(e)
        logging.error(msg)
        ScriptStatus.notify_immediate('unknown', msg)
        sys.exit(1)
    except AssertionError as e:
        # Unittest require it:
        raise
    except Exception as e:
        msg = "Exception occured: {0}, msg: {1}".format(
            e.__class__.__name__, str(e))
        logging.error(msg)
        logging.exception(e)
        sys.exit(1)
예제 #2
0
def main(config_file, std_err=False, verbose=True, dont_send=False):
    """
    Main function of the script

    Args:
        config_file: file path of the config file to load
        std_err: whether print logging output to stderr
        verbose: whether to provide verbose logging messages
        dont_send: whether to sent data to monitoring system or just do a dry
                   run
    """
    try:
        # Configure logging:
        fmt = logging.Formatter('%(filename)s[%(process)d] %(levelname)s: ' +
                                '%(message)s')
        logger = logging.getLogger()
        if verbose:
            logger.setLevel(logging.DEBUG)
        else:
            logger.setLevel(logging.INFO)
        if std_err:
            handler = logging.StreamHandler()
        else:
            handler = lh.SysLogHandler(address='/dev/log',
                                       facility=lh.SysLogHandler.LOG_USER)
        handler.setFormatter(fmt)
        logger.addHandler(handler)

        logger.info("check_cert is starting, command line arguments:" +
                    "config_file={0}, ".format(config_file) +
                    "std_err={0}, ".format(std_err) +
                    "verbose={0}, ".format(verbose)
                    )

        # FIXME - Remember to correctly configure syslog, otherwise rsyslog will
        # discard messages
        ScriptConfiguration.load_config(config_file)

        logger.debug("Loaded configuration: " +
                     str(ScriptConfiguration.get_config())
                     )

        # Provide some sane defaults:
        try:
            repo_port = ScriptConfiguration.get_val("repo_port")
        except KeyError:
            repo_port = 22

        try:
            ignored_certs = ScriptConfiguration.get_val("ignored_certs")
        except KeyError:
            ignored_certs = {}

        logger.debug("Remote repo is: {0}@{1}{3}->{4}, tcp port {2}".format(
                     ScriptConfiguration.get_val("repo_user"),
                     ScriptConfiguration.get_val("repo_host"),
                     repo_port,
                     ScriptConfiguration.get_val("repo_url"),
                     ScriptConfiguration.get_val("repo_masterbranch")) +
                     ", local repository dir is {0}".format(
                     ScriptConfiguration.get_val('repo_localdir')) +
                     ", repository key is {0}".format(
                     ScriptConfiguration.get_val('repo_pubkey')) +
                     ", warn_thresh is {0}".format(
                     ScriptConfiguration.get_val('warn_treshold')) +
                     ", crit_thresh is {0}".format(
                     ScriptConfiguration.get_val('critical_treshold'))
                     )

        # Initialize Riemann/NRPE reporting:
        if ScriptConfiguration.get_val("riemann_enabled") is True:
            ScriptStatus.initialize(
                riemann_enabled=True,
                riemann_hosts_config=ScriptConfiguration.get_val("riemann_hosts"),
                riemann_tags=ScriptConfiguration.get_val("riemann_tags"),
                riemann_ttl=ScriptConfiguration.get_val("riemann_ttl"),
                riemann_service_name=SERVICE_NAME,
                nrpe_enabled=ScriptConfiguration.get_val("nrpe_enabled"),
                debug=dont_send,)
        else:
            ScriptStatus.initialize(
                nrpe_enabled=ScriptConfiguration.get_val("nrpe_enabled"),
                debug=dont_send,)

        # Now, let's verify the configuration:
        # FIXME - ScriptStatus might have been already initialized with
        # incorrect config and in effect ScriptStatus.notify_immediate will
        # not reach monitoring system
        conf_issues = _verify_conf(ScriptConfiguration.get_config())
        if conf_issues:
            logging.debug("Configuration problems:\n\t" +
                          '\n\t'.join(conf_issues))
            ScriptStatus.notify_immediate('unknown',
                                          "Configuration file contains errors: " +
                                          ' '.join(conf_issues))

        # Make sure that we are the only ones running on the server:
        ScriptLock.init(ScriptConfiguration.get_val('lockfile'))
        ScriptLock.aqquire()

        # Initialize our repo mirror:
        CertStore.initialize(host=ScriptConfiguration.get_val("repo_host"),
                             port=repo_port,
                             pubkey=ScriptConfiguration.get_val('repo_pubkey'),
                             username=ScriptConfiguration.get_val("repo_user"),
                             repo_localdir=ScriptConfiguration.get_val(
                                 'repo_localdir'),
                             repo_url=ScriptConfiguration.get_val("repo_url"),
                             repo_masterbranch=ScriptConfiguration.get_val(
                                 "repo_masterbranch"),
                             )

        unparsable_certs = {"number": 0, "paths": []}

        for cert in CertStore.lookup_certs(CERTIFICATE_EXTENSIONS):
            # Check whether the cert needs to be included in checks at all:
            cert_hash = hashlib.sha1(cert.content).hexdigest()
            if cert_hash in ignored_certs:
                # This cert should be ignored
                logging.info("certificate {0} (sha1sum: {1})".format(
                             cert.path, cert_hash) + " has been ignored.")
                continue

            # Check if certifice type is supported:
            if cert.path[-3:] not in ['pem', 'crt', 'cer']:
                ScriptStatus.update('unknown',
                                    "Certificate {0} ".format(cert.path) +
                                    "is not supported by the check script, " +
                                    "please add it to ignore list or upgrade " +
                                    "the script.")
                continue

            # Check the expiry date:
            try:
                cert_expiration = get_cert_expiration(cert)
            except RecoverableException:
                unparsable_certs["number"] += 1
                unparsable_certs["paths"].append(cert.path)
                continue

            # -3 days is in fact -4 days, 23:59:58.817181
            # so we compensate and round up
            # additionally, openssl uses utc dates
            now = datetime.utcnow() - timedelta(days=1)
            time_left = cert_expiration - now  # timedelta object
            if time_left.days < 0:
                ScriptStatus.update('critical',
                                    "Certificate {0} expired {1} days ago.".format(
                                        cert.path, abs(time_left.days)))
            elif time_left.days == 0:
                ScriptStatus.update('critical',
                                    "Certificate {0} expires today.".format(
                                        cert.path))
            elif time_left.days < ScriptConfiguration.get_val("critical_treshold"):
                ScriptStatus.update('critical',
                                    "Certificate {0} is about to expire in"
                                    "{0} days.".format(cert.path, time_left.days))
            elif time_left.days < ScriptConfiguration.get_val("warn_treshold"):
                ScriptStatus.update('warn',
                                    "Certificate {0} is about to expire in"
                                    "{0} days.".format(cert.path, time_left.days))
            else:
                logger.info("{0} expires in {1} days - OK!".format(
                    cert.path, time_left.days))

        # We do not want to pollute output in case when there are too many broken
        # certs in the report.
        if unparsable_certs["number"] > 0:
            if unparsable_certs["number"] <= 2:
                ScriptStatus.update('unknown',
                                    'Script cannot parse certificates: '
                                    ','.join(unparsable_certs["paths"]))
            else:
                ScriptStatus.update('unknown', 'Script cannot parse {0} '.format(
                                    unparsable_certs["number"]) +
                                    "certificates, please check with verbose out on")

        ScriptStatus.notify_agregated()
        ScriptLock.release()
        sys.exit(0)

    except RecoverableException as e:
        msg = str(e)
        logging.error(msg)
        ScriptStatus.notify_immediate('unknown', msg)
        sys.exit(1)
    except AssertionError as e:
        # Unittest require it:
        raise
    except Exception as e:
        msg = "Exception occured: {0}, msg: {1}".format(e.__class__.__name__, str(e))
        logging.error(msg)
        logging.exception(e)
        sys.exit(1)
예제 #3
0
def main(config_file, std_err=False, verbose=True, clean_histdata=False):
    """
    Main function of the script

    Args:
        config_file: file path of the config file to load
        std_err: whether print logging output to stderr
        verbose: whether to provide verbose logging messages
        clean_histdata: all historical data should be cleared
    """

    try:
        # Configure logging:
        fmt = logging.Formatter('%(filename)s[%(process)d] %(levelname)s: ' +
                                '%(message)s')
        logger = logging.getLogger()
        if verbose:
            logger.setLevel(logging.DEBUG)
        else:
            logger.setLevel(logging.INFO)
        if std_err:
            handler = logging.StreamHandler()
        else:
            handler = lh.SysLogHandler(address='/dev/log',
                                       facility=lh.SysLogHandler.LOG_USER)
        handler.setFormatter(fmt)
        logger.addHandler(handler)

        logger.debug("{0} is starting, ".format(os.path.basename(__file__)) +
                     "command line arguments: " +
                     "config_file={0}, ".format(config_file) +
                     "std_err={0}, ".format(std_err) +
                     "verbose={0}, ".format(verbose) +
                     "clean_histdata={0}".format(clean_histdata)
                     )

        # FIXME - Remember to correctly configure syslog, otherwise rsyslog will
        # discard messages
        ScriptConfiguration.load_config(config_file)

        logger.debug("Loaded configuration: " +
                     str(ScriptConfiguration.get_config())
                     )

        # Initialize reporting to monitoring system:
        ScriptStatus.init(nrpe_enable=True)

        # Make sure that we are the only ones running on the server:
        ScriptLock.init(ScriptConfiguration.get_val('lockfile'))
        ScriptLock.aqquire()

        # Some basic sanity checking:
        verify_conf()

        # We are all set, lets do some real work:
        HistoryFile.init(location=ScriptConfiguration.get_val('history_file'),
                         max_averaging_window=ScriptConfiguration.get_val(
                             'max_averaging_window'),
                         min_averaging_window=ScriptConfiguration.get_val(
                             'min_averaging_window'))

        if clean_histdata:
            HistoryFile.clear_history()
            HistoryFile.save()
            ScriptStatus.notify_immediate('unknown',
                                          'History data has been cleared.')

        timeframe = ScriptConfiguration.get_val('timeframe')

        # FIXME: not sure how to refactor this, copypaste does not seem the best
        # solution :(
        def do_status_processing(prefix, current_growth, planned_growth,
                                 mountpoint=None, data_type=None):
            warn_tresh = 1 + (ScriptConfiguration.get_val(
                prefix + '_mon_warn_reduction')/100)
            crit_tresh = 1 + (ScriptConfiguration.get_val(
                prefix + '_mon_crit_reduction')/100)

            if prefix == 'disk' and data_type == 'inode':
                units = 'inodes/day'
            else:
                units = 'MB/day'

            if prefix == 'disk':
                rname = data_type + \
                    ' usage growth for mount {0}'.format(mountpoint)
            else:
                rname = '{0} usage growth'.format(prefix)

            rname = rname.capitalize()

            if current_growth > planned_growth * warn_tresh:
                msg = '{0} exceeds planned growth '.format(rname) + \
                      '- current: {0} {1}'.format(current_growth, units) + \
                      ', planned: {0} {1}.'.format(planned_growth, units)
                if current_growth > planned_growth * crit_tresh:
                    ScriptStatus.update('crit', msg)
                else:
                    ScriptStatus.update('warn', msg)
            else:
                ScriptStatus.update('ok',
                                    '{0} is OK ({1} {2}).'.format(
                                        rname, current_growth, units))

        if ScriptConfiguration.get_val('memory_mon_enabled'):
            cur_usage, max_usage = fetch_memory_usage()
            HistoryFile.add_datapoint('memory', cur_usage)
            tmp = HistoryFile.verify_dataspan('memory')
            if tmp < 0:
                ScriptStatus.update('unknown', 'There is not enough data ' +
                                    'to calculate current memory ' +
                                    'usage growth: {0} '.format(abs(tmp)) +
                                    'days more is needed.')
            else:
                datapoints = HistoryFile.get_datapoints('memory')

                planned_growth = find_planned_grow_ratio(cur_usage, max_usage,
                                                         timeframe)
                current_growth = find_current_grow_ratio(datapoints)

                logging.debug('memory -> ' +
                              'current_growth: {0}, '.format(current_growth) +
                              'planned_growth: {0}'.format(planned_growth))

                do_status_processing('memory', current_growth, planned_growth)

        if ScriptConfiguration.get_val('disk_mon_enabled'):
            mountpoints = ScriptConfiguration.get_val('disk_mountpoints')
            for dtype in ['space', 'inode']:
                for mountpoint in mountpoints:
                    if dtype == 'inode':
                        cur_usage, max_usage = fetch_inode_usage(mountpoint)
                    else:
                        cur_usage, max_usage = fetch_disk_usage(mountpoint)
                    HistoryFile.add_datapoint('disk', cur_usage,
                                              data_type=dtype,
                                              path=mountpoint)
                    tmp = HistoryFile.verify_dataspan('disk',
                                                      data_type=dtype,
                                                      path=mountpoint)
                    if tmp < 0:
                        ScriptStatus.update('unknown',
                                            'There is not enough data to ' +
                                            'calculate current disk ' + dtype +
                                            ' usage growth for mountpoint ' +
                                            '{0}: {1} '.format(
                                                mountpoint, abs(tmp)) +
                                            'days more is needed.')
                    else:
                        datapoints = HistoryFile.get_datapoints('disk',
                                                                data_type=dtype,
                                                                path=mountpoint)
                        planned_growth = find_planned_grow_ratio(cur_usage,
                                                                 max_usage,
                                                                 timeframe)
                        current_growth = find_current_grow_ratio(datapoints)

                        logging.debug('disk, ' +
                                      'mountpoint {0}, '.format(mountpoint) +
                                      'data_type {0}: '.format(dtype) +
                                      'current_growth: {0}'.format(current_growth) +
                                      'planned_growth: {0}'.format(planned_growth))
                        do_status_processing('disk', current_growth, planned_growth,
                                             mountpoint=mountpoint, data_type=dtype)

        HistoryFile.save()
        ScriptStatus.notify_agregated()
        ScriptLock.release()

    except RecoverableException as e:
        msg = str(e)
        logging.critical(msg)
        ScriptStatus.notify_immediate('unknown', msg)
        sys.exit(1)
    except AssertionError as e:
        # Unittests require it:
        raise
    except Exception as e:
        msg = "Exception occured: {0}".format(e.__class__.__name__)
        logging.exception(msg)
        print(msg)  # We can use notify immediate here :(
        sys.exit(3)