def main(config_file, std_err=False, verbose=True, dont_send=False): """ Main function of the script Args: config_file: file path of the config file to load std_err: whether print logging output to stderr verbose: whether to provide verbose logging messages dont_send: whether to sent data to monitoring system or just do a dry run """ try: # Configure logging: fmt = logging.Formatter('%(filename)s[%(process)d] %(levelname)s: ' + '%(message)s') logger = logging.getLogger() if verbose: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) if std_err: handler = logging.StreamHandler() else: handler = lh.SysLogHandler(address='/dev/log', facility=lh.SysLogHandler.LOG_USER) handler.setFormatter(fmt) logger.addHandler(handler) logger.info("check_cert is starting, command line arguments:" + "config_file={0}, ".format(config_file) + "std_err={0}, ".format(std_err) + "verbose={0}, ".format(verbose)) # FIXME - Remember to correctly configure syslog, otherwise rsyslog will # discard messages ScriptConfiguration.load_config(config_file) logger.debug("Loaded configuration: " + str(ScriptConfiguration.get_config())) # Provide some sane defaults: try: repo_port = ScriptConfiguration.get_val("repo_port") except KeyError: repo_port = 22 try: ignored_certs = ScriptConfiguration.get_val("ignored_certs") except KeyError: ignored_certs = {} logger.debug("Remote repo is: {0}@{1}{3}->{4}, tcp port {2}".format( ScriptConfiguration.get_val("repo_user"), ScriptConfiguration.get_val("repo_host"), repo_port, ScriptConfiguration.get_val("repo_url"), ScriptConfiguration.get_val("repo_masterbranch")) + ", local repository dir is {0}".format( ScriptConfiguration.get_val('repo_localdir')) + ", repository key is {0}".format( ScriptConfiguration.get_val('repo_pubkey')) + ", warn_thresh is {0}".format( ScriptConfiguration.get_val('warn_treshold')) + ", crit_thresh is {0}".format( ScriptConfiguration.get_val('critical_treshold'))) # Initialize Riemann/NRPE reporting: if ScriptConfiguration.get_val("riemann_enabled") is True: ScriptStatus.initialize( riemann_enabled=True, riemann_hosts_config=ScriptConfiguration.get_val( "riemann_hosts"), riemann_tags=ScriptConfiguration.get_val("riemann_tags"), riemann_ttl=ScriptConfiguration.get_val("riemann_ttl"), riemann_service_name=SERVICE_NAME, nrpe_enabled=ScriptConfiguration.get_val("nrpe_enabled"), debug=dont_send, ) else: ScriptStatus.initialize( nrpe_enabled=ScriptConfiguration.get_val("nrpe_enabled"), debug=dont_send, ) # Now, let's verify the configuration: # FIXME - ScriptStatus might have been already initialized with # incorrect config and in effect ScriptStatus.notify_immediate will # not reach monitoring system conf_issues = _verify_conf(ScriptConfiguration.get_config()) if conf_issues: logging.debug("Configuration problems:\n\t" + '\n\t'.join(conf_issues)) ScriptStatus.notify_immediate( 'unknown', "Configuration file contains errors: " + ' '.join(conf_issues)) # Make sure that we are the only ones running on the server: ScriptLock.init(ScriptConfiguration.get_val('lockfile')) ScriptLock.aqquire() # Initialize our repo mirror: CertStore.initialize( host=ScriptConfiguration.get_val("repo_host"), port=repo_port, pubkey=ScriptConfiguration.get_val('repo_pubkey'), username=ScriptConfiguration.get_val("repo_user"), repo_localdir=ScriptConfiguration.get_val('repo_localdir'), repo_url=ScriptConfiguration.get_val("repo_url"), repo_masterbranch=ScriptConfiguration.get_val("repo_masterbranch"), ) unparsable_certs = {"number": 0, "paths": []} for cert in CertStore.lookup_certs(CERTIFICATE_EXTENSIONS): # Check whether the cert needs to be included in checks at all: cert_hash = hashlib.sha1(cert.content).hexdigest() if cert_hash in ignored_certs: # This cert should be ignored logging.info("certificate {0} (sha1sum: {1})".format( cert.path, cert_hash) + " has been ignored.") continue # Check if certifice type is supported: if cert.path[-3:] not in ['pem', 'crt', 'cer']: ScriptStatus.update( 'unknown', "Certificate {0} ".format(cert.path) + "is not supported by the check script, " + "please add it to ignore list or upgrade " + "the script.") continue # Check the expiry date: try: cert_expiration = get_cert_expiration(cert) except RecoverableException: unparsable_certs["number"] += 1 unparsable_certs["paths"].append(cert.path) continue # -3 days is in fact -4 days, 23:59:58.817181 # so we compensate and round up # additionally, openssl uses utc dates now = datetime.utcnow() - timedelta(days=1) time_left = cert_expiration - now # timedelta object if time_left.days < 0: ScriptStatus.update( 'critical', "Certificate {0} expired {1} days ago.".format( cert.path, abs(time_left.days))) elif time_left.days == 0: ScriptStatus.update( 'critical', "Certificate {0} expires today.".format(cert.path)) elif time_left.days < ScriptConfiguration.get_val( "critical_treshold"): ScriptStatus.update( 'critical', "Certificate {0} is about to expire in" "{0} days.".format(cert.path, time_left.days)) elif time_left.days < ScriptConfiguration.get_val("warn_treshold"): ScriptStatus.update( 'warn', "Certificate {0} is about to expire in" "{0} days.".format(cert.path, time_left.days)) else: logger.info("{0} expires in {1} days - OK!".format( cert.path, time_left.days)) # We do not want to pollute output in case when there are too many broken # certs in the report. if unparsable_certs["number"] > 0: if unparsable_certs["number"] <= 2: ScriptStatus.update( 'unknown', 'Script cannot parse certificates: ' ','.join(unparsable_certs["paths"])) else: ScriptStatus.update( 'unknown', 'Script cannot parse {0} '.format( unparsable_certs["number"]) + "certificates, please check with verbose out on") ScriptStatus.notify_agregated() ScriptLock.release() sys.exit(0) except RecoverableException as e: msg = str(e) logging.error(msg) ScriptStatus.notify_immediate('unknown', msg) sys.exit(1) except AssertionError as e: # Unittest require it: raise except Exception as e: msg = "Exception occured: {0}, msg: {1}".format( e.__class__.__name__, str(e)) logging.error(msg) logging.exception(e) sys.exit(1)
def main(config_file, std_err=False, verbose=True, dont_send=False): """ Main function of the script Args: config_file: file path of the config file to load std_err: whether print logging output to stderr verbose: whether to provide verbose logging messages dont_send: whether to sent data to monitoring system or just do a dry run """ try: # Configure logging: fmt = logging.Formatter('%(filename)s[%(process)d] %(levelname)s: ' + '%(message)s') logger = logging.getLogger() if verbose: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) if std_err: handler = logging.StreamHandler() else: handler = lh.SysLogHandler(address='/dev/log', facility=lh.SysLogHandler.LOG_USER) handler.setFormatter(fmt) logger.addHandler(handler) logger.info("check_cert is starting, command line arguments:" + "config_file={0}, ".format(config_file) + "std_err={0}, ".format(std_err) + "verbose={0}, ".format(verbose) ) # FIXME - Remember to correctly configure syslog, otherwise rsyslog will # discard messages ScriptConfiguration.load_config(config_file) logger.debug("Loaded configuration: " + str(ScriptConfiguration.get_config()) ) # Provide some sane defaults: try: repo_port = ScriptConfiguration.get_val("repo_port") except KeyError: repo_port = 22 try: ignored_certs = ScriptConfiguration.get_val("ignored_certs") except KeyError: ignored_certs = {} logger.debug("Remote repo is: {0}@{1}{3}->{4}, tcp port {2}".format( ScriptConfiguration.get_val("repo_user"), ScriptConfiguration.get_val("repo_host"), repo_port, ScriptConfiguration.get_val("repo_url"), ScriptConfiguration.get_val("repo_masterbranch")) + ", local repository dir is {0}".format( ScriptConfiguration.get_val('repo_localdir')) + ", repository key is {0}".format( ScriptConfiguration.get_val('repo_pubkey')) + ", warn_thresh is {0}".format( ScriptConfiguration.get_val('warn_treshold')) + ", crit_thresh is {0}".format( ScriptConfiguration.get_val('critical_treshold')) ) # Initialize Riemann/NRPE reporting: if ScriptConfiguration.get_val("riemann_enabled") is True: ScriptStatus.initialize( riemann_enabled=True, riemann_hosts_config=ScriptConfiguration.get_val("riemann_hosts"), riemann_tags=ScriptConfiguration.get_val("riemann_tags"), riemann_ttl=ScriptConfiguration.get_val("riemann_ttl"), riemann_service_name=SERVICE_NAME, nrpe_enabled=ScriptConfiguration.get_val("nrpe_enabled"), debug=dont_send,) else: ScriptStatus.initialize( nrpe_enabled=ScriptConfiguration.get_val("nrpe_enabled"), debug=dont_send,) # Now, let's verify the configuration: # FIXME - ScriptStatus might have been already initialized with # incorrect config and in effect ScriptStatus.notify_immediate will # not reach monitoring system conf_issues = _verify_conf(ScriptConfiguration.get_config()) if conf_issues: logging.debug("Configuration problems:\n\t" + '\n\t'.join(conf_issues)) ScriptStatus.notify_immediate('unknown', "Configuration file contains errors: " + ' '.join(conf_issues)) # Make sure that we are the only ones running on the server: ScriptLock.init(ScriptConfiguration.get_val('lockfile')) ScriptLock.aqquire() # Initialize our repo mirror: CertStore.initialize(host=ScriptConfiguration.get_val("repo_host"), port=repo_port, pubkey=ScriptConfiguration.get_val('repo_pubkey'), username=ScriptConfiguration.get_val("repo_user"), repo_localdir=ScriptConfiguration.get_val( 'repo_localdir'), repo_url=ScriptConfiguration.get_val("repo_url"), repo_masterbranch=ScriptConfiguration.get_val( "repo_masterbranch"), ) unparsable_certs = {"number": 0, "paths": []} for cert in CertStore.lookup_certs(CERTIFICATE_EXTENSIONS): # Check whether the cert needs to be included in checks at all: cert_hash = hashlib.sha1(cert.content).hexdigest() if cert_hash in ignored_certs: # This cert should be ignored logging.info("certificate {0} (sha1sum: {1})".format( cert.path, cert_hash) + " has been ignored.") continue # Check if certifice type is supported: if cert.path[-3:] not in ['pem', 'crt', 'cer']: ScriptStatus.update('unknown', "Certificate {0} ".format(cert.path) + "is not supported by the check script, " + "please add it to ignore list or upgrade " + "the script.") continue # Check the expiry date: try: cert_expiration = get_cert_expiration(cert) except RecoverableException: unparsable_certs["number"] += 1 unparsable_certs["paths"].append(cert.path) continue # -3 days is in fact -4 days, 23:59:58.817181 # so we compensate and round up # additionally, openssl uses utc dates now = datetime.utcnow() - timedelta(days=1) time_left = cert_expiration - now # timedelta object if time_left.days < 0: ScriptStatus.update('critical', "Certificate {0} expired {1} days ago.".format( cert.path, abs(time_left.days))) elif time_left.days == 0: ScriptStatus.update('critical', "Certificate {0} expires today.".format( cert.path)) elif time_left.days < ScriptConfiguration.get_val("critical_treshold"): ScriptStatus.update('critical', "Certificate {0} is about to expire in" "{0} days.".format(cert.path, time_left.days)) elif time_left.days < ScriptConfiguration.get_val("warn_treshold"): ScriptStatus.update('warn', "Certificate {0} is about to expire in" "{0} days.".format(cert.path, time_left.days)) else: logger.info("{0} expires in {1} days - OK!".format( cert.path, time_left.days)) # We do not want to pollute output in case when there are too many broken # certs in the report. if unparsable_certs["number"] > 0: if unparsable_certs["number"] <= 2: ScriptStatus.update('unknown', 'Script cannot parse certificates: ' ','.join(unparsable_certs["paths"])) else: ScriptStatus.update('unknown', 'Script cannot parse {0} '.format( unparsable_certs["number"]) + "certificates, please check with verbose out on") ScriptStatus.notify_agregated() ScriptLock.release() sys.exit(0) except RecoverableException as e: msg = str(e) logging.error(msg) ScriptStatus.notify_immediate('unknown', msg) sys.exit(1) except AssertionError as e: # Unittest require it: raise except Exception as e: msg = "Exception occured: {0}, msg: {1}".format(e.__class__.__name__, str(e)) logging.error(msg) logging.exception(e) sys.exit(1)
def main(config_file, std_err=False, verbose=True, clean_histdata=False): """ Main function of the script Args: config_file: file path of the config file to load std_err: whether print logging output to stderr verbose: whether to provide verbose logging messages clean_histdata: all historical data should be cleared """ try: # Configure logging: fmt = logging.Formatter('%(filename)s[%(process)d] %(levelname)s: ' + '%(message)s') logger = logging.getLogger() if verbose: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) if std_err: handler = logging.StreamHandler() else: handler = lh.SysLogHandler(address='/dev/log', facility=lh.SysLogHandler.LOG_USER) handler.setFormatter(fmt) logger.addHandler(handler) logger.debug("{0} is starting, ".format(os.path.basename(__file__)) + "command line arguments: " + "config_file={0}, ".format(config_file) + "std_err={0}, ".format(std_err) + "verbose={0}, ".format(verbose) + "clean_histdata={0}".format(clean_histdata) ) # FIXME - Remember to correctly configure syslog, otherwise rsyslog will # discard messages ScriptConfiguration.load_config(config_file) logger.debug("Loaded configuration: " + str(ScriptConfiguration.get_config()) ) # Initialize reporting to monitoring system: ScriptStatus.init(nrpe_enable=True) # Make sure that we are the only ones running on the server: ScriptLock.init(ScriptConfiguration.get_val('lockfile')) ScriptLock.aqquire() # Some basic sanity checking: verify_conf() # We are all set, lets do some real work: HistoryFile.init(location=ScriptConfiguration.get_val('history_file'), max_averaging_window=ScriptConfiguration.get_val( 'max_averaging_window'), min_averaging_window=ScriptConfiguration.get_val( 'min_averaging_window')) if clean_histdata: HistoryFile.clear_history() HistoryFile.save() ScriptStatus.notify_immediate('unknown', 'History data has been cleared.') timeframe = ScriptConfiguration.get_val('timeframe') # FIXME: not sure how to refactor this, copypaste does not seem the best # solution :( def do_status_processing(prefix, current_growth, planned_growth, mountpoint=None, data_type=None): warn_tresh = 1 + (ScriptConfiguration.get_val( prefix + '_mon_warn_reduction')/100) crit_tresh = 1 + (ScriptConfiguration.get_val( prefix + '_mon_crit_reduction')/100) if prefix == 'disk' and data_type == 'inode': units = 'inodes/day' else: units = 'MB/day' if prefix == 'disk': rname = data_type + \ ' usage growth for mount {0}'.format(mountpoint) else: rname = '{0} usage growth'.format(prefix) rname = rname.capitalize() if current_growth > planned_growth * warn_tresh: msg = '{0} exceeds planned growth '.format(rname) + \ '- current: {0} {1}'.format(current_growth, units) + \ ', planned: {0} {1}.'.format(planned_growth, units) if current_growth > planned_growth * crit_tresh: ScriptStatus.update('crit', msg) else: ScriptStatus.update('warn', msg) else: ScriptStatus.update('ok', '{0} is OK ({1} {2}).'.format( rname, current_growth, units)) if ScriptConfiguration.get_val('memory_mon_enabled'): cur_usage, max_usage = fetch_memory_usage() HistoryFile.add_datapoint('memory', cur_usage) tmp = HistoryFile.verify_dataspan('memory') if tmp < 0: ScriptStatus.update('unknown', 'There is not enough data ' + 'to calculate current memory ' + 'usage growth: {0} '.format(abs(tmp)) + 'days more is needed.') else: datapoints = HistoryFile.get_datapoints('memory') planned_growth = find_planned_grow_ratio(cur_usage, max_usage, timeframe) current_growth = find_current_grow_ratio(datapoints) logging.debug('memory -> ' + 'current_growth: {0}, '.format(current_growth) + 'planned_growth: {0}'.format(planned_growth)) do_status_processing('memory', current_growth, planned_growth) if ScriptConfiguration.get_val('disk_mon_enabled'): mountpoints = ScriptConfiguration.get_val('disk_mountpoints') for dtype in ['space', 'inode']: for mountpoint in mountpoints: if dtype == 'inode': cur_usage, max_usage = fetch_inode_usage(mountpoint) else: cur_usage, max_usage = fetch_disk_usage(mountpoint) HistoryFile.add_datapoint('disk', cur_usage, data_type=dtype, path=mountpoint) tmp = HistoryFile.verify_dataspan('disk', data_type=dtype, path=mountpoint) if tmp < 0: ScriptStatus.update('unknown', 'There is not enough data to ' + 'calculate current disk ' + dtype + ' usage growth for mountpoint ' + '{0}: {1} '.format( mountpoint, abs(tmp)) + 'days more is needed.') else: datapoints = HistoryFile.get_datapoints('disk', data_type=dtype, path=mountpoint) planned_growth = find_planned_grow_ratio(cur_usage, max_usage, timeframe) current_growth = find_current_grow_ratio(datapoints) logging.debug('disk, ' + 'mountpoint {0}, '.format(mountpoint) + 'data_type {0}: '.format(dtype) + 'current_growth: {0}'.format(current_growth) + 'planned_growth: {0}'.format(planned_growth)) do_status_processing('disk', current_growth, planned_growth, mountpoint=mountpoint, data_type=dtype) HistoryFile.save() ScriptStatus.notify_agregated() ScriptLock.release() except RecoverableException as e: msg = str(e) logging.critical(msg) ScriptStatus.notify_immediate('unknown', msg) sys.exit(1) except AssertionError as e: # Unittests require it: raise except Exception as e: msg = "Exception occured: {0}".format(e.__class__.__name__) logging.exception(msg) print(msg) # We can use notify immediate here :( sys.exit(3)