Пример #1
0
def main():
    # Collect all info

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'hosts': ('the hosts/clusters that should be contacted for job information', None, 'extend', []),
        'location': ('the location for storing the pickle file: home, scratch', str, 'store', 'home'),
    }

    opts = ExtendedSimpleOption(options)

    try:
        LdapQuery(VscConfiguration())

        clusters = {}
        for host in opts.options.hosts:
            master = opts.configfile_parser.get(host, "master")
            checkjob_path = opts.configfile_parser.get(host, "checkjob_path")
            clusters[host] = {
                'master': master,
                'path': checkjob_path
            }

        checkjob = Checkjob(clusters, cache_pickle=True, dry_run=opts.options.dry_run)

        (job_information, reported_hosts, failed_hosts) = checkjob.get_moab_command_information()
        timeinfo = time.time()

        active_users = job_information.keys()

        logger.debug("Active users: %s" % (active_users))
        logger.debug("Checkjob information: %s" % (job_information))

        nagios_user_count = 0
        nagios_no_store = 0

        stats = {}

        for user in active_users:
            if not opts.options.dry_run:
                try:
                    (path, store) = get_pickle_path(opts.options.location, user)
                    user_queue_information = CheckjobInfo({user: job_information[user]})
                    store(user, path, (timeinfo, user_queue_information))
                    nagios_user_count += 1
                except (UserStorageError, FileStoreError, FileMoveError), _:
                    logger.exception("Could not store pickle file for user %s" % (user))
                    nagios_no_store += 1
            else:
                logger.info("Dry run, not actually storing data for user %s at path %s" %
                            (user, get_pickle_path(opts.options.location, user)[0]))
                logger.debug("Dry run, queue information for user %s is %s" % (user, job_information[user]))

        stats["store+users"] = nagios_user_count
        stats["store_fail"] = nagios_no_store
        stats["store_fail_critical"] = STORE_LIMIT_CRITICAL
Пример #2
0
def main():
    # Collect all info

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        "nagios": ("print out nagios information", None, "store_true", False, "n"),
        "nagios_check_filename": (
            "filename of where the nagios check data is stored",
            str,
            "store",
            NAGIOS_CHECK_FILENAME,
        ),
        "nagios_check_interval_threshold": (
            "threshold of nagios checks timing out",
            None,
            "store",
            NAGIOS_CHECK_INTERVAL_THRESHOLD,
        ),
        "hosts": ("the hosts/clusters that should be contacted for job information", None, "extend", []),
        "location": ("the location for storing the pickle file: home, scratch", str, "store", "home"),
        "ha": ("high-availability master IP address", None, "store", None),
        "dry-run": ("do not make any updates whatsoever", None, "store_true", False),
    }

    opts = simple_option(options)

    if opts.options.debug:
        fancylogger.setLogLevelDebug()

    nagios_reporter = NagiosReporter(
        NAGIOS_HEADER, opts.options.nagios_check_filename, opts.options.nagios_check_interval_threshold
    )
    if opts.options.nagios:
        logger.debug("Producing Nagios report and exiting.")
        nagios_reporter.report_and_exit()
        sys.exit(0)  # not reached

    if not proceed_on_ha_service(opts.options.ha):
        logger.warning("Not running on the target host in the HA setup. Stopping.")
        nagios_reporter.cache(NAGIOS_EXIT_WARNING, NagiosResult("Not running on the HA master."))
        sys.exit(NAGIOS_EXIT_WARNING)

    lockfile = TimestampedPidLockfile(DCHECKJOB_LOCK_FILE)
    lock_or_bork(lockfile, nagios_reporter)

    logger.info("Starting dcheckjob")

    LdapQuery(VscConfiguration())

    clusters = {}
    for host in opts.options.hosts:
        master = opts.configfile_parser.get(host, "master")
        checkjob_path = opts.configfile_parser.get(host, "checkjob_path")
        clusters[host] = {"master": master, "path": checkjob_path}

    checkjob = Checkjob(clusters, cache_pickle=True, dry_run=True)

    (job_information, reported_hosts, failed_hosts) = checkjob.get_moab_command_information()
    timeinfo = time.time()

    active_users = job_information.keys()

    logger.debug("Active users: %s" % (active_users))
    logger.debug("Checkjob information: %s" % (job_information))

    nagios_user_count = 0
    nagios_no_store = 0

    for user in active_users:
        if not opts.options.dry_run:
            try:
                (path, store) = get_pickle_path(opts.options.location, user)
                user_queue_information = CheckjobInfo({user: job_information[user]})
                store(user, path, (timeinfo, user_queue_information))
                nagios_user_count += 1
            except (UserStorageError, FileStoreError, FileMoveError), _:
                logger.error("Could not store pickle file for user %s" % (user))
                nagios_no_store += 1
        else:
            logger.info(
                "Dry run, not actually storing data for user %s at path %s"
                % (user, get_pickle_path(opts.options.location, user)[0])
            )
            logger.debug("Dry run, queue information for user %s is %s" % (user, job_information[user]))