def main(): # Collect all info # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'hosts': ('the hosts/clusters that should be contacted for job information', None, 'extend', []), 'location': ('the location for storing the pickle file: home, scratch', str, 'store', 'home'), } opts = ExtendedSimpleOption(options) try: LdapQuery(VscConfiguration()) clusters = {} for host in opts.options.hosts: master = opts.configfile_parser.get(host, "master") checkjob_path = opts.configfile_parser.get(host, "checkjob_path") clusters[host] = { 'master': master, 'path': checkjob_path } checkjob = Checkjob(clusters, cache_pickle=True, dry_run=opts.options.dry_run) (job_information, reported_hosts, failed_hosts) = checkjob.get_moab_command_information() timeinfo = time.time() active_users = job_information.keys() logger.debug("Active users: %s" % (active_users)) logger.debug("Checkjob information: %s" % (job_information)) nagios_user_count = 0 nagios_no_store = 0 stats = {} for user in active_users: if not opts.options.dry_run: try: (path, store) = get_pickle_path(opts.options.location, user) user_queue_information = CheckjobInfo({user: job_information[user]}) store(user, path, (timeinfo, user_queue_information)) nagios_user_count += 1 except (UserStorageError, FileStoreError, FileMoveError), _: logger.exception("Could not store pickle file for user %s" % (user)) nagios_no_store += 1 else: logger.info("Dry run, not actually storing data for user %s at path %s" % (user, get_pickle_path(opts.options.location, user)[0])) logger.debug("Dry run, queue information for user %s is %s" % (user, job_information[user])) stats["store+users"] = nagios_user_count stats["store_fail"] = nagios_no_store stats["store_fail_critical"] = STORE_LIMIT_CRITICAL
def main(): # Collect all info # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { "nagios": ("print out nagios information", None, "store_true", False, "n"), "nagios_check_filename": ( "filename of where the nagios check data is stored", str, "store", NAGIOS_CHECK_FILENAME, ), "nagios_check_interval_threshold": ( "threshold of nagios checks timing out", None, "store", NAGIOS_CHECK_INTERVAL_THRESHOLD, ), "hosts": ("the hosts/clusters that should be contacted for job information", None, "extend", []), "location": ("the location for storing the pickle file: home, scratch", str, "store", "home"), "ha": ("high-availability master IP address", None, "store", None), "dry-run": ("do not make any updates whatsoever", None, "store_true", False), } opts = simple_option(options) if opts.options.debug: fancylogger.setLogLevelDebug() nagios_reporter = NagiosReporter( NAGIOS_HEADER, opts.options.nagios_check_filename, opts.options.nagios_check_interval_threshold ) if opts.options.nagios: logger.debug("Producing Nagios report and exiting.") nagios_reporter.report_and_exit() sys.exit(0) # not reached if not proceed_on_ha_service(opts.options.ha): logger.warning("Not running on the target host in the HA setup. Stopping.") nagios_reporter.cache(NAGIOS_EXIT_WARNING, NagiosResult("Not running on the HA master.")) sys.exit(NAGIOS_EXIT_WARNING) lockfile = TimestampedPidLockfile(DCHECKJOB_LOCK_FILE) lock_or_bork(lockfile, nagios_reporter) logger.info("Starting dcheckjob") LdapQuery(VscConfiguration()) clusters = {} for host in opts.options.hosts: master = opts.configfile_parser.get(host, "master") checkjob_path = opts.configfile_parser.get(host, "checkjob_path") clusters[host] = {"master": master, "path": checkjob_path} checkjob = Checkjob(clusters, cache_pickle=True, dry_run=True) (job_information, reported_hosts, failed_hosts) = checkjob.get_moab_command_information() timeinfo = time.time() active_users = job_information.keys() logger.debug("Active users: %s" % (active_users)) logger.debug("Checkjob information: %s" % (job_information)) nagios_user_count = 0 nagios_no_store = 0 for user in active_users: if not opts.options.dry_run: try: (path, store) = get_pickle_path(opts.options.location, user) user_queue_information = CheckjobInfo({user: job_information[user]}) store(user, path, (timeinfo, user_queue_information)) nagios_user_count += 1 except (UserStorageError, FileStoreError, FileMoveError), _: logger.error("Could not store pickle file for user %s" % (user)) nagios_no_store += 1 else: logger.info( "Dry run, not actually storing data for user %s at path %s" % (user, get_pickle_path(opts.options.location, user)[0]) ) logger.debug("Dry run, queue information for user %s is %s" % (user, job_information[user]))