def test_parser(self): """Test the showq parsers""" sq = Showq('clusters') master = 'master19.golett.gent.vsc' showq = sq.parser(master, SHOWQ_JOBS) sq.jobctl = True jobctl = sq.parser(master, JOBCTL_JOBS) self.assertEqual(showq, jobctl, msg='showq and jobctl commands give same parsed result')
def process_hold(clusters, dry_run=False): """Process a filtered queueinfo dict""" releasejob_cache = FileCache(RELEASEJOB_CACHE_FILE) # get the showq data for data in clusters.values(): data['path'] = data['spath'] # showq path showq = Showq(clusters, cache_pickle=True) (queue_information, _, _) = showq.get_moab_command_information() # release the jobs, prepare the command m = MoabCommand(cache_pickle=False, dry_run=dry_run) for data in clusters.values(): data['path'] = data['mpath'] # mjobctl path m.clusters = clusters # read the previous data ts_data = releasejob_cache.load('queue_information') if ts_data is None: old_queue_information = {} else: (_, old_queue_information) = ts_data stats = { 'peruser': 0, 'total': 0, 'release': 0, } release_jobids = [] for user, clusterdata in queue_information.items(): oldclusterdata = old_queue_information.setdefault(user, {}) totaluser = 0 for cluster, data in clusterdata.items(): olddata = oldclusterdata.setdefault(cluster, {}) # DRMJID is supposed to be unique # get all oldjobids in one dict oldjobs = dict([(j['DRMJID'], j['_release']) for jt in olddata.values() for j in jt]) for jobtype, jobs in data.items(): removeids = [] for idx, job in enumerate(jobs): jid = job['DRMJID'] if jobtype in RELEASEJOB_SUPPORTED_HOLDTYPES: totaluser += 1 release = max(oldjobs.get(jid, 0), 0) + 1 job['_release'] = release stats['release'] = max(stats['release'], release) release_jobids.append(jid) # release the job cmd = [m.clusters[cluster]['path'], '-u', jid] logger.info("Releasing job %s cluster %s for the %s-th time." % (jid, cluster, release)) if dry_run: logger.info("Dry run %s" % cmd) else: m._run_moab_command(cmd, cluster, []) else: # keep historical data, eg a previously released job could be idle now # but keep the counter in case it gets held again try: release = oldjobs[jid] job['_release'] = release except KeyError: # not previously in hold, remove it removeids.append(idx) # remove the jobs (in reverse order) for remove_idx in removeids[::-1]: jobs.pop(remove_idx) # cleanup if len(jobs) == 0: data.pop(jobtype) # cleanup if len(data) == 0: clusterdata.pop(cluster) # cleanup if len(clusterdata) == 0: queue_information.pop(user) # update stats stats['peruser'] = max(stats['peruser'], totaluser) stats['total'] += totaluser logger.info("Release statistics: total jobs in hold %(total)s; max in hold per user %(peruser)s; max releases per job %(release)s" % stats) # update and close releasejob_cache.update('queue_information', queue_information, 0) releasejob_cache.close() return release_jobids, stats
def main(): # Collect all info # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { "nagios": ("print out nagion information", None, "store_true", False, "n"), "nagios_check_filename": ( "filename of where the nagios check data is stored", str, "store", NAGIOS_CHECK_FILENAME, ), "nagios_check_interval_threshold": ( "threshold of nagios checks timing out", None, "store", NAGIOS_CHECK_INTERVAL_THRESHOLD, ), "hosts": ("the hosts/clusters that should be contacted for job information", None, "extend", []), "information": ("the sort of information to store: user, vo, project", None, "store", "user"), "location": ("the location for storing the pickle file: gengar, muk", str, "store", "gengar"), "ha": ("high-availability master IP address", None, "store", None), "dry-run": ("do not make any updates whatsoever", None, "store_true", False), } opts = simple_option(options) if opts.options.debug: fancylogger.setLogLevelDebug() nagios_reporter = NagiosReporter(NAGIOS_HEADER, NAGIOS_CHECK_FILENAME, NAGIOS_CHECK_INTERVAL_THRESHOLD) if opts.options.nagios: logger.debug("Producing Nagios report and exiting.") nagios_reporter.report_and_exit() sys.exit(0) # not reached if not proceed_on_ha_service(opts.options.ha): logger.warning("Not running on the target host in the HA setup. Stopping.") nagios_reporter.cache(NAGIOS_EXIT_WARNING, NagiosResult("Not running on the HA master.")) sys.exit(NAGIOS_EXIT_WARNING) lockfile = TimestampedPidLockfile(DSHOWQ_LOCK_FILE) lock_or_bork(lockfile, nagios_reporter) logger.info("starting dshowq run") clusters = {} for host in opts.options.hosts: master = opts.configfile_parser.get(host, "master") showq_path = opts.configfile_parser.get(host, "showq_path") clusters[host] = {"master": master, "path": showq_path} showq = Showq(clusters, cache_pickle=True, dry_run=opts.options.dry_run) (queue_information, reported_hosts, failed_hosts) = showq.get_moab_command_information() timeinfo = time.time() active_users = queue_information.keys() logger.debug("Active users: %s" % (active_users)) logger.debug("Queue information: %s" % (queue_information)) # We need to determine which users should get an updated pickle. This depends on # - the active user set # - the information we want to provide on the cluster(set) where this script runs # At the same time, we need to determine the job information each user gets to see (target_users, target_queue_information, user_map) = determine_target_information( opts.options.information, active_users, queue_information ) nagios_user_count = 0 nagios_no_store = 0 LdapQuery(VscConfiguration()) for user in target_users: if not opts.options.dry_run: try: (path, store) = get_pickle_path(opts.options.location, user) user_queue_information = target_queue_information[user] user_queue_information["timeinfo"] = timeinfo store(user, path, (user_queue_information, user_map[user])) nagios_user_count += 1 except (UserStorageError, FileStoreError, FileMoveError), err: logger.error("Could not store pickle file for user %s" % (user)) nagios_no_store += 1 else: logger.info( "Dry run, not actually storing data for user %s at path %s" % (user, get_pickle_path(opts.options.location, user)[0]) ) logger.debug("Dry run, queue information for user %s is %s" % (user, target_queue_information[user]))
def process_hold(clusters, dry_run=False): """Process a filtered queueinfo dict""" releasejob_cache = FileCache(RELEASEJOB_CACHE_FILE) # get the showq data for data in clusters.values(): data['path'] = data['spath'] # showq path showq = Showq(clusters, cache_pickle=True) (queue_information, _, _) = showq.get_moab_command_information() # release the jobs, prepare the command m = MoabCommand(cache_pickle=False, dry_run=dry_run) for data in clusters.values(): data['path'] = data['mpath'] # mjobctl path m.clusters = clusters # read the previous data ts_data = releasejob_cache.load('queue_information') if ts_data is None: old_queue_information = {} else: (_, old_queue_information) = ts_data stats = { 'peruser': 0, 'total': 0, 'release': 0, } release_jobids = [] for user, clusterdata in queue_information.items(): oldclusterdata = old_queue_information.setdefault(user, {}) totaluser = 0 for cluster, data in clusterdata.items(): olddata = oldclusterdata.setdefault(cluster, {}) # DRMJID is supposed to be unique # get all oldjobids in one dict oldjobs = dict([(j['DRMJID'], j['_release']) for jt in olddata.values() for j in jt]) for jobtype, jobs in data.items(): removeids = [] for idx, job in enumerate(jobs): jid = job['DRMJID'] if jobtype in RELEASEJOB_SUPPORTED_HOLDTYPES: totaluser += 1 release = max(oldjobs.get(jid, 0), 0) + 1 job['_release'] = release stats['release'] = max(stats['release'], release) release_jobids.append(jid) # release the job cmd = [m.clusters[cluster]['path'], '-u', jid] logger.info( "Releasing job %s cluster %s for the %s-th time." % (jid, cluster, release)) if dry_run: logger.info("Dry run %s" % cmd) else: m._run_moab_command(cmd, cluster, []) else: # keep historical data, eg a previously released job could be idle now # but keep the counter in case it gets held again try: release = oldjobs[jid] job['_release'] = release except KeyError: # not previously in hold, remove it removeids.append(idx) # remove the jobs (in reverse order) for remove_idx in removeids[::-1]: jobs.pop(remove_idx) # cleanup if len(jobs) == 0: data.pop(jobtype) # cleanup if len(data) == 0: clusterdata.pop(cluster) # cleanup if len(clusterdata) == 0: queue_information.pop(user) # update stats stats['peruser'] = max(stats['peruser'], totaluser) stats['total'] += totaluser logger.info( "Release statistics: total jobs in hold %(total)s; max in hold per user %(peruser)s; max releases per job %(release)s" % stats) # update and close releasejob_cache.update('queue_information', queue_information, 0) releasejob_cache.close() return release_jobids, stats
def main(): # Collect all info # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'hosts': ('the hosts/clusters that should be contacted for job information', None, 'extend', []), 'information': ('the sort of information to store: user, vo, project', None, 'store', 'user'), 'location': ('the location for storing the pickle file: gengar, muk', str, 'store', 'gengar'), } opts = ExtendedSimpleOption(options) try: LdapQuery(VscConfiguration()) clusters = {} for host in opts.options.hosts: master = opts.configfile_parser.get(host, "master") showq_path = opts.configfile_parser.get(host, "showq_path") clusters[host] = { 'master': master, 'path': showq_path } logger.debug("clusters = %s" % (clusters,)) showq = Showq(clusters, cache_pickle=True, dry_run=opts.options.dry_run) logger.debug("Getting showq information ...") (queue_information, reported_hosts, failed_hosts) = showq.get_moab_command_information() timeinfo = time.time() active_users = queue_information.keys() logger.debug("Active users: %s" % (active_users)) logger.debug("Queue information: %s" % (queue_information)) # We need to determine which users should get an updated pickle. This depends on # - the active user set # - the information we want to provide on the cluster(set) where this script runs # At the same time, we need to determine the job information each user gets to see (target_users, target_queue_information, user_map) = determine_target_information(opts.options.information, active_users, queue_information) nagios_user_count = 0 nagios_no_store = 0 stats = {} for user in target_users: if not opts.options.dry_run: try: (path, store) = get_pickle_path(opts.options.location, user) user_queue_information = target_queue_information[user] user_queue_information['timeinfo'] = timeinfo store(user, path, (user_queue_information, user_map[user])) nagios_user_count += 1 except (UserStorageError, FileStoreError, FileMoveError), err: logger.error("Could not store pickle file for user %s" % (user)) nagios_no_store += 1 else: logger.info("Dry run, not actually storing data for user %s at path %s" % (user, get_pickle_path(opts.options.location, user)[0])) logger.debug("Dry run, queue information for user %s is %s" % (user, target_queue_information[user])) stats["store+users"] = nagios_user_count stats["store_fail"] = nagios_no_store stats["store_fail_critical"] = STORE_LIMIT_CRITICAL