def cache_users_from_file(): cache_file = get_cache_filename() try: cache_users_from_fd(open(cache_file, "r")) except Exception, e: htcondor.log( htcondor.LogLevel.Always, "Failed to cache users from file %s: %s" % (cache_file, str(e)))
def cache_users(): url = htcondor.param.get("CMSLPC_USER_URL") if not url: cache_users_from_file() return try: urlfd = urllib.urlopen(url) cache_users_from_fd(urlfd) except Exception, e: htcondor.log(htcondor.LogLevel.Always, "Failed to cache users from URL %s: %s" % (url, str(e))) cache_users_from_file() return
def write_cache_file(): final_fname = get_cache_filename() dirname, prefix = os.path.split(final_fname) fd, name = tempfile.mkstemp(dir=dirname, prefix=prefix) try: for dn in g_cache: os.write(fd, dn + "\n") os.close(fd) os.rename(name, final_fname) except Exception, e: htcondor.log(htcondor.LogLevel.Always, "Failed to write out cache file: %s" % str(e)) try: os.unlink(name) except: pass
def stopjob(info): global runningmasters if 'Name' not in info or 'SlotID' not in info: return name = info['Name'] matchre = "" if 'GLIDEIN_MASTER_NAME' in info: idxname = info['GLIDEIN_MASTER_NAME'] if idxname == name: # stop all jobs under this master matchre = '.*' else: # names of form "slotN@" stop that name and all "slotN_M@" names slotn = re.sub('^(slot[0-9]*)@.*', r'\1', name) if slotn != name: # match any name starting with slotN@ or slotN_ matchre = '^' + slotn + '[@_]' # else take the default of matching only one name else: idxname = name idx = (idxname, info['SlotID']) if idx not in runningmasters: return runningjobs = runningmasters[idx][1] if matchre == "": # no match expression, just stop one if name not in runningjobs: return stopjobnames = [name] else: # select all jobs in this master stopjobnames = runningjobs.keys() if matchre != '.*': # restrict to the matching regular expression regex = re.compile(matchre) stopjobnames = filter(regex.search, stopjobnames) for stopjobname in stopjobnames: loginfo = {} loginfo['Name'] = stopjobname loginfo['SlotID'] = info['SlotID'] loginfo['GlobalJobId'] = runningjobs[stopjobname] htcondor.log(htcondor.LogLevel.Audit, "Job stop: %s" % loginfo) del runningjobs[stopjobname] if len(runningjobs) == 0: del runningmasters[idx]
def cache_users_from_file(): cache_file = get_cache_filename() try: cache_users_from_fd(open(cache_file, "r")) except Exception, e: htcondor.log(htcondor.LogLevel.Always, "Failed to cache users from file %s: %s" % (cache_file, str(e)))
def startjob(info): global maxjobsecs global runningmasters if 'Name' not in info or 'SlotID' not in info or 'GlobalJobId' not in info: return name = info['Name'] if 'GLIDEIN_MASTER_NAME' in info: # Glidein may be partitioned and sometimes tear down all contained # slots at once, so need to track those slots together idxname = info['GLIDEIN_MASTER_NAME'] else: idxname = name idx = (idxname, info['SlotID']) globaljobid = info['GlobalJobId'] now = 0 if idx in runningmasters: thismaster = runningmasters[idx] runningjobs = thismaster[1] if name in runningjobs: if globaljobid == runningjobs[name]: # just an update to a running job, ignore return # first stop the existing job, the slot is being reused stopjob(info) # this may have removed the last job in thismaster, check again if idx not in runningmasters: # new master now = time.time() thismaster = (now, {}) runningmasters[idx] = thismaster # add job to this master thismaster[1][name] = globaljobid printinfo = {} keys = [ 'Name', 'SlotID', 'GlobalJobId', 'RemoteOwner', 'ClientMachine', 'ProjectName', 'Group', 'x509UserProxyVOName', 'x509userproxysubject', 'x509UserProxyEmail' ] for key in keys: if key in info: printinfo[key] = info[key] htcondor.log(htcondor.LogLevel.Audit, "Job start: %s" % printinfo) if now == 0: return # also look for expired jobs at the beginning of the list and stop them for idx in runningmasters: thismaster = runningmasters[idx] deltasecs = int(now - thismaster[0]) if deltasecs <= maxjobsecs: break loginfo = {} loginfo['SlotID'] = idx[1] runningjobs = thismaster[1] for jobname in runningjobs: loginfo['Name'] = jobname loginfo['GlobalJobId'] = runningjobs[jobname] htcondor.log( htcondor.LogLevel.Audit, "Cleaning up %d-second expired job: %s" % (deltasecs, loginfo)) del runningmasters[idx]
from collections import OrderedDict # Dictionary containing all tracked running jobs. # Each entry is for a 'master', which is either a pilot job/glidein or # individual job. # The index of the dictionary is a tuple of (mastername, slotid). # The contents of each entry is a tuple of (starttime, jobs), where # jobs is a dictionary of individual job names running in that master # and each entry has a value of the GlobalJobId of that job. runningmasters = OrderedDict() if 'AUDIT_PAYLOAD_MAX_HOURS' in htcondor.param: maxjobhours = int(htcondor.param['AUDIT_PAYLOAD_MAX_HOURS']) else: maxjobhours = 3 * 24 htcondor.log(htcondor.LogLevel.Audit, "Audit payload maximum job hours: %d" % maxjobhours) maxjobsecs = maxjobhours * 60 * 60 # a job may be being stopped def stopjob(info): global runningmasters if 'Name' not in info or 'SlotID' not in info: return name = info['Name'] matchre = "" if 'GLIDEIN_MASTER_NAME' in info: idxname = info['GLIDEIN_MASTER_NAME'] if idxname == name: # stop all jobs under this master matchre = '.*'