jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], False, True) tmpLog.debug("kill {0} co-jumbo jobs in Waiting".format( len(coJumboTokill))) if len(coJumboTokill) > 0: jediJobs = list(coJumboTokill) nJob = 100 iJob = 0 while iJob < len(jediJobs): tmpLog.debug(' killing %s' % str(jediJobs[iJob:iJob + nJob])) Client.killJobs(jediJobs[iJob:iJob + nJob], 51, keepUnmerged=True) iJob += nJob except Exception: errStr = traceback.format_exc() tmpLog.error(errStr) tmpLog.debug("Fork session") # thread for fork class ForkThr(threading.Thread): def __init__(self, fileName): threading.Thread.__init__(self) self.fileName = fileName
if id not in jobsMap[prio]: jobsMap[prio].append(id) # order by PandaID and currentPriority jobs = [] prioList = list(jobsMap) prioList.sort() for prio in prioList: # reverse order by PandaID to kill newer jobs ids = jobsMap[prio] ids.sort() ids.reverse() jobs += ids if options.maxJobs is not None: jobs = jobs[:int(options.maxJobs)] print('The number of jobs with priorities below %s : %s' % (args[0], len(jobs))) if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print('kill %s' % str(jobs[iJob:iJob + nJob])) if options.forceKill: Client.killJobs(jobs[iJob:iJob + nJob], 9) else: Client.killJobs(jobs[iJob:iJob + nJob]) iJob += nJob time.sleep(1)
srcSQL += ')' jobs = [] tables = [ 'ATLAS_PANDA.jobsActive4', 'ATLAS_PANDA.jobsWaiting4', 'ATLAS_PANDA.jobsDefined4' ] for table in tables: sql = "SELECT PandaID FROM %s WHERE prodUserName=:prodUserName AND prodSourceLabel IN %s " % ( table, srcSQL) if options.jobID is not None: sql += "AND jobDefinitionID=:jobDefinitionID " if not options.jobsetID in (None, 'all'): sql += "AND jobsetID=:jobsetID " sql += "ORDER BY PandaID " status, res = proxyS.querySQLS(sql, varMap) if res is not None: for id, in res: if not id in jobs: jobs.append(id) if len(jobs): iJob = 0 nJob = 1000 while iJob < len(jobs): subJobs = jobs[iJob:iJob + nJob] print("kill %s %s/%s" % (str(subJobs), iJob, len(jobs))) Client.killJobs(subJobs, code=9) iJob += nJob else: print("no job was killed")
status, res = proxyS.querySQLS(sql, varMap) if res is not None: for (id, lockedby) in res: if lockedby == 'jedi': jediJobs.append(id) else: jobs.append(id) # reassign jobs.sort() if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print('reassign %s' % str(jobs[iJob:iJob + nJob])) Client.reassignJobs(jobs[iJob:iJob + nJob]) iJob += nJob time.sleep(10) if len(jediJobs) != 0: nJob = 100 iJob = 0 while iJob < len(jediJobs): print('kill JEDI jobs %s' % str(jediJobs[iJob:iJob + nJob])) Client.killJobs(jediJobs[iJob:iJob + nJob], codeV, keepUnmerged=options.keepUnmerged) iJob += nJob print('\nreassigned {0} jobs'.format(len(jobs + jediJobs)))
proxyS = DBProxy() proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) jobs = [] varMap = {} varMap[':prodSourceLabel'] = 'managed' varMap[':taskID'] = args[0] varMap[':pandaIDl'] = args[1] varMap[':pandaIDu'] = args[2] sql = "SELECT PandaID FROM %s WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID AND PandaID BETWEEN :pandaIDl AND :pandaIDu ORDER BY PandaID" for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']: status,res = proxyS.querySQLS(sql % table,varMap) if res is not None: for id, in res: if not id in jobs: jobs.append(id) print('The number of jobs to be killed : %s' % len(jobs)) if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print('kill %s' % str(jobs[iJob:iJob+nJob])) if options.forceKill: Client.killJobs(jobs[iJob:iJob+nJob],9,useMailAsID=useMailAsIDV) else: Client.killJobs(jobs[iJob:iJob+nJob],useMailAsID=useMailAsIDV) iJob += nJob time.sleep(1)
sql = "SELECT PandaID,lockedby FROM ATLAS_PANDA.jobsDefined4 " else: sql = "SELECT PandaID,lockedby FROM ATLAS_PANDA.jobsActive4 " sql += "WHERE jobStatus=:jobStatus AND computingSite=:computingSite AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel ORDER BY PandaID" status, res = proxyS.querySQLS(sql, varMap) print("got {0} jobs".format(len(res))) jobs = [] jediJobs = [] if res is not None: for (id, lockedby) in res: if lockedby == 'jedi': jediJobs.append(id) else: jobs.append(id) if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print('reassign %s' % str(jobs[iJob:iJob + nJob])) Client.reassignJobs(jobs[iJob:iJob + nJob]) iJob += nJob if len(jediJobs) != 0: nJob = 100 iJob = 0 while iJob < len(jediJobs): print('kill JEDI jobs %s' % str(jediJobs[iJob:iJob + nJob])) Client.killJobs(jediJobs[iJob:iJob + nJob], 51) iJob += nJob
if options.forceKill: codeV = 9 elif options.killUserJobs: codeV = 91 else: try: codeV = int(options.codeV) except Exception: pass if options.killOwnProdJobs: useMailAsIDV = True if len(args) == 1: Client.killJobs([args[0]], code=codeV, useMailAsID=useMailAsIDV, keepUnmerged=options.keepUnmerged, jobSubStatus=options.jobSubStatus) else: startID = int(args[0]) endID = int(args[1]) if startID > endID: print('%d is less than %d' % (endID, startID)) sys.exit(1) Client.killJobs(range(startID, endID + 1), code=codeV, useMailAsID=useMailAsIDV, keepUnmerged=options.keepUnmerged, jobSubStatus=options.jobSubStatus)
import sys import pandaserver.userinterface.Client as Client if len(sys.argv) == 2: jobDefIDs = [sys.argv[1]] else: startID = int(sys.argv[1]) endID = int(sys.argv[2]) if startID > endID: print('%d is less than %d' % (endID,startID)) sys.exit(1) jobDefIDs = range(startID,endID+1) # quesry PandaID status, ids = Client.queryPandaIDs(jobDefIDs) if status != 0: sys.exit(0) # remove None while True: if None not in ids: break ids.remove(None) # kill if len(ids) != 0: Client.killJobs(ids)
def main(argv=tuple(), tbuf=None, **kwargs): try: long except NameError: long = int tmpLog = LogWrapper(_logger, None) tmpLog.debug("===================== start =====================") # current minute currentMinute = datetime.datetime.utcnow().minute # instantiate TB if tbuf is None: from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) else: taskBuffer = tbuf # instantiate sitemapper aSiteMapper = SiteMapper(taskBuffer) # delete tmpLog.debug("Del session") status, retSel = taskBuffer.querySQLS( "SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4", {}) if retSel is not None: try: maxID = retSel[0][0] tmpLog.debug("maxID : %s" % maxID) if maxID is not None: varMap = {} varMap[':maxID'] = maxID varMap[':jobStatus1'] = 'activated' varMap[':jobStatus2'] = 'waiting' varMap[':jobStatus3'] = 'failed' varMap[':jobStatus4'] = 'cancelled' status, retDel = taskBuffer.querySQLS( "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID<:maxID AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3,:jobStatus4)", varMap) except Exception: pass # count # of getJob/updateJob in dispatcher's log try: # don't update when logrotate is running timeNow = datetime.datetime.utcnow() logRotateTime = timeNow.replace(hour=3, minute=2, second=0, microsecond=0) if (timeNow > logRotateTime and (timeNow-logRotateTime) < datetime.timedelta(minutes=5)) or \ (logRotateTime > timeNow and (logRotateTime-timeNow) < datetime.timedelta(minutes=5)): tmpLog.debug("skip pilotCounts session for logrotate") else: # log filename dispLogName = '%s/panda-PilotRequests.log' % panda_config.logdir # time limit timeLimit = datetime.datetime.utcnow() - datetime.timedelta( hours=3) timeLimitS = datetime.datetime.utcnow() - datetime.timedelta( hours=1) # check if tgz is required com = 'head -1 %s' % dispLogName lostat, loout = commands_get_status_output(com) useLogTgz = True if lostat == 0: match = re.search('^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', loout) if match is not None: startTime = datetime.datetime(*time.strptime( match.group(0), '%Y-%m-%d %H:%M:%S')[:6]) # current log contains all info if startTime < timeLimit: useLogTgz = False # log files dispLogNameList = [dispLogName] if useLogTgz: today = datetime.date.today() dispLogNameList.append('{0}-{1}.gz'.format( dispLogName, today.strftime('%Y%m%d'))) # delete tmp commands_get_status_output('rm -f %s.tmp-*' % dispLogName) # tmp name tmpLogName = '%s.tmp-%s' % (dispLogName, datetime.datetime.utcnow( ).strftime('%Y-%m-%d-%H-%M-%S')) # loop over all files pilotCounts = {} pilotCountsS = {} for tmpDispLogName in dispLogNameList: # expand or copy if tmpDispLogName.endswith('.gz'): com = 'gunzip -c %s > %s' % (tmpDispLogName, tmpLogName) else: com = 'cp %s %s' % (tmpDispLogName, tmpLogName) lostat, loout = commands_get_status_output(com) if lostat != 0: errMsg = 'failed to expand/copy %s with : %s' % ( tmpDispLogName, loout) raise RuntimeError(errMsg) # search string sStr = '^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*' sStr += 'method=(.+),site=(.+),node=(.+),type=(.+)' # read logFH = open(tmpLogName) for line in logFH: # check format match = re.search(sStr, line) if match is not None: # check timerange timeStamp = datetime.datetime(*time.strptime( match.group(1), '%Y-%m-%d %H:%M:%S')[:6]) if timeStamp < timeLimit: continue tmpMethod = match.group(2) tmpSite = match.group(3) tmpNode = match.group(4) tmpType = match.group(5) # protection against corrupted entries from pilot, # e.g. pilot reading site json from cvmfs while it was being updated if tmpSite not in aSiteMapper.siteSpecList: continue # sum pilotCounts.setdefault(tmpSite, {}) pilotCounts[tmpSite].setdefault(tmpMethod, {}) pilotCounts[tmpSite][tmpMethod].setdefault(tmpNode, 0) pilotCounts[tmpSite][tmpMethod][tmpNode] += 1 # short if timeStamp > timeLimitS: if tmpSite not in pilotCountsS: pilotCountsS[tmpSite] = dict() if tmpMethod not in pilotCountsS[tmpSite]: pilotCountsS[tmpSite][tmpMethod] = dict() if tmpNode not in pilotCountsS[tmpSite][tmpMethod]: pilotCountsS[tmpSite][tmpMethod][tmpNode] = 0 pilotCountsS[tmpSite][tmpMethod][tmpNode] += 1 # close logFH.close() # delete tmp commands_get_status_output('rm %s' % tmpLogName) # update hostID = panda_config.pserverhost.split('.')[0] tmpLog.debug("pilotCounts session") retPC = taskBuffer.updateSiteData(hostID, pilotCounts, interval=3) tmpLog.debug(retPC) retPC = taskBuffer.updateSiteData(hostID, pilotCountsS, interval=1) tmpLog.debug(retPC) except Exception: errType, errValue = sys.exc_info()[:2] tmpLog.error("updateJob/getJob : %s %s" % (errType, errValue)) # nRunning tmpLog.debug("nRunning session") try: if (currentMinute / panda_config.nrun_interval ) % panda_config.nrun_hosts == panda_config.nrun_snum: retNR = taskBuffer.insertnRunningInSiteData() tmpLog.debug(retNR) except Exception: errType, errValue = sys.exc_info()[:2] tmpLog.error("nRunning : %s %s" % (errType, errValue)) # session for co-jumbo jobs tmpLog.debug("co-jumbo session") try: ret = taskBuffer.getCoJumboJobsToBeFinished(30, 0, 1000) if ret is None: tmpLog.debug("failed to get co-jumbo jobs to finish") else: coJumboA, coJumboD, coJumboW, coJumboTokill = ret tmpLog.debug("finish {0} co-jumbo jobs in Active".format( len(coJumboA))) if len(coJumboA) > 0: jobSpecs = taskBuffer.peekJobs(coJumboA, fromDefined=False, fromActive=True, fromArchived=False, fromWaiting=False) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI( jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], False) tmpLog.debug("finish {0} co-jumbo jobs in Defined".format( len(coJumboD))) if len(coJumboD) > 0: jobSpecs = taskBuffer.peekJobs(coJumboD, fromDefined=True, fromActive=False, fromArchived=False, fromWaiting=False) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI( jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], True) tmpLog.debug("finish {0} co-jumbo jobs in Waiting".format( len(coJumboW))) if len(coJumboW) > 0: jobSpecs = taskBuffer.peekJobs(coJumboW, fromDefined=False, fromActive=False, fromArchived=False, fromWaiting=True) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI( jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], False, True) tmpLog.debug("kill {0} co-jumbo jobs in Waiting".format( len(coJumboTokill))) if len(coJumboTokill) > 0: jediJobs = list(coJumboTokill) nJob = 100 iJob = 0 while iJob < len(jediJobs): tmpLog.debug(' killing %s' % str(jediJobs[iJob:iJob + nJob])) Client.killJobs(jediJobs[iJob:iJob + nJob], 51, keepUnmerged=True) iJob += nJob except Exception: errStr = traceback.format_exc() tmpLog.error(errStr) tmpLog.debug("Fork session") # thread for fork class ForkThr(threading.Thread): def __init__(self, fileName): threading.Thread.__init__(self) self.fileName = fileName def run(self): if 'VIRTUAL_ENV' in os.environ: prefix = os.environ['VIRTUAL_ENV'] else: prefix = '' setupStr = 'source {0}/etc/sysconfig/panda_server; '.format(prefix) runStr = '%s/python -Wignore ' % panda_config.native_python runStr += panda_config.pandaPython_dir + '/dataservice/forkSetupper.py -i ' runStr += self.fileName if self.fileName.split('/')[-1].startswith('set.NULL.'): runStr += ' -t' comStr = setupStr + runStr tmpLog.debug(comStr) commands_get_status_output(comStr) # get set.* files filePatt = panda_config.logdir + '/' + 'set.*' fileList = glob.glob(filePatt) # the max number of threads maxThr = 10 nThr = 0 # loop over all files forkThrList = [] timeNow = datetime.datetime.utcnow() for tmpName in fileList: if not os.path.exists(tmpName): continue try: # takes care of only recent files modTime = datetime.datetime( *(time.gmtime(os.path.getmtime(tmpName))[:7])) if (timeNow - modTime) > datetime.timedelta(minutes=1) and \ (timeNow - modTime) < datetime.timedelta(hours=1): cSt, cOut = commands_get_status_output( 'ps aux | grep fork | grep -v PYTH') # if no process is running for the file if cSt == 0 and tmpName not in cOut: nThr += 1 thr = ForkThr(tmpName) thr.start() forkThrList.append(thr) if nThr > maxThr: break except Exception: errType, errValue = sys.exc_info()[:2] tmpLog.error("%s %s" % (errType, errValue)) # join fork threads for thr in forkThrList: thr.join() # terminate TaskBuffer IF # taskBufferIF.terminate() tmpLog.debug("===================== end =====================")