class CloserAtlasPlugin: # constructor def __init__(self,job,datasets,log): self.jobSpec = job self.datasets = datasets self.tmpLog = LogWrapper(log,"{0} CloserAtlasPlugin".format(self.jobSpec.PandaID)) # execute def execute(self): try: # only for production if not self.jobSpec.prodSourceLabel in ['managed','test']: return True # only for urgent or high prio if not self.jobSpec.processingType in ['urgent'] and self.jobSpec.currentPriority <= 1000: return True # close datasets for datasetSpec in self.datasets: if re.search('_sub\d+$',datasetSpec.name) == None: continue if datasetSpec.status != 'tobeclosed': continue try: self.tmpLog.debug('immediate close {0}'.format(datasetSpec.name)) rucioAPI.closeDataset(datasetSpec.name) except: errtype,errvalue = sys.exc_info()[:2] self.tmpLog.warning('failed to close : {0} {1}'.format(errtype,errvalue)) except: errtype,errvalue = sys.exc_info()[:2] self.tmpLog.warning('failed to execute : {0} {1}'.format(errtype,errvalue)) return True
def run(self): # get logger tmpLog = LogWrapper(_logger,'<vuid={0} site={1} name={2}>'.format(self.vuid, self.site, self.dataset)) # query dataset tmpLog.debug("start") if self.vuid != None: dataset = self.taskBuffer.queryDatasetWithMap({'vuid':self.vuid}) else: dataset = self.taskBuffer.queryDatasetWithMap({'name':self.dataset}) if dataset == None: tmpLog.error("Not found") tmpLog.debug("end") return tmpLog.debug("type:%s name:%s" % (dataset.type,dataset.name)) if dataset.type == 'dispatch': # activate jobs in jobsDefined Activator(self.taskBuffer,dataset).start() if dataset.type == 'output': if dataset.name != None and re.search('^panda\..*_zip$',dataset.name) != None: # start unmerge jobs Activator(self.taskBuffer,dataset,enforce=True).start() else: # finish transferring jobs Finisher(self.taskBuffer,dataset,site=self.site).start() tmpLog.debug("end")
def _getPFNFromLFC(lfns,dq2url,guids,storageName,scopeList=[],tmpLog=None): if tmpLog == None: tmpLog = LogWrapper(_log,logPrefix) tmpLog.debug('_getPFNFromLFC %s %s / %s LFNs:%s %s' % (dq2url,str(storageName), len(lfns),str(lfns[:3]),str(scopeList[:3]))) outStr = '' # check paramter if guids == [] or storageName == [] or (len(lfns) != len(guids)): tmpLog.debug('_getPFNFromLFC done with empty list') return outStr # check scopeList if not scopeList in [None,[]] and len(lfns) != len(scopeList): tmpLog.warning('_getPFNFromLFC wrong scopeList %s %s %s %s' % (dq2url,str(storageName), str(lfns),str(scopeList))) tmpLog.error('_getPFNFromLFC failed') return outStr # loop over all LFNs iLFN = 0 nLFN = 1000 strFiles = '' outStr = '' for iLFN in range(len(lfns)): if scopeList != []: strFiles += '%s %s %s\n' % (lfns[iLFN],guids[iLFN],scopeList[iLFN]) else: strFiles += '%s %s\n' % (lfns[iLFN],guids[iLFN]) # bulk operation if (iLFN+1) % nLFN == 0 or (iLFN+1) >= len(lfns): # write to file inFileName = '%s/lfcin.%s' % (panda_config.logdir,commands.getoutput('uuidgen')) ifile = open(inFileName,'w') ifile.write(strFiles) ifile.close() # construct commands strStorage = '' for storage in storageName: strStorage += '%s,' % storage strStorage = strStorage[:-1] com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd) com+= 'unset LD_LIBRARY_PATH; unset PYTHONPATH; export PATH=/usr/local/bin:/bin:/usr/bin; ' com+= 'source %s; %s/python -Wignore %s/LFCclient.py -f %s -l %s -s %s' % \ (panda_config.glite_source,panda_config.native_python32,panda_config.lfcClient_dir, inFileName,dq2url,strStorage) tmpLog.debug(com) # exeute status,output = commands.getstatusoutput(com) tmpLog.debug(status) if status == 0: outStr += output else: tmpLog.error("_getPFNFromLFC : %s %s %s" % (dq2url,status,output)) # send message to logger try: # make message message = 'LFC access : %s %s %s' % (dq2url,status,output) # get logger _pandaLogger = PandaLogger() _pandaLogger.lock() _pandaLogger.setParams({'Type':'broker_util'}) logger = _pandaLogger.getHttpLogger(panda_config.loggername) # add message logger.error(message) # release HTTP handler _pandaLogger.release() except: pass tmpLog.error('_getPFNFromLFC failed') return status # reset strFiles = '' tmpLog.debug('_getPFNFromLFC done') # return return outStr
class EventPicker: # constructor def __init__(self, taskBuffer, siteMapper, evpFileName, ignoreError): self.taskBuffer = taskBuffer self.siteMapper = siteMapper self.ignoreError = ignoreError self.evpFileName = evpFileName self.token = datetime.datetime.utcnow().isoformat(' ') # logger self.logger = LogWrapper(_logger, self.token) self.pd2p = DynDataDistributer.DynDataDistributer([], self.taskBuffer, self.siteMapper, token=' ', logger=self.logger) self.userDatasetName = '' self.creationTime = '' self.params = '' self.lockedBy = '' self.evpFile = None self.userTaskName = '' # message buffer self.msgBuffer = [] self.lineLimit = 100 # JEDI self.jediTaskID = None # main def run(self): try: self.putLog('start %s' % self.evpFileName) # lock evp file self.evpFile = open(self.evpFileName) try: fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) except: # relase self.putLog("cannot lock %s" % self.evpFileName) self.evpFile.close() return True # options runEvtList = [] eventPickDataType = '' eventPickStreamName = '' eventPickDS = [] eventPickAmiTag = '' eventPickNumSites = 1 inputFileList = [] tagDsList = [] tagQuery = '' tagStreamRef = '' skipDaTRI = False runEvtGuidMap = {} # read evp file for tmpLine in self.evpFile: tmpMatch = re.search('^([^=]+)=(.+)$', tmpLine) # check format if tmpMatch == None: continue tmpItems = tmpMatch.groups() if tmpItems[0] == 'runEvent': # get run and event number tmpRunEvt = tmpItems[1].split(',') if len(tmpRunEvt) == 2: runEvtList.append(tmpRunEvt) elif tmpItems[0] == 'eventPickDataType': # data type eventPickDataType = tmpItems[1] elif tmpItems[0] == 'eventPickStreamName': # stream name eventPickStreamName = tmpItems[1] elif tmpItems[0] == 'eventPickDS': # dataset pattern eventPickDS = tmpItems[1].split(',') elif tmpItems[0] == 'eventPickAmiTag': # AMI tag eventPickAmiTag = tmpItems[1] elif tmpItems[0] == 'eventPickNumSites': # the number of sites where datasets are distributed try: eventPickNumSites = int(tmpItems[1]) except: pass elif tmpItems[0] == 'userName': # user name self.userDN = tmpItems[1] self.putLog("user=%s" % self.userDN) elif tmpItems[0] == 'userTaskName': # user task name self.userTaskName = tmpItems[1] elif tmpItems[0] == 'userDatasetName': # user dataset name self.userDatasetName = tmpItems[1] elif tmpItems[0] == 'lockedBy': # client name self.lockedBy = tmpItems[1] elif tmpItems[0] == 'creationTime': # creation time self.creationTime = tmpItems[1] elif tmpItems[0] == 'params': # parameters self.params = tmpItems[1] elif tmpItems[0] == 'inputFileList': # input file list inputFileList = tmpItems[1].split(',') try: inputFileList.remove('') except: pass elif tmpItems[0] == 'tagDS': # TAG dataset tagDsList = tmpItems[1].split(',') elif tmpItems[0] == 'tagQuery': # query for TAG tagQuery = tmpItems[1] elif tmpItems[0] == 'tagStreamRef': # StreamRef for TAG tagStreamRef = tmpItems[1] if not tagStreamRef.endswith('_ref'): tagStreamRef += '_ref' elif tmpItems[0] == 'runEvtGuidMap': # GUIDs try: exec "runEvtGuidMap=" + tmpItems[1] except: pass # extract task name if self.userTaskName == '' and self.params != '': try: tmpMatch = re.search('--outDS(=| ) *([^ ]+)', self.params) if tmpMatch != None: self.userTaskName = tmpMatch.group(2) if not self.userTaskName.endswith('/'): self.userTaskName += '/' except: pass # suppress DaTRI if self.params != '': if '--eventPickSkipDaTRI' in self.params: skipDaTRI = True # get compact user name compactDN = self.taskBuffer.cleanUserID(self.userDN) # get jediTaskID self.jediTaskID = self.taskBuffer.getTaskIDwithTaskNameJEDI( compactDN, self.userTaskName) # convert if tagDsList == [] or tagQuery == '': # convert run/event list to dataset/file list tmpRet, locationMap, allFiles = self.pd2p.convertEvtRunToDatasets( runEvtList, eventPickDataType, eventPickStreamName, eventPickDS, eventPickAmiTag, self.userDN, runEvtGuidMap) if not tmpRet: if 'isFatal' in locationMap and locationMap[ 'isFatal'] == True: self.ignoreError = False self.endWithError( 'Failed to convert the run/event list to a dataset/file list' ) return False else: # get parent dataset/files with TAG tmpRet, locationMap, allFiles = self.pd2p.getTagParentInfoUsingTagQuery( tagDsList, tagQuery, tagStreamRef) if not tmpRet: self.endWithError( 'Failed to get parent dataset/file list with TAG') return False # use only files in the list if inputFileList != []: tmpAllFiles = [] for tmpFile in allFiles: if tmpFile['lfn'] in inputFileList: tmpAllFiles.append(tmpFile) allFiles = tmpAllFiles # remove redundant CN from DN tmpDN = self.userDN tmpDN = re.sub('/CN=limited proxy', '', tmpDN) tmpDN = re.sub('(/CN=proxy)+$', '', tmpDN) # make dataset container tmpRet = self.pd2p.registerDatasetContainerWithDatasets( self.userDatasetName, allFiles, locationMap, nSites=eventPickNumSites, owner=tmpDN) if not tmpRet: self.endWithError('Failed to make a dataset container %s' % self.userDatasetName) return False # skip DaTRI if skipDaTRI: # successfully terminated self.putLog("skip DaTRI") # update task self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID) else: # get candidates tmpRet, candidateMaps = self.pd2p.getCandidates( self.userDatasetName, checkUsedFile=False, useHidden=True) if not tmpRet: self.endWithError( 'Failed to find candidate for destination') return False # collect all candidates allCandidates = [] for tmpDS, tmpDsVal in candidateMaps.iteritems(): for tmpCloud, tmpCloudVal in tmpDsVal.iteritems(): for tmpSiteName in tmpCloudVal[0]: if not tmpSiteName in allCandidates: allCandidates.append(tmpSiteName) if allCandidates == []: self.endWithError('No candidate for destination') return False # get list of dataset (container) names if eventPickNumSites > 1: # decompose container to transfer datasets separately tmpRet, tmpOut = self.pd2p.getListDatasetReplicasInContainer( self.userDatasetName) if not tmpRet: self.endWithError('Failed to get the size of %s' % self.userDatasetName) return False userDatasetNameList = tmpOut.keys() else: # transfer container at once userDatasetNameList = [self.userDatasetName] # loop over all datasets sitesUsed = [] for tmpUserDatasetName in userDatasetNameList: # get size of dataset container tmpRet, totalInputSize = rucioAPI.getDatasetSize( tmpUserDatasetName) if not tmpRet: self.endWithError('Failed to get the size of %s' % tmpUserDatasetName) return False # run brokerage tmpJob = JobSpec() tmpJob.AtlasRelease = '' self.putLog("run brokerage for %s" % tmpDS) brokerage.broker.schedule([tmpJob], self.taskBuffer, self.siteMapper, True, allCandidates, True, datasetSize=totalInputSize) if tmpJob.computingSite.startswith('ERROR'): self.endWithError('brokerage failed with %s' % tmpJob.computingSite) return False self.putLog("site -> %s" % tmpJob.computingSite) # send transfer request try: tmpDN = rucioAPI.parse_dn(tmpDN) tmpStatus, userInfo = rucioAPI.finger(tmpDN) if not tmpStatus: raise RuntimeError, 'user info not found for {0} with {1}'.format( tmpDN, userInfo) tmpDN = userInfo['nickname'] tmpDQ2ID = self.siteMapper.getSite( tmpJob.computingSite).ddm tmpMsg = "%s ds=%s site=%s id=%s" % ( 'registerDatasetLocation for DaTRI ', tmpUserDatasetName, tmpDQ2ID, tmpDN) self.putLog(tmpMsg) rucioAPI.registerDatasetLocation( tmpDS, [tmpDQ2ID], lifetime=14, owner=tmpDN, activity="User Subscriptions") self.putLog('OK') except: errType, errValue = sys.exc_info()[:2] tmpStr = 'Failed to send transfer request : %s %s' % ( errType, errValue) tmpStr.strip() tmpStr += traceback.format_exc() self.endWithError(tmpStr) return False # list of sites already used sitesUsed.append(tmpJob.computingSite) self.putLog("used %s sites" % len(sitesUsed)) # set candidates if len(sitesUsed) >= eventPickNumSites: # reset candidates to limit the number of sites allCandidates = sitesUsed sitesUsed = [] else: # remove site allCandidates.remove(tmpJob.computingSite) # send email notification for success tmpMsg = 'A transfer request was successfully sent to Rucio.\n' tmpMsg += 'Your task will get started once transfer is completed.' self.sendEmail(True, tmpMsg) try: # unlock and delete evp file fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_UN) self.evpFile.close() os.remove(self.evpFileName) except: pass # successfully terminated self.putLog("end %s" % self.evpFileName) return True except: errType, errValue = sys.exc_info()[:2] self.endWithError('Got exception %s:%s %s' % (errType, errValue, traceback.format_exc())) return False # end with error def endWithError(self, message): self.putLog(message, 'error') # unlock evp file try: fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_UN) self.evpFile.close() if not self.ignoreError: # remove evp file os.remove(self.evpFileName) # send email notification self.sendEmail(False, message) except: pass # upload log if self.jediTaskID != None: outLog = self.uploadLog() self.taskBuffer.updateTaskErrorDialogJEDI( self.jediTaskID, 'event picking failed. ' + outLog) # update task if not self.ignoreError: self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID, 'tobroken') self.putLog(outLog) self.putLog('end %s' % self.evpFileName) # put log def putLog(self, msg, type='debug'): tmpMsg = msg if type == 'error': self.logger.error(tmpMsg) else: self.logger.debug(tmpMsg) # send email notification def sendEmail(self, isSucceeded, message): # mail address toAdder = Notifier(self.taskBuffer, None, []).getEmail(self.userDN) if toAdder == '': self.putLog('cannot find email address for %s' % self.userDN, 'error') return # subject mailSubject = "PANDA notification for Event-Picking Request" # message mailBody = "Hello,\n\nHere is your request status for event picking\n\n" if isSucceeded: mailBody += "Status : Passed to Rucio\n" else: mailBody += "Status : Failed\n" mailBody += "Created : %s\n" % self.creationTime mailBody += "Ended : %s\n" % datetime.datetime.utcnow().strftime( '%Y-%m-%d %H:%M:%S') mailBody += "Dataset : %s\n" % self.userDatasetName mailBody += "\n" mailBody += "Parameters : %s %s\n" % (self.lockedBy, self.params) mailBody += "\n" mailBody += "%s\n" % message # send retVal = MailUtils().send(toAdder, mailSubject, mailBody) # return return # upload log def uploadLog(self): if self.jediTaskID == None: return 'cannot find jediTaskID' strMsg = self.logger.dumpToString() s, o = Client.uploadLog(strMsg, self.jediTaskID) if s != 0: return "failed to upload log with {0}.".format(s) if o.startswith('http'): return '<a href="{0}">log</a>'.format(o) return o
def updateJob(req, jobId, state, token=None, transExitCode=None, pilotErrorCode=None, pilotErrorDiag=None, timestamp=None, timeout=60, xml='', node=None, workdir=None, cpuConsumptionTime=None, cpuConsumptionUnit=None, remainingSpace=None, schedulerID=None, pilotID=None, siteName=None, messageLevel=None, pilotLog='', metaData='', cpuConversionFactor=None, exeErrorCode=None, exeErrorDiag=None, pilotTiming=None, computingElement=None, startTime=None, endTime=None, nEvents=None, nInputFiles=None, batchID=None, attemptNr=None, jobMetrics=None, stdout='', jobSubStatus=None, coreCount=None, maxRSS=None, maxVMEM=None, maxSWAP=None, maxPSS=None, avgRSS=None, avgVMEM=None, avgSWAP=None, avgPSS=None): tmpLog = LogWrapper( _logger, 'updateJob PandaID={0} PID={1}'.format(jobId, os.getpid())) tmpLog.debug('start') # get DN realDN = _getDN(req) # get FQANs fqans = _getFQAN(req) # check production role prodManager = _checkRole(fqans, realDN, jobDispatcher, site=siteName, hostname=req.get_remote_host()) # check token validToken = _checkToken(token, jobDispatcher) # accept json acceptJson = req.acceptJson() _logger.debug( "updateJob(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,attemptNr:%s,jobSubStatus:%s,core:%s,DN:%s,role:%s,token:%s,val:%s,FQAN:%s,maxRSS=%s,maxVMEM=%s,maxSWAP=%s,maxPSS=%s,avgRSS=%s,avgVMEM=%s,avgSWAP=%s,avgPSS=%s\n==XML==\n%s\n==LOG==\n%s\n==Meta==\n%s\n==Metrics==\n%s\n==stdout==\n%s)" % (jobId, state, transExitCode, pilotErrorCode, pilotErrorDiag, node, workdir, cpuConsumptionTime, cpuConsumptionUnit, remainingSpace, schedulerID, pilotID, siteName, messageLevel, nEvents, nInputFiles, cpuConversionFactor, exeErrorCode, exeErrorDiag, pilotTiming, computingElement, startTime, endTime, batchID, attemptNr, jobSubStatus, coreCount, realDN, prodManager, token, validToken, str(fqans), maxRSS, maxVMEM, maxSWAP, maxPSS, avgRSS, avgVMEM, avgSWAP, avgPSS, xml, pilotLog, metaData, jobMetrics, stdout)) _pilotReqLogger.info('method=updateJob,site=%s,node=%s,type=None' % (siteName, node)) # invalid role if not prodManager: _logger.warning("updateJob(%s) : invalid role" % jobId) return Protocol.Response(Protocol.SC_Role).encode(acceptJson) # invalid token if not validToken: _logger.warning("updateJob(%s) : invalid token" % jobId) return Protocol.Response(Protocol.SC_Invalid).encode(acceptJson) # aborting message if jobId == 'NULL': return Protocol.Response(Protocol.SC_Success).encode(acceptJson) # check status if not state in [ 'running', 'failed', 'finished', 'holding', 'starting', 'transferring' ]: _logger.warning("invalid state=%s for updateJob" % state) return Protocol.Response(Protocol.SC_Success).encode(acceptJson) # pilot log tmpLog.debug('sending log') if pilotLog != '': try: # make message message = pilotLog # get logger _pandaLogger = PandaLogger() _pandaLogger.lock() _pandaLogger.setParams({'Type': 'pilotLog', 'PandaID': int(jobId)}) logger = _pandaLogger.getHttpLogger(panda_config.loggername) # add message logger.info(message) except: tmpLog.debug('failed to send log') finally: tmpLog.debug('release lock') try: # release HTTP handler _pandaLogger.release() except: pass tmpLog.debug('done log') # create parameter map param = {} if cpuConsumptionTime != None: param['cpuConsumptionTime'] = cpuConsumptionTime if cpuConsumptionUnit != None: param['cpuConsumptionUnit'] = cpuConsumptionUnit if node != None: param['modificationHost'] = node[:128] if transExitCode != None: param['transExitCode'] = transExitCode if pilotErrorCode != None: param['pilotErrorCode'] = pilotErrorCode if pilotErrorDiag != None: param['pilotErrorDiag'] = pilotErrorDiag[:500] if jobMetrics != None: param['jobMetrics'] = jobMetrics[:500] if schedulerID != None: param['schedulerID'] = schedulerID if pilotID != None: param['pilotID'] = pilotID[:200] if batchID != None: param['batchID'] = batchID[:80] if exeErrorCode != None: param['exeErrorCode'] = exeErrorCode if exeErrorDiag != None: param['exeErrorDiag'] = exeErrorDiag[:500] if cpuConversionFactor != None: param['cpuConversion'] = cpuConversionFactor if pilotTiming != None: param['pilotTiming'] = pilotTiming if computingElement != None: param['computingElement'] = computingElement if nEvents != None: param['nEvents'] = nEvents if nInputFiles != None: param['nInputFiles'] = nInputFiles if not jobSubStatus in [None, '']: param['jobSubStatus'] = jobSubStatus if not coreCount in [None, '']: param['actualCoreCount'] = coreCount if maxRSS != None: param['maxRSS'] = maxRSS if maxVMEM != None: param['maxVMEM'] = maxVMEM if maxSWAP != None: param['maxSWAP'] = maxSWAP if maxPSS != None: param['maxPSS'] = maxPSS if avgRSS != None: param['avgRSS'] = avgRSS if avgVMEM != None: param['avgVMEM'] = avgVMEM if avgSWAP != None: param['avgSWAP'] = avgSWAP if avgPSS != None: param['avgPSS'] = avgPSS if startTime != None: try: param['startTime'] = datetime.datetime( *time.strptime(startTime, '%Y-%m-%d %H:%M:%S')[:6]) except: pass if endTime != None: try: param['endTime'] = datetime.datetime( *time.strptime(endTime, '%Y-%m-%d %H:%M:%S')[:6]) except: pass if attemptNr != None: try: attemptNr = int(attemptNr) except: attemptNr = None if stdout != '': stdout = stdout[:2048] # invoke JD tmpLog.debug('executing') return jobDispatcher.updateJob(int(jobId), state, int(timeout), xml, siteName, param, metaData, attemptNr, stdout, acceptJson)
import datetime import traceback from taskbuffer.TaskBuffer import taskBuffer from pandalogger.PandaLogger import PandaLogger from pandalogger.LogWrapper import LogWrapper from brokerage.SiteMapper import SiteMapper # password from config import panda_config passwd = panda_config.dbpasswd # logger _logger = PandaLogger().getLogger('prioryMassage') tmpLog = LogWrapper(_logger) tmpLog.debug("================= start ==================") # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) # get usage breakdown usageBreakDownPerUser = {} usageBreakDownPerSite = {} workingGroupList = [] for table in ['ATLAS_PANDA.jobsActive4', 'ATLAS_PANDA.jobsArchived4']: varMap = {} varMap[':prodSourceLabel'] = 'user' if table == 'ATLAS_PANDA.jobsActive4':
def run(self): try: # make a message instance tmpLog = LogWrapper(_logger,None) # run main procedure in the same process if not self.forkRun: tmpLog.debug('main start') tmpLog.debug('firstSubmission={0}'.format(self.firstSubmission)) # group jobs per VO voJobsMap = {} ddmFreeJobs = [] tmpLog.debug('{0} jobs in total'.format(len(self.jobs))) for tmpJob in self.jobs: # set VO=local for DDM free if tmpJob.destinationSE == 'local': tmpVO = 'local' else: tmpVO = tmpJob.VO # make map if not voJobsMap.has_key(tmpVO): voJobsMap[tmpVO] = [] voJobsMap[tmpVO].append(tmpJob) # loop over all VOs for tmpVO,tmpJobList in voJobsMap.iteritems(): tmpLog.debug('vo={0} has {1} jobs'.format(tmpVO,len(tmpJobList))) # get plugin setupperPluginClass = panda_config.getPlugin('setupper_plugins',tmpVO) if setupperPluginClass == None: # use ATLAS plug-in by default from SetupperAtlasPlugin import SetupperAtlasPlugin setupperPluginClass = SetupperAtlasPlugin tmpLog.debug('plugin name -> {0}'.format(setupperPluginClass.__name__)) try: # make plugin setupperPlugin = setupperPluginClass(self.taskBuffer,self.jobs,tmpLog, resubmit=self.resubmit, pandaDDM=self.pandaDDM, ddmAttempt=self.ddmAttempt, onlyTA=self.onlyTA, firstSubmission=self.firstSubmission) # run plugin tmpLog.debug('run plugin') setupperPlugin.run() # go forward if not TA if not self.onlyTA: # update jobs tmpLog.debug('update jobs') self.updateJobs(setupperPlugin.jobs+setupperPlugin.jumboJobs,tmpLog) # execute post process tmpLog.debug('post execute plugin') setupperPlugin.postRun() tmpLog.debug('done plugin') except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('plugin failed with {0}:{1}'.format(errtype, errvalue)) tmpLog.debug('main end') else: tmpLog.debug('fork start') # write jobs to file import os import cPickle as pickle outFileName = '%s/set.%s_%s' % (panda_config.logdir,self.jobs[0].PandaID,commands.getoutput('uuidgen')) outFile = open(outFileName,'w') pickle.dump(self.jobs,outFile) outFile.close() # run main procedure in another process because python doesn't release memory com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd) com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \ (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python, panda_config.pandaPython_dir,outFileName) if self.onlyTA: com += " -t" if not self.firstSubmission: com += " -f" tmpLog.debug(com) # exeute status,output = self.taskBuffer.processLimiter.getstatusoutput(com) tmpLog.debug("return from main process: %s %s" % (status,output)) tmpLog.debug('fork end') except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('master failed with {0}:{1}'.format(errtype,errvalue))
from taskbuffer.TaskBuffer import taskBuffer from taskbuffer.WorkerSpec import WorkerSpec from pandalogger.PandaLogger import PandaLogger from pandalogger.LogWrapper import LogWrapper # password from config import panda_config passwd = panda_config.dbpasswd # logger _logger = PandaLogger().getLogger('harvesterCtl') tmpLog = LogWrapper(_logger, None) tmpLog.debug("===================== start =====================") # overall timeout value overallTimeout = 20 # grace period try: gracePeriod = int(sys.argv[1]) except: gracePeriod = 3 # kill old process try: # time limit timeLimit = datetime.datetime.utcnow() - datetime.timedelta( minutes=overallTimeout)
from taskbuffer.TaskBuffer import taskBuffer from taskbuffer.WorkerSpec import WorkerSpec from pandalogger.PandaLogger import PandaLogger from pandalogger.LogWrapper import LogWrapper # password from config import panda_config passwd = panda_config.dbpasswd # logger _logger = PandaLogger().getLogger('harvesterCtl') tmpLog = LogWrapper(_logger,None) tmpLog.debug("===================== start =====================") # overall timeout value overallTimeout = 20 # grace period try: gracePeriod = int(sys.argv[1]) except: gracePeriod = 3 # kill old process try: # time limit timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=overallTimeout) # get process list
def getFilesFromLRC(files, url, guids=[], storageName=[], terminateWhenFailed=False, getPFN=False, scopeList=[]): tmpLog = LogWrapper(_log, None) tmpLog.debug('getFilesFromLRC "%s" %s' % (url, str(storageName))) # get PFC outSTR = '' if url.startswith('mysql://'): # from MySQL outSTR = _getPFNFromMySQL(files, url) # get PFN if getPFN: outPFN = {} # FIXME tmpLog.debug('RetPFN:%s ' % str(outPFN)) return outPFN elif url.startswith('http://'): # from HTTP I/F outSTR = _getPoolFileCatalog(files, url) # get PFN if getPFN: outPFN = {} try: if not outSTR in ['', None]: root = xml.dom.minidom.parseString(outSTR) fileNodes = root.getElementsByTagName('File') for file in fileNodes: # get PFN and LFN nodes physical = file.getElementsByTagName('physical')[0] pfnNode = physical.getElementsByTagName('pfn')[0] logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw pfn = str(pfnNode.getAttribute('name')) lfn = str(lfnNode.getAttribute('name')) # assign if not outPFN.has_key(lfn): outPFN[lfn] = [] outPFN[lfn].append(pfn) except: type, value, traceBack = sys.exc_info() tmpLog.error(outSTR) tmpLog.error("could not parse XML - %s %s" % (type, value)) tmpLog.debug('RetPFN:%s ' % str(outPFN)) return outPFN elif url.startswith('lfc://') or url.startswith('rucio://'): # from LFC timeStart = datetime.datetime.utcnow() outSTR = _getPFNFromLFC(files, url, guids, storageName, scopeList=scopeList, tmpLog=tmpLog) regTime = datetime.datetime.utcnow() - timeStart tmpLog.debug( 'file lookup for %s LFNs from %s took %s.%03d sec' % (len(files), url, regTime.seconds, regTime.microseconds / 1000)) # get PFN if getPFN: outPFN = {} try: if not outSTR in ['', None]: tmpItems = outSTR.split('LFCRet :') tmpItems.remove('') # loop over all returns for tmpItem in tmpItems: exec "tmpLFNmap = %s" % tmpItem for tmpLFN, tmpPFN in tmpLFNmap.iteritems(): outPFN[tmpLFN] = tmpPFN except: type, value, traceBack = sys.exc_info() tmpLog.error(outSTR) tmpLog.error("could not parse LFC ret - %s %s" % (type, value)) tmpLog.debug('RetPFN:%s files' % len(outPFN)) return outPFN # check return if not isinstance(outSTR, types.StringType): if terminateWhenFailed: return None # set empty string outSTR = '' # collect OK Files okFiles = [] for file in files: if re.search(file, outSTR) != None: okFiles.append(file) tmpLog.debug('Ret:%s / %s files' % (str(okFiles[:3]), len(okFiles))) return okFiles
from taskbuffer import EventServiceUtils from pandalogger.PandaLogger import PandaLogger from brokerage.SiteMapper import SiteMapper from pandautils import PandaUtils from pandalogger.LogWrapper import LogWrapper # password from config import panda_config passwd = panda_config.dbpasswd # logger _logger = PandaLogger().getLogger('add') tmpLog = LogWrapper(_logger,None) tmpLog.debug("===================== start =====================") # overall timeout value overallTimeout = 20 # grace period try: gracePeriod = int(sys.argv[1]) except: gracePeriod = 3 # current minute currentMinute = datetime.datetime.utcnow().minute # kill old process try:
def _getPFNFromLFC(lfns, dq2url, guids, storageName, scopeList=[], tmpLog=None): if tmpLog == None: tmpLog = LogWrapper(_log, logPrefix) tmpLog.debug('_getPFNFromLFC %s %s / %s LFNs:%s %s' % (dq2url, str(storageName), len(lfns), str( lfns[:3]), str(scopeList[:3]))) outStr = '' # check paramter if guids == [] or storageName == [] or (len(lfns) != len(guids)): tmpLog.debug('_getPFNFromLFC done with empty list') return outStr # check scopeList if not scopeList in [None, []] and len(lfns) != len(scopeList): tmpLog.warning('_getPFNFromLFC wrong scopeList %s %s %s %s' % (dq2url, str(storageName), str(lfns), str(scopeList))) tmpLog.error('_getPFNFromLFC failed') return outStr # loop over all LFNs iLFN = 0 nLFN = 1000 strFiles = '' outStr = '' for iLFN in range(len(lfns)): if scopeList != []: strFiles += '%s %s %s\n' % (lfns[iLFN], guids[iLFN], scopeList[iLFN]) else: strFiles += '%s %s\n' % (lfns[iLFN], guids[iLFN]) # bulk operation if (iLFN + 1) % nLFN == 0 or (iLFN + 1) >= len(lfns): # write to file inFileName = '%s/lfcin.%s' % (panda_config.logdir, commands.getoutput('uuidgen')) ifile = open(inFileName, 'w') ifile.write(strFiles) ifile.close() # construct commands strStorage = '' for storage in storageName: strStorage += '%s,' % storage strStorage = strStorage[:-1] com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % ( panda_config.home_dir_cwd, panda_config.home_dir_cwd) com += 'unset LD_LIBRARY_PATH; unset PYTHONPATH; export PATH=/usr/local/bin:/bin:/usr/bin; ' com+= 'source %s; %s/python -Wignore %s/LFCclient.py -f %s -l %s -s %s' % \ (panda_config.glite_source,panda_config.native_python32,panda_config.lfcClient_dir, inFileName,dq2url,strStorage) tmpLog.debug(com) # exeute status, output = commands.getstatusoutput(com) tmpLog.debug(status) if status == 0: outStr += output else: tmpLog.error("_getPFNFromLFC : %s %s %s" % (dq2url, status, output)) # send message to logger try: # make message message = 'LFC access : %s %s %s' % (dq2url, status, output) # get logger _pandaLogger = PandaLogger() _pandaLogger.lock() _pandaLogger.setParams({'Type': 'broker_util'}) logger = _pandaLogger.getHttpLogger( panda_config.loggername) # add message logger.error(message) # release HTTP handler _pandaLogger.release() except: pass tmpLog.error('_getPFNFromLFC failed') return status # reset strFiles = '' tmpLog.debug('_getPFNFromLFC done') # return return outStr
from taskbuffer.TaskBuffer import taskBuffer from pandalogger.PandaLogger import PandaLogger from pandalogger.LogWrapper import LogWrapper import panda_proxy_cache # logger _logger = PandaLogger().getLogger('panda_activeusers_query') tmpLog = LogWrapper(_logger) if __name__ == '__main__' : tmpLog.debug("================= start ==================") # instantiate TB taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) # instantiate MyProxy I/F my_proxy_interface_instance = panda_proxy_cache.MyProxyInterface() # roles if hasattr(panda_config,'proxy_cache_roles'): roles = panda_config.proxy_cache_roles.split(',') else: roles = ['atlas','atlas:/atlas/Role=production','atlas:/atlas/Role=pilot'] # get users sql = 'select distinct DN FROM ATLAS_PANDAMETA.users WHERE GRIDPREF LIKE :patt' varMap = {} varMap[':patt'] = '%p%'
def getFilesFromLRC(files,url,guids=[],storageName=[],terminateWhenFailed=False,getPFN=False, scopeList=[]): tmpLog = LogWrapper(_log,None) tmpLog.debug('getFilesFromLRC "%s" %s' % (url,str(storageName))) # get PFC outSTR = '' if url.startswith('mysql://'): # from MySQL outSTR = _getPFNFromMySQL(files,url) # get PFN if getPFN: outPFN = {} # FIXME tmpLog.debug('RetPFN:%s ' % str(outPFN)) return outPFN elif url.startswith('http://'): # from HTTP I/F outSTR = _getPoolFileCatalog(files,url) # get PFN if getPFN: outPFN = {} try: if not outSTR in ['',None]: root = xml.dom.minidom.parseString(outSTR) fileNodes = root.getElementsByTagName('File') for file in fileNodes: # get PFN and LFN nodes physical = file.getElementsByTagName('physical')[0] pfnNode = physical.getElementsByTagName('pfn')[0] logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw pfn = str(pfnNode.getAttribute('name')) lfn = str(lfnNode.getAttribute('name')) # assign if not outPFN.has_key(lfn): outPFN[lfn] = [] outPFN[lfn].append(pfn) except: type, value, traceBack = sys.exc_info() tmpLog.error(outSTR) tmpLog.error("could not parse XML - %s %s" % (type, value)) tmpLog.debug('RetPFN:%s ' % str(outPFN)) return outPFN elif url.startswith('lfc://') or url.startswith('rucio://'): # from LFC timeStart = datetime.datetime.utcnow() outSTR = _getPFNFromLFC(files,url,guids,storageName,scopeList=scopeList,tmpLog=tmpLog) regTime = datetime.datetime.utcnow() - timeStart tmpLog.debug('file lookup for %s LFNs from %s took %s.%03d sec' % (len(files),url,regTime.seconds, regTime.microseconds/1000)) # get PFN if getPFN: outPFN = {} try: if not outSTR in ['',None]: tmpItems = outSTR.split('LFCRet :') tmpItems.remove('') # loop over all returns for tmpItem in tmpItems: exec "tmpLFNmap = %s" % tmpItem for tmpLFN,tmpPFN in tmpLFNmap.iteritems(): outPFN[tmpLFN] = tmpPFN except: type, value, traceBack = sys.exc_info() tmpLog.error(outSTR) tmpLog.error("could not parse LFC ret - %s %s" % (type, value)) tmpLog.debug('RetPFN:%s files' % len(outPFN)) return outPFN # check return if not isinstance(outSTR,types.StringType): if terminateWhenFailed: return None # set empty string outSTR = '' # collect OK Files okFiles = [] for file in files: if re.search(file,outSTR) != None: okFiles.append(file) tmpLog.debug('Ret:%s / %s files' % (str(okFiles[:3]),len(okFiles))) return okFiles
from taskbuffer.TaskBuffer import taskBuffer from pandalogger.PandaLogger import PandaLogger from pandalogger.LogWrapper import LogWrapper from brokerage.SiteMapper import SiteMapper # password from config import panda_config passwd = panda_config.dbpasswd # logger _logger = PandaLogger().getLogger('prioryMassage') tmpLog = LogWrapper(_logger) tmpLog.debug("================= start ==================") # instantiate TB taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) # get usage breakdown usageBreakDownPerUser = {} usageBreakDownPerSite = {} workingGroupList = [] for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4']: varMap = {} varMap[':prodSourceLabel'] = 'user' varMap[':pmerge'] = 'pmerge'
class AdderGen: # constructor def __init__(self,taskBuffer,jobID,jobStatus,xmlFile,ignoreTmpError=True,siteMapper=None): self.job = None self.jobID = jobID self.jobStatus = jobStatus self.taskBuffer = taskBuffer self.ignoreTmpError = ignoreTmpError self.lockXML = None self.siteMapper = siteMapper self.attemptNr = None self.xmlFile = xmlFile self.datasetMap = {} self.extraInfo = {'surl':{},'nevents':{},'lbnr':{}} # exstract attemptNr try: tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1] if re.search('^\d+$',tmpAttemptNr) != None: self.attemptNr = int(tmpAttemptNr) except: pass # logger self.logger = LogWrapper(_logger,self.jobID) # main def run(self): try: self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus,self.attemptNr)) # lock XML self.lockXML = open(self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB) except: self.logger.debug("cannot get lock : %s" % self.xmlFile) self.lockXML.close() # remove XML just in case for the final attempt if not self.ignoreTmpError: try: # remove Catalog os.remove(self.xmlFile) except: pass return # check if file exists if not os.path.exists(self.xmlFile): self.logger.debug("not exist : %s" % self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: pass return # query job self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False, fromArchived=False, fromWaiting=False, forAnal=True)[0] # check if job has finished if self.job == None: self.logger.debug(': job not found in DB') elif self.job.jobStatus in ['finished','failed','unknown','cancelled','merging']: self.logger.error(': invalid state -> %s' % self.job.jobStatus) elif self.attemptNr != None and self.job.attemptNr != self.attemptNr: self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr,self.attemptNr)) else: # check file status in JEDI fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(self.job) self.logger.debug("check file status in JEDI : {0}".format(fileCheckInJEDI)) if fileCheckInJEDI == None: raise RuntimeError,'failed to check file status in JEDI' if fileCheckInJEDI == False: # set job status to failed since some file status is wrong in JEDI self.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "wrong file status in JEDI" self.logger.debug("set jobStatus={0} since input are already cancelled in JEDI".format(self.jobStatus)) # keep old status oldJobStatus = self.job.jobStatus # set job status if not self.job.jobStatus in ['transferring']: self.job.jobStatus = self.jobStatus addResult = None adderPlugin = None # parse XML parseResult = self.parseXML() if parseResult < 2: # intraction with DDM try: # set VO=local for DDM free if self.job.destinationSE == 'local': tmpVO = 'local' else: tmpVO = self.job.VO # instantiate concrete plugin adderPluginClass = panda_config.getPlugin('adder_plugins',tmpVO) if adderPluginClass == None: # use ATLAS plugin by default from AdderAtlasPlugin import AdderAtlasPlugin adderPluginClass = AdderAtlasPlugin self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__)) adderPlugin = adderPluginClass(self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, extraInfo=self.extraInfo, logger=self.logger) # execute self.logger.debug('plugin is ready') adderPlugin.execute() addResult = adderPlugin.result self.logger.debug('plugin done with %s' % (addResult.statusCode)) except: errtype,errvalue = sys.exc_info()[:2] self.logger.error("failed to execute AdderPlugin for VO={0} with {1}:{2}".format(tmpVO, errtype, errvalue)) addResult = None self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "AdderPlugin failure" # ignore temporary errors if self.ignoreTmpError and addResult != None and addResult.isTemporary(): self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag) self.logger.debug('escape') # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type,value)) self.logger.debug("cannot unlock XML") return # failed if addResult == None or not addResult.isSucceeded(): self.job.jobStatus = 'failed' # set file status for failed jobs or failed transferring jobs if self.job.jobStatus == 'failed' or self.jobStatus == 'failed': self.job.jobStatus = 'failed' for file in self.job.Files: if file.type in ['output','log']: if addResult != None and file.lfn in addResult.mergingFiles: file.status = 'merging' else: file.status = 'failed' else: # reset errors self.job.jobDispatcherErrorCode = 0 self.job.jobDispatcherErrorDiag = 'NULL' # set status if addResult != None and addResult.mergingFiles != []: # set status for merging: for file in self.job.Files: if file.lfn in addResult.mergingFiles: file.status = 'merging' self.job.jobStatus = 'merging' # propagate transition to prodDB self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) elif addResult != None and addResult.transferringFiles != []: # set status for transferring for file in self.job.Files: if file.lfn in addResult.transferringFiles: file.status = 'transferring' self.job.jobStatus = 'transferring' # propagate transition to prodDB self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) else: self.job.jobStatus = 'finished' # endtime if self.job.endTime=='NULL': self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) # output size and # of outputs self.job.nOutputDataFiles = 0 self.job.outputFileBytes = 0 for tmpFile in self.job.Files: if tmpFile.type == 'output': self.job.nOutputDataFiles += 1 try: self.job.outputFileBytes += tmpFile.fsize except: pass # protection maxOutputFileBytes = 99999999999 if self.job.outputFileBytes > maxOutputFileBytes: self.job.outputFileBytes = maxOutputFileBytes # set cancelled state if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed': self.job.jobStatus = 'cancelled' # update job self.logger.debug("updating DB") retU = self.taskBuffer.updateJobs([self.job],False,oldJobStatusList=[oldJobStatus], extraInfo=self.extraInfo) self.logger.debug("retU: %s" % retU) # failed if not retU[0]: self.logger.error('failed to update DB') # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type,value)) self.logger.debug("cannot unlock XML") return # setup for closer if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.jobStatus == 'cancelled'): destDBList = [] guidList = [] for file in self.job.Files: # ignore inputs if file.type == 'input': continue # skip pseudo datasets if file.destinationDBlock in ['',None,'NULL']: continue # start closer for output/log datasets if not file.destinationDBlock in destDBList: destDBList.append(file.destinationDBlock) # collect GUIDs if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test','rucio_test'] and \ self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \ and file.type == 'output': # extract base LFN since LFN was changed to full LFN for CMS baseLFN = file.lfn.split('/')[-1] guidList.append({'lfn':baseLFN,'guid':file.GUID,'type':file.type, 'checksum':file.checksum,'md5sum':file.md5sum, 'fsize':file.fsize,'scope':file.scope}) if guidList != []: retG = self.taskBuffer.setGUIDs(guidList) if destDBList != []: # start Closer if adderPlugin != None and hasattr(adderPlugin,'datasetMap') and adderPlugin.datasetMap != {}: cThr = Closer.Closer(self.taskBuffer,destDBList,self.job,datasetMap=adderPlugin.datasetMap) else: cThr = Closer.Closer(self.taskBuffer,destDBList,self.job) self.logger.debug("start Closer") cThr.start() cThr.join() self.logger.debug("end Closer") self.logger.debug("end") try: # remove Catalog os.remove(self.xmlFile) except: pass # unlock XML if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type,value)) self.logger.debug("except") # unlock XML just in case try: if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type,value)) self.logger.debug("cannot unlock XML") # parse XML # 0: succeeded, 1: harmless error to exit, 2: fatal error, 3: event service def parseXML(self): # get LFN and GUID self.logger.debug('XML filename : %s' % self.xmlFile) # no outputs if self.job.Files == []: self.logger.debug("has no outputs") self.logger.debug("parseXML end") return 0 # get input files inputLFNs = [] for file in self.job.Files: if file.type == 'input': inputLFNs.append(file.lfn) # parse XML lfns = [] guids = [] fsizes = [] md5sums = [] chksums = [] surls = [] fullLfnMap = {} nEventsMap = {} try: root = xml.dom.minidom.parse(self.xmlFile) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) # get metadata fsize = None md5sum = None adler32 = None surl = None fullLFN = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'fsize': fsize = long(meta.getAttribute('att_value')) elif name == 'md5sum': md5sum = str(meta.getAttribute('att_value')) # check if re.search("^[a-fA-F0-9]{32}$",md5sum) == None: md5sum = None elif name == 'adler32': adler32 = str(meta.getAttribute('att_value')) elif name == 'surl': surl = str(meta.getAttribute('att_value')) elif name == 'full_lfn': fullLFN = str(meta.getAttribute('att_value')) # error check if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)): if EventServiceUtils.isEventServiceMerge(self.job): continue else: raise RuntimeError, 'fsize/md5sum/adler32/surl=None' # append lfns.append(lfn) guids.append(guid) fsizes.append(fsize) md5sums.append(md5sum) surls.append(surl) if adler32 != None: # use adler32 if available chksums.append("ad:%s" % adler32) else: chksums.append("md5:%s" % md5sum) if fullLFN != None: fullLfnMap[lfn] = fullLFN except: # check if file exists if os.path.exists(self.xmlFile): type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type,value)) # set failed anyway self.job.jobStatus = 'failed' # XML error happens when pilot got killed due to wall-time limit or failures in wrapper if (self.job.pilotErrorCode in [0,'0','NULL']) and \ (self.job.transExitCode in [0,'0','NULL']): self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML" return 2 else: # XML was deleted return 1 # parse metadata to get nEvents try: root = xml.dom.minidom.parseString(self.job.metadata) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) # get metadata nevents = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'events': nevents = long(meta.getAttribute('att_value')) nEventsMap[lfn] = nevents break except: pass self.logger.debug('nEventsMap=%s' % str(nEventsMap)) # get lumi block number lumiBlockNr = self.job.getLumiBlockNr() # check files fileList = [] for file in self.job.Files: fileList.append(file.lfn) if file.type == 'input': if file.lfn in lfns: if self.job.prodSourceLabel in ['user','panda']: # skipped file file.status = 'skipped' elif self.job.prodSourceLabel in ['managed','test','rc_test','ptest']: # failed by pilot file.status = 'failed' elif file.type == 'output' or file.type == 'log': # add only log file for failed jobs if self.jobStatus == 'failed' and file.type != 'log': file.status = 'failed' continue # set failed if it is missing in XML if not file.lfn in lfns: if self.job.jobStatus == 'finished' and EventServiceUtils.isEventServiceJob(self.job): # unset file status for ES jobs pass else: file.status = 'failed' continue # look for GUID with LFN try: i = lfns.index(file.lfn) file.GUID = guids[i] file.fsize = fsizes[i] file.md5sum = md5sums[i] file.checksum = chksums[i] surl = surls[i] # status file.status = 'ready' # change to full LFN if fullLfnMap.has_key(file.lfn): file.lfn = fullLfnMap[file.lfn] # add SURL to extraInfo self.extraInfo['surl'][file.lfn] = surl # add nevents if nEventsMap.has_key(file.lfn): self.extraInfo['nevents'][file.lfn] = nEventsMap[file.lfn] except: # status file.status = 'failed' type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type,value)) # set lumi block number if lumiBlockNr != None and file.status != 'failed': self.extraInfo['lbnr'][file.lfn] = lumiBlockNr # check consistency between XML and filesTable for lfn in lfns: if not lfn in fileList: self.logger.error("%s is not found in filesTable" % lfn) self.job.jobStatus = 'failed' for tmpFile in self.job.Files: tmpFile.status = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "pilot XML is inconsistent with filesTable" return 2 # return self.logger.debug("parseXML end") return 0
class AdderGen: # constructor def __init__(self, taskBuffer, jobID, jobStatus, xmlFile, ignoreTmpError=True, siteMapper=None): self.job = None self.jobID = jobID self.jobStatus = jobStatus self.taskBuffer = taskBuffer self.ignoreTmpError = ignoreTmpError self.lockXML = None self.siteMapper = siteMapper self.attemptNr = None self.xmlFile = xmlFile self.datasetMap = {} self.extraInfo = { 'surl': {}, 'nevents': {}, 'lbnr': {}, 'endpoint': {} } # exstract attemptNr try: tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1] if re.search('^\d+$', tmpAttemptNr) != None: self.attemptNr = int(tmpAttemptNr) except: pass # logger self.logger = LogWrapper(_logger, str(self.jobID)) # dump file report def dumpFileReport(self, fileCatalog, attemptNr): self.logger.debug("dump file report") # dump Catalog into file if attemptNr == None: xmlFile = '%s/%s_%s_%s' % (panda_config.logdir, self.jobID, self.jobStatus, str(uuid.uuid4())) else: xmlFile = '%s/%s_%s_%s_%s' % (panda_config.logdir, self.jobID, self.jobStatus, str( uuid.uuid4()), attemptNr) file = open(xmlFile, 'w') file.write(fileCatalog) file.close() # get plugin class def getPluginClass(self, tmpVO): # instantiate concrete plugin adderPluginClass = panda_config.getPlugin('adder_plugins', tmpVO) if adderPluginClass == None: # use ATLAS plugin by default from AdderAtlasPlugin import AdderAtlasPlugin adderPluginClass = AdderAtlasPlugin self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__)) return adderPluginClass # main def run(self): try: self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus, self.attemptNr)) # lock XML self.lockXML = open(self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) except: self.logger.debug("cannot get lock : %s" % self.xmlFile) self.lockXML.close() # remove XML just in case for the final attempt if not self.ignoreTmpError: try: # remove Catalog os.remove(self.xmlFile) except: pass return # check if file exists if not os.path.exists(self.xmlFile): self.logger.debug("not exist : %s" % self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: pass return # query job self.job = self.taskBuffer.peekJobs([self.jobID], fromDefined=False, fromWaiting=False, forAnal=True)[0] # check if job has finished if self.job == None: self.logger.debug(': job not found in DB') elif self.job.jobStatus in [ 'finished', 'failed', 'unknown', 'merging' ]: self.logger.error(': invalid state -> %s' % self.job.jobStatus) elif self.attemptNr != None and self.job.attemptNr != self.attemptNr: self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr, self.attemptNr)) elif self.attemptNr is not None and self.job.jobStatus == 'transferring': errMsg = 'XML with attemptNr for {0}'.format( self.job.jobStatus) self.logger.error(errMsg) # FIXME raise RuntimeError, errMsg elif self.jobStatus == EventServiceUtils.esRegStatus: # instantiate concrete plugin adderPluginClass = self.getPluginClass(self.job.VO) adderPlugin = adderPluginClass(self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, logger=self.logger) # execute self.logger.debug('plugin is ready for ES file registration') adderPlugin.registerEventServiceFiles() else: # check file status in JEDI if not self.job.isCancelled( ) and not self.job.taskBufferErrorCode in [ taskbuffer.ErrorCode.EC_PilotRetried ]: fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI( self.job) self.logger.debug("check file status in JEDI : {0}".format( fileCheckInJEDI)) if fileCheckInJEDI == None: raise RuntimeError, 'failed to check file status in JEDI' if fileCheckInJEDI == False: # set job status to failed since some file status is wrong in JEDI self.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder errStr = "inconsistent file status between Panda and JEDI. " errStr += "failed to avoid duplicated processing caused by synchronization failure" self.job.ddmErrorDiag = errStr self.logger.debug( "set jobStatus={0} since input is inconsistent between Panda and JEDI" .format(self.jobStatus)) elif self.job.jobSubStatus in ['pilot_closed']: # terminated by the pilot self.logger.debug( "going to closed since terminated by the pilot") retClosed = self.taskBuffer.killJobs([self.jobID], 'pilot', '60', True) if retClosed[0] == True: self.logger.debug("end") try: # remove Catalog os.remove(self.xmlFile) except: pass # unlock XML if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() return # check for cloned jobs if EventServiceUtils.isJobCloningJob(self.job): checkJC = self.taskBuffer.checkClonedJob(self.job) if checkJC == None: raise RuntimeError, 'failed to check the cloned job' # failed to lock semaphore if checkJC['lock'] == False: self.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "failed to lock semaphore for job cloning" self.logger.debug( "set jobStatus={0} since did not get semaphore for job cloning" .format(self.jobStatus)) # use failed for cancelled/closed jobs if self.job.isCancelled(): self.jobStatus = 'failed' # reset error codes to skip retrial module self.job.pilotErrorCode = 0 self.job.exeErrorCode = 0 self.job.ddmErrorCode = 0 # keep old status oldJobStatus = self.job.jobStatus # set job status if not self.job.jobStatus in ['transferring']: self.job.jobStatus = self.jobStatus addResult = None adderPlugin = None # parse XML parseResult = self.parseXML() if parseResult < 2: # intraction with DDM try: # instantiate concrete plugin adderPluginClass = self.getPluginClass(self.job.VO) adderPlugin = adderPluginClass( self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, extraInfo=self.extraInfo, logger=self.logger) # execute self.logger.debug('plugin is ready') adderPlugin.execute() addResult = adderPlugin.result self.logger.debug('plugin done with %s' % (addResult.statusCode)) except: errtype, errvalue = sys.exc_info()[:2] self.logger.error( "failed to execute AdderPlugin for VO={0} with {1}:{2}" .format(self.job.VO, errtype, errvalue)) addResult = None self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "AdderPlugin failure" # ignore temporary errors if self.ignoreTmpError and addResult != None and addResult.isTemporary( ): self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag) self.logger.debug('escape') # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type, value)) self.logger.debug("cannot unlock XML") return # failed if addResult == None or not addResult.isSucceeded(): self.job.jobStatus = 'failed' # set file status for failed jobs or failed transferring jobs self.logger.debug( "status after plugin call :job.jobStatus=%s jobStatus=%s" % (self.job.jobStatus, self.jobStatus)) if self.job.jobStatus == 'failed' or self.jobStatus == 'failed': # First of all: check if job failed and in this case take first actions according to error table source, error_code, error_diag = None, None, None if self.job.pilotErrorCode: source = 'pilotErrorCode' error_code = self.job.pilotErrorCode error_diag = self.job.pilotErrorDiag elif self.job.exeErrorCode: source = 'exeErrorCode' error_code = self.job.exeErrorCode error_diag = self.job.exeErrorDiag elif self.job.ddmErrorCode: source = 'ddmErrorCode' error_code = self.job.ddmErrorCode error_diag = self.job.ddmErrorDiag elif self.job.transExitCode: source = 'transExitCode' error_code = self.job.transExitCode error_diag = '' # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag)) if source and error_code: try: self.logger.debug( "AdderGen.run will call apply_retrial_rules") retryModule.apply_retrial_rules( self.taskBuffer, self.job.PandaID, source, error_code, error_diag, self.job.attemptNr) self.logger.debug("apply_retrial_rules is back") except Exception as e: self.logger.error( "apply_retrial_rules excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc())) self.job.jobStatus = 'failed' for file in self.job.Files: if file.type in ['output', 'log']: if addResult != None and file.lfn in addResult.mergingFiles: file.status = 'merging' else: file.status = 'failed' else: # reset errors self.job.jobDispatcherErrorCode = 0 self.job.jobDispatcherErrorDiag = 'NULL' # set status if addResult != None and addResult.mergingFiles != []: # set status for merging: for file in self.job.Files: if file.lfn in addResult.mergingFiles: file.status = 'merging' self.job.jobStatus = 'merging' # propagate transition to prodDB self.job.stateChangeTime = time.strftime( '%Y-%m-%d %H:%M:%S', time.gmtime()) elif addResult != None and addResult.transferringFiles != []: # set status for transferring for file in self.job.Files: if file.lfn in addResult.transferringFiles: file.status = 'transferring' self.job.jobStatus = 'transferring' # propagate transition to prodDB self.job.stateChangeTime = time.strftime( '%Y-%m-%d %H:%M:%S', time.gmtime()) else: self.job.jobStatus = 'finished' # endtime if self.job.endTime == 'NULL': self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime()) # output size and # of outputs self.job.nOutputDataFiles = 0 self.job.outputFileBytes = 0 for tmpFile in self.job.Files: if tmpFile.type == 'output': self.job.nOutputDataFiles += 1 try: self.job.outputFileBytes += tmpFile.fsize except: pass # protection maxOutputFileBytes = 99999999999 if self.job.outputFileBytes > maxOutputFileBytes: self.job.outputFileBytes = maxOutputFileBytes # set cancelled state if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed': self.job.jobStatus = 'cancelled' # update job if oldJobStatus in ['cancelled', 'closed']: pass else: self.logger.debug("updating DB") retU = self.taskBuffer.updateJobs( [self.job], False, oldJobStatusList=[oldJobStatus], extraInfo=self.extraInfo) self.logger.debug("retU: %s" % retU) # failed if not retU[0]: self.logger.error( 'failed to update DB for pandaid={0}'.format( self.job.PandaID)) # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type, value)) self.logger.debug("cannot unlock XML") return try: # updateJobs was successful and it failed a job with taskBufferErrorCode self.logger.debug("AdderGen.run will peek the job") job_tmp = self.taskBuffer.peekJobs( [self.job.PandaID], fromDefined=False, fromArchived=True, fromWaiting=False)[0] self.logger.debug( "status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}" .format(job_tmp.jobStatus, job_tmp.taskBufferErrorCode, job_tmp.taskBufferErrorDiag)) if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode: source = 'taskBufferErrorCode' error_code = job_tmp.taskBufferErrorCode error_diag = job_tmp.taskBufferErrorDiag self.logger.debug( "AdderGen.run 2 will call apply_retrial_rules") retryModule.apply_retrial_rules( self.taskBuffer, job_tmp.PandaID, source, error_code, error_diag, job_tmp.attemptNr) self.logger.debug("apply_retrial_rules 2 is back") except IndexError: pass except Exception as e: self.logger.error( "apply_retrial_rules 2 excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc())) # setup for closer if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.isCancelled()): destDBList = [] guidList = [] for file in self.job.Files: # ignore inputs if file.type == 'input': continue # skip pseudo datasets if file.destinationDBlock in ['', None, 'NULL']: continue # start closer for output/log datasets if not file.destinationDBlock in destDBList: destDBList.append(file.destinationDBlock) # collect GUIDs if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test','rucio_test'] and \ self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \ and file.type == 'output': # extract base LFN since LFN was changed to full LFN for CMS baseLFN = file.lfn.split('/')[-1] guidList.append({ 'lfn': baseLFN, 'guid': file.GUID, 'type': file.type, 'checksum': file.checksum, 'md5sum': file.md5sum, 'fsize': file.fsize, 'scope': file.scope }) if guidList != []: retG = self.taskBuffer.setGUIDs(guidList) if destDBList != []: # start Closer if adderPlugin != None and hasattr( adderPlugin, 'datasetMap' ) and adderPlugin.datasetMap != {}: cThr = Closer.Closer( self.taskBuffer, destDBList, self.job, datasetMap=adderPlugin.datasetMap) else: cThr = Closer.Closer(self.taskBuffer, destDBList, self.job) self.logger.debug("start Closer") cThr.start() cThr.join() self.logger.debug("end Closer") # run closer for assocaiate parallel jobs if EventServiceUtils.isJobCloningJob(self.job): assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer( self.job.jediTaskID, self.job.PandaID, destDBList) for assJobID, assDBlocks in assDBlockMap.iteritems( ): assJob = self.taskBuffer.peekJobs( [assJobID], fromDefined=False, fromArchived=False, fromWaiting=False, forAnal=True)[0] if self.job == None: self.logger.debug( ': associated job PandaID={0} not found in DB' .format(assJobID)) else: cThr = Closer.Closer( self.taskBuffer, assDBlocks, assJob) self.logger.debug( "start Closer for PandaID={0}".format( assJobID)) cThr.start() cThr.join() self.logger.debug( "end Closer for PandaID={0}".format( assJobID)) self.logger.debug("end") try: # remove Catalog os.remove(self.xmlFile) except: pass # unlock XML if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() errStr = ": %s %s " % (type, value) errStr += traceback.format_exc() self.logger.error(errStr) self.logger.error("except") # unlock XML just in case try: if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) except: type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type, value)) self.logger.error("cannot unlock XML") # parse XML # 0: succeeded, 1: harmless error to exit, 2: fatal error, 3: event service def parseXML(self): # get LFN and GUID self.logger.debug('XML filename : %s' % self.xmlFile) # no outputs if self.job.Files == []: self.logger.debug("has no outputs") self.logger.debug("parseXML end") return 0 # get input files inputLFNs = [] for file in self.job.Files: if file.type == 'input': inputLFNs.append(file.lfn) # parse XML lfns = [] guids = [] fsizes = [] md5sums = [] chksums = [] surls = [] fullLfnMap = {} nEventsMap = {} try: root = xml.dom.minidom.parse(self.xmlFile) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) # get metadata fsize = None md5sum = None adler32 = None surl = None fullLFN = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'fsize': fsize = long(meta.getAttribute('att_value')) elif name == 'md5sum': md5sum = str(meta.getAttribute('att_value')) # check if re.search("^[a-fA-F0-9]{32}$", md5sum) == None: md5sum = None elif name == 'adler32': adler32 = str(meta.getAttribute('att_value')) elif name == 'surl': surl = str(meta.getAttribute('att_value')) elif name == 'full_lfn': fullLFN = str(meta.getAttribute('att_value')) # endpoints self.extraInfo['endpoint'][lfn] = [] for epNode in file.getElementsByTagName('endpoint'): self.extraInfo['endpoint'][lfn].append( str(epNode.firstChild.data)) # error check if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)): if EventServiceUtils.isEventServiceMerge(self.job): continue else: raise RuntimeError, 'fsize/md5sum/adler32/surl=None' # append lfns.append(lfn) guids.append(guid) fsizes.append(fsize) md5sums.append(md5sum) surls.append(surl) if adler32 != None: # use adler32 if available chksums.append("ad:%s" % adler32) else: chksums.append("md5:%s" % md5sum) if fullLFN != None: fullLfnMap[lfn] = fullLFN except: # parse json try: import json with open(self.xmlFile) as tmpF: jsonDict = json.load(tmpF) for lfn, fileData in jsonDict.iteritems(): lfn = str(lfn) fsize = None md5sum = None adler32 = None surl = None fullLFN = None guid = str(fileData['guid']) if 'fsize' in fileData: fsize = long(fileData['fsize']) if 'md5sum' in fileData: md5sum = str(fileData['md5sum']) # check if re.search("^[a-fA-F0-9]{32}$", md5sum) == None: md5sum = None if 'adler32' in fileData: adler32 = str(fileData['adler32']) if 'surl' in fileData: surl = str(fileData['surl']) if 'full_lfn' in fileData: fullLFN = str(fileData['full_lfn']) # endpoints self.extraInfo['endpoint'][lfn] = [] if 'endpoint' in fileData: self.extraInfo['endpoint'][lfn] = fileData[ 'endpoint'] # error check if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)): if EventServiceUtils.isEventServiceMerge(self.job): continue else: raise RuntimeError, 'fsize/md5sum/adler32/surl=None' # append lfns.append(lfn) guids.append(guid) fsizes.append(fsize) md5sums.append(md5sum) surls.append(surl) if adler32 != None: # use adler32 if available chksums.append("ad:%s" % adler32) else: chksums.append("md5:%s" % md5sum) if fullLFN != None: fullLfnMap[lfn] = fullLFN except: # check if file exists if os.path.exists(self.xmlFile): type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type, value)) # set failed anyway self.job.jobStatus = 'failed' # XML error happens when pilot got killed due to wall-time limit or failures in wrapper if (self.job.pilotErrorCode in [0,'0','NULL']) and \ (self.job.transExitCode in [0,'0','NULL']): self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML" return 2 else: # XML was deleted return 1 # parse metadata to get nEvents try: root = xml.dom.minidom.parseString(self.job.metadata) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) # get metadata nevents = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'events': nevents = long(meta.getAttribute('att_value')) nEventsMap[lfn] = nevents break except: pass self.logger.debug('nEventsMap=%s' % str(nEventsMap)) # parse json try: import json jsonDict = json.loads(self.job.metadata) for jsonFileItem in jsonDict['files']['output']: for jsonSubFileItem in jsonFileItem['subFiles']: lfn = str(jsonSubFileItem['name']) try: nevents = long(jsonSubFileItem['nentries']) nEventsMap[lfn] = nevents except: pass except: pass self.logger.debug('nEventsMapJson=%s' % str(nEventsMap)) # get lumi block number lumiBlockNr = self.job.getLumiBlockNr() # copy files for variable number of outputs tmpStat = self.copyFilesForVariableNumOutputs(lfns) if not tmpStat: self.logger.error( "failed to copy files for variable number of outputs") return 2 # check files fileList = [] for file in self.job.Files: fileList.append(file.lfn) if file.type == 'input': if file.lfn in lfns: if self.job.prodSourceLabel in ['user', 'panda']: # skipped file file.status = 'skipped' elif self.job.prodSourceLabel in [ 'managed', 'test', 'rc_test', 'ptest' ]: # failed by pilot file.status = 'failed' elif file.type == 'output' or file.type == 'log': # add only log file for failed jobs if self.jobStatus == 'failed' and file.type != 'log': file.status = 'failed' continue # set failed if it is missing in XML if not file.lfn in lfns: if self.job.jobStatus == 'finished' and \ (EventServiceUtils.isEventServiceJob(self.job) or EventServiceUtils.isJumboJob(self.job)): # unset file status for ES jobs pass elif file.isAllowedNoOutput(): # allowed not to be produced file.status = 'nooutput' self.logger.debug('set {0} to status={1}'.format( file.lfn, file.status)) else: file.status = 'failed' self.job.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "expected output {0} is missing in pilot XML".format( file.lfn) self.logger.error(self.job.ddmErrorDiag) continue # look for GUID with LFN try: i = lfns.index(file.lfn) file.GUID = guids[i] file.fsize = fsizes[i] file.md5sum = md5sums[i] file.checksum = chksums[i] surl = surls[i] # status file.status = 'ready' # change to full LFN if fullLfnMap.has_key(file.lfn): file.lfn = fullLfnMap[file.lfn] # add SURL to extraInfo self.extraInfo['surl'][file.lfn] = surl # add nevents if nEventsMap.has_key(file.lfn): self.extraInfo['nevents'][file.lfn] = nEventsMap[ file.lfn] except: # status file.status = 'failed' type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type, value)) # set lumi block number if lumiBlockNr != None and file.status != 'failed': self.extraInfo['lbnr'][file.lfn] = lumiBlockNr # check consistency between XML and filesTable for lfn in lfns: if not lfn in fileList: self.logger.error("%s is not found in filesTable" % lfn) self.job.jobStatus = 'failed' for tmpFile in self.job.Files: tmpFile.status = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "pilot produced {0} inconsistently with jobdef".format( lfn) return 2 # return self.logger.debug("parseXML end") return 0 # copy files for variable number of outputs def copyFilesForVariableNumOutputs(self, lfns): # get original output files origOutputs = {} updateOrig = {} for tmpFile in self.job.Files: if tmpFile.type in ['output', 'log']: origOutputs[tmpFile.lfn] = tmpFile if tmpFile.lfn in lfns: # keep original updateOrig[tmpFile.lfn] = False else: # overwrite original updateOrig[tmpFile.lfn] = True # look for unkown files addedNewFiles = False for newLFN in lfns: if not newLFN in origOutputs: # look for corresponding original output for origLFN in origOutputs.keys(): tmpPatt = '^{0}\.*_\d+$'.format(origLFN) if re.search(tmpPatt, newLFN) != None: # copy file record tmpStat = self.taskBuffer.copyFileRecord( newLFN, origOutputs[origLFN], updateOrig[origLFN]) if not tmpStat: return False addedNewFiles = True # disable further overwriting updateOrig[origLFN] = False break # refresh job info if addedNewFiles: self.job = self.taskBuffer.peekJobs([self.jobID], fromDefined=False, fromWaiting=False, forAnal=True)[0] # return return True
import traceback from taskbuffer.TaskBuffer import taskBuffer from pandalogger.PandaLogger import PandaLogger from pandalogger.LogWrapper import LogWrapper from brokerage.SiteMapper import SiteMapper from taskbuffer import ErrorCode # password from config import panda_config passwd = panda_config.dbpasswd # logger _logger = PandaLogger().getLogger('esPreemption') tmpLog = LogWrapper(_logger) tmpLog.debug("================= start ==================") # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) # time limit timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=15) # get low priority ES jobs per site sqlEsJobs = "SELECT PandaID,computingSite,commandToPilot,startTime " sqlEsJobs += "FROM {0}.jobsActive4 ".format(panda_config.schemaPANDA) sqlEsJobs += "WHERE prodSourceLabel IN (:label1,:label2) AND eventService=:es " sqlEsJobs += "AND currentPriority<:prio AND jobStatus=:jobStat "
def application(environ, start_response): # get method name methodName = '' if environ.has_key('SCRIPT_NAME'): methodName = environ['SCRIPT_NAME'].split('/')[-1] tmpLog = LogWrapper(_logger, "PID={0} {1}".format(os.getpid(), methodName)) tmpLog.debug("start") regStart = datetime.datetime.utcnow() retType = None # check method name if not methodName in allowedMethods: tmpLog.error("is forbidden") exeRes = "False : %s is forbidden" % methodName else: # get method object tmpMethod = None try: exec "tmpMethod = %s" % methodName except: pass # object not found if tmpMethod == None: tmpLog.error("is undefined") exeRes = "False" else: try: # get params tmpPars = cgi.FieldStorage(environ['wsgi.input'], environ=environ, keep_blank_values=1) # convert to map params = {} for tmpKey in tmpPars.keys(): if tmpPars[tmpKey].file != None and tmpPars[ tmpKey].filename != None: # file params[tmpKey] = tmpPars[tmpKey] else: # string params[tmpKey] = tmpPars.getfirst(tmpKey) if panda_config.entryVerbose: tmpLog.debug("with %s" % str(params.keys())) # dummy request object dummyReq = DummyReq(environ, tmpLog) # exec exeRes = apply(tmpMethod, [dummyReq], params) # extract return type if type(exeRes) == types.DictType: retType = exeRes['type'] exeRes = exeRes['content'] # convert bool to string if exeRes in [True, False]: exeRes = str(exeRes) except Exception as e: tmpLog.error("execution failure : {0}".format(str(e))) errStr = "" for tmpKey, tmpVal in environ.iteritems(): errStr += "%s : %s\n" % (tmpKey, str(tmpVal)) tmpLog.error(errStr) # return internal server error start_response('500 INTERNAL SERVER ERROR', [('Content-Type', 'text/plain')]) return [str(e)] if panda_config.entryVerbose: tmpLog.debug("done") regTime = datetime.datetime.utcnow() - regStart tmpLog.info( "exec_time=%s.%03d sec, return len=%s B" % (regTime.seconds, regTime.microseconds / 1000, len(str(exeRes)))) # return if exeRes == taskbuffer.ErrorCode.EC_NotFound: start_response('404 Not Found', [('Content-Type', 'text/plain')]) return ['not found'] elif isinstance(exeRes, taskbuffer.ErrorCode.EC_Redirect): start_response('302 Redirect', [('Location', exeRes.url)]) return ['redirect'] else: if retType == 'json': start_response('200 OK', [('Content-Type', 'application/json')]) else: start_response('200 OK', [('Content-Type', 'text/plain')]) return [exeRes]
def getGUIDsFromEventIndex(self, runEventList, streamName, amiTags, dataType): comment = ' /* DBProxy.getGUIDsFromEventIndex */' methodName = comment.split(' ')[-2].split('.')[-1] tmpLog = LogWrapper( _logger, methodName + " <streamName={0} amiTags={1} dataType={2}>".format( streamName, amiTags, dataType)) try: # change to list if not amiTags in [None, '']: amiTags = amiTags.replace('*', '.*').split(',') tmpLog.debug("start for {0} events".format(len(runEventList))) # check data type if not dataType in ['RAW', 'ESD', 'AOD']: return False, 'dataType={0} is unsupported'.format(dataType) # sql to insert runs and events sqlRE = "INSERT INTO {0}.TMP_RUN_EVENT_PAIRS (runNumber,eventNumber) ".format( panda_config.schemaEI) sqlRE += "VALUES (:runNumber,:eventNumber) " varMaps = [] for runNumber, eventNumber in runEventList: varMap = {} varMap[':runNumber'] = runNumber varMap[':eventNumber'] = eventNumber varMaps.append(varMap) # begin transaction self.conn.begin() self.cur.arraysize = 100000 # insert runs and events self.cur.executemany(sqlRE + comment, varMaps) # read GUIDs varMap = {} if amiTags in [None, '']: sqlRG = "SELECT runNumber,eventNumber,guid_{0} ".format( dataType) sqlRG += "FROM {0}.V_PANDA_EVPICK_NOAMITAG_MANY ".format( panda_config.schemaEI) else: sqlRG = "SELECT runNumber,eventNumber,guid_{0},amiTag ".format( dataType) sqlRG += "FROM {0}.V_PANDA_EVPICK_AMITAG_MANY ".format( panda_config.schemaEI) if not streamName in [None, '']: sqlRG += "WHERE streamName=:streamName " varMap[':streamName'] = streamName self.cur.execute(sqlRG + comment, varMap) resRG = self.cur.fetchall() # commit if not self._commit(): raise RuntimeError, 'Commit error' retValue = {} keyAmiIdxMap = {} for tmpItem in resRG: if amiTags in [None, '']: runNumber, eventNumber, guid = tmpItem # dummy idxTag = 0 else: runNumber, eventNumber, guid, amiTag = tmpItem # get index number for the AMI tag in the list idxTag = self.getIndexAmiTag(amiTags, amiTag) # didn't match if idxTag == None: continue tmpKey = (runNumber, eventNumber) # use AMI tag in a preference orde if tmpKey in keyAmiIdxMap and keyAmiIdxMap[tmpKey] < idxTag: continue keyAmiIdxMap[tmpKey] = idxTag retValue[tmpKey] = [guid] tmpLog.debug("found {0} events".format(len(retValue))) return True, retValue except: # roll back self._rollback() # error self.dumpErrorMessage(_logger, methodName) return False, None
class EventPicker: # constructor def __init__(self,taskBuffer,siteMapper,evpFileName,ignoreError): self.taskBuffer = taskBuffer self.siteMapper = siteMapper self.ignoreError = ignoreError self.evpFileName = evpFileName self.token = datetime.datetime.utcnow().isoformat(' ') # logger self.logger = LogWrapper(_logger,self.token) self.pd2p = DynDataDistributer.DynDataDistributer([],self.taskBuffer,self.siteMapper, token=' ',logger=self.logger) self.userDatasetName = '' self.creationTime = '' self.params = '' self.lockedBy = '' self.evpFile = None self.userTaskName = '' # message buffer self.msgBuffer = [] self.lineLimit = 100 # JEDI self.jediTaskID = None # main def run(self): try: self.putLog('start %s' % self.evpFileName) # lock evp file self.evpFile = open(self.evpFileName) try: fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_EX|fcntl.LOCK_NB) except: # relase self.putLog("cannot lock %s" % self.evpFileName) self.evpFile.close() return True # options runEvtList = [] eventPickDataType = '' eventPickStreamName = '' eventPickDS = [] eventPickAmiTag = '' eventPickNumSites = 1 inputFileList = [] tagDsList = [] tagQuery = '' tagStreamRef = '' skipDaTRI = False runEvtGuidMap = {} ei_api = '' # read evp file for tmpLine in self.evpFile: tmpMatch = re.search('^([^=]+)=(.+)$',tmpLine) # check format if tmpMatch == None: continue tmpItems = tmpMatch.groups() if tmpItems[0] == 'runEvent': # get run and event number tmpRunEvt = tmpItems[1].split(',') if len(tmpRunEvt) == 2: runEvtList.append(tmpRunEvt) elif tmpItems[0] == 'eventPickDataType': # data type eventPickDataType = tmpItems[1] elif tmpItems[0] == 'eventPickStreamName': # stream name eventPickStreamName = tmpItems[1] elif tmpItems[0] == 'eventPickDS': # dataset pattern eventPickDS = tmpItems[1].split(',') elif tmpItems[0] == 'eventPickAmiTag': # AMI tag eventPickAmiTag = tmpItems[1] elif tmpItems[0] == 'eventPickNumSites': # the number of sites where datasets are distributed try: eventPickNumSites = int(tmpItems[1]) except: pass elif tmpItems[0] == 'userName': # user name self.userDN = tmpItems[1] self.putLog("user=%s" % self.userDN) elif tmpItems[0] == 'userTaskName': # user task name self.userTaskName = tmpItems[1] elif tmpItems[0] == 'userDatasetName': # user dataset name self.userDatasetName = tmpItems[1] elif tmpItems[0] == 'lockedBy': # client name self.lockedBy = tmpItems[1] elif tmpItems[0] == 'creationTime': # creation time self.creationTime = tmpItems[1] elif tmpItems[0] == 'params': # parameters self.params = tmpItems[1] elif tmpItems[0] == 'ei_api': # ei api parameter for MC ei_api = tmpItems[1] elif tmpItems[0] == 'inputFileList': # input file list inputFileList = tmpItems[1].split(',') try: inputFileList.remove('') except: pass elif tmpItems[0] == 'tagDS': # TAG dataset tagDsList = tmpItems[1].split(',') elif tmpItems[0] == 'tagQuery': # query for TAG tagQuery = tmpItems[1] elif tmpItems[0] == 'tagStreamRef': # StreamRef for TAG tagStreamRef = tmpItems[1] if not tagStreamRef.endswith('_ref'): tagStreamRef += '_ref' elif tmpItems[0] == 'runEvtGuidMap': # GUIDs try: exec "runEvtGuidMap="+tmpItems[1] except: pass # extract task name if self.userTaskName == '' and self.params != '': try: tmpMatch = re.search('--outDS(=| ) *([^ ]+)',self.params) if tmpMatch != None: self.userTaskName = tmpMatch.group(2) if not self.userTaskName.endswith('/'): self.userTaskName += '/' except: pass # suppress DaTRI if self.params != '': if '--eventPickSkipDaTRI' in self.params: skipDaTRI = True # get compact user name compactDN = self.taskBuffer.cleanUserID(self.userDN) # get jediTaskID self.jediTaskID = self.taskBuffer.getTaskIDwithTaskNameJEDI(compactDN,self.userTaskName) # convert if tagDsList == [] or tagQuery == '': # convert run/event list to dataset/file list tmpRet,locationMap,allFiles = self.pd2p.convertEvtRunToDatasets(runEvtList, eventPickDataType, eventPickStreamName, eventPickDS, eventPickAmiTag, self.userDN, runEvtGuidMap, ei_api ) if not tmpRet: if 'isFatal' in locationMap and locationMap['isFatal'] == True: self.ignoreError = False self.endWithError('Failed to convert the run/event list to a dataset/file list') return False else: # get parent dataset/files with TAG tmpRet,locationMap,allFiles = self.pd2p.getTagParentInfoUsingTagQuery(tagDsList,tagQuery,tagStreamRef) if not tmpRet: self.endWithError('Failed to get parent dataset/file list with TAG') return False # use only files in the list if inputFileList != []: tmpAllFiles = [] for tmpFile in allFiles: if tmpFile['lfn'] in inputFileList: tmpAllFiles.append(tmpFile) allFiles = tmpAllFiles # remove redundant CN from DN tmpDN = self.userDN tmpDN = re.sub('/CN=limited proxy','',tmpDN) tmpDN = re.sub('(/CN=proxy)+$','',tmpDN) # make dataset container tmpRet = self.pd2p.registerDatasetContainerWithDatasets(self.userDatasetName,allFiles, locationMap, nSites=eventPickNumSites, owner=tmpDN) if not tmpRet: self.endWithError('Failed to make a dataset container %s' % self.userDatasetName) return False # skip DaTRI if skipDaTRI: # successfully terminated self.putLog("skip DaTRI") # update task self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID) else: # get candidates tmpRet,candidateMaps = self.pd2p.getCandidates(self.userDatasetName,checkUsedFile=False, useHidden=True) if not tmpRet: self.endWithError('Failed to find candidate for destination') return False # collect all candidates allCandidates = [] for tmpDS,tmpDsVal in candidateMaps.iteritems(): for tmpCloud,tmpCloudVal in tmpDsVal.iteritems(): for tmpSiteName in tmpCloudVal[0]: if not tmpSiteName in allCandidates: allCandidates.append(tmpSiteName) if allCandidates == []: self.endWithError('No candidate for destination') return False # get list of dataset (container) names if eventPickNumSites > 1: # decompose container to transfer datasets separately tmpRet,tmpOut = self.pd2p.getListDatasetReplicasInContainer(self.userDatasetName) if not tmpRet: self.endWithError('Failed to get the size of %s' % self.userDatasetName) return False userDatasetNameList = tmpOut.keys() else: # transfer container at once userDatasetNameList = [self.userDatasetName] # loop over all datasets sitesUsed = [] for tmpUserDatasetName in userDatasetNameList: # get size of dataset container tmpRet,totalInputSize = rucioAPI.getDatasetSize(tmpUserDatasetName) if not tmpRet: self.endWithError('Failed to get the size of %s' % tmpUserDatasetName) return False # run brokerage tmpJob = JobSpec() tmpJob.AtlasRelease = '' self.putLog("run brokerage for %s" % tmpDS) brokerage.broker.schedule([tmpJob],self.taskBuffer,self.siteMapper,True,allCandidates, True,datasetSize=totalInputSize) if tmpJob.computingSite.startswith('ERROR'): self.endWithError('brokerage failed with %s' % tmpJob.computingSite) return False self.putLog("site -> %s" % tmpJob.computingSite) # send transfer request try: tmpDN = rucioAPI.parse_dn(tmpDN) tmpStatus,userInfo = rucioAPI.finger(tmpDN) if not tmpStatus: raise RuntimeError,'user info not found for {0} with {1}'.format(tmpDN,userInfo) tmpDN = userInfo['nickname'] tmpDQ2ID = self.siteMapper.getSite(tmpJob.computingSite).ddm_input tmpMsg = "%s ds=%s site=%s id=%s" % ('registerDatasetLocation for DaTRI ', tmpUserDatasetName, tmpDQ2ID, tmpDN) self.putLog(tmpMsg) rucioAPI.registerDatasetLocation(tmpDS,[tmpDQ2ID],lifetime=14,owner=tmpDN, activity="User Subscriptions") self.putLog('OK') except: errType,errValue = sys.exc_info()[:2] tmpStr = 'Failed to send transfer request : %s %s' % (errType,errValue) tmpStr.strip() tmpStr += traceback.format_exc() self.endWithError(tmpStr) return False # list of sites already used sitesUsed.append(tmpJob.computingSite) self.putLog("used %s sites" % len(sitesUsed)) # set candidates if len(sitesUsed) >= eventPickNumSites: # reset candidates to limit the number of sites allCandidates = sitesUsed sitesUsed = [] else: # remove site allCandidates.remove(tmpJob.computingSite) # send email notification for success tmpMsg = 'A transfer request was successfully sent to Rucio.\n' tmpMsg += 'Your task will get started once transfer is completed.' self.sendEmail(True,tmpMsg) try: # unlock and delete evp file fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_UN) self.evpFile.close() os.remove(self.evpFileName) except: pass # successfully terminated self.putLog("end %s" % self.evpFileName) return True except: errType,errValue = sys.exc_info()[:2] self.endWithError('Got exception %s:%s %s' % (errType,errValue,traceback.format_exc())) return False # end with error def endWithError(self,message): self.putLog(message,'error') # unlock evp file try: fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_UN) self.evpFile.close() if not self.ignoreError: # remove evp file os.remove(self.evpFileName) # send email notification self.sendEmail(False,message) except: pass # upload log if self.jediTaskID != None: outLog = self.uploadLog() self.taskBuffer.updateTaskErrorDialogJEDI(self.jediTaskID,'event picking failed. '+outLog) # update task if not self.ignoreError: self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID,'tobroken') self.putLog(outLog) self.putLog('end %s' % self.evpFileName) # put log def putLog(self,msg,type='debug'): tmpMsg = msg if type == 'error': self.logger.error(tmpMsg) else: self.logger.debug(tmpMsg) # send email notification def sendEmail(self,isSucceeded,message): # mail address toAdder = Notifier(self.taskBuffer,None,[]).getEmail(self.userDN) if toAdder == '': self.putLog('cannot find email address for %s' % self.userDN,'error') return # subject mailSubject = "PANDA notification for Event-Picking Request" # message mailBody = "Hello,\n\nHere is your request status for event picking\n\n" if isSucceeded: mailBody += "Status : Passed to Rucio\n" else: mailBody += "Status : Failed\n" mailBody += "Created : %s\n" % self.creationTime mailBody += "Ended : %s\n" % datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') mailBody += "Dataset : %s\n" % self.userDatasetName mailBody += "\n" mailBody += "Parameters : %s %s\n" % (self.lockedBy,self.params) mailBody += "\n" mailBody += "%s\n" % message # send retVal = MailUtils().send(toAdder,mailSubject,mailBody) # return return # upload log def uploadLog(self): if self.jediTaskID == None: return 'cannot find jediTaskID' strMsg = self.logger.dumpToString() s,o = Client.uploadLog(strMsg,self.jediTaskID) if s != 0: return "failed to upload log with {0}.".format(s) if o.startswith('http'): return '<a href="{0}">log</a>'.format(o) return o
def run(self): try: # make a message instance tmpLog = LogWrapper(_logger, None) # run main procedure in the same process if not self.forkRun: tmpLog.debug('main start') tmpLog.debug('firstSubmission={0}'.format( self.firstSubmission)) # group jobs per VO voJobsMap = {} ddmFreeJobs = [] tmpLog.debug('{0} jobs in total'.format(len(self.jobs))) for tmpJob in self.jobs: # set VO=local for DDM free if tmpJob.destinationSE == 'local': tmpVO = 'local' else: tmpVO = tmpJob.VO # make map if not voJobsMap.has_key(tmpVO): voJobsMap[tmpVO] = [] voJobsMap[tmpVO].append(tmpJob) # loop over all VOs for tmpVO, tmpJobList in voJobsMap.iteritems(): tmpLog.debug('vo={0} has {1} jobs'.format( tmpVO, len(tmpJobList))) # get plugin setupperPluginClass = panda_config.getPlugin( 'setupper_plugins', tmpVO) if setupperPluginClass == None: # use ATLAS plug-in by default from SetupperAtlasPlugin import SetupperAtlasPlugin setupperPluginClass = SetupperAtlasPlugin tmpLog.debug('plugin name -> {0}'.format( setupperPluginClass.__name__)) try: # make plugin setupperPlugin = setupperPluginClass( self.taskBuffer, self.jobs, tmpLog, resubmit=self.resubmit, pandaDDM=self.pandaDDM, ddmAttempt=self.ddmAttempt, onlyTA=self.onlyTA, firstSubmission=self.firstSubmission) # run plugin tmpLog.debug('run plugin') setupperPlugin.run() # go forward if not TA if not self.onlyTA: # update jobs tmpLog.debug('update jobs') self.updateJobs( setupperPlugin.jobs + setupperPlugin.jumboJobs, tmpLog) # execute post process tmpLog.debug('post execute plugin') setupperPlugin.postRun() tmpLog.debug('done plugin') except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('plugin failed with {0}:{1}'.format( errtype, errvalue)) tmpLog.debug('main end') else: tmpLog.debug('fork start') # write jobs to file import os import cPickle as pickle outFileName = '%s/set.%s_%s' % (panda_config.logdir, self.jobs[0].PandaID, commands.getoutput('uuidgen')) outFile = open(outFileName, 'w') pickle.dump(self.jobs, outFile) outFile.close() # run main procedure in another process because python doesn't release memory com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % ( panda_config.home_dir_cwd, panda_config.home_dir_cwd) com += 'source %s; ' % panda_config.glite_source com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \ (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python, panda_config.pandaPython_dir,outFileName) if self.onlyTA: com += " -t" if not self.firstSubmission: com += " -f" tmpLog.debug(com) # exeute status, output = self.taskBuffer.processLimiter.getstatusoutput( com) tmpLog.debug("return from main process: %s %s" % (status, output)) tmpLog.debug('fork end') except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('master failed with {0}:{1}'.format( errtype, errvalue))
def uploadLog(req,file): if not Protocol.isSecure(req): return False if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']: return False tmpLog = LogWrapper(_logger,'uploadLog <{0}>'.format(file.filename)) tmpLog.debug("start {0}".format(req.subprocess_env['SSL_CLIENT_S_DN'])) # size check sizeLimit = 100*1024*1024 # get file size contentLength = 0 try: contentLength = long(req.headers_in["content-length"]) except: if req.headers_in.has_key("content-length"): tmpLog.error("cannot get CL : %s" % req.headers_in["content-length"]) else: tmpLog.error("no CL") tmpLog.debug("size %s" % contentLength) if contentLength > sizeLimit: errStr = "failed to upload log due to size limit" tmpLog.error(errStr) tmpLog.debug("end") return errStr jediLogDir = '/jedilog' retStr = '' try: fileBaseName = file.filename.split('/')[-1] fileFullPath = '{0}{1}/{2}'.format(panda_config.cache_dir,jediLogDir,fileBaseName) # delete old file if os.path.exists(fileFullPath): os.remove(fileFullPath) # write fo = open(fileFullPath,'wb') fileContent = file.file.read() fo.write(fileContent) fo.close() tmpLog.debug("written to {0}".format(fileFullPath)) retStr = 'http://{0}/cache{1}/{2}'.format(getServerHTTP(None),jediLogDir,fileBaseName) except: errtype,errvalue = sys.exc_info()[:2] errStr = "failed to write log with {0}:{1}".format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpLog.debug("end") return errStr tmpLog.debug("end") return retStr
def updateJob(req, jobId, state, token=None, transExitCode=None, pilotErrorCode=None, pilotErrorDiag=None, timestamp=None, timeout=60, xml='', node=None, workdir=None, cpuConsumptionTime=None, cpuConsumptionUnit=None, remainingSpace=None, schedulerID=None, pilotID=None, siteName=None, messageLevel=None, pilotLog='', metaData='', cpuConversionFactor=None, exeErrorCode=None, exeErrorDiag=None, pilotTiming=None, computingElement=None, startTime=None, endTime=None, nEvents=None, nInputFiles=None, batchID=None, attemptNr=None, jobMetrics=None, stdout='', jobSubStatus=None, coreCount=None, maxRSS=None, maxVMEM=None, maxSWAP=None, maxPSS=None, avgRSS=None, avgVMEM=None, avgSWAP=None, avgPSS=None, totRCHAR=None, totWCHAR=None, totRBYTES=None, totWBYTES=None, rateRCHAR=None, rateWCHAR=None, rateRBYTES=None, rateWBYTES=None): tmpLog = LogWrapper( _logger, 'updateJob PandaID={0} PID={1}'.format(jobId, os.getpid())) tmpLog.debug('start') # get DN realDN = _getDN(req) # get FQANs fqans = _getFQAN(req) # check production role prodManager = _checkRole(fqans, realDN, jobDispatcher, site=siteName, hostname=req.get_remote_host()) # check token validToken = _checkToken(token, jobDispatcher) # accept json acceptJson = req.acceptJson() _logger.debug( "updateJob(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,attemptNr:%s,jobSubStatus:%s,core:%s,DN:%s,role:%s,token:%s,val:%s,FQAN:%s,maxRSS=%s,maxVMEM=%s,maxSWAP=%s,maxPSS=%s,avgRSS=%s,avgVMEM=%s,avgSWAP=%s,avgPSS=%s,totRCHAR=%s,totWCHAR=%s,totRBYTES=%s,totWBYTES=%s,rateRCHAR=%s,rateWCHAR=%s,rateRBYTES=%s,rateWBYTES=%s\n==XML==\n%s\n==LOG==\n%s\n==Meta==\n%s\n==Metrics==\n%s\n==stdout==\n%s)" % (jobId, state, transExitCode, pilotErrorCode, pilotErrorDiag, node, workdir, cpuConsumptionTime, cpuConsumptionUnit, remainingSpace, schedulerID, pilotID, siteName, messageLevel, nEvents, nInputFiles, cpuConversionFactor, exeErrorCode, exeErrorDiag, pilotTiming, computingElement, startTime, endTime, batchID, attemptNr, jobSubStatus, coreCount, realDN, prodManager, token, validToken, str(fqans), maxRSS, maxVMEM, maxSWAP, maxPSS, avgRSS, avgVMEM, avgSWAP, avgPSS, totRCHAR, totWCHAR, totRBYTES, totWBYTES, rateRCHAR, rateWCHAR, rateRBYTES, rateWBYTES, xml, pilotLog[:1024], metaData[:1024], jobMetrics, stdout)) _pilotReqLogger.debug('method=updateJob,site=%s,node=%s,type=None' % (siteName, node)) # invalid role if not prodManager: _logger.warning("updateJob(%s) : invalid role" % jobId) if acceptJson: tmpMsg = 'no production/pilot role in VOMS FQANs or non pilot owner' else: tmpMsg = None return Protocol.Response(Protocol.SC_Role, tmpMsg).encode(acceptJson) # invalid token if not validToken: _logger.warning("updateJob(%s) : invalid token" % jobId) return Protocol.Response(Protocol.SC_Invalid).encode(acceptJson) # aborting message if jobId == 'NULL': return Protocol.Response(Protocol.SC_Success).encode(acceptJson) # check status if not state in [ 'running', 'failed', 'finished', 'holding', 'starting', 'transferring' ]: _logger.warning("invalid state=%s for updateJob" % state) return Protocol.Response(Protocol.SC_Success).encode(acceptJson) # create parameter map param = {} if cpuConsumptionTime != None: param['cpuConsumptionTime'] = cpuConsumptionTime if cpuConsumptionUnit != None: param['cpuConsumptionUnit'] = cpuConsumptionUnit if node != None: param['modificationHost'] = node[:128] if transExitCode != None: param['transExitCode'] = transExitCode if pilotErrorCode != None: param['pilotErrorCode'] = pilotErrorCode if pilotErrorDiag != None: param['pilotErrorDiag'] = pilotErrorDiag[:500] if jobMetrics != None: param['jobMetrics'] = jobMetrics[:500] if schedulerID != None: param['schedulerID'] = schedulerID if pilotID != None: param['pilotID'] = pilotID[:200] if batchID != None: param['batchID'] = batchID[:80] if exeErrorCode != None: param['exeErrorCode'] = exeErrorCode if exeErrorDiag != None: param['exeErrorDiag'] = exeErrorDiag[:500] if cpuConversionFactor != None: param['cpuConversion'] = cpuConversionFactor if pilotTiming != None: param['pilotTiming'] = pilotTiming if computingElement != None: param['computingElement'] = computingElement if nEvents != None: param['nEvents'] = nEvents if nInputFiles != None: param['nInputFiles'] = nInputFiles if not jobSubStatus in [None, '']: param['jobSubStatus'] = jobSubStatus if not coreCount in [None, '']: param['actualCoreCount'] = coreCount if maxRSS != None: param['maxRSS'] = maxRSS if maxVMEM != None: param['maxVMEM'] = maxVMEM if maxSWAP != None: param['maxSWAP'] = maxSWAP if maxPSS != None: param['maxPSS'] = maxPSS if avgRSS != None: param['avgRSS'] = avgRSS if avgVMEM != None: param['avgVMEM'] = avgVMEM if avgSWAP != None: param['avgSWAP'] = avgSWAP if avgPSS != None: param['avgPSS'] = avgPSS if totRCHAR is not None: totRCHAR = int(totRCHAR) / 1024 # convert to kByte totRCHAR = min(10**10 - 1, totRCHAR) # limit to 10 digit param['totRCHAR'] = totRCHAR if totWCHAR is not None: totWCHAR = int(totWCHAR) / 1024 # convert to kByte totWCHAR = min(10**10 - 1, totWCHAR) # limit to 10 digit param['totWCHAR'] = totWCHAR if totRBYTES is not None: totRBYTES = int(totRBYTES) / 1024 # convert to kByte totRBYTES = min(10**10 - 1, totRBYTES) # limit to 10 digit param['totRBYTES'] = totRBYTES if totWBYTES is not None: totWBYTES = int(totWBYTES) / 1024 # convert to kByte totWBYTES = min(10**10 - 1, totWBYTES) # limit to 10 digit param['totWBYTES'] = totWBYTES if rateRCHAR is not None: rateRCHAR = min(10**10 - 1, int(rateRCHAR)) # limit to 10 digit param['rateRCHAR'] = rateRCHAR if rateWCHAR is not None: rateWCHAR = min(10**10 - 1, int(rateWCHAR)) # limit to 10 digit param['rateWCHAR'] = rateWCHAR if rateRBYTES is not None: rateRBYTES = min(10**10 - 1, int(rateRBYTES)) # limit to 10 digit param['rateRBYTES'] = rateRBYTES if rateWBYTES is not None: rateWBYTES = min(10**10 - 1, int(rateWBYTES)) # limit to 10 digit param['rateWBYTES'] = rateWBYTES if startTime != None: try: param['startTime'] = datetime.datetime( *time.strptime(startTime, '%Y-%m-%d %H:%M:%S')[:6]) except: pass if endTime != None: try: param['endTime'] = datetime.datetime( *time.strptime(endTime, '%Y-%m-%d %H:%M:%S')[:6]) except: pass if attemptNr != None: try: attemptNr = int(attemptNr) except: attemptNr = None if stdout != '': stdout = stdout[:2048] # invoke JD tmpLog.debug('executing') return jobDispatcher.updateJob(int(jobId), state, int(timeout), xml, siteName, param, metaData, pilotLog, attemptNr, stdout, acceptJson)
def uploadLog(req, file): if not Protocol.isSecure(req): return False if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']: return False tmpLog = LogWrapper(_logger, 'uploadLog <{0}>'.format(file.filename)) tmpLog.debug("start {0}".format(req.subprocess_env['SSL_CLIENT_S_DN'])) # size check sizeLimit = 100 * 1024 * 1024 # get file size contentLength = 0 try: contentLength = long(req.headers_in["content-length"]) except: if req.headers_in.has_key("content-length"): tmpLog.error("cannot get CL : %s" % req.headers_in["content-length"]) else: tmpLog.error("no CL") tmpLog.debug("size %s" % contentLength) if contentLength > sizeLimit: errStr = "failed to upload log due to size limit" tmpLog.error(errStr) tmpLog.debug("end") return errStr jediLogDir = '/jedilog' retStr = '' try: fileBaseName = file.filename.split('/')[-1] fileFullPath = '{0}{1}/{2}'.format(panda_config.cache_dir, jediLogDir, fileBaseName) # delete old file if os.path.exists(fileFullPath): os.remove(fileFullPath) # write fo = open(fileFullPath, 'wb') fileContent = file.file.read() fo.write(fileContent) fo.close() tmpLog.debug("written to {0}".format(fileFullPath)) retStr = 'http://{0}/cache{1}/{2}'.format(getServerHTTP(None), jediLogDir, fileBaseName) except: errtype, errvalue = sys.exc_info()[:2] errStr = "failed to write log with {0}:{1}".format( errtype.__name__, errvalue) tmpLog.error(errStr) tmpLog.debug("end") return errStr tmpLog.debug("end") return retStr
class AdderGen: # constructor def __init__(self,taskBuffer,jobID,jobStatus,xmlFile,ignoreTmpError=True,siteMapper=None): self.job = None self.jobID = jobID self.jobStatus = jobStatus self.taskBuffer = taskBuffer self.ignoreTmpError = ignoreTmpError self.lockXML = None self.siteMapper = siteMapper self.attemptNr = None self.xmlFile = xmlFile self.datasetMap = {} self.extraInfo = {'surl':{},'nevents':{},'lbnr':{},'endpoint':{}, 'guid':{}} # exstract attemptNr try: tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1] if re.search('^\d+$',tmpAttemptNr) != None: self.attemptNr = int(tmpAttemptNr) except: pass # logger self.logger = LogWrapper(_logger,str(self.jobID)) # dump file report def dumpFileReport(self,fileCatalog,attemptNr): self.logger.debug("dump file report") # dump Catalog into file if attemptNr == None: xmlFile = '%s/%s_%s_%s' % (panda_config.logdir,self.jobID,self.jobStatus, str(uuid.uuid4())) else: xmlFile = '%s/%s_%s_%s_%s' % (panda_config.logdir,self.jobID,self.jobStatus, str(uuid.uuid4()),attemptNr) file = open(xmlFile,'w') file.write(fileCatalog) file.close() # get plugin class def getPluginClass(self, tmpVO): # instantiate concrete plugin adderPluginClass = panda_config.getPlugin('adder_plugins',tmpVO) if adderPluginClass == None: # use ATLAS plugin by default from AdderAtlasPlugin import AdderAtlasPlugin adderPluginClass = AdderAtlasPlugin self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__)) return adderPluginClass # main def run(self): try: self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus,self.attemptNr)) # lock XML self.lockXML = open(self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB) except: self.logger.debug("cannot get lock : %s" % self.xmlFile) self.lockXML.close() # remove XML just in case for the final attempt if not self.ignoreTmpError: try: # remove Catalog os.remove(self.xmlFile) except: pass return # check if file exists if not os.path.exists(self.xmlFile): self.logger.debug("not exist : %s" % self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: pass return # query job self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False, fromWaiting=False, forAnal=True)[0] # check if job has finished if self.job == None: self.logger.debug(': job not found in DB') elif self.job.jobStatus in ['finished','failed','unknown','merging']: self.logger.error(': invalid state -> %s' % self.job.jobStatus) elif self.attemptNr != None and self.job.attemptNr != self.attemptNr: self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr,self.attemptNr)) elif self.attemptNr is not None and self.job.jobStatus == 'transferring': errMsg = 'XML with attemptNr for {0}'.format(self.job.jobStatus) self.logger.error(errMsg) # FIXME raise RuntimeError, errMsg elif self.jobStatus == EventServiceUtils.esRegStatus: # instantiate concrete plugin adderPluginClass = self.getPluginClass(self.job.VO) adderPlugin = adderPluginClass(self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, logger=self.logger) # execute self.logger.debug('plugin is ready for ES file registration') adderPlugin.registerEventServiceFiles() else: # check file status in JEDI if not self.job.isCancelled() and not self.job.taskBufferErrorCode in [taskbuffer.ErrorCode.EC_PilotRetried]: fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(self.job) self.logger.debug("check file status in JEDI : {0}".format(fileCheckInJEDI)) if fileCheckInJEDI == None: raise RuntimeError,'failed to check file status in JEDI' if fileCheckInJEDI == False: # set job status to failed since some file status is wrong in JEDI self.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder errStr = "inconsistent file status between Panda and JEDI. " errStr += "failed to avoid duplicated processing caused by synchronization failure" self.job.ddmErrorDiag = errStr self.logger.debug("set jobStatus={0} since input is inconsistent between Panda and JEDI".format(self.jobStatus)) elif self.job.jobSubStatus in ['pilot_closed']: # terminated by the pilot self.logger.debug("going to closed since terminated by the pilot") retClosed = self.taskBuffer.killJobs([self.jobID],'pilot','60',True) if retClosed[0] == True: self.logger.debug("end") try: # remove Catalog os.remove(self.xmlFile) except: pass # unlock XML if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() return # check for cloned jobs if EventServiceUtils.isJobCloningJob(self.job): checkJC = self.taskBuffer.checkClonedJob(self.job) if checkJC == None: raise RuntimeError,'failed to check the cloned job' # failed to lock semaphore if checkJC['lock'] == False: self.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "failed to lock semaphore for job cloning" self.logger.debug("set jobStatus={0} since did not get semaphore for job cloning".format(self.jobStatus)) # use failed for cancelled/closed jobs if self.job.isCancelled(): self.jobStatus = 'failed' # reset error codes to skip retrial module self.job.pilotErrorCode = 0 self.job.exeErrorCode = 0 self.job.ddmErrorCode = 0 # keep old status oldJobStatus = self.job.jobStatus # set job status if not self.job.jobStatus in ['transferring']: self.job.jobStatus = self.jobStatus addResult = None adderPlugin = None # parse XML parseResult = self.parseXML() if parseResult < 2: # intraction with DDM try: # instantiate concrete plugin adderPluginClass = self.getPluginClass(self.job.VO) adderPlugin = adderPluginClass(self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, extraInfo=self.extraInfo, logger=self.logger) # execute self.logger.debug('plugin is ready') adderPlugin.execute() addResult = adderPlugin.result self.logger.debug('plugin done with %s' % (addResult.statusCode)) except: errtype,errvalue = sys.exc_info()[:2] self.logger.error("failed to execute AdderPlugin for VO={0} with {1}:{2}".format(self.job.VO, errtype, errvalue)) addResult = None self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "AdderPlugin failure" # ignore temporary errors if self.ignoreTmpError and addResult != None and addResult.isTemporary(): self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag) self.logger.debug('escape') # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type,value)) self.logger.debug("cannot unlock XML") return # failed if addResult == None or not addResult.isSucceeded(): self.job.jobStatus = 'failed' # set file status for failed jobs or failed transferring jobs self.logger.debug("status after plugin call :job.jobStatus=%s jobStatus=%s" % (self.job.jobStatus, self.jobStatus)) if self.job.jobStatus == 'failed' or self.jobStatus == 'failed': # First of all: check if job failed and in this case take first actions according to error table source, error_code, error_diag = None, None, None if self.job.pilotErrorCode: source = 'pilotErrorCode' error_code = self.job.pilotErrorCode error_diag = self.job.pilotErrorDiag elif self.job.exeErrorCode: source = 'exeErrorCode' error_code = self.job.exeErrorCode error_diag = self.job.exeErrorDiag elif self.job.ddmErrorCode: source = 'ddmErrorCode' error_code = self.job.ddmErrorCode error_diag = self.job.ddmErrorDiag elif self.job.transExitCode: source = 'transExitCode' error_code = self.job.transExitCode error_diag = '' # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag)) if source and error_code: try: self.logger.debug("AdderGen.run will call apply_retrial_rules") retryModule.apply_retrial_rules(self.taskBuffer, self.job.PandaID, source, error_code, error_diag, self.job.attemptNr) self.logger.debug("apply_retrial_rules is back") except Exception as e: self.logger.error("apply_retrial_rules excepted and needs to be investigated (%s): %s"%(e, traceback.format_exc())) self.job.jobStatus = 'failed' for file in self.job.Files: if file.type in ['output','log']: if addResult != None and file.lfn in addResult.mergingFiles: file.status = 'merging' else: file.status = 'failed' else: # reset errors self.job.jobDispatcherErrorCode = 0 self.job.jobDispatcherErrorDiag = 'NULL' # set status if addResult != None and addResult.mergingFiles != []: # set status for merging: for file in self.job.Files: if file.lfn in addResult.mergingFiles: file.status = 'merging' self.job.jobStatus = 'merging' # propagate transition to prodDB self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) elif addResult != None and addResult.transferringFiles != []: # set status for transferring for file in self.job.Files: if file.lfn in addResult.transferringFiles: file.status = 'transferring' self.job.jobStatus = 'transferring' self.job.jobSubStatus = None # propagate transition to prodDB self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) else: self.job.jobStatus = 'finished' # endtime if self.job.endTime=='NULL': self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) # output size and # of outputs self.job.nOutputDataFiles = 0 self.job.outputFileBytes = 0 for tmpFile in self.job.Files: if tmpFile.type == 'output': self.job.nOutputDataFiles += 1 try: self.job.outputFileBytes += tmpFile.fsize except: pass # protection maxOutputFileBytes = 99999999999 if self.job.outputFileBytes > maxOutputFileBytes: self.job.outputFileBytes = maxOutputFileBytes # set cancelled state if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed': self.job.jobStatus = 'cancelled' # update job if oldJobStatus in ['cancelled','closed']: pass else: self.logger.debug("updating DB") retU = self.taskBuffer.updateJobs([self.job],False,oldJobStatusList=[oldJobStatus], extraInfo=self.extraInfo) self.logger.debug("retU: %s" % retU) # failed if not retU[0]: self.logger.error('failed to update DB for pandaid={0}'.format(self.job.PandaID)) # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type,value)) self.logger.debug("cannot unlock XML") return try: # updateJobs was successful and it failed a job with taskBufferErrorCode self.logger.debug("AdderGen.run will peek the job") job_tmp = self.taskBuffer.peekJobs([self.job.PandaID], fromDefined=False, fromArchived=True, fromWaiting=False)[0] self.logger.debug("status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}".format(job_tmp.jobStatus, job_tmp.taskBufferErrorCode, job_tmp.taskBufferErrorDiag)) if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode: source = 'taskBufferErrorCode' error_code = job_tmp.taskBufferErrorCode error_diag = job_tmp.taskBufferErrorDiag self.logger.debug("AdderGen.run 2 will call apply_retrial_rules") retryModule.apply_retrial_rules(self.taskBuffer, job_tmp.PandaID, source, error_code, error_diag, job_tmp.attemptNr) self.logger.debug("apply_retrial_rules 2 is back") except IndexError: pass except Exception as e: self.logger.error("apply_retrial_rules 2 excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc())) # setup for closer if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.isCancelled()): destDBList = [] guidList = [] for file in self.job.Files: # ignore inputs if file.type == 'input': continue # skip pseudo datasets if file.destinationDBlock in ['',None,'NULL']: continue # start closer for output/log datasets if not file.destinationDBlock in destDBList: destDBList.append(file.destinationDBlock) # collect GUIDs if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['rucio_test'] + JobUtils.list_ptest_prod_sources and \ self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \ and file.type == 'output': # extract base LFN since LFN was changed to full LFN for CMS baseLFN = file.lfn.split('/')[-1] guidList.append({'lfn':baseLFN,'guid':file.GUID,'type':file.type, 'checksum':file.checksum,'md5sum':file.md5sum, 'fsize':file.fsize,'scope':file.scope}) if guidList != []: retG = self.taskBuffer.setGUIDs(guidList) if destDBList != []: # start Closer if adderPlugin != None and hasattr(adderPlugin,'datasetMap') and adderPlugin.datasetMap != {}: cThr = Closer.Closer(self.taskBuffer,destDBList,self.job,datasetMap=adderPlugin.datasetMap) else: cThr = Closer.Closer(self.taskBuffer,destDBList,self.job) self.logger.debug("start Closer") cThr.start() cThr.join() self.logger.debug("end Closer") # run closer for assocaiate parallel jobs if EventServiceUtils.isJobCloningJob(self.job): assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer(self.job.jediTaskID,self.job.PandaID, destDBList) for assJobID,assDBlocks in assDBlockMap.iteritems(): assJob = self.taskBuffer.peekJobs([assJobID],fromDefined=False, fromArchived=False, fromWaiting=False, forAnal=True)[0] if self.job == None: self.logger.debug(': associated job PandaID={0} not found in DB'.format(assJobID)) else: cThr = Closer.Closer(self.taskBuffer,assDBlocks,assJob) self.logger.debug("start Closer for PandaID={0}".format(assJobID)) cThr.start() cThr.join() self.logger.debug("end Closer for PandaID={0}".format(assJobID)) self.logger.debug("end") try: # remove Catalog os.remove(self.xmlFile) except: pass # unlock XML if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() errStr = ": %s %s " % (type,value) errStr += traceback.format_exc() self.logger.error(errStr) self.logger.error("except") # unlock XML just in case try: if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) except: type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type,value)) self.logger.error("cannot unlock XML") # parse XML # 0: succeeded, 1: harmless error to exit, 2: fatal error, 3: event service def parseXML(self): # get LFN and GUID self.logger.debug('XML filename : %s' % self.xmlFile) # no outputs if self.job.Files == []: self.logger.debug("has no outputs") self.logger.debug("parseXML end") return 0 # get input files inputLFNs = [] for file in self.job.Files: if file.type == 'input': inputLFNs.append(file.lfn) # parse XML lfns = [] guids = [] fsizes = [] md5sums = [] chksums = [] surls = [] fullLfnMap = {} nEventsMap = {} guidMap = dict() try: root = xml.dom.minidom.parse(self.xmlFile) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) # get metadata fsize = None md5sum = None adler32 = None surl = None fullLFN = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'fsize': fsize = long(meta.getAttribute('att_value')) elif name == 'md5sum': md5sum = str(meta.getAttribute('att_value')) # check if re.search("^[a-fA-F0-9]{32}$",md5sum) == None: md5sum = None elif name == 'adler32': adler32 = str(meta.getAttribute('att_value')) elif name == 'surl': surl = str(meta.getAttribute('att_value')) elif name == 'full_lfn': fullLFN = str(meta.getAttribute('att_value')) # endpoints self.extraInfo['endpoint'][lfn] = [] for epNode in file.getElementsByTagName('endpoint'): self.extraInfo['endpoint'][lfn].append(str(epNode.firstChild.data)) # error check if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)): if EventServiceUtils.isEventServiceMerge(self.job): continue else: raise RuntimeError, 'fsize/md5sum/adler32/surl=None' # append lfns.append(lfn) guids.append(guid) fsizes.append(fsize) md5sums.append(md5sum) surls.append(surl) if adler32 != None: # use adler32 if available chksums.append("ad:%s" % adler32) else: chksums.append("md5:%s" % md5sum) if fullLFN != None: fullLfnMap[lfn] = fullLFN except: # parse json try: import json with open(self.xmlFile) as tmpF: jsonDict = json.load(tmpF) for lfn, fileData in jsonDict.iteritems(): lfn = str(lfn) fsize = None md5sum = None adler32 = None surl = None fullLFN = None guid = str(fileData['guid']) if 'fsize' in fileData: fsize = long(fileData['fsize']) if 'md5sum' in fileData: md5sum = str(fileData['md5sum']) # check if re.search("^[a-fA-F0-9]{32}$",md5sum) == None: md5sum = None if 'adler32' in fileData: adler32 = str(fileData['adler32']) if 'surl' in fileData: surl = str(fileData['surl']) if 'full_lfn' in fileData: fullLFN = str(fileData['full_lfn']) # endpoints self.extraInfo['endpoint'][lfn] = [] if 'endpoint' in fileData: self.extraInfo['endpoint'][lfn] = fileData['endpoint'] # error check if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)): if EventServiceUtils.isEventServiceMerge(self.job): continue else: raise RuntimeError, 'fsize/md5sum/adler32/surl=None' # append lfns.append(lfn) guids.append(guid) fsizes.append(fsize) md5sums.append(md5sum) surls.append(surl) if adler32 != None: # use adler32 if available chksums.append("ad:%s" % adler32) else: chksums.append("md5:%s" % md5sum) if fullLFN != None: fullLfnMap[lfn] = fullLFN except: # check if file exists if os.path.exists(self.xmlFile): type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type,value)) # set failed anyway self.job.jobStatus = 'failed' # XML error happens when pilot got killed due to wall-time limit or failures in wrapper if (self.job.pilotErrorCode in [0,'0','NULL']) and \ (self.job.taskBufferErrorCode not in [taskbuffer.ErrorCode.EC_WorkerDone]) and \ (self.job.transExitCode in [0,'0','NULL']): self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML" return 2 else: # XML was deleted return 1 # parse metadata to get nEvents try: root = xml.dom.minidom.parseString(self.job.metadata) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) guidMap[lfn] = guid # get metadata nevents = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'events': nevents = long(meta.getAttribute('att_value')) nEventsMap[lfn] = nevents break except: pass # parse json try: import json jsonDict = json.loads(self.job.metadata) for jsonFileItem in jsonDict['files']['output']: for jsonSubFileItem in jsonFileItem['subFiles']: lfn = str(jsonSubFileItem['name']) try: nevents = long(jsonSubFileItem['nentries']) nEventsMap[lfn] = nevents except: pass try: guid = str(jsonSubFileItem['file_guid']) guidMap[lfn] = guid except: pass except: pass self.logger.debug('nEventsMap=%s' % str(nEventsMap)) self.logger.debug('guidMap=%s' % str(guidMap)) # get lumi block number lumiBlockNr = self.job.getLumiBlockNr() # copy files for variable number of outputs tmpStat = self.copyFilesForVariableNumOutputs(lfns) if not tmpStat: self.logger.error("failed to copy files for variable number of outputs") return 2 # check files fileList = [] for file in self.job.Files: fileList.append(file.lfn) if file.type == 'input': if file.lfn in lfns: if self.job.prodSourceLabel in ['user','panda']: # skipped file file.status = 'skipped' elif self.job.prodSourceLabel in ['managed','test'] + JobUtils.list_ptest_prod_sources: # failed by pilot file.status = 'failed' elif file.type == 'output' or file.type == 'log': # add only log file for failed jobs if self.jobStatus == 'failed' and file.type != 'log': file.status = 'failed' continue # set failed if it is missing in XML if not file.lfn in lfns: if self.job.jobStatus == 'finished' and \ (EventServiceUtils.isEventServiceJob(self.job) or EventServiceUtils.isJumboJob(self.job)): # unset file status for ES jobs pass elif file.isAllowedNoOutput(): # allowed not to be produced file.status = 'nooutput' self.logger.debug('set {0} to status={1}'.format(file.lfn,file.status)) else: file.status = 'failed' self.job.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "expected output {0} is missing in pilot XML".format(file.lfn) self.logger.error(self.job.ddmErrorDiag) continue # look for GUID with LFN try: i = lfns.index(file.lfn) file.GUID = guids[i] file.fsize = fsizes[i] file.md5sum = md5sums[i] file.checksum = chksums[i] surl = surls[i] # status file.status = 'ready' # change to full LFN if fullLfnMap.has_key(file.lfn): file.lfn = fullLfnMap[file.lfn] # add SURL to extraInfo self.extraInfo['surl'][file.lfn] = surl # add nevents if nEventsMap.has_key(file.lfn): self.extraInfo['nevents'][file.lfn] = nEventsMap[file.lfn] except: # status file.status = 'failed' type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type,value)) # set lumi block number if lumiBlockNr != None and file.status != 'failed': self.extraInfo['lbnr'][file.lfn] = lumiBlockNr self.extraInfo['guid'] = guidMap # check consistency between XML and filesTable for lfn in lfns: if not lfn in fileList: self.logger.error("%s is not found in filesTable" % lfn) self.job.jobStatus = 'failed' for tmpFile in self.job.Files: tmpFile.status = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "pilot produced {0} inconsistently with jobdef".format(lfn) return 2 # return self.logger.debug("parseXML end") return 0 # copy files for variable number of outputs def copyFilesForVariableNumOutputs(self,lfns): # get original output files origOutputs = {} updateOrig = {} for tmpFile in self.job.Files: if tmpFile.type in ['output','log']: origOutputs[tmpFile.lfn] = tmpFile if tmpFile.lfn in lfns: # keep original updateOrig[tmpFile.lfn] = False else: # overwrite original updateOrig[tmpFile.lfn] = True # look for unkown files addedNewFiles = False for newLFN in lfns: if not newLFN in origOutputs: # look for corresponding original output for origLFN in origOutputs.keys(): tmpPatt = '^{0}\.*_\d+$'.format(origLFN) if re.search(tmpPatt,newLFN) != None: # copy file record tmpStat = self.taskBuffer.copyFileRecord(newLFN,origOutputs[origLFN],updateOrig[origLFN]) if not tmpStat: return False addedNewFiles = True # disable further overwriting updateOrig[origLFN] = False break # refresh job info if addedNewFiles: self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False, fromWaiting=False, forAnal=True)[0] # return return True
from config import panda_config from taskbuffer.TaskBuffer import taskBuffer from pandalogger.PandaLogger import PandaLogger from pandalogger.LogWrapper import LogWrapper import panda_proxy_cache # logger _logger = PandaLogger().getLogger('panda_activeusers_query') tmpLog = LogWrapper(_logger) if __name__ == '__main__': tmpLog.debug("================= start ==================") # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate MyProxy I/F my_proxy_interface_instance = panda_proxy_cache.MyProxyInterface() # roles if hasattr(panda_config, 'proxy_cache_roles'): roles = panda_config.proxy_cache_roles.split(',') else: roles = [ 'atlas', 'atlas:/atlas/Role=production', 'atlas:/atlas/Role=pilot' ]
def getGUIDsFromEventIndex(self,runEventList,streamName,amiTags,dataType): comment = ' /* DBProxy.getGUIDsFromEventIndex */' methodName = comment.split(' ')[-2].split('.')[-1] tmpLog = LogWrapper(_logger,methodName+" <streamName={0} amiTags={1} dataType={2}>".format(streamName,amiTags,dataType)) try: # change to list if not amiTags in [None,'']: amiTags = amiTags.replace('*','.*').split(',') tmpLog.debug("start for {0} events".format(len(runEventList))) # check data type if not dataType in ['RAW','ESD','AOD']: return False,'dataType={0} is unsupported'.format(dataType) # sql to insert runs and events sqlRE = "INSERT INTO {0}.TMP_RUN_EVENT_PAIRS (runNumber,eventNumber) ".format(panda_config.schemaEI) sqlRE += "VALUES (:runNumber,:eventNumber) " varMaps = [] for runNumber,eventNumber in runEventList: varMap = {} varMap[':runNumber'] = runNumber varMap[':eventNumber'] = eventNumber varMaps.append(varMap) # begin transaction self.conn.begin() self.cur.arraysize = 100000 # insert runs and events self.cur.executemany(sqlRE+comment, varMaps) # read GUIDs varMap = {} if amiTags in [None,'']: sqlRG = "SELECT runNumber,eventNumber,guid_{0} ".format(dataType) sqlRG += "FROM {0}.V_PANDA_EVPICK_NOAMITAG_MANY ".format(panda_config.schemaEI) else: sqlRG = "SELECT runNumber,eventNumber,guid_{0},amiTag ".format(dataType) sqlRG += "FROM {0}.V_PANDA_EVPICK_AMITAG_MANY ".format(panda_config.schemaEI) if not streamName in [None,'']: sqlRG += "WHERE streamName=:streamName " varMap[':streamName'] = streamName self.cur.execute(sqlRG+comment, varMap) resRG = self.cur.fetchall() # commit if not self._commit(): raise RuntimeError, 'Commit error' retValue = {} keyAmiIdxMap = {} for tmpItem in resRG: if amiTags in [None,'']: runNumber,eventNumber,guid = tmpItem # dummy idxTag = 0 else: runNumber,eventNumber,guid,amiTag = tmpItem # get index number for the AMI tag in the list idxTag = self.getIndexAmiTag(amiTags,amiTag) # didn't match if idxTag == None: continue tmpKey = (runNumber,eventNumber) # use AMI tag in a preference orde if tmpKey in keyAmiIdxMap and keyAmiIdxMap[tmpKey] < idxTag: continue keyAmiIdxMap[tmpKey] = idxTag retValue[tmpKey] = [guid] tmpLog.debug("found {0} events".format(len(retValue))) return True,retValue except: # roll back self._rollback() # error self.dumpErrorMessage(_logger,methodName) return False,None
import pandalogger.PandaLogger from pandalogger.PandaLogger import PandaLogger from brokerage.SiteMapper import SiteMapper from pandautils import PandaUtils from pandalogger.LogWrapper import LogWrapper # password from config import panda_config passwd = panda_config.dbpasswd # logger _logger = PandaLogger().getLogger('add') tmpLog = LogWrapper(_logger, None) tmpLog.debug("===================== start =====================") # overall timeout value overallTimeout = 20 # grace period try: gracePeriod = int(sys.argv[1]) except: gracePeriod = 3 # current minute currentMinute = datetime.datetime.utcnow().minute # kill old process try:
from taskbuffer.TaskBuffer import taskBuffer from pandalogger.PandaLogger import PandaLogger from pandalogger.LogWrapper import LogWrapper from brokerage.SiteMapper import SiteMapper from taskbuffer import ErrorCode # password from config import panda_config passwd = panda_config.dbpasswd # logger _logger = PandaLogger().getLogger('esPreemption') tmpLog = LogWrapper(_logger) tmpLog.debug("================= start ==================") # instantiate TB taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) # time limit timeLimit = datetime.datetime.utcnow()-datetime.timedelta(minutes=15) # get low priority ES jobs per site sqlEsJobs = "SELECT PandaID,computingSite,commandToPilot,startTime " sqlEsJobs += "FROM {0}.jobsActive4 ".format(panda_config.schemaPANDA) sqlEsJobs += "WHERE prodSourceLabel IN (:label1,:label2) AND eventService=:es " sqlEsJobs += "AND currentPriority<:prio AND jobStatus=:jobStat "