def parseXML(self): # get LFN and GUID # self.logger.debug('XML filename : %s' % self.xmlFile) # no outputs log_out = [f for f in self.job.Files if f.type in ['log', 'output']] if not log_out: self.logger.debug("has no outputs") self.logger.debug("parseXML end") return 0 # get input files inputLFNs = [] for file in self.job.Files: if file.type == 'input': inputLFNs.append(file.lfn) # parse XML lfns = [] guids = [] fsizes = [] md5sums = [] chksums = [] surls = [] fullLfnMap = {} nEventsMap = {} guidMap = dict() try: # root = xml.dom.minidom.parse(self.xmlFile) root = xml.dom.minidom.parseString(self.data) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) # get metadata fsize = None md5sum = None adler32 = None surl = None fullLFN = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'fsize': fsize = long(meta.getAttribute('att_value')) elif name == 'md5sum': md5sum = str(meta.getAttribute('att_value')) # check if re.search("^[a-fA-F0-9]{32}$", md5sum) is None: md5sum = None elif name == 'adler32': adler32 = str(meta.getAttribute('att_value')) elif name == 'surl': surl = str(meta.getAttribute('att_value')) elif name == 'full_lfn': fullLFN = str(meta.getAttribute('att_value')) # endpoints self.extraInfo['endpoint'][lfn] = [] for epNode in file.getElementsByTagName('endpoint'): self.extraInfo['endpoint'][lfn].append( str(epNode.firstChild.data)) # error check if (lfn not in inputLFNs) and (fsize is None or (md5sum is None and adler32 is None)): if EventServiceUtils.isEventServiceMerge(self.job): continue else: raise RuntimeError('fsize/md5sum/adler32/surl=None') # append lfns.append(lfn) guids.append(guid) fsizes.append(fsize) md5sums.append(md5sum) surls.append(surl) if adler32 is not None: # use adler32 if available chksums.append("ad:%s" % adler32) else: chksums.append("md5:%s" % md5sum) if fullLFN is not None: fullLfnMap[lfn] = fullLFN except Exception: # parse json try: import json # with open(self.xmlFile) as tmpF: jsonDict = json.loads(self.data) for lfn in jsonDict: fileData = jsonDict[lfn] lfn = str(lfn) fsize = None md5sum = None adler32 = None surl = None fullLFN = None guid = str(fileData['guid']) if 'fsize' in fileData: fsize = long(fileData['fsize']) if 'md5sum' in fileData: md5sum = str(fileData['md5sum']) # check if re.search("^[a-fA-F0-9]{32}$", md5sum) is None: md5sum = None if 'adler32' in fileData: adler32 = str(fileData['adler32']) if 'surl' in fileData: surl = str(fileData['surl']) if 'full_lfn' in fileData: fullLFN = str(fileData['full_lfn']) # endpoints self.extraInfo['endpoint'][lfn] = [] if 'endpoint' in fileData: self.extraInfo['endpoint'][lfn] = fileData['endpoint'] # error check if (lfn not in inputLFNs) and (fsize is None or (md5sum is None and adler32 is None)): if EventServiceUtils.isEventServiceMerge(self.job): continue else: raise RuntimeError( 'fsize/md5sum/adler32/surl=None') # append lfns.append(lfn) guids.append(guid) fsizes.append(fsize) md5sums.append(md5sum) surls.append(surl) if adler32 is not None: # use adler32 if available chksums.append("ad:%s" % adler32) else: chksums.append("md5:%s" % md5sum) if fullLFN is not None: fullLfnMap[lfn] = fullLFN except Exception: # check if file exists # if os.path.exists(self.xmlFile): if True: type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type, value)) # set failed anyway self.job.jobStatus = 'failed' # XML error happens when pilot got killed due to wall-time limit or failures in wrapper if (self.job.pilotErrorCode in [0,'0','NULL']) and \ (self.job.taskBufferErrorCode not in [pandaserver.taskbuffer.ErrorCode.EC_WorkerDone]) and \ (self.job.transExitCode in [0,'0','NULL']): self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML" return 2 else: # XML was deleted return 1 # parse metadata to get nEvents nEventsFrom = None try: root = xml.dom.minidom.parseString(self.job.metadata) files = root.getElementsByTagName('File') for file in files: # get GUID guid = str(file.getAttribute('ID')) # get PFN and LFN nodes logical = file.getElementsByTagName('logical')[0] lfnNode = logical.getElementsByTagName('lfn')[0] # convert UTF8 to Raw lfn = str(lfnNode.getAttribute('name')) guidMap[lfn] = guid # get metadata nevents = None for meta in file.getElementsByTagName('metadata'): # get fsize name = str(meta.getAttribute('att_name')) if name == 'events': nevents = long(meta.getAttribute('att_value')) nEventsMap[lfn] = nevents break nEventsFrom = "xml" except Exception: pass # parse json try: import json jsonDict = json.loads(self.job.metadata) for jsonFileItem in jsonDict['files']['output']: for jsonSubFileItem in jsonFileItem['subFiles']: lfn = str(jsonSubFileItem['name']) try: nevents = long(jsonSubFileItem['nentries']) nEventsMap[lfn] = nevents except Exception: pass try: guid = str(jsonSubFileItem['file_guid']) guidMap[lfn] = guid except Exception: pass nEventsFrom = "json" except Exception: pass # use nEvents and GUIDs reported by the pilot if no job report if self.job.metadata == 'NULL' and self.jobStatus == 'finished' and self.job.nEvents > 0 \ and self.job.prodSourceLabel in ['managed']: for file in self.job.Files: if file.type == 'output': nEventsMap[file.lfn] = self.job.nEvents for lfn, guid in zip(lfns, guids): guidMap[lfn] = guid nEventsFrom = "pilot" self.logger.debug('nEventsMap=%s' % str(nEventsMap)) self.logger.debug('nEventsFrom=%s' % str(nEventsFrom)) self.logger.debug('guidMap=%s' % str(guidMap)) self.logger.debug('self.job.jobStatus=%s in parseXML' % self.job.jobStatus) self.logger.debug( 'isES=%s isJumbo=%s' % (EventServiceUtils.isEventServiceJob( self.job), EventServiceUtils.isJumboJob(self.job))) # get lumi block number lumiBlockNr = self.job.getLumiBlockNr() # copy files for variable number of outputs tmpStat = self.copyFilesForVariableNumOutputs(lfns) if not tmpStat: self.logger.error( "failed to copy files for variable number of outputs") return 2 # check files fileList = [] for file in self.job.Files: fileList.append(file.lfn) if file.type == 'input': if file.lfn in lfns: if self.job.prodSourceLabel in ['user', 'panda']: # skipped file file.status = 'skipped' elif self.job.prodSourceLabel in [ 'managed', 'test' ] + JobUtils.list_ptest_prod_sources: # failed by pilot file.status = 'failed' elif file.type == 'output' or file.type == 'log': # add only log file for failed jobs if self.jobStatus == 'failed' and file.type != 'log': file.status = 'failed' continue # set failed if it is missing in XML if file.lfn not in lfns: if (self.job.jobStatus == 'finished' and EventServiceUtils.isEventServiceJob(self.job)) \ or EventServiceUtils.isJumboJob(self.job): # unset file status for ES jobs pass elif file.isAllowedNoOutput(): # allowed not to be produced file.status = 'nooutput' self.logger.debug('set {0} to status={1}'.format( file.lfn, file.status)) else: file.status = 'failed' self.job.jobStatus = 'failed' self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder self.job.ddmErrorDiag = "expected output {0} is missing in pilot XML".format( file.lfn) self.logger.error(self.job.ddmErrorDiag) continue # look for GUID with LFN try: i = lfns.index(file.lfn) file.GUID = guids[i] file.fsize = fsizes[i] file.md5sum = md5sums[i] file.checksum = chksums[i] surl = surls[i] # status file.status = 'ready' # change to full LFN if file.lfn in fullLfnMap: file.lfn = fullLfnMap[file.lfn] # add SURL to extraInfo self.extraInfo['surl'][file.lfn] = surl # add nevents if file.lfn in nEventsMap: self.extraInfo['nevents'][file.lfn] = nEventsMap[ file.lfn] except Exception: # status file.status = 'failed' type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type, value)) # set lumi block number if lumiBlockNr is not None and file.status != 'failed': self.extraInfo['lbnr'][file.lfn] = lumiBlockNr self.extraInfo['guid'] = guidMap # check consistency between XML and filesTable for lfn in lfns: if lfn not in fileList: self.logger.error("%s is not found in filesTable" % lfn) self.job.jobStatus = 'failed' for tmpFile in self.job.Files: tmpFile.status = 'failed' self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder self.job.ddmErrorDiag = "pilot produced {0} inconsistently with jobdef".format( lfn) return 2 # return self.logger.debug("parseXML end") return 0
def run(self): self.lock.acquire() try: for vuid,name,modDate in self.datasets: _logger.debug("Freezer start %s %s" % (modDate,name)) self.proxyLock.acquire() retF,resF = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID,status FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock ", {':destinationDBlock':name}) self.proxyLock.release() if retF < 0: _logger.error("SQL error") else: allFinished = True onePandaID = None for tmpPandaID,tmpFileStatus in resF: onePandaID = tmpPandaID if not tmpFileStatus in ['ready', 'failed', 'skipped', 'merging', 'finished']: allFinished = False break # check sub datasets in the jobset for event service job if allFinished: self.proxyLock.acquire() tmpJobs = taskBuffer.getFullJobStatus([onePandaID]) self.proxyLock.release() if len(tmpJobs) > 0 and tmpJobs[0] is not None: if EventServiceUtils.isEventServiceMerge(tmpJobs[0]): self.proxyLock.acquire() cThr = Closer(taskBuffer, [], tmpJobs[0]) allFinished = cThr.checkSubDatasetsInJobset() self.proxyLock.release() _logger.debug("closer checked sub datasets in the jobset for %s : %s" % (name, allFinished)) # no files in filesTable if allFinished: _logger.debug("freeze %s " % name) dsExists = True if name.startswith('pandaddm_') or name.startswith('user.') or name.startswith('group.') \ or name.startswith('hc_test.') or name.startswith('panda.um.'): dsExists = False if name.startswith('panda.um.'): self.proxyLock.acquire() retMer,resMer = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND status IN (:statusM,:statusF) ", {':destinationDBlock':name, ':statusM':'merging', ':statusF':'failed'}) self.proxyLock.release() if resMer is not None and len(resMer)>0: mergeID = resMer[0][0] # get merging jobs self.proxyLock.acquire() mergingJobs = taskBuffer.peekJobs([mergeID],fromDefined=False,fromArchived=False,fromWaiting=False) self.proxyLock.release() mergeJob = mergingJobs[0] if mergeJob is not None: tmpDestDBlocks = [] # get destDBlock for tmpFile in mergeJob.Files: if tmpFile.type in ['output','log']: if not tmpFile.destinationDBlock in tmpDestDBlocks: tmpDestDBlocks.append(tmpFile.destinationDBlock) # run _logger.debug("start JEDI closer for %s " % name) self.proxyLock.acquire() cThr = Closer(taskBuffer,tmpDestDBlocks,mergeJob) cThr.start() cThr.join() self.proxyLock.release() _logger.debug("end JEDI closer for %s " % name) continue else: _logger.debug("failed to get merging job for %s " % name) else: _logger.debug("failed to get merging file for %s " % name) status,out = True,'' elif dsExists: # check if dataset exists status,out = rucioAPI.getMetaData(name) if status == True: if out is not None: try: rucioAPI.closeDataset(name) status = True except Exception: errtype,errvalue = sys.exc_info()[:2] out = 'failed to freeze : {0} {1}'.format(errtype,errvalue) status = False else: # dataset not exist status,out = True,'' dsExists = False else: status,out = True,'' if not status: _logger.error('{0} failed to freeze with {1}'.format(name,out)) else: self.proxyLock.acquire() varMap = {} varMap[':vuid'] = vuid varMap[':status'] = 'completed' taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", varMap) self.proxyLock.release() if name.startswith('pandaddm_') or name.startswith('panda.um.') or not dsExists: continue # set tobedeleted to dis setTobeDeletedToDis(name) # count # of files status,out = rucioAPI.getNumberOfFiles(name) if status is not True: if status is False: _logger.error(out) else: _logger.debug(out) try: nFile = int(out) _logger.debug(nFile) if nFile == 0: # erase dataset _logger.debug('erase %s' % name) status,out = rucioAPI.eraseDataset(name) _logger.debug('OK with %s' % name) except Exception: pass else: _logger.debug("wait %s " % name) self.proxyLock.acquire() taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE WHERE vuid=:vuid", {':vuid':vuid}) self.proxyLock.release() _logger.debug("end %s " % name) except Exception: errStr = traceback.format_exc() _logger.error(errStr) self.pool.remove(self) self.lock.release()
def run(self): try: _logger.debug('%s Start %s' % (self.pandaID,self.job.jobStatus)) flagComplete = True topUserDsList = [] usingMerger = False disableNotifier = False firstIndvDS = True finalStatusDS = [] for destinationDBlock in self.destinationDBlocks: dsList = [] _logger.debug('%s start %s' % (self.pandaID,destinationDBlock)) # ignore tid datasets if re.search('_tid[\d_]+$',destinationDBlock): _logger.debug('%s skip %s' % (self.pandaID,destinationDBlock)) continue # ignore HC datasets if re.search('^hc_test\.',destinationDBlock) is not None or re.search('^user\.gangarbt\.',destinationDBlock) is not None: if re.search('_sub\d+$',destinationDBlock) is None and re.search('\.lib$',destinationDBlock) is None: _logger.debug('%s skip HC %s' % (self.pandaID,destinationDBlock)) continue # query dataset if destinationDBlock in self.datasetMap: dataset = self.datasetMap[destinationDBlock] else: dataset = self.taskBuffer.queryDatasetWithMap({'name':destinationDBlock}) if dataset is None: _logger.error('%s Not found : %s' % (self.pandaID,destinationDBlock)) flagComplete = False continue # skip tobedeleted/tobeclosed if dataset.status in ['cleanup','tobeclosed','completed','deleted']: _logger.debug('%s skip %s due to %s' % (self.pandaID,destinationDBlock,dataset.status)) continue dsList.append(dataset) # sort dsList.sort() # count number of completed files notFinish = self.taskBuffer.countFilesWithMap({'destinationDBlock':destinationDBlock, 'status':'unknown'}) if notFinish < 0: _logger.error('%s Invalid DB return : %s' % (self.pandaID,notFinish)) flagComplete = False continue # check if completed _logger.debug('%s notFinish:%s' % (self.pandaID,notFinish)) if self.job.destinationSE == 'local' and self.job.prodSourceLabel in ['user','panda']: # close non-DQ2 destinationDBlock immediately finalStatus = 'closed' elif self.job.lockedby == 'jedi' and self.isTopLevelDS(destinationDBlock): # set it closed in order not to trigger DDM cleanup. It will be closed by JEDI finalStatus = 'closed' elif self.job.prodSourceLabel in ['user'] and "--mergeOutput" in self.job.jobParameters \ and self.job.processingType != 'usermerge': # merge output files if firstIndvDS: # set 'tobemerged' to only the first dataset to avoid triggering many Mergers for --individualOutDS finalStatus = 'tobemerged' firstIndvDS = False else: finalStatus = 'tobeclosed' # set merging to top dataset usingMerger = True # disable Notifier disableNotifier = True elif self.job.produceUnMerge(): finalStatus = 'doing' else: # set status to 'tobeclosed' to trigger DQ2 closing finalStatus = 'tobeclosed' if notFinish == 0 and EventServiceUtils.isEventServiceMerge(self.job): allInJobsetFinished = self.checkSubDatasetsInJobset() else: allInJobsetFinished = True if notFinish == 0 and allInJobsetFinished: _logger.debug('%s set %s to dataset : %s' % (self.pandaID,finalStatus,destinationDBlock)) # set status dataset.status = finalStatus # update dataset in DB retT = self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ", criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'}) if len(retT) > 0 and retT[0]==1: finalStatusDS += dsList # close user datasets if self.job.prodSourceLabel in ['user'] and self.job.destinationDBlock.endswith('/') \ and (dataset.name.startswith('user') or dataset.name.startswith('group')): # get top-level user dataset topUserDsName = re.sub('_sub\d+$','',dataset.name) # update if it is the first attempt if topUserDsName != dataset.name and not topUserDsName in topUserDsList and self.job.lockedby != 'jedi': topUserDs = self.taskBuffer.queryDatasetWithMap({'name':topUserDsName}) if topUserDs is not None: # check status if topUserDs.status in ['completed','cleanup','tobeclosed','deleted', 'tobemerged','merging']: _logger.debug('%s skip %s due to status=%s' % (self.pandaID,topUserDsName,topUserDs.status)) else: # set status if self.job.processingType.startswith('gangarobot') or \ self.job.processingType.startswith('hammercloud'): # not trigger freezing for HC datasets so that files can be appended topUserDs.status = 'completed' elif not usingMerger: topUserDs.status = finalStatus else: topUserDs.status = 'merging' # append to avoid repetition topUserDsList.append(topUserDsName) # update DB retTopT = self.taskBuffer.updateDatasets([topUserDs],withLock=True,withCriteria="status<>:crStatus", criteriaMap={':crStatus':topUserDs.status}) if len(retTopT) > 0 and retTopT[0]==1: _logger.debug('%s set %s to top dataset : %s' % (self.pandaID,topUserDs.status,topUserDsName)) else: _logger.debug('%s failed to update top dataset : %s' % (self.pandaID,topUserDsName)) # get parent dataset for merge job if self.job.processingType == 'usermerge': tmpMatch = re.search('--parentDS ([^ \'\"]+)',self.job.jobParameters) if tmpMatch is None: _logger.error('%s failed to extract parentDS' % self.pandaID) else: unmergedDsName = tmpMatch.group(1) # update if it is the first attempt if not unmergedDsName in topUserDsList: unmergedDs = self.taskBuffer.queryDatasetWithMap({'name':unmergedDsName}) if unmergedDs is None: _logger.error('%s failed to get parentDS=%s from DB' % (self.pandaID,unmergedDsName)) else: # check status if unmergedDs.status in ['completed','cleanup','tobeclosed']: _logger.debug('%s skip %s due to status=%s' % (self.pandaID,unmergedDsName,unmergedDs.status)) else: # set status unmergedDs.status = finalStatus # append to avoid repetition topUserDsList.append(unmergedDsName) # update DB retTopT = self.taskBuffer.updateDatasets([unmergedDs],withLock=True,withCriteria="status<>:crStatus", criteriaMap={':crStatus':unmergedDs.status}) if len(retTopT) > 0 and retTopT[0]==1: _logger.debug('%s set %s to parent dataset : %s' % (self.pandaID,unmergedDs.status,unmergedDsName)) else: _logger.debug('%s failed to update parent dataset : %s' % (self.pandaID,unmergedDsName)) # start Activator if re.search('_sub\d+$',dataset.name) is None: if self.job.prodSourceLabel=='panda' and self.job.processingType in ['merge','unmerge']: # don't trigger Activator for merge jobs pass else: if self.job.jobStatus == 'finished': aThr = Activator(self.taskBuffer,dataset) aThr.start() aThr.join() else: # unset flag since another thread already updated #flagComplete = False pass else: # update dataset in DB self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ", criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'}) # unset flag flagComplete = False # end _logger.debug('%s end %s' % (self.pandaID,destinationDBlock)) # special actions for vo if flagComplete: closerPluginClass = panda_config.getPlugin('closer_plugins',self.job.VO) if closerPluginClass is None and self.job.VO == 'atlas': # use ATLAS plugin for ATLAS from pandaserver.dataservice.CloserAtlasPlugin import CloserAtlasPlugin closerPluginClass = CloserAtlasPlugin if closerPluginClass is not None: closerPlugin = closerPluginClass(self.job,finalStatusDS,_logger) closerPlugin.execute() # change pending jobs to failed finalizedFlag = True if flagComplete and self.job.prodSourceLabel=='user': _logger.debug('%s finalize %s %s' % (self.pandaID,self.job.prodUserName,self.job.jobDefinitionID)) finalizedFlag = self.taskBuffer.finalizePendingJobs(self.job.prodUserName,self.job.jobDefinitionID) _logger.debug('%s finalized with %s' % (self.pandaID,finalizedFlag)) # update unmerged datasets in JEDI to trigger merging if flagComplete and self.job.produceUnMerge() and finalStatusDS != []: if finalizedFlag: tmpStat = self.taskBuffer.updateUnmergedDatasets(self.job,finalStatusDS) _logger.debug('%s updated unmerged datasets with %s' % (self.pandaID,tmpStat)) # start notifier _logger.debug('%s source:%s complete:%s' % (self.pandaID,self.job.prodSourceLabel,flagComplete)) if (self.job.jobStatus != 'transferring') and ((flagComplete and self.job.prodSourceLabel=='user') or \ (self.job.jobStatus=='failed' and self.job.prodSourceLabel=='panda')) and \ self.job.lockedby != 'jedi': # don't send email for merge jobs if (not disableNotifier) and not self.job.processingType in ['merge','unmerge']: useNotifier = True summaryInfo = {} # check all jobDefIDs in jobsetID if not self.job.jobsetID in [0,None,'NULL']: useNotifier,summaryInfo = self.taskBuffer.checkDatasetStatusForNotifier(self.job.jobsetID,self.job.jobDefinitionID, self.job.prodUserName) _logger.debug('%s useNotifier:%s' % (self.pandaID,useNotifier)) if useNotifier: _logger.debug('%s start Notifier' % self.pandaID) nThr = Notifier.Notifier(self.taskBuffer,self.job,self.destinationDBlocks,summaryInfo) nThr.run() _logger.debug('%s end Notifier' % self.pandaID) _logger.debug('%s End' % self.pandaID) except Exception: errType,errValue = sys.exc_info()[:2] _logger.error("%s %s" % (errType,errValue))
def updateJobs(self, jobList, tmpLog): updateJobs = [] failedJobs = [] activateJobs = [] waitingJobs = [] closeJobs = [] # sort out jobs for job in jobList: # failed jobs if job.jobStatus in ['failed', 'cancelled']: failedJobs.append(job) # waiting elif job.jobStatus == 'waiting': waitingJobs.append(job) # no input jobs elif job.dispatchDBlock == 'NULL': activateJobs.append(job) # normal jobs else: # change status job.jobStatus = "assigned" updateJobs.append(job) # trigger merge generation if all events are done newActivateJobs = [] nFinished = 0 for job in activateJobs: if job.notDiscardEvents() and job.allOkEvents( ) and not EventServiceUtils.isEventServiceMerge(job): self.taskBuffer.activateJobs([job]) # change status job.jobStatus = "finished" self.taskBuffer.updateJobs([job], False) nFinished += 1 else: newActivateJobs.append(job) activateJobs = newActivateJobs tmpLog.debug('# of finished jobs in activated : {0}'.format(nFinished)) newUpdateJobs = [] nFinished = 0 for job in updateJobs: if job.notDiscardEvents() and job.allOkEvents( ) and not EventServiceUtils.isEventServiceMerge(job): self.taskBuffer.updateJobs([job], True) # change status job.jobStatus = "finished" self.taskBuffer.updateJobs([job], True) nFinished += 1 else: newUpdateJobs.append(job) updateJobs = newUpdateJobs tmpLog.debug('# of finished jobs in defined : {0}'.format(nFinished)) # update DB tmpLog.debug('# of activated jobs : {0}'.format(len(activateJobs))) self.taskBuffer.activateJobs(activateJobs) tmpLog.debug('# of updated jobs : {0}'.format(len(updateJobs))) self.taskBuffer.updateJobs(updateJobs, True) tmpLog.debug('# of failed jobs : {0}'.format(len(failedJobs))) self.taskBuffer.updateJobs(failedJobs, True) tmpLog.debug('# of waiting jobs : {0}'.format(len(waitingJobs))) self.taskBuffer.keepJobs(waitingJobs) # delete local values del updateJobs del failedJobs del activateJobs del waitingJobs
def appendJob(self, job, siteMapperCache=None): # event service merge if EventServiceUtils.isEventServiceMerge(job): isEventServiceMerge = True else: isEventServiceMerge = False # PandaID self.data['PandaID'] = job.PandaID # prodSourceLabel self.data['prodSourceLabel'] = job.prodSourceLabel # swRelease self.data['swRelease'] = job.AtlasRelease # homepackage self.data['homepackage'] = job.homepackage # transformation self.data['transformation'] = job.transformation # job name self.data['jobName'] = job.jobName # job definition ID self.data['jobDefinitionID'] = job.jobDefinitionID # cloud self.data['cloud'] = job.cloud # files strIFiles = '' strOFiles = '' strDispatch = '' strDisToken = '' strDisTokenForOutput = '' strDestination = '' strRealDataset = '' strRealDatasetIn = '' strProdDBlock = '' strDestToken = '' strProdToken = '' strProdTokenForOutput = '' strGUID = '' strFSize = '' strCheckSum = '' strFileDestinationSE = '' strScopeIn = '' strScopeOut = '' strScopeLog = '' logFile = '' logGUID = '' ddmEndPointIn = [] ddmEndPointOut = [] noOutput = [] siteSpec = None inDsLfnMap = {} inLFNset = set() if siteMapperCache is not None: siteMapper = siteMapperCache.getObj() siteSpec = siteMapper.getSite(job.computingSite) # resolve destSE try: job.destinationSE = siteMapper.resolveNucleus( job.destinationSE) for tmpFile in job.Files: tmpFile.destinationSE = siteMapper.resolveNucleus( tmpFile.destinationSE) except Exception: pass siteMapperCache.releaseObj() for file in job.Files: if file.type == 'input': if EventServiceUtils.isJumboJob(job) and file.lfn in inLFNset: pass else: inLFNset.add(file.lfn) if strIFiles != '': strIFiles += ',' strIFiles += file.lfn if strDispatch != '': strDispatch += ',' strDispatch += file.dispatchDBlock if strDisToken != '': strDisToken += ',' strDisToken += file.dispatchDBlockToken strProdDBlock += '%s,' % file.prodDBlock if not isEventServiceMerge: strProdToken += '%s,' % file.prodDBlockToken else: strProdToken += '%s,' % job.metadata[1][file.lfn] if strGUID != '': strGUID += ',' strGUID += file.GUID strRealDatasetIn += '%s,' % file.dataset strFSize += '%s,' % file.fsize if file.checksum not in ['', 'NULL', None]: strCheckSum += '%s,' % file.checksum else: strCheckSum += '%s,' % file.md5sum strScopeIn += '%s,' % file.scope ddmEndPointIn.append( self.getDdmEndpoint(siteSpec, file.dispatchDBlockToken, 'input', job.prodSourceLabel, job.job_label)) if file.dataset not in inDsLfnMap: inDsLfnMap[file.dataset] = [] inDsLfnMap[file.dataset].append(file.lfn) if file.type == 'output' or file.type == 'log': if strOFiles != '': strOFiles += ',' strOFiles += file.lfn if strDestination != '': strDestination += ',' strDestination += file.destinationDBlock if strRealDataset != '': strRealDataset += ',' strRealDataset += file.dataset strFileDestinationSE += '%s,' % file.destinationSE if file.type == 'log': logFile = file.lfn logGUID = file.GUID strScopeLog = file.scope else: strScopeOut += '%s,' % file.scope if strDestToken != '': strDestToken += ',' strDestToken += re.sub( '^ddd:', 'dst:', file.destinationDBlockToken.split(',')[0]) strDisTokenForOutput += '%s,' % file.dispatchDBlockToken strProdTokenForOutput += '%s,' % file.prodDBlockToken ddmEndPointOut.append( self.getDdmEndpoint( siteSpec, file.destinationDBlockToken.split(',')[0], 'output', job.prodSourceLabel, job.job_label)) if file.isAllowedNoOutput(): noOutput.append(file.lfn) # inFiles self.data['inFiles'] = strIFiles # dispatch DBlock self.data['dispatchDblock'] = strDispatch # dispatch DBlock space token self.data['dispatchDBlockToken'] = strDisToken # dispatch DBlock space token for output self.data['dispatchDBlockTokenForOut'] = strDisTokenForOutput[:-1] # outFiles self.data['outFiles'] = strOFiles # destination DBlock self.data['destinationDblock'] = strDestination # destination DBlock space token self.data['destinationDBlockToken'] = strDestToken # prod DBlocks self.data['prodDBlocks'] = strProdDBlock[:-1] # prod DBlock space token self.data['prodDBlockToken'] = strProdToken[:-1] # real output datasets self.data['realDatasets'] = strRealDataset # real output datasets self.data['realDatasetsIn'] = strRealDatasetIn[:-1] # file's destinationSE self.data['fileDestinationSE'] = strFileDestinationSE[:-1] # log filename self.data['logFile'] = logFile # log GUID self.data['logGUID'] = logGUID # jobPars self.data['jobPars'], ppSteps = job.extractMultiStepExec() if ppSteps is not None: self.data.update(ppSteps) if job.to_encode_job_params(): self.data['jobPars'] = base64.b64encode( self.data['jobPars'].encode()).decode() # attempt number self.data['attemptNr'] = job.attemptNr # GUIDs self.data['GUID'] = strGUID # checksum self.data['checksum'] = strCheckSum[:-1] # fsize self.data['fsize'] = strFSize[:-1] # scope self.data['scopeIn'] = strScopeIn[:-1] self.data['scopeOut'] = strScopeOut[:-1] self.data['scopeLog'] = strScopeLog # DDM endpoints try: self.data['ddmEndPointIn'] = ','.join(ddmEndPointIn) except TypeError: self.data['ddmEndPointIn'] = '' try: self.data['ddmEndPointOut'] = ','.join(ddmEndPointOut) except TypeError: self.data['ddmEndPointOut'] = '' # destinationSE self.data['destinationSE'] = job.destinationSE # user ID self.data['prodUserID'] = job.prodUserID # CPU count self.data['maxCpuCount'] = job.maxCpuCount # RAM count self.data['minRamCount'] = job.minRamCount # disk count self.data['maxDiskCount'] = job.maxDiskCount # cmtconfig if ppSteps is None: self.data['cmtConfig'] = job.cmtConfig else: self.data['cmtConfig'] = '' # processingType self.data['processingType'] = job.processingType # transferType self.data['transferType'] = job.transferType # sourceSite self.data['sourceSite'] = job.sourceSite # current priority self.data['currentPriority'] = job.currentPriority # taskID if job.lockedby == 'jedi': self.data['taskID'] = job.jediTaskID else: self.data['taskID'] = job.taskID # core count if job.coreCount in ['NULL', None]: self.data['coreCount'] = 1 else: self.data['coreCount'] = job.coreCount # jobsetID self.data['jobsetID'] = job.jobsetID # nucleus self.data['nucleus'] = job.nucleus # walltime self.data['maxWalltime'] = job.maxWalltime # looping check if job.is_no_looping_check(): self.data['loopingCheck'] = False # debug mode if job.specialHandling is not None and 'debug' in job.specialHandling: self.data['debug'] = 'True' # event service or job cloning if EventServiceUtils.isJobCloningJob(job): self.data['cloneJob'] = EventServiceUtils.getJobCloningType(job) elif EventServiceUtils.isEventServiceJob( job) or EventServiceUtils.isJumboJob(job): self.data['eventService'] = 'True' # prod DBlock space token for pre-merging output self.data['prodDBlockTokenForOutput'] = strProdTokenForOutput[:-1] # event service merge if isEventServiceMerge: self.data['eventServiceMerge'] = 'True' # write to file for ES merge writeToFileStr = '' try: for outputName in job.metadata[0]: inputList = job.metadata[0][outputName] writeToFileStr += 'inputFor_{0}:'.format(outputName) for tmpInput in inputList: writeToFileStr += '{0},'.format(tmpInput) writeToFileStr = writeToFileStr[:-1] writeToFileStr += '^' writeToFileStr = writeToFileStr[:-1] except Exception: pass self.data['writeToFile'] = writeToFileStr elif job.writeInputToFile(): try: # write input to file writeToFileStr = '' for inDS in inDsLfnMap: inputList = inDsLfnMap[inDS] inDS = re.sub('/$', '', inDS) inDS = inDS.split(':')[-1] writeToFileStr += 'tmpin_{0}:'.format(inDS) writeToFileStr += ','.join(inputList) writeToFileStr += '^' writeToFileStr = writeToFileStr[:-1] self.data['writeToFile'] = writeToFileStr except Exception: pass # replace placeholder if EventServiceUtils.isJumboJob(job) or EventServiceUtils.isCoJumboJob( job): try: for inDS in inDsLfnMap: inputList = inDsLfnMap[inDS] inDS = re.sub('/$', '', inDS) inDS = inDS.split(':')[-1] srcStr = 'tmpin__cnt_{0}'.format(inDS) dstStr = ','.join(inputList) self.data['jobPars'] = self.data['jobPars'].replace( srcStr, dstStr) except Exception: pass # no output if noOutput != []: self.data['allowNoOutput'] = ','.join(noOutput) # alternative stage-out if job.getAltStgOut() is not None: self.data['altStageOut'] = job.getAltStgOut() # log to OS if job.putLogToOS(): self.data['putLogToOS'] = 'True' # suppress execute string conversion if job.noExecStrCnv(): self.data['noExecStrCnv'] = 'True' # in-file positional event number if job.inFilePosEvtNum(): self.data['inFilePosEvtNum'] = 'True' # use prefetcher if job.usePrefetcher(): self.data['usePrefetcher'] = 'True' # image name if job.container_name not in ['NULL', None]: self.data['container_name'] = job.container_name # IO self.data['ioIntensity'] = job.get_task_attribute('ioIntensity') self.data['ioIntensityUnit'] = job.get_task_attribute( 'ioIntensityUnit') # HPO if job.is_hpo_workflow(): self.data['isHPO'] = 'True' # VP if siteSpec is not None: scope_input, scope_output = DataServiceUtils.select_scope( siteSpec, job.prodSourceLabel, job.job_label) if siteSpec.use_vp(scope_input): self.data['useVP'] = 'True'