def run(self): # start try: byCallback = False if self.job == None: byCallback = True _logger.debug("start: %s" % self.dataset.name) _logger.debug("callback from %s" % self.site) # FIXME when callback from BNLPANDA disappeared if self.site == 'BNLPANDA': self.site = 'BNL-OSG2_ATLASMCDISK' # instantiate site mapper siteMapper = SiteMapper(self.taskBuffer) # get computingSite/destinationSE computingSite, destinationSE = self.taskBuffer.getDestSE( self.dataset.name) if destinationSE == None: # try to get computingSite/destinationSE from ARCH to delete sub # even if no active jobs left computingSite, destinationSE = self.taskBuffer.getDestSE( self.dataset.name, True) if destinationSE == None: _logger.error("cannot get source/destination for %s" % self.dataset.name) _logger.debug("end: %s" % self.dataset.name) return _logger.debug("src: %s" % computingSite) _logger.debug("dst: %s" % destinationSE) # get corresponding token tmpSrcSiteSpec = siteMapper.getSite(computingSite) tmpDstSiteSpec = siteMapper.getSite(destinationSE) _logger.debug(tmpDstSiteSpec.setokens) destToken = None for tmpToken, tmpDdmId in tmpDstSiteSpec.setokens.iteritems(): if self.site == tmpDdmId: destToken = tmpToken break _logger.debug("use Token=%s" % destToken) # get required tokens reqTokens = self.taskBuffer.getDestTokens(self.dataset.name) if reqTokens == None: _logger.error("cannot get required token for %s" % self.dataset.name) _logger.debug("end: %s" % self.dataset.name) return _logger.debug("req Token=%s" % reqTokens) # make bitmap for the token bitMap = 1 if len(reqTokens.split(',')) > 1: for tmpReqToken in reqTokens.split(','): if tmpReqToken == destToken: break # shift one bit bitMap <<= 1 # completed bitmap compBitMap = (1 << len(reqTokens.split(','))) - 1 # ignore the lowest bit for T1, file on DISK is already there if tmpSrcSiteSpec.ddm == tmpDstSiteSpec.ddm: compBitMap = compBitMap & 0xFFFE # update bitmap in DB updatedBitMap = self.taskBuffer.updateTransferStatus( self.dataset.name, bitMap) _logger.debug( "transfer status:%s - comp:%s - bit:%s" % (hex(updatedBitMap), hex(compBitMap), hex(bitMap))) # update output files if (updatedBitMap & compBitMap) == compBitMap: ids = self.taskBuffer.updateOutFilesReturnPandaIDs( self.dataset.name) # set flag for T2 cleanup self.dataset.status = 'cleanup' self.taskBuffer.updateDatasets([self.dataset]) else: _logger.debug("end: %s" % self.dataset.name) return else: _logger.debug("start: %s" % self.job.PandaID) # update input files ids = [self.job.PandaID] _logger.debug("IDs: %s" % ids) if len(ids) != 0: # get job if self.job == None: jobs = self.taskBuffer.peekJobs(ids, fromDefined=False, fromArchived=False, fromWaiting=False) else: jobs = [self.job] # loop over all jobs for job in jobs: if job == None: continue _logger.debug("Job: %s" % job.PandaID) if job.jobStatus == 'transferring': jobReady = True failedFiles = [] noOutFiles = [] # check file status for file in job.Files: if file.type == 'output' or file.type == 'log': if file.status == 'failed': failedFiles.append(file.lfn) elif file.status == 'nooutput': noOutFiles.append(file.lfn) elif file.status != 'ready': _logger.debug( "Job: %s file:%s %s != ready" % (job.PandaID, file.lfn, file.status)) jobReady = False break # finish job if jobReady: if byCallback: _logger.debug("Job: %s all files ready" % job.PandaID) else: _logger.debug( "Job: %s all files checked with catalog" % job.PandaID) # create XML try: import xml.dom.minidom dom = xml.dom.minidom.getDOMImplementation() doc = dom.createDocument(None, 'xml', None) topNode = doc.createElement("POOLFILECATALOG") for file in job.Files: if file.type in ['output', 'log']: # skip failed or no-output files if file.lfn in failedFiles + noOutFiles: continue # File fileNode = doc.createElement("File") fileNode.setAttribute("ID", file.GUID) # LFN logNode = doc.createElement("logical") lfnNode = doc.createElement("lfn") lfnNode.setAttribute('name', file.lfn) # metadata fsizeNode = doc.createElement( "metadata") fsizeNode.setAttribute( "att_name", "fsize") fsizeNode.setAttribute( "att_value", str(file.fsize)) # checksum if file.checksum.startswith('ad:'): # adler32 chksumNode = doc.createElement( "metadata") chksumNode.setAttribute( "att_name", "adler32") chksumNode.setAttribute( "att_value", re.sub('^ad:', '', file.checksum)) else: # md5sum chksumNode = doc.createElement( "metadata") chksumNode.setAttribute( "att_name", "md5sum") chksumNode.setAttribute( "att_value", re.sub('^md5:', '', file.checksum)) # append nodes logNode.appendChild(lfnNode) fileNode.appendChild(logNode) fileNode.appendChild(fsizeNode) fileNode.appendChild(chksumNode) topNode.appendChild(fileNode) # status in file name if failedFiles == []: statusFileName = 'finished' else: statusFileName = 'failed' # write to file xmlFile = '%s/%s_%s_%s' % ( panda_config.logdir, job.PandaID, statusFileName, commands.getoutput('uuidgen')) oXML = open(xmlFile, "w") oXML.write(topNode.toxml()) oXML.close() except: type, value, traceBack = sys.exc_info() _logger.error("%s : %s %s" % (job.PandaID, type, value)) _logger.debug("Job: %s status: %s" % (job.PandaID, job.jobStatus)) # end if self.job == None: _logger.debug("end: %s" % self.dataset.name) else: _logger.debug("end: %s" % self.job.PandaID) except: type, value, traceBack = sys.exc_info() _logger.error("run() : %s %s" % (type, value))
xml = """<?xml version="1.0" encoding="UTF-8" standalone="no" ?> <!-- ATLAS file meta-data catalog --> <!DOCTYPE POOLFILECATALOG SYSTEM "InMemory"> <POOLFILECATALOG> """ try: att = sys.argv[2] except: att = job.attemptNr if job.computingSite in ['',None,'NULL']: print 'computingSite is not yet defined' sys.exit(0) siteSpec = siteMapper.getSite(job.computingSite) for file in job.Files: if file.type in ['output','log']: file.GUID = commands.getoutput('uuidgen') if job.computingSite == file.destinationSE and \ siteSpec.setokens.has_key(file.destinationDBlockToken): tmpSrcDDM = siteSpec.setokens[file.destinationDBlockToken] else: tmpSrcDDM = siteMapper.getSite(job.computingSite).ddm srm = TiersOfATLAS.getSiteProperty(tmpSrcDDM,'srm') srm = re.sub('^token:[^:]+:','',srm) xml += """ <File ID="%s"> <logical> <lfn name="%s"/>
throttleForSink = {} throttleForSource = {} totalFlowFromSource = {} # loop over all sources to get total flows tmpLog.debug(" >>> checking limits") for sinkSite, sinkMap in wanMX.iteritems(): totalFlowToSink = 0 # loop over all sinks for sourceSite, sourceMap in sinkMap.iteritems(): # get total flows totalFlowToSink += sourceMap['flow'] if not totalFlowFromSource.has_key(sourceSite): totalFlowFromSource[sourceSite] = 0 totalFlowFromSource[sourceSite] += sourceMap['flow'] # check limit for sink tmpSiteSpec = siteMapper.getSite(sinkSite) if siteMapper.checkSite( sinkSite ) and tmpSiteSpec.wansinklimit * 1024 * 1024 * 1024 > totalFlowToSink: throttleForSink[sinkSite] = False tmpLog.debug( " release Sink {0} : {1}bps (total) < {2}Gbps (limit)".format( sinkSite, totalFlowToSink, tmpSiteSpec.wansinklimit)) else: throttleForSink[sinkSite] = True tmpLog.debug( " throttle Sink {0} : {1}bps (total) > {2}Gbps (limit)".format( sinkSite, totalFlowToSink, tmpSiteSpec.wansinklimit)) # check limit for source for sourceSite, totalFlow in totalFlowFromSource.iteritems(): tmpSiteSpec = siteMapper.getSite(sourceSite)
varMap[':computingSite'] = computingSite varMap[':prodUserName'] = prodUserName varMap[':jobDefinitionID'] = jobDefinitionID varMap[':modificationTime'] = recentRuntimeLimit varMap[':jobStatus1'] = 'starting' _logger.debug(" rebro:%s/%s:ID=%s:%s jediTaskID=%s site=%s" % (iComb,nComb,jobDefinitionID, prodUserName,jediTaskID, computingSite)) iComb += 1 hasRecentJobs = False # check site if not siteMapper.checkSite(computingSite): _logger.debug(" -> skip unknown site=%s" % computingSite) continue # check site status tmpSiteStatus = siteMapper.getSite(computingSite).status if not tmpSiteStatus in ['offline','test']: # use normal time limit for nornal site status if maxModificationTime > normalTimeLimit: _logger.debug(" -> skip wait for normal timelimit=%s<maxModTime=%s" % (normalTimeLimit,maxModificationTime)) continue for tableName in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4']: retU,resU = taskBuffer.querySQLS(sql % tableName, varMap) if resU == None: # database error raise RuntimeError,"failed to check modTime" if resU != []: # found recent jobs hasRecentJobs = True _logger.debug(" -> skip %s ran recently at %s" % (resU[0][0],resU[0][1])) break
varMap[':jediTaskID'] = jediTaskID varMap[':computingSite'] = computingSite varMap[':prodUserName'] = prodUserName varMap[':jobDefinitionID'] = jobDefinitionID varMap[':modificationTime'] = recentRuntimeLimit _logger.debug(" rebro:%s/%s:ID=%s:%s jediTaskID=%s site=%s" % (iComb,nComb,jobDefinitionID, prodUserName,jediTaskID, computingSite)) iComb += 1 hasRecentJobs = False # check site if not siteMapper.checkSite(computingSite): _logger.debug(" -> skip unknown site=%s" % computingSite) continue # check site status tmpSiteStatus = siteMapper.getSite(computingSite).status if not tmpSiteStatus in ['offline','test']: # use normal time limit for nornal site status if maxModificationTime > normalTimeLimit: _logger.debug(" -> skip wait for normal timelimit=%s<maxModTime=%s" % (normalTimeLimit,maxModificationTime)) continue for tableName in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4']: retU,resU = taskBuffer.querySQLS(sql % tableName, varMap) if resU == None: # database error raise RuntimeError,"failed to check modTime" if resU != []: # found recent jobs hasRecentJobs = True _logger.debug(" -> skip %s ran recently at %s" % (resU[0][0],resU[0][1])) break
def run(self): try: # get job tmpJobs = self.taskBuffer.getFullJobStatus([self.rPandaID]) if tmpJobs == [] or tmpJobs[0] == None: _logger.debug("cannot find job for PandaID=%s" % self.rPandaID) return self.job = tmpJobs[0] _logger.debug("%s start %s:%s:%s" % (self.token,self.job.jobDefinitionID,self.job.prodUserName,self.job.computingSite)) # using output container if not self.job.destinationDBlock.endswith('/'): _logger.debug("%s ouput dataset container is required" % self.token) _logger.debug("%s end" % self.token) return # FIXEME : dont' touch group jobs for now if self.job.destinationDBlock.startswith('group') and (not self.userRequest): _logger.debug("%s skip group jobs" % self.token) _logger.debug("%s end" % self.token) return # check processingType typesForRebro = ['pathena','prun','ganga','ganga-rbtest'] if not self.job.processingType in typesForRebro: _logger.debug("%s skip processingType=%s not in %s" % \ (self.token,self.job.processingType,str(typesForRebro))) _logger.debug("%s end" % self.token) return # check jobsetID if self.job.jobsetID in [0,'NULL',None]: _logger.debug("%s jobsetID is undefined" % self.token) _logger.debug("%s end" % self.token) return # check metadata if self.job.metadata in [None,'NULL']: _logger.debug("%s metadata is unavailable" % self.token) _logger.debug("%s end" % self.token) return # check --disableRebrokerage match = re.search("--disableRebrokerage",self.job.metadata) if match != None and (not self.simulation) and (not self.forceOpt) \ and (not self.userRequest): _logger.debug("%s diabled rebrokerage" % self.token) _logger.debug("%s end" % self.token) return # check --site match = re.search("--site",self.job.metadata) if match != None and (not self.simulation) and (not self.forceOpt) \ and (not self.userRequest): _logger.debug("%s --site is used" % self.token) _logger.debug("%s end" % self.token) return # check --libDS match = re.search("--libDS",self.job.metadata) if match != None: _logger.debug("%s --libDS is used" % self.token) _logger.debug("%s end" % self.token) return # check --workingGroup since it is site-specific match = re.search("--workingGroup",self.job.metadata) if match != None: _logger.debug("%s workingGroup is specified" % self.token) _logger.debug("%s end" % self.token) return # avoid too many rebrokerage if not self.checkRev(): _logger.debug("%s avoid too many rebrokerage" % self.token) _logger.debug("%s end" % self.token) return # check if multiple JobIDs use the same libDS if self.bPandaID != None and self.buildStatus not in ['finished','failed']: if self.minPandaIDlibDS == None or self.maxPandaIDlibDS == None: _logger.debug("%s max/min PandaIDs are unavailable for the libDS" % self.token) _logger.debug("%s end" % self.token) return tmpPandaIDsForLibDS = self.taskBuffer.getFullJobStatus([self.minPandaIDlibDS,self.maxPandaIDlibDS]) if len(tmpPandaIDsForLibDS) != 2 or tmpPandaIDsForLibDS[0] == None or tmpPandaIDsForLibDS[1] == None: _logger.debug("%s failed to get max/min PandaIDs for the libDS" % self.token) _logger.debug("%s end" % self.token) return # check if tmpPandaIDsForLibDS[0].jobDefinitionID != tmpPandaIDsForLibDS[1].jobDefinitionID: _logger.debug("%s multiple JobIDs use the libDS %s:%s %s:%s" % (self.token,tmpPandaIDsForLibDS[0].jobDefinitionID, self.minPandaIDlibDS,tmpPandaIDsForLibDS[1].jobDefinitionID, self.maxPandaIDlibDS)) _logger.debug("%s end" % self.token) return # check excludedSite if self.excludedSite == None: self.excludedSite = [] match = re.search("--excludedSite( +|=)\s*(\'|\")*([^ \"\';$]+)",self.job.metadata) if match != None: self.excludedSite = match.group(3).split(',') # remove empty try: self.excludedSite.remove('') except: pass _logger.debug("%s excludedSite=%s" % (self.token,str(self.excludedSite))) # check cloud if self.cloud == None: match = re.search("--cloud( +|=)\s*(\'|\")*([^ \"\';$]+)",self.job.metadata) if match != None: self.cloud = match.group(3) _logger.debug("%s cloud=%s" % (self.token,self.cloud)) # get inDS/LFNs status,tmpMapInDS,maxFileSize = self.taskBuffer.getInDatasetsForReBrokerage(self.jobID,self.userName) if not status: # failed _logger.error("%s failed to get inDS/LFN from DB" % self.token) return status,inputDS = self.getListDatasetsUsedByJob(tmpMapInDS) if not status: # failed _logger.error("%s failed" % self.token) return # get relicas replicaMap = {} unknownSites = {} for tmpDS in inputDS: if tmpDS.endswith('/'): # container status,tmpRepMaps = self.getListDatasetReplicasInContainer(tmpDS) else: # normal dataset status,tmpRepMap = self.getListDatasetReplicas(tmpDS) tmpRepMaps = {tmpDS:tmpRepMap} if not status: # failed _logger.debug("%s failed" % self.token) return # make map per site for tmpDS,tmpRepMap in tmpRepMaps.iteritems(): for tmpSite,tmpStat in tmpRepMap.iteritems(): # ignore special sites if tmpSite in ['CERN-PROD_TZERO','CERN-PROD_DAQ','CERN-PROD_TMPDISK']: continue # ignore tape sites if tmpSite.endswith('TAPE'): continue # keep sites with unknown replica info if tmpStat[-1]['found'] == None: if not unknownSites.has_key(tmpDS): unknownSites[tmpDS] = [] unknownSites[tmpDS].append(tmpSite) # ignore ToBeDeleted if tmpStat[-1]['archived'] in ['ToBeDeleted',]: continue # change EOS if tmpSite.startswith('CERN-PROD_EOS'): tmpSite = 'CERN-PROD_EOS' # change EOS TMP if tmpSite.startswith('CERN-PROD_TMP'): tmpSite = 'CERN-PROD_TMP' # change DISK to SCRATCHDISK tmpSite = re.sub('_[^_-]+DISK$','',tmpSite) # change PERF-XYZ to SCRATCHDISK tmpSite = re.sub('_PERF-[^_-]+$','',tmpSite) # change PHYS-XYZ to SCRATCHDISK tmpSite = re.sub('_PHYS-[^_-]+$','',tmpSite) # patch for BNLPANDA if tmpSite in ['BNLPANDA']: tmpSite = 'BNL-OSG2' # add to map if not replicaMap.has_key(tmpSite): replicaMap[tmpSite] = {} replicaMap[tmpSite][tmpDS] = tmpStat[-1] _logger.debug("%s replica map -> %s" % (self.token,str(replicaMap))) # refresh replica info in needed self.refreshReplicaInfo(unknownSites) # instantiate SiteMapper siteMapper = SiteMapper(self.taskBuffer) # get original DDM origSiteDDM = self.getAggName(siteMapper.getSite(self.job.computingSite).ddm) # check all datasets maxDQ2Sites = [] if inputDS != []: # loop over all sites for tmpSite,tmpDsVal in replicaMap.iteritems(): # loop over all datasets appendFlag = True for tmpOrigDS in inputDS: # check completeness if tmpDsVal.has_key(tmpOrigDS) and tmpDsVal[tmpOrigDS]['found'] != None and \ tmpDsVal[tmpOrigDS]['total'] == tmpDsVal[tmpOrigDS]['found']: pass else: appendFlag = False # append if appendFlag: if not tmpSite in maxDQ2Sites: maxDQ2Sites.append(tmpSite) _logger.debug("%s candidate DQ2s -> %s" % (self.token,str(maxDQ2Sites))) if inputDS != [] and maxDQ2Sites == []: _logger.debug("%s no DQ2 candidate" % self.token) else: maxPandaSites = [] # original maxinputsize origMaxInputSize = siteMapper.getSite(self.job.computingSite).maxinputsize # look for Panda siteIDs for tmpSiteID,tmpSiteSpec in siteMapper.siteSpecList.iteritems(): # use ANALY_ only if not tmpSiteID.startswith('ANALY_'): continue # remove test and local if re.search('_test',tmpSiteID,re.I) != None: continue if re.search('_local',tmpSiteID,re.I) != None: continue # avoid same site if self.avoidSameSite and self.getAggName(tmpSiteSpec.ddm) == origSiteDDM: continue # check DQ2 ID if self.cloud in [None,tmpSiteSpec.cloud] \ and (self.getAggName(tmpSiteSpec.ddm) in maxDQ2Sites or inputDS == []): # excluded sites excludedFlag = False for tmpExcSite in self.excludedSite: if re.search(tmpExcSite,tmpSiteID) != None: excludedFlag = True break if excludedFlag: _logger.debug("%s skip %s since excluded" % (self.token,tmpSiteID)) continue # use online only if tmpSiteSpec.status != 'online': _logger.debug("%s skip %s status=%s" % (self.token,tmpSiteID,tmpSiteSpec.status)) continue # check maxinputsize if (maxFileSize == None and origMaxInputSize > siteMapper.getSite(tmpSiteID).maxinputsize) or \ maxFileSize > siteMapper.getSite(tmpSiteID).maxinputsize: _logger.debug("%s skip %s due to maxinputsize" % (self.token,tmpSiteID)) continue # append if not tmpSiteID in maxPandaSites: maxPandaSites.append(tmpSiteID) # choose at most 20 sites randomly to avoid too many lookup random.shuffle(maxPandaSites) maxPandaSites = maxPandaSites[:20] _logger.debug("%s candidate PandaSites -> %s" % (self.token,str(maxPandaSites))) # no Panda siteIDs if maxPandaSites == []: _logger.debug("%s no Panda site candidate" % self.token) else: # set AtlasRelease and cmtConfig to dummy job tmpJobForBrokerage = JobSpec() if self.job.AtlasRelease in ['NULL',None]: tmpJobForBrokerage.AtlasRelease = '' else: tmpJobForBrokerage.AtlasRelease = self.job.AtlasRelease # use nightlies matchNight = re.search('^AnalysisTransforms-.*_(rel_\d+)$',self.job.homepackage) if matchNight != None: tmpJobForBrokerage.AtlasRelease += ':%s' % matchNight.group(1) # use cache else: matchCache = re.search('^AnalysisTransforms-([^/]+)',self.job.homepackage) if matchCache != None: tmpJobForBrokerage.AtlasRelease = matchCache.group(1).replace('_','-') if not self.job.cmtConfig in ['NULL',None]: tmpJobForBrokerage.cmtConfig = self.job.cmtConfig # memory size if not self.job.minRamCount in ['NULL',None,0]: tmpJobForBrokerage.minRamCount = self.job.minRamCount # CPU count if not self.job.maxCpuCount in ['NULL',None,0]: tmpJobForBrokerage.maxCpuCount = self.job.maxCpuCount # run brokerage brokerage.broker.schedule([tmpJobForBrokerage],self.taskBuffer,siteMapper,forAnalysis=True, setScanSiteList=maxPandaSites,trustIS=True,reportLog=True) newSiteID = tmpJobForBrokerage.computingSite self.brokerageInfo += tmpJobForBrokerage.brokerageErrorDiag _logger.debug("%s runBrokerage - > %s" % (self.token,newSiteID)) # unknown site if not siteMapper.checkSite(newSiteID): _logger.error("%s unknown site" % self.token) _logger.debug("%s failed" % self.token) return # get new site spec newSiteSpec = siteMapper.getSite(newSiteID) # avoid repetition if self.getAggName(newSiteSpec.ddm) == origSiteDDM: _logger.debug("%s assigned to the same site %s " % (self.token,newSiteID)) _logger.debug("%s end" % self.token) return # simulation mode if self.simulation: _logger.debug("%s end simulation" % self.token) return # prepare jobs status = self.prepareJob(newSiteID,newSiteSpec) if status: # run SetUpper statusSetUp = self.runSetUpper() if not statusSetUp: _logger.debug("%s runSetUpper failed" % self.token) else: _logger.debug("%s successfully assigned to %s" % (self.token,newSiteID)) _logger.debug("%s end" % self.token) except: errType,errValue,errTraceBack = sys.exc_info() _logger.error("%s run() : %s %s" % (self.token,errType,errValue))
throttleForSink = {} throttleForSource = {} totalFlowFromSource = {} # loop over all sources to get total flows tmpLog.debug(" >>> checking limits") for sinkSite,sinkMap in wanMX.iteritems(): totalFlowToSink = 0 # loop over all sinks for sourceSite,sourceMap in sinkMap.iteritems(): # get total flows totalFlowToSink += sourceMap['flow'] if not totalFlowFromSource.has_key(sourceSite): totalFlowFromSource[sourceSite] = 0 totalFlowFromSource[sourceSite] += sourceMap['flow'] # check limit for sink tmpSiteSpec = siteMapper.getSite(sinkSite) if siteMapper.checkSite(sinkSite) and tmpSiteSpec.wansinklimit*1024*1024*1024 > totalFlowToSink: throttleForSink[sinkSite] = False tmpLog.debug(" release Sink {0} : {1}bps (total) < {2}Gbps (limit)".format(sinkSite,totalFlowToSink, tmpSiteSpec.wansinklimit)) else: throttleForSink[sinkSite] = True tmpLog.debug(" throttle Sink {0} : {1}bps (total) > {2}Gbps (limit)".format(sinkSite,totalFlowToSink, tmpSiteSpec.wansinklimit)) # check limit for source for sourceSite,totalFlow in totalFlowFromSource.iteritems(): tmpSiteSpec = siteMapper.getSite(sourceSite) if siteMapper.checkSite(sourceSite) and tmpSiteSpec.wansourcelimit*1024*1024*1024 > totalFlow: throttleForSource[sourceSite] = False tmpLog.debug(" release Src {0} : {1}bps (total) < {2}Gbps (limit)".format(sourceSite,totalFlow, tmpSiteSpec.wansourcelimit))
def run(self): # start try: byCallback = False if self.job == None: byCallback = True _logger.debug("start: %s" % self.dataset.name) _logger.debug("callback from %s" % self.site) # FIXME when callback from BNLPANDA disappeared if self.site == 'BNLPANDA': self.site = 'BNL-OSG2_ATLASMCDISK' # instantiate site mapper siteMapper = SiteMapper(self.taskBuffer) # get computingSite/destinationSE computingSite,destinationSE = self.taskBuffer.getDestSE(self.dataset.name) if destinationSE == None: # try to get computingSite/destinationSE from ARCH to delete sub # even if no active jobs left computingSite,destinationSE = self.taskBuffer.getDestSE(self.dataset.name,True) if destinationSE == None: _logger.error("cannot get source/destination for %s" % self.dataset.name) _logger.debug("end: %s" % self.dataset.name) return _logger.debug("src: %s" % computingSite) _logger.debug("dst: %s" % destinationSE) # get corresponding token tmpSrcSiteSpec = siteMapper.getSite(computingSite) tmpDstSiteSpec = siteMapper.getSite(destinationSE) _logger.debug(tmpDstSiteSpec.setokens_output) destToken = None for tmpToken,tmpDdmId in tmpDstSiteSpec.setokens_output.iteritems(): if self.site == tmpDdmId: destToken = tmpToken break _logger.debug("use Token=%s" % destToken) # get required tokens reqTokens = self.taskBuffer.getDestTokens(self.dataset.name) if reqTokens == None: _logger.error("cannot get required token for %s" % self.dataset.name) _logger.debug("end: %s" % self.dataset.name) return _logger.debug("req Token=%s" % reqTokens) # make bitmap for the token bitMap = 1 if len(reqTokens.split(','))>1: for tmpReqToken in reqTokens.split(','): if tmpReqToken == destToken: break # shift one bit bitMap <<= 1 # completed bitmap compBitMap = (1 << len(reqTokens.split(',')))-1 # ignore the lowest bit for T1, file on DISK is already there if tmpSrcSiteSpec.ddm_output == tmpDstSiteSpec.ddm_output: compBitMap = compBitMap & 0xFFFE # update bitmap in DB updatedBitMap = self.taskBuffer.updateTransferStatus(self.dataset.name,bitMap) _logger.debug("transfer status:%s - comp:%s - bit:%s" % (hex(updatedBitMap),hex(compBitMap),hex(bitMap))) # update output files if (updatedBitMap & compBitMap) == compBitMap: ids = self.taskBuffer.updateOutFilesReturnPandaIDs(self.dataset.name) # set flag for T2 cleanup self.dataset.status = 'cleanup' self.taskBuffer.updateDatasets([self.dataset]) else: _logger.debug("end: %s" % self.dataset.name) return else: _logger.debug("start: %s" % self.job.PandaID) # update input files ids = [self.job.PandaID] _logger.debug("IDs: %s" % ids) if len(ids) != 0: # get job if self.job == None: jobs = self.taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False) else: jobs = [self.job] # loop over all jobs for job in jobs: if job == None: continue _logger.debug("Job: %s" % job.PandaID) if job.jobStatus == 'transferring': jobReady = True failedFiles = [] noOutFiles = [] # check file status for file in job.Files: if file.type == 'output' or file.type == 'log': if file.status == 'failed': failedFiles.append(file.lfn) elif file.status == 'nooutput': noOutFiles.append(file.lfn) elif file.status != 'ready': _logger.debug("Job: %s file:%s %s != ready" % (job.PandaID,file.lfn,file.status)) jobReady = False break # finish job if jobReady: if byCallback: _logger.debug("Job: %s all files ready" % job.PandaID) else: _logger.debug("Job: %s all files checked with catalog" % job.PandaID) # create XML try: import xml.dom.minidom dom = xml.dom.minidom.getDOMImplementation() doc = dom.createDocument(None,'xml',None) topNode = doc.createElement("POOLFILECATALOG") for file in job.Files: if file.type in ['output','log']: # skip failed or no-output files if file.lfn in failedFiles+noOutFiles: continue # File fileNode = doc.createElement("File") fileNode.setAttribute("ID",file.GUID) # LFN logNode = doc.createElement("logical") lfnNode = doc.createElement("lfn") lfnNode.setAttribute('name',file.lfn) # metadata fsizeNode = doc.createElement("metadata") fsizeNode.setAttribute("att_name","fsize") fsizeNode.setAttribute("att_value",str(file.fsize)) # checksum if file.checksum.startswith('ad:'): # adler32 chksumNode = doc.createElement("metadata") chksumNode.setAttribute("att_name","adler32") chksumNode.setAttribute("att_value",re.sub('^ad:','',file.checksum)) else: # md5sum chksumNode = doc.createElement("metadata") chksumNode.setAttribute("att_name","md5sum") chksumNode.setAttribute("att_value",re.sub('^md5:','',file.checksum)) # append nodes logNode.appendChild(lfnNode) fileNode.appendChild(logNode) fileNode.appendChild(fsizeNode) fileNode.appendChild(chksumNode) topNode.appendChild(fileNode) # status in file name if failedFiles == []: statusFileName = 'finished' else: statusFileName = 'failed' # write to file xmlFile = '%s/%s_%s_%s' % (panda_config.logdir,job.PandaID,statusFileName,commands.getoutput('uuidgen')) oXML = open(xmlFile,"w") oXML.write(topNode.toxml()) oXML.close() except: type, value, traceBack = sys.exc_info() _logger.error("Job: %s %s %s" % (job.PandaID,type,value)) _logger.debug("Job: %s status: %s" % (job.PandaID,job.jobStatus)) # end if self.job == None: _logger.debug("end: %s" % self.dataset.name) else: _logger.debug("end: %s" % self.job.PandaID) except: type, value, traceBack = sys.exc_info() _logger.error("run() : %s %s" % (type,value))
xml = """<?xml version="1.0" encoding="UTF-8" standalone="no" ?> <!-- ATLAS file meta-data catalog --> <!DOCTYPE POOLFILECATALOG SYSTEM "InMemory"> <POOLFILECATALOG> """ try: att = sys.argv[2] except: att = job.attemptNr if job.computingSite in ['', None, 'NULL']: print 'computingSite is not yet defined' sys.exit(0) siteSpec = siteMapper.getSite(job.computingSite) for file in job.Files: if file.type in ['output', 'log']: file.GUID = commands.getoutput('uuidgen') if job.computingSite == file.destinationSE and \ siteSpec.setokens_output.has_key(file.destinationDBlockToken): tmpSrcDDM = siteSpec.setokens_output[file.destinationDBlockToken] else: tmpSrcDDM = siteMapper.getSite(job.computingSite).ddm_output srm = TiersOfATLAS.getSiteProperty(tmpSrcDDM, 'srm') srm = re.sub('^token:[^:]+:', '', srm) xml += """ <File ID="%s"> <logical> <lfn name="%s"/>
if startTime < timeLimit: siteJobsMap[siteName]['running'].append(pandaID) # sql to get number of high priority jobs sqlHiJobs = "SELECT count(*) FROM {0}.jobsActive4 ".format( panda_config.schemaPANDA) sqlHiJobs += "WHERE prodSourceLabel=:label AND jobStatus IN (:jobStat1,:jobStat2) " sqlHiJobs += "AND currentPriority>=:prio AND computingSite=:site AND eventService IS NULL " sqlHiJobs += "AND startTime<:timeLimit " # sql to kill job sqlKill = "UPDATE {0}.jobsActive4 ".format(panda_config.schemaPANDA) sqlKill += "SET commandToPilot=:com,supErrorCode=:code,supErrorDiag=:diag " sqlKill += "WHERE PandaID=:pandaID AND jobStatus=:jobStatus " # check all sites for siteName, jobsMap in siteJobsMap.iteritems(): # check jobseed siteSpec = siteMapper.getSite(siteName) # skip ES-only sites if siteSpec.getJobSeed() == 'es': continue # get number of high priority jobs varMap = {} varMap[':label'] = 'managed' varMap[':jobStat1'] = 'activated' varMap[':jobStat2'] = 'starting' varMap[':prio'] = 800 varMap[':timeLimit'] = timeLimit status, res = taskBuffer.querySQLS(sqlHiJobs, varMap) if res != None: nJobs = res[0][0] nJobsToKill = nJobs - len(siteJobsMap[siteName]['killing']) tmpLog.debug(
class Closer: # constructor def __init__(self,taskBuffer,destinationDBlocks,job,pandaDDM=False,datasetMap={}): self.taskBuffer = taskBuffer self.destinationDBlocks = destinationDBlocks self.job = job self.pandaID = job.PandaID self.pandaDDM = pandaDDM self.siteMapper = None self.datasetMap = datasetMap # to keep backward compatibility def start(self): self.run() def join(self): pass # main def run(self): try: _logger.debug('%s Start %s' % (self.pandaID,self.job.jobStatus)) flagComplete = True ddmJobs = [] topUserDsList = [] usingMerger = False disableNotifier = False firstIndvDS = True finalStatusDS = [] for destinationDBlock in self.destinationDBlocks: dsList = [] _logger.debug('%s start %s' % (self.pandaID,destinationDBlock)) # ignore tid datasets if re.search('_tid[\d_]+$',destinationDBlock): _logger.debug('%s skip %s' % (self.pandaID,destinationDBlock)) continue # ignore HC datasets if re.search('^hc_test\.',destinationDBlock) != None or re.search('^user\.gangarbt\.',destinationDBlock) != None: if re.search('_sub\d+$',destinationDBlock) == None and re.search('\.lib$',destinationDBlock) == None: _logger.debug('%s skip HC %s' % (self.pandaID,destinationDBlock)) continue # query dataset if self.datasetMap.has_key(destinationDBlock): dataset = self.datasetMap[destinationDBlock] else: dataset = self.taskBuffer.queryDatasetWithMap({'name':destinationDBlock}) if dataset == None: _logger.error('%s Not found : %s' % (self.pandaID,destinationDBlock)) flagComplete = False continue # skip tobedeleted/tobeclosed if dataset.status in ['cleanup','tobeclosed','completed']: _logger.debug('%s skip %s due to %s' % (self.pandaID,destinationDBlock,dataset.status)) continue dsList.append(dataset) # sort dsList.sort() # count number of completed files notFinish = self.taskBuffer.countFilesWithMap({'destinationDBlock':destinationDBlock, 'status':'unknown'}) if notFinish < 0: _logger.error('%s Invalid DB return : %s' % (self.pandaID,notFinish)) flagComplete = False continue # check if completed _logger.debug('%s notFinish:%s' % (self.pandaID,notFinish)) if self.job.destinationSE == 'local' and self.job.prodSourceLabel in ['user','panda']: # close non-DQ2 destinationDBlock immediately finalStatus = 'closed' elif self.job.lockedby == 'jedi' and self.isTopLevelDS(destinationDBlock): # set it closed in order not to trigger DDM cleanup. It will be closed by JEDI finalStatus = 'closed' elif self.job.prodSourceLabel in ['user'] and "--mergeOutput" in self.job.jobParameters \ and self.job.processingType != 'usermerge': # merge output files if firstIndvDS: # set 'tobemerged' to only the first dataset to avoid triggering many Mergers for --individualOutDS finalStatus = 'tobemerged' firstIndvDS = False else: finalStatus = 'tobeclosed' # set merging to top dataset usingMerger = True # disable Notifier disableNotifier = True elif self.job.produceUnMerge(): finalStatus = 'doing' else: # set status to 'tobeclosed' to trigger DQ2 closing finalStatus = 'tobeclosed' if notFinish==0: _logger.debug('%s set %s to dataset : %s' % (self.pandaID,finalStatus,destinationDBlock)) # set status dataset.status = finalStatus # update dataset in DB retT = self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ", criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'}) if len(retT) > 0 and retT[0]==1: finalStatusDS += dsList # close user datasets if self.job.prodSourceLabel in ['user'] and self.job.destinationDBlock.endswith('/') \ and (dataset.name.startswith('user') or dataset.name.startswith('group')): # get top-level user dataset topUserDsName = re.sub('_sub\d+$','',dataset.name) # update if it is the first attempt if topUserDsName != dataset.name and not topUserDsName in topUserDsList and self.job.lockedby != 'jedi': topUserDs = self.taskBuffer.queryDatasetWithMap({'name':topUserDsName}) if topUserDs != None: # check status if topUserDs.status in ['completed','cleanup','tobeclosed', 'tobemerged','merging']: _logger.debug('%s skip %s due to status=%s' % (self.pandaID,topUserDsName,topUserDs.status)) else: # set status if self.job.processingType.startswith('gangarobot') or \ self.job.processingType.startswith('hammercloud'): # not trigger freezing for HC datasets so that files can be appended topUserDs.status = 'completed' elif not usingMerger: topUserDs.status = finalStatus else: topUserDs.status = 'merging' # append to avoid repetition topUserDsList.append(topUserDsName) # update DB retTopT = self.taskBuffer.updateDatasets([topUserDs],withLock=True,withCriteria="status<>:crStatus", criteriaMap={':crStatus':topUserDs.status}) if len(retTopT) > 0 and retTopT[0]==1: _logger.debug('%s set %s to top dataset : %s' % (self.pandaID,topUserDs.status,topUserDsName)) else: _logger.debug('%s failed to update top dataset : %s' % (self.pandaID,topUserDsName)) # get parent dataset for merge job if self.job.processingType == 'usermerge': tmpMatch = re.search('--parentDS ([^ \'\"]+)',self.job.jobParameters) if tmpMatch == None: _logger.error('%s failed to extract parentDS' % self.pandaID) else: unmergedDsName = tmpMatch.group(1) # update if it is the first attempt if not unmergedDsName in topUserDsList: unmergedDs = self.taskBuffer.queryDatasetWithMap({'name':unmergedDsName}) if unmergedDs == None: _logger.error('%s failed to get parentDS=%s from DB' % (self.pandaID,unmergedDsName)) else: # check status if unmergedDs.status in ['completed','cleanup','tobeclosed']: _logger.debug('%s skip %s due to status=%s' % (self.pandaID,unmergedDsName,unmergedDs.status)) else: # set status unmergedDs.status = finalStatus # append to avoid repetition topUserDsList.append(unmergedDsName) # update DB retTopT = self.taskBuffer.updateDatasets([unmergedDs],withLock=True,withCriteria="status<>:crStatus", criteriaMap={':crStatus':unmergedDs.status}) if len(retTopT) > 0 and retTopT[0]==1: _logger.debug('%s set %s to parent dataset : %s' % (self.pandaID,unmergedDs.status,unmergedDsName)) else: _logger.debug('%s failed to update parent dataset : %s' % (self.pandaID,unmergedDsName)) if self.pandaDDM and self.job.prodSourceLabel=='managed': # instantiate SiteMapper if self.siteMapper == None: self.siteMapper = SiteMapper(self.taskBuffer) # get file list for PandaDDM retList = self.taskBuffer.queryFilesWithMap({'destinationDBlock':destinationDBlock}) lfnsStr = '' guidStr = '' for tmpFile in retList: if tmpFile.type in ['log','output']: lfnsStr += '%s,' % tmpFile.lfn guidStr += '%s,' % tmpFile.GUID if lfnsStr != '': guidStr = guidStr[:-1] lfnsStr = lfnsStr[:-1] # create a DDM job ddmjob = JobSpec() ddmjob.jobDefinitionID = int(time.time()) % 10000 ddmjob.jobName = "%s" % commands.getoutput('uuidgen') ddmjob.transformation = 'http://pandaserver.cern.ch:25080/trf/mover/run_dq2_cr' ddmjob.destinationDBlock = 'testpanda.%s' % ddmjob.jobName ddmjob.computingSite = "BNL_ATLAS_DDM" ddmjob.destinationSE = ddmjob.computingSite ddmjob.currentPriority = 200000 ddmjob.prodSourceLabel = 'ddm' ddmjob.transferType = 'sub' # append log file fileOL = FileSpec() fileOL.lfn = "%s.job.log.tgz" % ddmjob.jobName fileOL.destinationDBlock = ddmjob.destinationDBlock fileOL.destinationSE = ddmjob.destinationSE fileOL.dataset = ddmjob.destinationDBlock fileOL.type = 'log' ddmjob.addFile(fileOL) # make arguments dstDQ2ID = 'BNLPANDA' srcDQ2ID = self.siteMapper.getSite(self.job.computingSite).ddm callBackURL = 'https://%s:%s/server/panda/datasetCompleted?vuid=%s&site=%s' % \ (panda_config.pserverhost,panda_config.pserverport, dataset.vuid,dstDQ2ID) _logger.debug(callBackURL) # set src/dest ddmjob.sourceSite = srcDQ2ID ddmjob.destinationSite = dstDQ2ID # if src==dst, send callback without ddm job if dstDQ2ID == srcDQ2ID: comout = commands.getoutput('curl -k %s' % callBackURL) _logger.debug(comout) else: # run dq2_cr callBackURL = urllib.quote(callBackURL) # get destination dir destDir = brokerage.broker_util._getDefaultStorage(self.siteMapper.getSite(self.job.computingSite).dq2url) argStr = "-s %s -r %s --guids %s --lfns %s --callBack %s -d %s/%s %s" % \ (srcDQ2ID,dstDQ2ID,guidStr,lfnsStr,callBackURL,destDir, destinationDBlock,destinationDBlock) # set job parameters ddmjob.jobParameters = argStr _logger.debug('%s pdq2_cr %s' % (self.pandaID,ddmjob.jobParameters)) ddmJobs.append(ddmjob) # start Activator if re.search('_sub\d+$',dataset.name) == None: if self.job.prodSourceLabel=='panda' and self.job.processingType in ['merge','unmerge']: # don't trigger Activator for merge jobs pass else: if self.job.jobStatus == 'finished': aThr = Activator(self.taskBuffer,dataset) aThr.start() aThr.join() else: # unset flag since another thread already updated #flagComplete = False pass else: # update dataset in DB self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ", criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'}) # unset flag flagComplete = False # end _logger.debug('%s end %s' % (self.pandaID,destinationDBlock)) # start DDM jobs if ddmJobs != []: self.taskBuffer.storeJobs(ddmJobs,self.job.prodUserID,joinThr=True) # change pending jobs to failed finalizedFlag = True if flagComplete and self.job.prodSourceLabel=='user': _logger.debug('%s finalize %s %s' % (self.pandaID,self.job.prodUserName,self.job.jobDefinitionID)) finalizedFlag = self.taskBuffer.finalizePendingJobs(self.job.prodUserName,self.job.jobDefinitionID,waitLock=True) _logger.debug('%s finalized with %s' % (self.pandaID,finalizedFlag)) # update unmerged datasets in JEDI to trigger merging if flagComplete and self.job.produceUnMerge() and finalStatusDS != []: if finalizedFlag: self.taskBuffer.updateUnmergedDatasets(self.job,finalStatusDS) # start notifier _logger.debug('%s source:%s complete:%s' % (self.pandaID,self.job.prodSourceLabel,flagComplete)) if (self.job.jobStatus != 'transferring') and ((flagComplete and self.job.prodSourceLabel=='user') or \ (self.job.jobStatus=='failed' and self.job.prodSourceLabel=='panda')) and \ self.job.lockedby != 'jedi': # don't send email for merge jobs if (not disableNotifier) and not self.job.processingType in ['merge','unmerge']: useNotifier = True summaryInfo = {} # check all jobDefIDs in jobsetID if not self.job.jobsetID in [0,None,'NULL']: useNotifier,summaryInfo = self.taskBuffer.checkDatasetStatusForNotifier(self.job.jobsetID,self.job.jobDefinitionID, self.job.prodUserName) _logger.debug('%s useNotifier:%s' % (self.pandaID,useNotifier)) if useNotifier: _logger.debug('%s start Notifier' % self.pandaID) nThr = Notifier.Notifier(self.taskBuffer,self.job,self.destinationDBlocks,summaryInfo) nThr.run() _logger.debug('%s end Notifier' % self.pandaID) _logger.debug('%s End' % self.pandaID) except: errType,errValue = sys.exc_info()[:2] _logger.error("%s %s" % (errType,errValue)) # check if top dataset def isTopLevelDS(self,datasetName): topDS = re.sub('_sub\d+$','',datasetName) if topDS == datasetName: return True return False
# exec status,res = taskBuffer.querySQLS(sql,varMap,arraySize=10000) if res == None: _logger.debug("total %s " % res) else: # release high prio jobs sql = "UPDATE ATLAS_PANDA.jobsActive4 SET jobStatus=:newStatus " sql += "WHERE jobStatus=:oldStatus AND prodSourceLabel IN (:p1) AND lockedBy=:lockedBy " sql += "AND currentPriority>=:prioCutoff AND computingSite=:computingSite " # loop over computing sites for computingSite, in res: # get site spec if not siteMapper.checkSite(computingSite): continue siteSpec = siteMapper.getSite(computingSite) # check if resource fair share is used if siteSpec.useResourceFairShare(): varMap = {} varMap[':newStatus'] = 'activated' varMap[':oldStatus'] = 'throttled' varMap[':p1'] = 'managed' varMap[':lockedBy'] = 'jedi' varMap[':prioCutoff'] = prioCutoff varMap[':computingSite'] = computingSite status,res = taskBuffer.querySQLS(sql,varMap,arraySize=10000) # get statistics sql = "SELECT COUNT(*),jobStatus,computingSite,cloud FROM ATLAS_PANDA.jobsActive4 " sql += "WHERE jobStatus IN (:s1,:s2,:s3) AND prodSourceLabel IN (:p1) AND lockedBy=:lockedBy "
# kill only old jobs if startTime < timeLimit: siteJobsMap[siteName]['running'].append(pandaID) # sql to get number of high priority jobs sqlHiJobs = "SELECT count(*) FROM {0}.jobsActive4 ".format(panda_config.schemaPANDA) sqlHiJobs += "WHERE prodSourceLabel=:label AND jobStatus IN (:jobStat1,:jobStat2) " sqlHiJobs += "AND currentPriority>=:prio AND computingSite=:site AND eventService IS NULL " sqlHiJobs += "AND startTime<:timeLimit " # sql to kill job sqlKill = "UPDATE {0}.jobsActive4 ".format(panda_config.schemaPANDA) sqlKill += "SET commandToPilot=:com,supErrorCode=:code,supErrorDiag=:diag " sqlKill += "WHERE PandaID=:pandaID AND jobStatus=:jobStatus " # check all sites for siteName,jobsMap in siteJobsMap.iteritems(): # check jobseed siteSpec = siteMapper.getSite(siteName) # skip ES-only sites if siteSpec.getJobSeed() == 'es': continue # get number of high priority jobs varMap = {} varMap[':label'] = 'managed' varMap[':jobStat1'] = 'activated' varMap[':jobStat2'] = 'starting' varMap[':prio'] = 800 varMap[':timeLimit'] = timeLimit status,res = taskBuffer.querySQLS(sqlHiJobs,varMap) if res != None: nJobs = res[0][0] nJobsToKill = nJobs-len(siteJobsMap[siteName]['killing']) tmpLog.debug("site={0} nHighPrioJobs={1} nRunnigES={2} nKillingES={3} nESToKill={4}".format(siteName,nJobs,