def setupInterface(self): # parse config for configStr in jedi_config.ddm.modConfig.split(','): configStr = configStr.strip() items = configStr.split(':') # check format active = True try: vo = items[0] maxSize = int(items[1]) moduleName = items[2] className = items[3] if len(items) >= 5: group = items[4] if not group: group = None else: group = None if len(items) >= 6 and items[5] == 'off': active = False except Exception: # TODO add config error message continue # add VO interface if active: voIF = Interaction.CommandSendInterface( vo, maxSize, moduleName, className) voIF.initialize() else: voIF = None key = self.get_dict_key(vo, group) self.interfaceMap[key] = voIF
def setupInterface(self): vo = 'any' maxSize = jedi_config.db.nWorkers moduleName = 'pandajedi.jedicore.JediTaskBuffer' className = 'JediTaskBuffer' self.interface = Interaction.CommandSendInterface( vo, maxSize, moduleName, className) self.interface.initialize()
def setupInterface(self): # parse config for configStr in jedi_config.ddm.modConfig.split(','): configStr = configStr.strip() items = configStr.split(':') # check format try: vo = items[0] maxSize = int(items[1]) moduleName = items[2] className = items[3] except: # TODO add config error message continue # add VO interface voIF = Interaction.CommandSendInterface(vo, maxSize, moduleName, className) voIF.initialize() self.interfaceMap[vo] = voIF
self.taskSpec.splitRule = tmpStr else: tmpMatch = re.search(valName + '=(-*\d+)', self.taskSpec.splitRule) if tmpMatch == None: # append self.taskSpec.splitRule += ',{0}'.format(tmpStr) else: # replace self.taskSpec.splitRule = re.sub(valName + '=(-*\d+)', tmpStr, self.taskSpec.splitRule) return # get parameters for event service merging def getParamsForEventServiceMerging(self, taskParamMap): # no event service if not self.taskSpec.useEventService(): return None # extract parameters transPath = 'UnDefined' jobParameters = 'UnDefined' if taskParamMap.has_key('esmergeSpec'): if taskParamMap['esmergeSpec'].has_key('transPath'): transPath = taskParamMap['esmergeSpec']['transPath'] if taskParamMap['esmergeSpec'].has_key('jobParameters'): jobParameters = taskParamMap['esmergeSpec']['jobParameters'] # return return '<PANDA_ESMERGE_TRF>' + transPath + '</PANDA_ESMERGE_TRF>' + '<PANDA_ESMERGE_JOBP>' + jobParameters + '</PANDA_ESMERGE_JOBP>' Interaction.installSC(TaskRefinerBase)
from pandajedi.jedicore import Interaction # base class for task setup class TaskSetupperBase(object): def __init__(self, taskBufferIF, ddmIF): self.ddmIF = ddmIF self.taskBufferIF = taskBufferIF self.refresh() def refresh(self): self.siteMapper = self.taskBufferIF.getSiteMapper() Interaction.installSC(TaskSetupperBase)
# start communication channel import threading thr = threading.Thread(target=self.startImpl) thr.start() # implementation of start() def startImpl(self): try: Interaction.CommandReceiveInterface.start(self) except: errtype, errvalue = sys.exc_info()[:2] self.logger.error('crashed in JediKnight.startImpl() with %s %s' % (errtype.__name__, errvalue)) # parse init params def parseInit(self, par): if isinstance(par, list): return par try: return par.split('|') except: return [par] # sleep to avoid synchronization of loop def randomSleep(self, minVal=0, maxVal=30): time.sleep(random.randint(minVal, maxVal)) # install SCs Interaction.installSC(JediKnight)
# finish tasks when goal is reached tmpLog.info('finish achieved tasks for vo={0} label={1}'.format( vo, prodSourceLabel)) tmpRet = self.taskBufferIF.getAchievedTasks_JEDI( vo, prodSourceLabel, jedi_config.watchdog.waitForAchieved) if tmpRet is None: # failed tmpLog.error('failed to finish') else: for jediTaskID in tmpRet: self.taskBufferIF.sendCommandTaskPanda(jediTaskID, 'JEDI. Goal reached', True, 'finish', comQualifier='soft') tmpLog.info('finished {0} tasks'.format(tmpRet)) # rescue unlocked tasks with picked files tmpLog.info( 'rescue unlocked tasks with picked files for vo={0} label={1}'. format(vo, prodSourceLabel)) tmpRet = self.taskBufferIF.rescueUnLockedTasksWithPicked_JEDI( vo, prodSourceLabel, 60, pid) if tmpRet is None: # failed tmpLog.error('failed to rescue unlocked tasks') else: tmpLog.info('rescue unlocked {0} tasks'.format(tmpRet)) Interaction.installSC(TypicalWatchDogBase)
from pandajedi.jedicore import Interaction # base class for task brokerge class TaskBrokerBase(object): def __init__(self, taskBufferIF, ddmIF): self.ddmIF = ddmIF self.taskBufferIF = taskBufferIF self.refresh() def refresh(self): self.siteMapper = self.taskBufferIF.getSiteMapper() Interaction.installSC(TaskBrokerBase)
def get_unified_sites(self, scan_site_list): unified_list = set() for tmpSiteName in scan_site_list: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) unifiedName = tmpSiteSpec.get_unified_name() unified_list.add(unifiedName) return tuple(unified_list) # get list of pseudo sites def get_pseudo_sites(self, unified_list, scan_site_list): unified_list = set(unified_list) pseudo_list = set() for tmpSiteName in scan_site_list: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) if tmpSiteSpec.get_unified_name() in unified_list: pseudo_list.add(tmpSiteName) return tuple(pseudo_list) # add pseudo sites to skip def add_pseudo_sites_to_skip(self, unified_dict, scan_site_list, skipped_dict): for tmpSiteName in scan_site_list: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) if tmpSiteSpec.get_unified_name() in unified_dict: skipped_dict[tmpSiteName] = unified_dict[ tmpSiteSpec.get_unified_name()] return skipped_dict Interaction.installSC(JobBrokerBase)
unified_list = set() for tmpSiteName in scan_site_list: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) unifiedName = tmpSiteSpec.get_unified_name() unified_list.add(unifiedName) return tuple(unified_list) # get list of pseudo sites def get_pseudo_sites(self, unified_list, scan_site_list): unified_list = set(unified_list) pseudo_list = set() for tmpSiteName in scan_site_list: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) if tmpSiteSpec.get_unified_name() in unified_list: pseudo_list.add(tmpSiteName) return tuple(pseudo_list) # add pseudo sites to skip def add_pseudo_sites_to_skip(self, unified_dict, scan_site_list, skipped_dict): for tmpSiteName in scan_site_list: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) if tmpSiteSpec.get_unified_name() in unified_dict: skipped_dict[tmpSiteName] = unified_dict[tmpSiteSpec.get_unified_name()] return skipped_dict Interaction.installSC(JobBrokerBase)
self.maxNumJobs = None self.minPriority = None self.underNqLimit = False self.siteMapper = self.taskBufferIF.getSiteMapper() # set maximum number of jobs to be submitted def setMaxNumJobs(self, maxNumJobs): self.maxNumJobs = maxNumJobs # set min priority of jobs to be submitted def setMinPriority(self, minPriority): self.minPriority = minPriority # check throttle level def mergeThrottled(self, thrLevel): # un-leveled flag if thrLevel in [True, False]: return thrLevel return thrLevel > THR_LEVEL5 # check if lack of jobs def lackOfJobs(self): return self.underNqLimit # not enough jobs are queued def notEnoughJobsQueued(self): self.underNqLimit = True Interaction.installSC(JobThrottlerBase)
from pandajedi.jedicore import Interaction # base class for task brokerge class TaskBrokerBase (object): def __init__(self,taskBufferIF,ddmIF): self.ddmIF = ddmIF self.taskBufferIF = taskBufferIF self.refresh() def refresh(self): self.siteMapper = self.taskBufferIF.getSiteMapper() Interaction.installSC(TaskBrokerBase)
thr.start() # implementation of start() def startImpl(self): try: Interaction.CommandReceiveInterface.start(self) except: errtype,errvalue = sys.exc_info()[:2] self.logger.error('crashed in JediKnight.startImpl() with %s %s' % (errtype.__name__,errvalue)) # parse init params def parseInit(self,par): if isinstance(par,list): return par try: return par.split('|') except: return [par] # sleep to avoid synchronization of loop def randomSleep(self,minVal=0,maxVal=30): time.sleep(random.randint(minVal,maxVal)) # install SCs Interaction.installSC(JediKnight)
from pandajedi.jedicore import Interaction # base class for watchdog class WatchDogBase (object): # constructor def __init__(self,taskBufferIF,ddmIF): self.ddmIF = ddmIF self.taskBufferIF = taskBufferIF self.refresh() def refresh(self): self.siteMapper = self.taskBufferIF.getSiteMapper() Interaction.installSC(WatchDogBase)
from pandajedi.jedicore import Interaction # base class for task generator class TaskGeneratorBase (object): def __init__(self,taskBufferIF,ddmIF): self.ddmIF = ddmIF self.taskBufferIF = taskBufferIF self.refresh() def refresh(self): self.siteMapper = self.taskBufferIF.getSiteMapper() Interaction.installSC(TaskGeneratorBase)
from pandajedi.jedicore import Interaction # base class for watchdog class WatchDogBase(object): # constructor def __init__(self, taskBufferIF, ddmIF): self.taskBufferIF = taskBufferIF self.ddmIF = ddmIF self.refresh() # refresh def refresh(self): self.siteMapper = self.taskBufferIF.getSiteMapper() # pre-action def pre_action(self, tmpLog, vo, prodSourceLabel, pid, *args, **kwargs): pass Interaction.installSC(WatchDogBase)
maxNumEventRanges=maxNumEventRanges, multiplicity=multiplicity, splitByFields=splitByFields, tmpLog=tmpLog, useDirectIO=useDirectIO, maxDiskSize=maxDiskSize) if subChunk == None: break if subChunk != []: # append subChunks.append(subChunk) iSubChunks += 1 # append to return map if remain if subChunks != []: # get site names for parallel execution if taskSpec.getNumSitesPerJob() > 1 and not inputChunk.isMerging: siteName = inputChunk.getParallelSites(taskSpec.getNumSitesPerJob(), nSubChunks,[siteName]) returnList.append({'siteName':siteName, 'subChunks':subChunks, 'siteCandidate':siteCandidate, }) tmpLog.debug('split to %s subchunks' % len(subChunks)) # return tmpLog.debug('done') return self.SC_SUCCEEDED,returnList Interaction.installSC(JobSplitter)
if allow_chunk_size_limit and strict_chunkSize and len( subChunks) < nSubChunks: tmpLog.debug( 'skip splitting since chunk size {} is less than chunk size limit {} at {}' .format(len(subChunks), nSubChunks, siteName)) inputChunk.rollback_file_usage() isSkipped = True else: # get site names for parallel execution if taskSpec.getNumSitesPerJob( ) > 1 and not inputChunk.isMerging: siteName = inputChunk.getParallelSites( taskSpec.getNumSitesPerJob(), nSubChunks, [siteName]) returnList.append({ 'siteName': siteName, 'subChunks': subChunks, 'siteCandidate': siteCandidate, }) try: gshare = taskSpec.gshare.replace(' ', '_') except Exception: gshare = None tmpLog.info('split to nJobs=%s at site=%s gshare=%s' % (len(subChunks), siteName, gshare)) # return tmpLog.debug('done') return self.SC_SUCCEEDED, returnList, isSkipped Interaction.installSC(JobSplitter)
# check goal only if checkGoal: # no goal if taskSpec.goal != None and taskCompleteness >= taskGoal: return True return False # return status return status # pre-check def doPreCheck(self, taskSpec, tmpLog): # send task to exhausted if taskSpec.useExhausted() and not taskSpec.status in ['passed'] \ and self.getFinalTaskStatus(taskSpec) in ['finished'] \ and not self.getFinalTaskStatus(taskSpec,checkParent=False) in ['done'] \ and not self.getFinalTaskStatus(taskSpec,checkGoal=True): taskSpec.status = 'exhausted' taskSpec.lockedBy = None taskSpec.lockedTime = None # update task tmpLog.info('set task.status={0}'.format(taskSpec.status)) self.taskBufferIF.updateTask_JEDI( taskSpec, {'jediTaskID': taskSpec.jediTaskID}, updateDEFT=True) # kick child tasks self.taskBufferIF.kickChildTasks_JEDI(taskSpec.jediTaskID) return True return False Interaction.installSC(PostProcessorBase)
from pandajedi.jedicore import Interaction # base class for job throttle class JobThrottlerBase(object): def __init__(self, taskBufferIF): self.taskBufferIF = taskBufferIF # returns self.retTmpError = self.SC_FAILED, True self.retThrottled = self.SC_SUCCEEDED, True self.retUnThrottled = self.SC_SUCCEEDED, False # limit self.maxNumJobs = None self.minPriority = None self.refresh() # refresh def refresh(self): self.siteMapper = self.taskBufferIF.getSiteMapper() # set maximum number of jobs to be submitted def setMaxNumJobs(self, maxNumJobs): self.maxNumJobs = maxNumJobs # set min priority of jobs to be submitted def setMinPriority(self, minPriority): self.minPriority = minPriority Interaction.installSC(JobThrottlerBase)
# append self.taskSpec.splitRule += ',{0}'.format(tmpStr) else: # replace self.taskSpec.splitRule = re.sub(valName+'=(-*\d+)', tmpStr, self.taskSpec.splitRule) return # get parameters for event service merging def getParamsForEventServiceMerging(self,taskParamMap): # no event service if not self.taskSpec.useEventService(): return None # extract parameters transPath = 'UnDefined' jobParameters = 'UnDefined' if taskParamMap.has_key('esmergeSpec'): if taskParamMap['esmergeSpec'].has_key('transPath'): transPath = taskParamMap['esmergeSpec']['transPath'] if taskParamMap['esmergeSpec'].has_key('jobParameters'): jobParameters = taskParamMap['esmergeSpec']['jobParameters'] # return return '<PANDA_ESMERGE_TRF>'+transPath+'</PANDA_ESMERGE_TRF>'+'<PANDA_ESMERGE_JOBP>'+jobParameters+'</PANDA_ESMERGE_JOBP>' Interaction.installSC(TaskRefinerBase)
from pandajedi.jedicore import Interaction # base class for task setup class TaskSetupperBase (object): def __init__(self,taskBufferIF,ddmIF): self.ddmIF = ddmIF self.taskBufferIF = taskBufferIF self.refresh() def refresh(self): self.siteMapper = self.taskBufferIF.getSiteMapper() Interaction.installSC(TaskSetupperBase)
except: errType,errValue = sys.exc_info()[:2] if iTry+1 < nTry: # sleep for retry tmpLog.debug("sleep {0} due to {1}:{2}".format(iTry,errType,errValue)) time.sleep(30) else: tmpLog.error("failed to send notification with {0}:{1}".format(errType,errValue)) if fileBackUp: # write to file which is processed in add.py mailFile = '{0}/jmail_{1}_{2}' % (panda_config.logdir,jediTaskID,commands.getoutput('uuidgen')) oMail = open(mailFile,"w") oMail.write(str(jediTaskID)+'\n'+toAdd+'\n'+msgBody) oMail.close() break try: smtplib.stderr = org_smtpstderr except: pass # return email sender def senderAddress(self): return panda_config.emailSender Interaction.installSC(PostProcessorBase)
def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap): # make logger tmpLog = MsgWrapper(logger, '<jediTaskID={0}>'.format(taskSpec.jediTaskID)) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL, inputChunk retTmpError = self.SC_FAILED, inputChunk # set cloud try: if not taskParamMap: taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI( taskSpec.jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) if not taskSpec.cloud and 'cloud' in taskParamMap: taskSpec.cloud = taskParamMap['cloud'] except Exception: pass # get sites in the cloud site_preassigned = True if taskSpec.site not in ['', None]: tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) if self.siteMapper.checkSite(taskSpec.site): scanSiteList = [taskSpec.site] else: scanSiteList = [] for tmpSite in self.siteMapper.getCloud( taskSpec.cloud)['sites']: if re.search(taskSpec.site, tmpSite): scanSiteList.append(tmpSite) if not scanSiteList: tmpLog.error('unknown site={}'.format(taskSpec.site)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError elif inputChunk.getPreassignedSite() is not None: scanSiteList = [inputChunk.getPreassignedSite()] tmpLog.debug('site={0} is pre-assigned in masterDS'.format( inputChunk.getPreassignedSite())) else: site_preassigned = False scanSiteList = self.siteMapper.getCloud(taskSpec.cloud)['sites'] # remove NA if 'NA' in scanSiteList: scanSiteList.remove('NA') tmpLog.debug('cloud=%s has %s candidates' % (taskSpec.cloud, len(scanSiteList))) tmpLog.debug('initial {0} candidates'.format(len(scanSiteList))) ###################################### # selection for status and PandaSite newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status if tmpSiteSpec.status != 'online' and not site_preassigned: tmpLog.debug(' skip %s due to status=%s' % (tmpSiteName, tmpSiteSpec.status)) continue # check PandaSite if 'PandaSite' in taskParamMap and taskParamMap['PandaSite']: if tmpSiteSpec.pandasite != taskParamMap['PandaSite']: tmpLog.debug(' skip %s due to wrong PandaSite=%s <> %s' % (tmpSiteName, tmpSiteSpec.pandasite, taskParamMap['PandaSite'])) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for scratch disk minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize( ) + inputChunk.getMaxAtomSize() minDiskCountS = minDiskCountS // 1024 // 1024 # size for direct IO sites if taskSpec.useLocalIO(): minDiskCountR = minDiskCountS else: minDiskCountR = taskSpec.getOutDiskSize( ) + taskSpec.getWorkDiskSize() minDiskCountR = minDiskCountR // 1024 // 1024 newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir: if JediCoreUtils.use_direct_io_for_job(taskSpec, tmpSiteSpec, inputChunk): minDiskCount = minDiskCountR else: minDiskCount = minDiskCountS if minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug( ' skip {0} due to small scratch disk={1} < {2}'. format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # free space must be >= 200GB diskThreshold = 200 tmpSpaceSize = tmpSiteSpec.space if tmpSiteSpec.space and tmpSpaceSize < diskThreshold: tmpLog.debug( ' skip {0} due to disk shortage in SE = {1} < {2}GB'. format(tmpSiteName, tmpSiteSpec.space, diskThreshold)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if minWalltime not in [0, None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug( ' skip {0} due to short site walltime={1}(site upper limit) < {2}' .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug( ' skip {0} due to short job walltime={1}(site lower limit) > {2}' .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format( len(scanSiteList), minWalltime, taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for memory origMinRamCount = inputChunk.getMaxRamCount() if not site_preassigned and origMinRamCount: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # job memory requirement if taskSpec.ramPerCore(): minRamCount = origMinRamCount * ( tmpSiteSpec.coreCount if tmpSiteSpec.coreCount else 1) minRamCount += (taskSpec.baseRamCount if taskSpec.baseRamCount else 0) else: minRamCount = origMinRamCount # site max memory requirement site_maxmemory = tmpSiteSpec.maxrss if tmpSiteSpec.maxrss else 0 # check at the site if site_maxmemory and minRamCount and minRamCount > site_maxmemory: tmpMsg = ' skip site={0} due to site RAM shortage {1}(site upper limit) less than {2} '.format( tmpSiteName, site_maxmemory, minRamCount) tmpLog.debug(tmpMsg) continue # site min memory requirement site_minmemory = tmpSiteSpec.minrss if tmpSiteSpec.minrss else 0 if site_minmemory and minRamCount and minRamCount < site_minmemory: tmpMsg = ' skip site={0} due to job RAM shortage {1}(site lower limit) greater than {2} '.format( tmpSiteName, site_minmemory, minRamCount) tmpLog.info(tmpMsg) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if tmpSiteName in nWNmap: nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][ 'updateJob'] if nPilot == 0 and taskSpec.prodSourceLabel not in ['test']: tmpLog.debug(' skip %s due to no pilot' % tmpSiteName) #continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # sites already used by task tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI( taskSpec.jediTaskID) if not tmpSt: tmpLog.error('failed to get sites which already used by task') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # get list of available files availableFileMap = {} for datasetSpec in inputChunk.getDatasets(): try: # get list of site to be scanned tmpLog.debug( 'getting the list of available files for {0}'.format( datasetSpec.datasetName)) fileScanSiteList = [] for tmpPseudoSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpPseudoSiteName) tmpSiteName = tmpSiteSpec.get_unified_name() if tmpSiteName in fileScanSiteList: continue fileScanSiteList.append(tmpSiteName) # mapping between sites and input storage endpoints siteStorageEP = AtlasBrokerUtils.getSiteInputStorageEndpointMap( fileScanSiteList, self.siteMapper, taskSpec.prodSourceLabel, None) # disable file lookup for merge jobs if inputChunk.isMerging: checkCompleteness = False else: checkCompleteness = True if not datasetSpec.isMaster(): useCompleteOnly = True else: useCompleteOnly = False # get available files per site/endpoint tmpAvFileMap = self.ddmIF.getAvailableFiles( datasetSpec, siteStorageEP, self.siteMapper, check_completeness=checkCompleteness, file_scan_in_container=False, complete_only=useCompleteOnly) if tmpAvFileMap is None: raise Interaction.JEDITemporaryError( 'ddmIF.getAvailableFiles failed') availableFileMap[datasetSpec.datasetName] = tmpAvFileMap except Exception as e: tmpLog.error('failed to get available files with {}'.format(e)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # calculate weight tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsByGlobalShare( taskSpec.vo) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # final procedure tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} candidateSpecList = [] preSiteCandidateSpec = None for tmpSiteName in scanSiteList: # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'running', None, None) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'defined', None, None) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'activated', None, None) weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1) # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # set weight siteCandidateSpec.weight = weight # files for tmpDatasetName, availableFiles in six.iteritems( availableFileMap): if tmpSiteName in availableFiles: siteCandidateSpec.add_local_disk_files( availableFiles[tmpSiteName]['localdisk']) # append if tmpSiteName in sitesUsedByTask: candidateSpecList.append(siteCandidateSpec) else: if weight not in weightMap: weightMap[weight] = [] weightMap[weight].append(siteCandidateSpec) # limit the number of sites maxNumSites = 5 weightList = list(weightMap.keys()) weightList.sort() weightList.reverse() for weightVal in weightList: if len(candidateSpecList) >= maxNumSites: break sitesWithWeight = weightMap[weightVal] random.shuffle(sitesWithWeight) candidateSpecList += sitesWithWeight[:(maxNumSites - len(candidateSpecList))] # collect site names scanSiteList = [] for siteCandidateSpec in candidateSpecList: scanSiteList.append(siteCandidateSpec.siteName) # append candidates newScanSiteList = [] for siteCandidateSpec in candidateSpecList: # append inputChunk.addSiteCandidate(siteCandidateSpec) newScanSiteList.append(siteCandidateSpec.siteName) tmpLog.debug(' use {} with weight={} nFiles={}'.format( siteCandidateSpec.siteName, siteCandidateSpec.weight, len(siteCandidateSpec.localDiskFiles))) scanSiteList = newScanSiteList if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # return tmpLog.debug('done') return self.SC_SUCCEEDED, inputChunk