Пример #1
0
    def doSetup(self,taskSpec,datasetToRegister,pandaJobs):
        # make logger
        tmpLog = MsgWrapper(logger,"< jediTaskID={0} >".format(taskSpec.jediTaskID))
        tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType))
        # returns
        retFatal    = self.SC_FATAL
        retTmpError = self.SC_FAILED
        retOK       = self.SC_SUCCEEDED
        try:
            # get DDM I/F
            ddmIF = self.ddmIF.getInterface(taskSpec.vo)
            # register datasets
            if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']:
                # prod vs anal
                userSetup = False
                if taskSpec.prodSourceLabel in ['user']:
                    userSetup = True
                    # collect datasetID to register datasets/containers just in case
                    for tmpPandaJob in pandaJobs:
                        if not tmpPandaJob.produceUnMerge():
                            for tmpFileSpec in tmpPandaJob.Files:
                                if tmpFileSpec.type in ['output','log']:
                                    if tmpFileSpec.datasetID not in datasetToRegister:
                                        datasetToRegister.append(tmpFileSpec.datasetID)
                tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister)))
                # get site mapper
                siteMapper = self.taskBufferIF.getSiteMapper()

                # loop over all datasets
                avDatasetList = []
                cnDatasetMap  = {}
                for datasetID in datasetToRegister:
                    # get output and log datasets
                    tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID))
                    tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID,
                                                                                  datasetID)
                    if not tmpStat:
                        tmpLog.error('failed to get output and log datasets')
                        return retFatal
                    if datasetSpec.isPseudo():
                        tmpLog.info('skip pseudo dataset')
                        continue
                    # DDM backend
                    ddmBackEnd = taskSpec.getDdmBackEnd()
                    tmpLog.info('checking {0}'.format(datasetSpec.datasetName))
                    # check if dataset and container are available in DDM
                    for targetName in [datasetSpec.datasetName,datasetSpec.containerName]:
                        if targetName is None:
                            continue
                        if targetName not in avDatasetList:
                            # set lifetime
                            if targetName.startswith('panda'):
                                if datasetSpec.type == 'trn_log' and taskSpec.prodSourceLabel == 'managed':
                                    lifetime = 365
                                else:
                                    lifetime = 14
                            else:
                                lifetime = None
                            # check dataset/container in DDM
                            tmpList = ddmIF.listDatasets(targetName)
                            if tmpList == []:
                                # get location
                                location = None
                                locForRule = None
                                if targetName == datasetSpec.datasetName:
                                    # dataset
                                    if datasetSpec.site in ['',None]:
                                        if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                                            locForRule = datasetSpec.destination
                                        elif DataServiceUtils.getDestinationSE(datasetSpec.storageToken) is not None:
                                            location = DataServiceUtils.getDestinationSE(datasetSpec.storageToken)
                                        elif taskSpec.cloud is not None:
                                            # use T1 SE
                                            tmpT1Name = siteMapper.getCloud(taskSpec.cloud)['source']
                                            location = siteMapper.getDdmEndpoint(tmpT1Name, datasetSpec.storageToken,
                                                                                 taskSpec.prodSourceLabel,
                                                                                 JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType))
                                    else:
                                        tmpLog.info('site={0} token={1}'.format(datasetSpec.site, datasetSpec.storageToken))
                                        location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken,
                                                                             taskSpec.prodSourceLabel,
                                                                             JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType))
                                if locForRule is None:
                                    locForRule = location
                                # set metadata
                                if taskSpec.prodSourceLabel in ['managed','test'] and targetName == datasetSpec.datasetName:
                                    metaData = {}
                                    metaData['task_id'] = taskSpec.jediTaskID
                                    if taskSpec.campaign not in [None,'']:
                                        metaData['campaign'] = taskSpec.campaign
                                    if datasetSpec.getTransient() is not None:
                                        metaData['transient'] = datasetSpec.getTransient()
                                else:
                                    metaData = None
                                # register dataset/container
                                tmpLog.info('registering {0} with location={1} backend={2} lifetime={3} meta={4}'.format(targetName,
                                                                                                                         location,
                                                                                                                         ddmBackEnd,
                                                                                                                         lifetime,
                                                                                                                         str(metaData)))
                                tmpStat = ddmIF.registerNewDataset(targetName,backEnd=ddmBackEnd,location=location,
                                                                   lifetime=lifetime,metaData=metaData)
                                if not tmpStat:
                                    tmpLog.error('failed to register {0}'.format(targetName))
                                    return retFatal
                                # procedures for user
                                if userSetup or DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                                    # register location
                                    tmpToRegister = False
                                    if userSetup and targetName == datasetSpec.datasetName and datasetSpec.site not in ['',None]:
                                        if taskSpec.workingGroup:
                                            userName = taskSpec.workingGroup
                                        else:
                                            userName = taskSpec.userName
                                        grouping = None
                                        tmpToRegister = True
                                    elif DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                                        userName = None
                                        grouping = 'NONE'
                                        tmpToRegister = True
                                    if tmpToRegister:
                                        activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel)
                                        tmpLog.info('registering location={} lifetime={} days activity={} grouping={} '
                                                    'owner={}'.format(locForRule, lifetime, activity, grouping,
                                                                      userName))
                                        tmpStat = ddmIF.registerDatasetLocation(targetName,locForRule,owner=userName,
                                                                                lifetime=lifetime,backEnd=ddmBackEnd,
                                                                                activity=activity,grouping=grouping)
                                        if not tmpStat:
                                            tmpLog.error('failed to register location {0} for {1}'.format(locForRule,
                                                                                                          targetName))
                                            return retFatal
                                        # double copy
                                        if userSetup and datasetSpec.type == 'output':
                                            if datasetSpec.destination != datasetSpec.site:
                                                tmpLog.info('skip making double copy as destination={0} is not site={1}'.format(datasetSpec.destination,
                                                                                                                                datasetSpec.site))
                                            else:

                                                second_copy = True
                                                try:
                                                    if taskSpec.site:
                                                        panda_site = siteMapper.getSite(taskSpec.site)
                                                        if panda_site.catchall and 'skip_2nd_copy' in panda_site.catchall:
                                                            tmpLog.info('skip making double copy as specified in {0} catchall'.format(panda_site))
                                                            second_copy = False
                                                except Exception:
                                                    second_copy = True

                                                if second_copy:
                                                    locForDouble = '(type=SCRATCHDISK)\\notforextracopy=True'
                                                    tmpMsg  = 'registering double copy '
                                                    tmpMsg += 'location="{0}" lifetime={1}days activity={2} for dataset={3}'.format(locForDouble,lifetime,
                                                                                                                                    activity,targetName)
                                                    tmpLog.info(tmpMsg)
                                                    tmpStat = ddmIF.registerDatasetLocation(targetName,locForDouble,copies=2,owner=userName,
                                                                                            lifetime=lifetime,activity=activity,
                                                                                            grouping='NONE',weight='freespace',
                                                                                            ignore_availability=False)
                                                    if not tmpStat:
                                                        tmpLog.error('failed to register double copylocation {0} for {1}'.format(locForDouble,
                                                                                                                               targetName))
                                                        return retFatal
                                avDatasetList.append(targetName)
                            else:
                                tmpLog.info('{0} already registered'.format(targetName))
                    # check if dataset is in the container
                    if datasetSpec.containerName is not None and datasetSpec.containerName != datasetSpec.datasetName:
                        # get list of constituent datasets in the container
                        if datasetSpec.containerName not in cnDatasetMap:
                            cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName)
                        # add dataset
                        if datasetSpec.datasetName not in cnDatasetMap[datasetSpec.containerName]:
                            tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName))
                            tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName],
                                                                   backEnd=ddmBackEnd)
                            if not tmpStat:
                                tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName,
                                                                               datasetSpec.containerName))
                                return retFatal
                            cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName)
                        else:
                            tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName))
                    # update dataset
                    datasetSpec.status = 'registered'
                    self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID,
                                                                      'datasetID':datasetID})
            # register ES datasets
            if taskSpec.registerEsFiles():
                targetName = EventServiceUtils.getEsDatasetName(taskSpec.jediTaskID)
                location = None
                metaData = {}
                metaData['task_id'] = taskSpec.jediTaskID
                metaData['hidden']  = True
                tmpLog.info('registering ES dataset {0} with location={1} meta={2}'.format(targetName,
                                                                                           location,
                                                                                           str(metaData)))
                tmpStat = ddmIF.registerNewDataset(targetName,location=location,metaData=metaData,
                                                   resurrect=True)
                if not tmpStat:
                    tmpLog.error('failed to register ES dataset {0}'.format(targetName))
                    return retFatal
                # register rule
                location = 'type=DATADISK'
                activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel)
                grouping = 'NONE'
                tmpLog.info('registering location={0} activity={1} grouping={2}'.format(location,
                                                                                        activity,
                                                                                        grouping))
                tmpStat = ddmIF.registerDatasetLocation(targetName,location,activity=activity,
                                                        grouping=grouping)
                if not tmpStat:
                    tmpLog.error('failed to register location {0} with {2} for {1}'.format(location,
                                                                                           targetName,
                                                                                           activity))
                    return retFatal
            # open datasets
            if taskSpec.prodSourceLabel in ['managed','test']:
                # get the list of output/log datasets
                outDatasetList = []
                for tmpPandaJob in pandaJobs:
                    for tmpFileSpec in tmpPandaJob.Files:
                        if tmpFileSpec.type in ['output','log']:
                            if tmpFileSpec.destinationDBlock not in outDatasetList:
                                outDatasetList.append(tmpFileSpec.destinationDBlock)
                # open datasets
                for outDataset in outDatasetList:
                    tmpLog.info('open {0}'.format(outDataset))
                    ddmIF.openDataset(outDataset)
                    # unset lifetime
                    ddmIF.setDatasetMetadata(outDataset,'lifetime',None)
            # return
            tmpLog.info('done')
            return retOK
        except Exception:
            errtype,errvalue = sys.exc_info()[:2]
            tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue))
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            return retFatal
Пример #2
0
 def runImpl(self):
     # cutoff for disk in TB
     diskThreshold = 5 * 1024
     # dataset type to ignore file availability check
     datasetTypeToSkipCheck = ['log']
     thrInputSize = 1024*1024*1024
     thrInputNum = 100
     thrInputSizeFrac = 0.1
     thrInputNumFrac = 0.1
     cutOffRW = 50
     negWeightTape = 0.001
     # main
     lastJediTaskID = None
     siteMapper = self.taskBufferIF.getSiteMapper()
     while True:
         try:
             taskInputList = self.inputList.get(1)
             # no more datasets
             if len(taskInputList) == 0:
                 self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__,
                                                                                                             self.numTasks))
                 return
             # loop over all tasks
             for taskSpec,inputChunk in taskInputList:
                 lastJediTaskID = taskSpec.jediTaskID
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='{0}'.format(taskSpec.jediTaskID))
                 tmpLog.debug('start')
                 # get nuclei
                 nucleusList = siteMapper.nuclei
                 if taskSpec.nucleus in nucleusList:
                     candidateNucleus = taskSpec.nucleus
                 else:
                     tmpLog.debug('got {0} candidates'.format(len(nucleusList)))
                     ######################################
                     # check status
                     newNucleusList = {}
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleusSpec.state in ['ACTIVE']:
                             tmpLog.debug('  skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus,
                                                                                                         tmpNucleusSpec.state))
                         else:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.debug('{0} candidates passed status check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # check endpoint
                     newNucleusList = {}
                     tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                                   ['output','log'])
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         toSkip = False
                         for tmpDatasetSpec in tmpDatasetSpecList:
                             # ignore distributed datasets
                             if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None:
                                 continue
                             # get endpoint with the pattern
                             tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken)
                             if tmpEP == None:
                                 tmpLog.debug('  skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus,
                                                                                                                     tmpDatasetSpec.storageToken))
                                 toSkip = True
                                 break
                             # check state
                             """
                             if not tmpEP['state'] in ['ACTIVE']:
                                 tmpLog.debug('  skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus,
                                                                                                                          tmpEP['ddm_endpoint_name'],
                                                                                                                          tmpEP['state']))
                                 toSkip = True
                                 break
                             """    
                             # check space
                             tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired']
                             if tmpSpaceSize < diskThreshold:
                                 tmpLog.debug('  skip nucleus={0} since disk shortage ({1}<{2}) at endpoint {3} criteria=-space'.format(tmpNucleus,
                                                                                                                                        tmpSpaceSize,
                                                                                                                                        diskThreshold,
                                                                                                                                        tmpEP['state']))
                                 toSkip = True
                                 break
                         if not toSkip:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.debug('{0} candidates passed endpoint check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ###################################### 
                     # data locality
                     toSkip = False
                     availableData = {}
                     for datasetSpec in inputChunk.getDatasets():
                         # only for real datasets
                         if datasetSpec.isPseudo():
                             continue
                         # ignore DBR
                         if DataServiceUtils.isDBR(datasetSpec.datasetName):
                             continue
                         # skip locality check
                         if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck:
                             continue
                         # get nuclei where data is available
                         tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF,
                                                                           datasetSpec.datasetName,
                                                                           nucleusList.keys())
                         if tmpSt != Interaction.SC_SUCCEEDED:
                             tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet))
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             toSkip = True
                             break
                         # sum
                         for tmpNucleus,tmpVals in tmpRet.iteritems():
                             if not tmpNucleus in availableData:
                                 availableData[tmpNucleus] = tmpVals
                             else:
                                 availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems())
                     if toSkip:
                         continue
                     if availableData != {}:
                         newNucleusList = {}
                         # skip if no data
                         for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                             if availableData[tmpNucleus]['tot_size'] > thrInputSize and \
                                     availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac:
                                 tmpLog.debug('  skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus,
                                                                                                                                         availableData[tmpNucleus]['ava_size_any'],
                                                                                                                                         availableData[tmpNucleus]['tot_size'],
                                                                                                                                         thrInputSizeFrac))
                             elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \
                                     availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac:
                                 tmpLog.debug('  skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus,
                                                                                                                                           availableData[tmpNucleus]['ava_num_any'],
                                                                                                                                           availableData[tmpNucleus]['tot_num'],
                                                                                                                                           thrInputNumFrac))
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         nucleusList = newNucleusList
                         tmpLog.debug('{0} candidates passed data check'.format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # ability to execute jobs
                     newNucleusList = {}
                     # get all panda sites
                     tmpSiteList = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         tmpSiteList += tmpNucleusSpec.allPandaSites
                     tmpSiteList = list(set(tmpSiteList))
                     tmpLog.debug('===== start for job check')
                     jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF)
                     tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True,
                                                          tmpSiteList,tmpLog)
                     tmpLog.debug('===== done for job check')
                     if tmpSt != Interaction.SC_SUCCEEDED:
                         tmpLog.debug('failed to get sites where jobs can run. Use any nuclei where input is available')
                         # use any nuclei where input is available if no sites can run jobs
                         tmpRet = tmpSiteList
                     okNuclei = set()
                     for tmpSite in tmpRet:
                         siteSpec = siteMapper.getSite(tmpSite)
                         okNuclei.add(siteSpec.pandasite)
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if tmpNucleus in okNuclei:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                         else:
                             tmpLog.debug('  skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus))
                     nucleusList = newNucleusList
                     tmpLog.debug('{0} candidates passed job check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ###################################### 
                     # RW
                     taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID)
                     ###################################### 
                     # weight
                     self.prioRW.acquire()
                     nucleusRW = self.prioRW[taskSpec.currentPriority]
                     self.prioRW.release()
                     totalWeight = 0
                     nucleusweights = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleus in nucleusRW:
                             nucleusRW[tmpNucleus] = 0
                         wStr = '1'
                         # with RW
                         if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW:
                             weight = 1 / float(nucleusRW[tmpNucleus])
                             wStr += '/({0}=RW)'.format(nucleusRW[tmpNucleus])
                         else:
                             weight = 1
                             wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW)
                         # with data
                         if availableData != {}:
                             weight *= float(availableData[tmpNucleus]['ava_size_any'])
                             weight /= float(availableData[tmpNucleus]['tot_size'])
                             wStr += '*({0}=available input size on DISK/TAPE)'.format(availableData[tmpNucleus]['ava_size_any'])
                             wStr += '/({0}=total input size)'.format(availableData[tmpNucleus]['tot_size'])
                             # negative weight for tape
                             if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']:
                                 weight *= negWeightTape
                                 wStr += '*({0}=weight for TAPE)'.format(negWeightTape)
                         tmpLog.debug('  use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr))
                         totalWeight += weight
                         nucleusweights.append((tmpNucleus,weight))
                     tmpLog.debug('final {0} candidates'.format(len(nucleusList)))
                     ###################################### 
                     # final selection
                     tgtWeight = random.uniform(0,totalWeight)
                     candidateNucleus = None
                     for tmpNucleus,weight in nucleusweights:
                         tgtWeight -= weight
                         if tgtWeight <= 0:
                             candidateNucleus = tmpNucleus
                             break
                     if candidateNucleus == None:
                         candidateNucleus = nucleusweights[-1][0]
                 ###################################### 
                 # update
                 nucleusSpec = nucleusList[candidateNucleus]
                 # get output/log datasets
                 tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                            ['output','log'])
                 # get destinations
                 retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)}
                 tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
                 tmpLog.info('  set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet))
                 # update RW table
                 self.prioRW.acquire()
                 for prio,rwMap in self.prioRW.iteritems():
                     if prio > taskSpec.currentPriority:
                         continue
                     if candidateNucleus in rwMap:
                         rwMap[candidateNucleus] += taskRW
                     else:
                         rwMap[candidateNucleus] = taskRW
                 self.prioRW.release()
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errMsg  = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID)
             errMsg += traceback.format_exc()
             logger.error(errMsg)
Пример #3
0
 def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,
                         '<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                         monToken='<jediTaskID={0} {1}>'.format(
                             taskSpec.jediTaskID,
                             datetime.datetime.utcnow().isoformat('/')))
     tmpLog.debug('start')
     # return for failure
     retFatal = self.SC_FATAL, inputChunk
     retTmpError = self.SC_FAILED, inputChunk
     # get primary site candidates
     sitePreAssigned = False
     excludeList = []
     includeList = None
     scanSiteList = []
     # get list of site access
     siteAccessList = self.taskBufferIF.listSiteAccess(
         None, taskSpec.userName)
     siteAccessMap = {}
     for tmpSiteName, tmpAccess in siteAccessList:
         siteAccessMap[tmpSiteName] = tmpAccess
     # site limitation
     if taskSpec.useLimitedSites():
         if 'excludedSite' in taskParamMap:
             excludeList = taskParamMap['excludedSite']
             # str to list for task retry
             try:
                 if type(excludeList) != types.ListType:
                     excludeList = excludeList.split(',')
             except:
                 pass
         if 'includedSite' in taskParamMap:
             includeList = taskParamMap['includedSite']
             # str to list for task retry
             if includeList == '':
                 includeList = None
             try:
                 if type(includeList) != types.ListType:
                     includeList = includeList.split(',')
             except:
                 pass
     # loop over all sites
     for siteName, tmpSiteSpec in self.siteMapper.siteSpecList.iteritems():
         if tmpSiteSpec.type == 'analysis':
             scanSiteList.append(siteName)
     # preassigned
     if not taskSpec.site in ['', None]:
         # site is pre-assigned
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
         sitePreAssigned = True
         if not taskSpec.site in scanSiteList:
             scanSiteList.append(taskSpec.site)
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     # allowed remote access protocol
     allowedRemoteProtocol = 'fax'
     # MP
     if taskSpec.coreCount != None and taskSpec.coreCount > 1:
         # use MCORE only
         useMP = 'only'
     elif taskSpec.coreCount == 0:
         # use MCORE and normal
         useMP = 'any'
     else:
         # not use MCORE
         useMP = 'unuse'
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status in ['offline']:
             skipFlag = True
         elif tmpSiteSpec.status in ['brokeroff', 'test']:
             if not sitePreAssigned:
                 skipFlag = True
             elif tmpSiteName != taskSpec.site:
                 skipFlag = True
         if not skipFlag:
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug(
                 '  skip site=%s due to status=%s criteria=-status' %
                 (tmpSiteName, tmpSiteSpec.status))
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed site status check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for MP
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \
                     (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]):
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to core mismatch cores_site=%s <> cores_task=%s criteria=-cpucore' % \
                              (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount))
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed for useMP={1}'.format(
             len(scanSiteList), useMP))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for release
     if taskSpec.transHome != None:
         if taskSpec.transHome.startswith('ROOT'):
             # hack until x86_64-slc6-gcc47-opt is published in installedsw
             if taskSpec.architecture == 'x86_64-slc6-gcc47-opt':
                 tmpCmtConfig = 'x86_64-slc6-gcc46-opt'
             else:
                 tmpCmtConfig = taskSpec.architecture
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                 scanSiteList, cmtConfig=tmpCmtConfig, onlyCmtConfig=True)
         elif 'AthAnalysis' in taskSpec.transHome or re.search(
                 'Ath[a-zA-Z]+Base', taskSpec.transHome) != None:
             # AthAnalysis
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                 scanSiteList,
                 cmtConfig=taskSpec.architecture,
                 onlyCmtConfig=True)
         else:
             # remove AnalysisTransforms-
             transHome = re.sub('^[^-]+-*', '', taskSpec.transHome)
             transHome = re.sub('_', '-', transHome)
             if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None and taskSpec.transHome != 'AnalysisTransforms' and \
                     re.search('\d{4}-\d{2}-\d{2}T\d{4}$',taskSpec.transHome) == None :
                 # cache is checked
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                     scanSiteList,
                     caches=transHome,
                     cmtConfig=taskSpec.architecture)
             elif transHome == '' and taskSpec.transUses != None:
                 # remove Atlas-
                 transUses = taskSpec.transUses.split('-')[-1]
                 # release is checked
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                     scanSiteList,
                     releases=transUses,
                     cmtConfig=taskSpec.architecture)
             else:
                 # nightlies
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                     scanSiteList, releases='CVMFS')
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip site=%s due to missing rel/cache %s:%s:%s criteria=-cache' % \
                              (tmpSiteName,taskSpec.transUses,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed for SW {1}:{2}:{3}'.format(
             len(scanSiteList), taskSpec.transUses, taskSpec.transHome,
             taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for memory
     minRamCount = inputChunk.getMaxRamCount()
     minRamCount = JediCoreUtils.compensateRamCount(minRamCount)
     if not minRamCount in [0, None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # site max memory requirement
             if not tmpSiteSpec.maxrss in [0, None]:
                 site_maxmemory = tmpSiteSpec.maxrss
             else:
                 site_maxmemory = tmpSiteSpec.maxmemory
             if not site_maxmemory in [
                     0, None
             ] and minRamCount != 0 and minRamCount > site_maxmemory:
                 tmpLog.debug(
                     '  skip site={0} due to site RAM shortage. site_maxmemory={1} < job_minramcount={2} criteria=-lowmemory'
                     .format(tmpSiteName, site_maxmemory, minRamCount))
                 continue
             # site min memory requirement
             if not tmpSiteSpec.minrss in [0, None]:
                 site_minmemory = tmpSiteSpec.minrss
             else:
                 site_minmemory = tmpSiteSpec.minmemory
             if not site_minmemory in [
                     0, None
             ] and minRamCount != 0 and minRamCount < site_minmemory:
                 tmpLog.debug(
                     '  skip site={0} due to job RAM shortage. site_minmemory={1} > job_minramcount={2} criteria=-highmemory'
                     .format(tmpSiteName, site_minmemory, minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(
             len(scanSiteList), minRamCount, taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for scratch disk
     tmpMaxAtomSize = inputChunk.getMaxAtomSize()
     tmpEffAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True)
     tmpOutDiskSize = taskSpec.getOutDiskSize()
     tmpWorkDiskSize = taskSpec.getWorkDiskSize()
     minDiskCountS = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize + tmpMaxAtomSize
     minDiskCountS = minDiskCountS / 1024 / 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize
         minDiskCountR = minDiskCountR / 1024 / 1024
     tmpLog.debug(
         'maxAtomSize={0} effectiveAtomSize={1} outDiskCount={2} workDiskSize={3}'
         .format(tmpMaxAtomSize, tmpEffAtomSize, tmpOutDiskSize,
                 tmpWorkDiskSize))
     tmpLog.debug('minDiskCountScratch={0} minDiskCountRemote={1}'.format(
         minDiskCountS, minDiskCountR))
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0:
             if tmpSiteSpec.isDirectIO():
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug(
                     '  skip site={0} due to small scratch disk={1} < {2} criteria=-disk'
                     .format(tmpSiteName, tmpSiteSpec.maxwdir,
                             minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check endpoint
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         tmpEndPoint = tmpSiteSpec.ddm_endpoints.getEndPoint(
             tmpSiteSpec.ddm)
         if tmpEndPoint is not None:
             # free space must be >= 200GB
             diskThreshold = 200
             tmpSpaceSize = 0
             if tmpEndPoint['space_expired'] is not None:
                 tmpSpaceSize += tmpEndPoint['space_expired']
             if tmpEndPoint['space_free'] is not None:
                 tmpSpaceSize += tmpEndPoint['space_free']
             if tmpSpaceSize < diskThreshold:
                 tmpLog.debug(
                     '  skip site={0} due to disk shortage in SE {1} < {2}GB criteria=-disk'
                     .format(tmpSiteName, tmpSpaceSize, diskThreshold))
                 continue
             # check if blacklisted
             if tmpEndPoint['blacklisted'] == 'Y':
                 tmpLog.debug(
                     '  skip site={0} since {1} is blacklisted in DDM criteria=-blacklist'
                     .format(tmpSiteName, tmpSiteSpec.ddm))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0, None] and minWalltime > 0:
         minWalltime *= tmpEffAtomSize
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug(
                     '  skip site={0} due to short site walltime={1}(site upper limit) < {2} criteria=-shortwalltime'
                     .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug(
                     '  skip site={0} due to short job walltime={1}(site lower limit) > {2} criteria=-longwalltime'
                     .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(
             len(scanSiteList), minWalltime, taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][
                 'updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug(
                 '  skip site=%s due to no pilot criteria=-nopilot' %
                 tmpSiteName)
             if not self.testMode:
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed pilot activity check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # check inclusion and exclusion
     newScanSiteList = []
     sitesForANY = []
     for tmpSiteName in scanSiteList:
         autoSite = False
         # check exclusion
         if AtlasBrokerUtils.isMatched(tmpSiteName, excludeList):
             tmpLog.debug(
                 '  skip site={0} excluded criteria=-excluded'.format(
                     tmpSiteName))
             continue
         # check inclusion
         if includeList != None and not AtlasBrokerUtils.isMatched(
                 tmpSiteName, includeList):
             if 'AUTO' in includeList:
                 autoSite = True
             else:
                 tmpLog.debug(
                     '  skip site={0} not included criteria=-notincluded'.
                     format(tmpSiteName))
                 continue
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limited access
         if tmpSiteSpec.accesscontrol == 'grouplist':
             if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \
                     siteAccessMap[tmpSiteSpec.sitename] != 'approved':
                 tmpLog.debug(
                     '  skip site={0} limited access criteria=-limitedaccess'
                     .format(tmpSiteName))
                 continue
         # check cloud
         if not taskSpec.cloud in [None, '', 'any', tmpSiteSpec.cloud]:
             tmpLog.debug(
                 '  skip site={0} cloud mismatch criteria=-cloudmismatch'.
                 format(tmpSiteName))
             continue
         if autoSite:
             sitesForANY.append(tmpSiteName)
         else:
             newScanSiteList.append(tmpSiteName)
     # use AUTO sites if no sites are included
     if newScanSiteList == []:
         newScanSiteList = sitesForANY
     else:
         for tmpSiteName in sitesForANY:
             tmpLog.debug(
                 '  skip site={0} not included criteria=-notincluded'.
                 format(tmpSiteName))
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed inclusion/exclusion/cloud'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for data availability
     hasDDS = False
     dataWeight = {}
     remoteSourceList = {}
     if inputChunk.getDatasets() != []:
         oldScanSiteList = copy.copy(scanSiteList)
         for datasetSpec in inputChunk.getDatasets():
             datasetName = datasetSpec.datasetName
             if not self.dataSiteMap.has_key(datasetName):
                 # get the list of sites where data is available
                 tmpLog.debug(
                     'getting the list of sites where {0} is available'.
                     format(datasetName))
                 tmpSt, tmpRet = AtlasBrokerUtils.getAnalSitesWithData(
                     scanSiteList, self.siteMapper, self.ddmIF, datasetName)
                 if tmpSt in [
                         Interaction.JEDITemporaryError,
                         Interaction.JEDITimeoutError
                 ]:
                     tmpLog.error(
                         'temporary failed to get the list of sites where data is available, since %s'
                         % tmpRet)
                     taskSpec.setErrDiag(
                         tmpLog.uploadLog(taskSpec.jediTaskID))
                     # send info to logger
                     self.sendLogMessage(tmpLog)
                     return retTmpError
                 if tmpSt == Interaction.JEDIFatalError:
                     tmpLog.error(
                         'fatal error when getting the list of sites where data is available, since %s'
                         % tmpRet)
                     taskSpec.setErrDiag(
                         tmpLog.uploadLog(taskSpec.jediTaskID))
                     # send info to logger
                     self.sendLogMessage(tmpLog)
                     return retFatal
                 # append
                 self.dataSiteMap[datasetName] = tmpRet
                 if datasetName.startswith('ddo'):
                     tmpLog.debug(' {0} sites'.format(len(tmpRet)))
                 else:
                     tmpLog.debug(' {0} sites : {1}'.format(
                         len(tmpRet), str(tmpRet)))
                     # check if distributed
                     if tmpRet != {}:
                         isDistributed = True
                         for tmpMap in tmpRet.values():
                             for tmpVal in tmpMap.values():
                                 if tmpVal['state'] == 'complete':
                                     isDistributed = False
                                     break
                             if not isDistributed:
                                 break
                         if isDistributed:
                             # check if really distributed
                             isDistributed = self.ddmIF.isDistributedDataset(
                                 datasetName)
                             if isDistributed:
                                 hasDDS = True
                                 datasetSpec.setDistributed()
                                 tmpLog.debug(' {0} is distributed'.format(
                                     datasetName))
             # check if the data is available at somewhere
             if self.dataSiteMap[datasetName] == {}:
                 tmpLog.error(
                     '{0} is unavailable at any site'.format(datasetName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 # send info to logger
                 self.sendLogMessage(tmpLog)
                 return retFatal
         # get the list of sites where data is available
         scanSiteList = None
         scanSiteListOnDisk = None
         normFactor = 0
         for datasetName, tmpDataSite in self.dataSiteMap.iteritems():
             normFactor += 1
             # get sites where replica is available
             tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(
                 tmpDataSite, includeTape=True)
             tmpDiskSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(
                 tmpDataSite, includeTape=False)
             # get sites which can remotely access source sites
             if inputChunk.isMerging:
                 # disable remote access for merging
                 tmpSatelliteSites = {}
             elif (not sitePreAssigned) or (
                     sitePreAssigned and not taskSpec.site in tmpSiteList):
                 tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites(
                     tmpDiskSiteList,
                     self.taskBufferIF,
                     self.siteMapper,
                     nSites=50,
                     protocol=allowedRemoteProtocol)
             else:
                 tmpSatelliteSites = {}
             # make weight map for local
             for tmpSiteName in tmpSiteList:
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = 0
                 # give more weight to disk
                 if tmpSiteName in tmpDiskSiteList:
                     dataWeight[tmpSiteName] += 1
                 else:
                     dataWeight[tmpSiteName] += 0.001
             # make weight map for remote
             for tmpSiteName, tmpWeightSrcMap in tmpSatelliteSites.iteritems(
             ):
                 # skip since local data is available
                 if tmpSiteName in tmpSiteList:
                     continue
                 tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                 # negative weight for remote access
                 wRemote = 50.0
                 if not tmpSiteSpec.wansinklimit in [0, None]:
                     wRemote /= float(tmpSiteSpec.wansinklimit)
                 # sum weight
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = float(
                         tmpWeightSrcMap['weight']) / wRemote
                 else:
                     dataWeight[tmpSiteName] += float(
                         tmpWeightSrcMap['weight']) / wRemote
                 # make remote source list
                 if not remoteSourceList.has_key(tmpSiteName):
                     remoteSourceList[tmpSiteName] = {}
                 remoteSourceList[tmpSiteName][
                     datasetName] = tmpWeightSrcMap['source']
             # first list
             if scanSiteList == None:
                 scanSiteList = []
                 for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                     if not tmpSiteName in oldScanSiteList:
                         continue
                     if not tmpSiteName in scanSiteList:
                         scanSiteList.append(tmpSiteName)
                 scanSiteListOnDisk = set()
                 for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys(
                 ):
                     if not tmpSiteName in oldScanSiteList:
                         continue
                     scanSiteListOnDisk.add(tmpSiteName)
                 continue
             # pickup sites which have all data
             newScanList = []
             for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                 if tmpSiteName in scanSiteList and not tmpSiteName in newScanList:
                     newScanList.append(tmpSiteName)
             scanSiteList = newScanList
             tmpLog.debug('{0} is available at {1} sites'.format(
                 datasetName, len(scanSiteList)))
             # pickup sites which have all data on DISK
             newScanListOnDisk = set()
             for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys():
                 if tmpSiteName in scanSiteListOnDisk:
                     newScanListOnDisk.add(tmpSiteName)
             scanSiteListOnDisk = newScanListOnDisk
             tmpLog.debug('{0} is available at {1} sites on DISK'.format(
                 datasetName, len(scanSiteListOnDisk)))
         # check for preassigned
         if sitePreAssigned and not taskSpec.site in scanSiteList:
             scanSiteList = []
             tmpLog.debug(
                 'data is unavailable locally or remotely at preassigned site {0}'
                 .format(taskSpec.site))
         elif len(scanSiteListOnDisk) > 0:
             # use only disk sites
             scanSiteList = list(scanSiteListOnDisk)
         tmpLog.debug('{0} candidates have input data'.format(
             len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retFatal
     ######################################
     # sites already used by task
     tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(
         taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # calculate weight
     fqans = taskSpec.makeFQANs()
     """
     tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans,
                                                                                               taskSpec.workingGroup,True)
     currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight)
     currentPriority -= 500
     tmpLog.debug('currentPriority={0}'.format(currentPriority))
     """
     tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(
         taskSpec.vo, taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     # check for preassigned
     if sitePreAssigned and not taskSpec.site in scanSiteList:
         tmpLog.debug("preassigned site {0} did not pass all tests".format(
             taskSpec.site))
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retFatal
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     timeWindowForFC = 6
     preSiteCandidateSpec = None
     failureCounts = self.taskBufferIF.getFailureCountsForTask_JEDI(
         taskSpec.jediTaskID, timeWindowForFC)
     problematicSites = set()
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName,
                                                'running', None, None)
         nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                 tmpSiteName, 'defined',
                                                 None, None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) + \
                      AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None)
         nStarting = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                 tmpSiteName, 'starting',
                                                 None, None)
         nFailed = 0
         nClosed = 0
         nFinished = 0
         if tmpSiteName in failureCounts:
             if 'failed' in failureCounts[tmpSiteName]:
                 nFailed = failureCounts[tmpSiteName]['failed']
             if 'closed' in failureCounts[tmpSiteName]:
                 nClosed = failureCounts[tmpSiteName]['closed']
             if 'finished' in failureCounts[tmpSiteName]:
                 nFinished = failureCounts[tmpSiteName]['finished']
         # problematic sites
         if nFailed + nClosed > 2 * nFinished:
             problematicSites.add(tmpSiteName)
         # calculate weight
         weight = float(nRunning + 1) / float(nActivated + nAssigned +
                                              nStarting + 1)
         nThrottled = 0
         if remoteSourceList.has_key(tmpSiteName):
             nThrottled = AtlasBrokerUtils.getNumJobs(
                 jobStatPrioMap, tmpSiteName, 'throttled', None, None)
             weight /= float(nThrottled + 1)
         # noramize weights by taking data availability into account
         tmpDataWeight = 1
         if dataWeight.has_key(tmpSiteName):
             weight = weight * dataWeight[tmpSiteName]
             tmpDataWeight = dataWeight[tmpSiteName]
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # preassigned
         if sitePreAssigned and tmpSiteName == taskSpec.site:
             preSiteCandidateSpec = siteCandidateSpec
         # set weight
         siteCandidateSpec.weight = weight
         tmpStr = '  site={0} nRun={1} nDef={2} nAct={3} nStart={4} '.format(
             tmpSiteName, nRunning, nAssigned, nActivated, nStarting)
         tmpStr += 'nFailed={0} nClosed={1} nFinished={2} nTr={3} dataW={4} W={5}'.format(
             nFailed, nClosed, nFinished, nThrottled, tmpDataWeight, weight)
         tmpLog.debug(tmpStr)
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if not weightMap.has_key(weight):
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)
     # sort candidates by weights
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight
     # limit the number of sites. use all sites for distributed datasets
     if not hasDDS:
         maxNumSites = 10
         # remove problematic sites
         candidateSpecList = AtlasBrokerUtils.skipProblematicSites(
             candidateSpecList, problematicSites, sitesUsedByTask,
             preSiteCandidateSpec, maxNumSites, timeWindowForFC, tmpLog)
     # append preassigned
     if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList:
         candidateSpecList.append(preSiteCandidateSpec)
     # collect site names
     scanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # get list of available files
     availableFileMap = {}
     for datasetSpec in inputChunk.getDatasets():
         try:
             # get list of site to be scanned
             fileScanSiteList = []
             for tmpSiteName in scanSiteList:
                 fileScanSiteList.append(tmpSiteName)
                 if remoteSourceList.has_key(
                         tmpSiteName
                 ) and remoteSourceList[tmpSiteName].has_key(
                         datasetSpec.datasetName):
                     for tmpRemoteSite in remoteSourceList[tmpSiteName][
                             datasetSpec.datasetName]:
                         if not tmpRemoteSite in fileScanSiteList:
                             fileScanSiteList.append(tmpRemoteSite)
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(
                 fileScanSiteList, self.siteMapper)
             # disable file lookup for merge jobs
             if inputChunk.isMerging:
                 checkCompleteness = False
             else:
                 checkCompleteness = True
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(
                 datasetSpec,
                 siteStorageEP,
                 self.siteMapper,
                 ngGroup=[2],
                 checkCompleteness=checkCompleteness)
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError, 'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype, errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' %
                          (errtype.__name__, errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         tmpSiteName = siteCandidateSpec.siteName
         # preassigned
         if sitePreAssigned and tmpSiteName != taskSpec.site:
             tmpLog.debug(
                 '  skip site={0} non pre-assigned site criteria=-nonpreassigned'
                 .format(tmpSiteName))
             continue
         # set available files
         if inputChunk.getDatasets() == []:
             isAvailable = True
         else:
             isAvailable = False
         for tmpDatasetName, availableFiles in availableFileMap.iteritems():
             tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName)
             # check remote files
             if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[
                     tmpSiteName].has_key(tmpDatasetName):
                 for tmpRemoteSite in remoteSourceList[tmpSiteName][
                         tmpDatasetName]:
                     if availableFiles.has_key(tmpRemoteSite) and \
                             len(tmpDatasetSpec.Files) <= len(availableFiles[tmpRemoteSite]['localdisk']):
                         # use only remote disk files
                         siteCandidateSpec.remoteFiles += availableFiles[
                             tmpRemoteSite]['localdisk']
                         # set remote site and access protocol
                         siteCandidateSpec.remoteProtocol = allowedRemoteProtocol
                         siteCandidateSpec.remoteSource = tmpRemoteSite
                         isAvailable = True
                         break
             # local files
             if availableFiles.has_key(tmpSiteName):
                 if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \
                         len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']) or \
                         len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localtape']) or \
                         (tmpDatasetSpec.isDistributed() and len(availableFiles[tmpSiteName]['all']) > 0):
                     siteCandidateSpec.localDiskFiles += availableFiles[
                         tmpSiteName]['localdisk']
                     # add cached files to local list since cached files go to pending when reassigned
                     siteCandidateSpec.localDiskFiles += availableFiles[
                         tmpSiteName]['cache']
                     siteCandidateSpec.localTapeFiles += availableFiles[
                         tmpSiteName]['localtape']
                     siteCandidateSpec.cacheFiles += availableFiles[
                         tmpSiteName]['cache']
                     siteCandidateSpec.remoteFiles += availableFiles[
                         tmpSiteName]['remote']
                     siteCandidateSpec.addAvailableFiles(
                         availableFiles[tmpSiteName]['all'])
                     isAvailable = True
                 else:
                     tmpMsg = '{0} is incompete at {1} : nFiles={2} nLocal={3} nCached={4} nTape={5}'
                     tmpLog.debug(
                         tmpMsg.format(
                             tmpDatasetName,
                             tmpSiteName,
                             len(tmpDatasetSpec.Files),
                             len(availableFiles[tmpSiteName]['localdisk']),
                             len(availableFiles[tmpSiteName]['cache']),
                             len(availableFiles[tmpSiteName]['localtape']),
                         ))
             if not isAvailable:
                 break
         # append
         if not isAvailable:
             tmpLog.debug(
                 '  skip site={0} file unavailable criteria=-fileunavailable'
                 .format(siteCandidateSpec.siteName))
             continue
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug(
             '  use site={0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5} criteria=+use'
             .format(
                 siteCandidateSpec.siteName,
                 siteCandidateSpec.weight,
                 len(siteCandidateSpec.localDiskFiles),
                 len(siteCandidateSpec.localTapeFiles),
                 len(siteCandidateSpec.cacheFiles),
                 len(siteCandidateSpec.remoteFiles),
             ))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     # send info to logger
     self.sendLogMessage(tmpLog)
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED, inputChunk
Пример #4
0
 def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID))
     tmpLog.debug('start')
     # return for failure
     retFatal    = self.SC_FATAL,inputChunk
     retTmpError = self.SC_FAILED,inputChunk
     # get primary site candidates 
     sitePreAssigned = False
     excludeList = []
     includeList = None
     scanSiteList = []
     # get list of site access
     siteAccessList = self.taskBufferIF.listSiteAccess(None,taskSpec.userName)
     siteAccessMap = {}
     for tmpSiteName,tmpAccess in siteAccessList:
         siteAccessMap[tmpSiteName] = tmpAccess
     # site limitation
     if taskSpec.useLimitedSites():
         if 'excludedSite' in taskParamMap:
             excludeList = taskParamMap['excludedSite']
         if 'includedSite' in taskParamMap:
             includeList = taskParamMap['includedSite']
     # loop over all sites        
     for siteName,tmpSiteSpec in self.siteMapper.siteSpecList.iteritems():
         if tmpSiteSpec.type == 'analysis':
             scanSiteList.append(siteName)
     # preassigned
     if not taskSpec.site in ['',None]:
         # site is pre-assigned
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
         sitePreAssigned = True
         if not taskSpec.site in scanSiteList:
             scanSiteList.append(taskSpec.site)
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     # allowed remote access protocol
     allowedRemoteProtocol = 'fax'
     ######################################
     # selection for data availability
     dataWeight = {}
     remoteSourceList = {}
     if inputChunk.getDatasets() != []:    
         for datasetSpec in inputChunk.getDatasets():
             datasetName = datasetSpec.datasetName
             if not self.dataSiteMap.has_key(datasetName):
                 # get the list of sites where data is available
                 tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName))
                 tmpSt,tmpRet = AtlasBrokerUtils.getAnalSitesWithData(scanSiteList,
                                                                      self.siteMapper,
                                                                      self.ddmIF,datasetName)
                 if tmpSt == self.SC_FAILED:
                     tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     return retTmpError
                 if tmpSt == self.SC_FATAL:
                     tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     return retFatal
                 # append
                 self.dataSiteMap[datasetName] = tmpRet
                 if datasetName.startswith('ddo'):
                     tmpLog.debug(' {0} sites'.format(len(tmpRet)))
                 else:
                     tmpLog.debug(' {0} sites : {1}'.format(len(tmpRet),str(tmpRet)))
             # check if the data is available at somewhere
             if self.dataSiteMap[datasetName] == {}:
                 tmpLog.error('{0} is unavaiable at any site'.format(datasetName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retFatal
             # check if the data is available on disk
             if AtlasBrokerUtils.getAnalSitesWithDataDisk(self.dataSiteMap[datasetName]) == []:
                 tmpLog.error('{0} is avaiable only on tape'.format(datasetName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retFatal
         # get the list of sites where data is available    
         scanSiteList = None     
         normFactor = 0
         for datasetName,tmpDataSite in self.dataSiteMap.iteritems():
             normFactor += 1
             # get sites where disk replica is available
             tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(tmpDataSite)
             # get sites which can remotely access source sites
             if (not sitePreAssigned) or (sitePreAssigned and not taskSpec.site in tmpSiteList):
                 tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites(tmpSiteList,self.taskBufferIF,
                                                                        self.siteMapper,nSites=50,
                                                                        protocol=allowedRemoteProtocol)
             else:
                 tmpSatelliteSites = {}
             # make weight map for local
             for tmpSiteName in tmpSiteList:
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = 1
                 else:
                     dataWeight[tmpSiteName] += 1
             # make weight map for remote
             for tmpSiteName,tmpWeightSrcMap in tmpSatelliteSites.iteritems():
                 # skip since local data is available
                 if tmpSiteName in tmpSiteList:
                     continue
                 # sum weight
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = tmpWeightSrcMap['weight']
                 else:
                     dataWeight[tmpSiteName] += tmpWeightSrcMap['weight']
                 # make remote source list
                 if not remoteSourceList.has_key(tmpSiteName):
                     remoteSourceList[tmpSiteName] = {}
                 remoteSourceList[tmpSiteName][datasetName] = tmpWeightSrcMap['source']
             # first list
             if scanSiteList == None:
                 scanSiteList = []
                 for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                     if not tmpSiteName in scanSiteList:
                         scanSiteList.append(tmpSiteName)
                 continue
             # pickup sites which have all data
             newScanList = []
             for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                 if tmpSiteName in scanSiteList and not tmpSiteName in newScanList:
                     newScanList.append(tmpSiteName)
             scanSiteList = newScanList
             tmpLog.debug('{0} is available at {1} sites'.format(datasetName,len(scanSiteList)))
         tmpLog.debug('{0} candidates have input data'.format(len(scanSiteList)))
         # check for preassigned
         if sitePreAssigned and not taskSpec.site in scanSiteList:
             scanSiteList = []
             tmpLog.debug('data is unavailable locally or remotely at preassigned site {0}'.format(taskSpec.site))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retFatal
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status in ['offline']:
             skipFlag = True
         elif tmpSiteSpec.status in ['brokeroff','test']:
             if not sitePreAssigned:
                 skipFlag = True
             elif tmpSiteName != taskSpec.site:
                 skipFlag = True
         if not skipFlag:    
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug('  skip %s due to status=%s' % (tmpSiteName,tmpSiteSpec.status))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for release
     if not taskSpec.transHome in [None,'AnalysisTransforms']:
         if taskSpec.transHome.startswith('ROOT'):
             # hack until x86_64-slc6-gcc47-opt is published in installedsw
             if taskSpec.architecture == 'x86_64-slc6-gcc47-opt':
                 tmpCmtConfig = 'x86_64-slc6-gcc46-opt'
             else:
                 tmpCmtConfig = taskSpec.architecture
             siteListWithSW = taskBuffer.checkSitesWithRelease(scanSiteList,
                                                               cmtConfig=tmpCmtConfig,
                                                               onlyCmtConfig=True)
         else:    
             # remove AnalysisTransforms-
             transHome = re.sub('^[^-]+-*','',taskSpec.transHome)
             transHome = re.sub('_','-',transHome)
             if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None:
                 # cache is checked 
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                          caches=transHome,
                                                                          cmtConfig=taskSpec.architecture)
             elif transHome == '':
                 # release is checked 
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                          releases=taskSpec.transUses,
                                                                          cmtConfig=taskSpec.architecture)
             else:
                 # nightlies
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                          releases='CVMFS')
                 #                                                         releases='nightlies',
                 #                                                         cmtConfig=taskSpec.architecture)
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY'] or \
                tmpSiteSpec.cloud in ['ND']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip %s due to missing rel/cache %s:%s' % \
                              (tmpSiteName,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for SW {1}:{2}'.format(len(scanSiteList),
                                                                    taskSpec.transHome,
                                                                    taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for memory
     minRamCount  = taskSpec.ramCount
     if not minRamCount in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory:
                 tmpLog.debug('  skip {0} due to site RAM shortage={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                       tmpSiteSpec.maxmemory,
                                                                                                       minRamCount))
                 continue
             if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory:
                 tmpLog.debug('  skip {0} due to job RAM shortage={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                      tmpSiteSpec.minmemory,
                                                                                                      minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList),
                                                                          minRamCount,taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for scratch disk
     minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize()
     minDiskCountS = minDiskCountS / 1024 / 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()
         minDiskCountR = minDiskCountR / 1024 / 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0:
             if tmpSiteSpec.isDirectIO():
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug('  skip {0} due to small scratch disk={1} < {2}'.format(tmpSiteName,
                                                                                      tmpSiteSpec.maxwdir,
                                                                                      minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # free space must be >= 200GB
         diskThreshold = 200
         tmpSpaceSize = tmpSiteSpec.space
         if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold:
             tmpLog.debug('  skip {0} due to disk shortage in SE = {1} < {2}GB'.format(tmpSiteName,tmpSiteSpec.space,
                                                                                       diskThreshold))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug('  skip {0} due to short site walltime={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                         tmpSiteSpec.maxtime,
                                                                                                         minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug('  skip {0} due to short job walltime={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                        tmpSiteSpec.mintime,
                                                                                                        minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug('  skip %s due to no pilot' % tmpSiteName)
             #continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # check inclusion and exclusion
     newScanSiteList = []
     sitesForANY = []
     for tmpSiteName in scanSiteList:
         autoSite = False
         # check exclusion
         if AtlasBrokerUtils.isMatched(tmpSiteName,excludeList):
             tmpLog.debug('  skip {0} excluded'.format(tmpSiteName))
             continue
         # check inclusion
         if includeList != None and not AtlasBrokerUtils.isMatched(tmpSiteName,includeList):
             if 'AUTO' in includeList:
                 autoSite = True
             else:
                 tmpLog.debug('  skip {0} not included'.format(tmpSiteName))
                 continue
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limited access
         if tmpSiteSpec.accesscontrol == 'grouplist':
             if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \
                     siteAccessMap[tmpSiteSpec.sitename] != 'approved':
                 tmpLog.debug('  skip {0} limited access'.format(tmpSiteName))
                 continue
         # check cloud
         if not taskSpec.cloud in [None,'','any',tmpSiteSpec.cloud]: 
             tmpLog.debug('  skip {0} cloud missmatch'.format(tmpSiteName))
             continue
         if autoSite:
             sitesForANY.append(tmpSiteName)
         else:
             newScanSiteList.append(tmpSiteName)
     # use AUTO sites if no sites are included
     if newScanSiteList == []:
         newScanSiteList = sitesForANY
     else:
         for tmpSiteName in sitesForANY:
             tmpLog.debug('  skip {0} not included'.format(tmpSiteName))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed inclusion/exclusion/cloud'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # sites already used by task
     tmpSt,sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # calculate weight
     fqans = taskSpec.makeFQANs()
     tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans,
                                                                                               taskSpec.workingGroup,True)
     currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight)
     tmpLog.debug('currentPriority={0}'.format(currentPriority))
     tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,
                                                                                 taskSpec.prodSourceLabel,
                                                                                 currentPriority)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # check for preassigned
     if sitePreAssigned and not taskSpec.site in scanSiteList:
         tmpLog.debug("preassigned site {0} didn't pass all tests".format(taskSpec.site))
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retFatal
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     preSiteCandidateSpec = None
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',  None,None)
         nAssigned  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'defined',  None,None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None)
         weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1)
         if remoteSourceList.has_key(tmpSiteName):
             nThrottled = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None)
             weight = float(nThrottled + 1)
         # noramize weights by taking data availability into account
         if dataWeight.has_key(tmpSiteName):
             weight = weight * dataWeight[tmpSiteName]
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # preassigned
         if sitePreAssigned and tmpSiteName == taskSpec.site:
             preSiteCandidateSpec = siteCandidateSpec
         # set weight
         siteCandidateSpec.weight = weight
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if not weightMap.has_key(weight):
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)    
     # limit the number of sites
     maxNumSites = 5
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         if len(candidateSpecList) >= maxNumSites:
             break
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight[:(maxNumSites-len(candidateSpecList))]
     # append preassigned
     if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList: 
         candidateSpecList.append(preSiteCandidateSpec)
     # collect site names
     scanSiteList = []    
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # get list of available files
     availableFileMap = {}     
     for datasetSpec in inputChunk.getDatasets():
         try:
             # get list of site to be scanned
             fileScanSiteList = []
             for tmpSiteName in scanSiteList:
                 fileScanSiteList.append(tmpSiteName)
                 if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(datasetSpec.datasetName):
                     for tmpRemoteSite in remoteSourceList[tmpSiteName][datasetSpec.datasetName]:
                         if not tmpRemoteSite in fileScanSiteList:
                             fileScanSiteList.append(tmpRemoteSite)
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(fileScanSiteList,self.siteMapper)
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec,
                                                         siteStorageEP,
                                                         self.siteMapper,
                                                         ngGroup=[2])
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         tmpSiteName = siteCandidateSpec.siteName
         # preassigned
         if sitePreAssigned and tmpSiteName != taskSpec.site:
             tmpLog.debug('  skip {0} non pre-assigned site'.format(tmpSiteName))
             continue
         # set available files
         if inputChunk.getDatasets() == []: 
             isAvailable = True
         else:
             isAvailable = False
         for tmpDatasetName,availableFiles in availableFileMap.iteritems():
             tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName)
             # check remote files
             if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(tmpDatasetName):
                 for tmpRemoteSite in remoteSourceList[tmpSiteName][tmpDatasetName]:
                     if availableFiles.has_key(tmpRemoteSite) and \
                             len(tmpDatasetSpec.Files) <= availableFiles[tmpRemoteSite]['localdisk']:
                         # use only remote disk files
                         siteCandidateSpec.remoteFiles += availableFiles[tmpRemoteSite]['localdisk']
                         # set remote site and access protocol
                         siteCandidateSpec.remoteProtocol = allowedRemoteProtocol
                         siteCandidateSpec.remoteSource   = tmpRemoteSite
                         isAvailable = True
                         break
             # local files
             if availableFiles.has_key(tmpSiteName):
                 if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \
                         len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']):
                     siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['localdisk']
                     # add cached files to local list since cached files go to pending when reassigned
                     siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['cache']
                     siteCandidateSpec.localTapeFiles  += availableFiles[tmpSiteName]['localtape']
                     siteCandidateSpec.cacheFiles  += availableFiles[tmpSiteName]['cache']
                     siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote']
                     isAvailable = True
                 else:
                     tmpLog.debug('{0} is incompete at {1} : nFiles={2} nLocal={3} nCached={4}'.format(tmpDatasetName,
                                                                                                       tmpSiteName,
                                                                                                       len(tmpDatasetSpec.Files),
                                                                                                       len(availableFiles[tmpSiteName]['localdisk']),
                                                                                                       len(availableFiles[tmpSiteName]['cache'])))
             if not isAvailable:
                 break
         # append
         if not isAvailable:
             tmpLog.debug('  skip {0} file unavailable'.format(siteCandidateSpec.siteName))
             continue
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug('  use {0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5}'.format(siteCandidateSpec.siteName,
                                                                                                              siteCandidateSpec.weight,
                                                                                                              len(siteCandidateSpec.localDiskFiles),
                                                                                                              len(siteCandidateSpec.localTapeFiles),
                                                                                                              len(siteCandidateSpec.cacheFiles),
                                                                                                              len(siteCandidateSpec.remoteFiles),
                                                                                                              ))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # return
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,inputChunk
Пример #5
0
 def runImpl(self):
     # cutoff for disk in TB
     diskThreshold = self.taskBufferIF.getConfigValue(
         self.msgType,
         'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name), 'jedi',
         'atlas')
     if diskThreshold is None:
         diskThreshold = 100 * 1024
     # dataset type to ignore file availability check
     datasetTypeToSkipCheck = ['log']
     # thresholds for data availability check
     thrInputSize = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas')
     if thrInputSize is None:
         thrInputSize = 1
     thrInputSize *= 1024 * 1024 * 1024
     thrInputNum = self.taskBufferIF.getConfigValue(self.msgType,
                                                    'INPUT_NUM_THRESHOLD',
                                                    'jedi', 'atlas')
     if thrInputNum is None:
         thrInputNum = 100
     thrInputSizeFrac = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas')
     if thrInputSizeFrac is None:
         thrInputSizeFrac = 10
     thrInputSizeFrac = float(thrInputSizeFrac) / 100
     thrInputNumFrac = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas')
     if thrInputNumFrac is None:
         thrInputNumFrac = 10
     thrInputNumFrac = float(thrInputNumFrac) / 100
     cutOffRW = 50
     negWeightTape = 0.001
     minIoIntensityWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MIN_IO_INTENSITY_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if minIoIntensityWithLD is None:
         minIoIntensityWithLD = 200
     minInputSizeWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MIN_INPUT_SIZE_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if minInputSizeWithLD is None:
         minInputSizeWithLD = 10000
     maxTaskPrioWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MAX_TASK_PRIO_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if maxTaskPrioWithLD is None:
         maxTaskPrioWithLD = 800
     # main
     lastJediTaskID = None
     siteMapper = self.taskBufferIF.getSiteMapper()
     while True:
         try:
             taskInputList = self.inputList.get(1)
             # no more datasets
             if len(taskInputList) == 0:
                 self.logger.debug(
                     '{0} terminating after processing {1} tasks since no more inputs '
                     .format(self.__class__.__name__, self.numTasks))
                 return
             # loop over all tasks
             for taskSpec, inputChunk in taskInputList:
                 lastJediTaskID = taskSpec.jediTaskID
                 # make logger
                 tmpLog = MsgWrapper(
                     self.logger,
                     '<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                     monToken='jediTaskID={0}'.format(taskSpec.jediTaskID))
                 tmpLog.debug('start')
                 tmpLog.info(
                     'thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}'
                     .format(thrInputSize, thrInputNum, thrInputSizeFrac,
                             thrInputNumFrac))
                 # read task parameters
                 try:
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                         taskSpec.jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except Exception:
                     tmpLog.error('failed to read task params')
                     taskSpec.setErrDiag(
                         tmpLog.uploadLog(taskSpec.jediTaskID))
                     self.sendLogMessage(tmpLog)
                     continue
                 # RW
                 taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(
                     taskSpec.jediTaskID)
                 # get nuclei
                 nucleusList = siteMapper.nuclei
                 if taskSpec.nucleus in siteMapper.nuclei:
                     candidateNucleus = taskSpec.nucleus
                 elif taskSpec.nucleus in siteMapper.satellites:
                     nucleusList = siteMapper.satellites
                     candidateNucleus = taskSpec.nucleus
                 else:
                     tmpLog.info('got {0} candidates'.format(
                         len(nucleusList)))
                     ######################################
                     # check status
                     newNucleusList = {}
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleusSpec.state not in ['ACTIVE']:
                             tmpLog.info(
                                 '  skip nucleus={0} due to status={1} criteria=-status'
                                 .format(tmpNucleus, tmpNucleusSpec.state))
                         else:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info(
                         '{0} candidates passed status check'.format(
                             len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # check status of transfer backlog
                     t1Weight = taskSpec.getT1Weight()
                     if t1Weight < 0:
                         tmpLog.info(
                             'skip transfer backlog check due to negative T1Weight'
                         )
                     else:
                         newNucleusList = {}
                         backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei(
                         )
                         for tmpNucleus, tmpNucleusSpec in iteritems(
                                 nucleusList):
                             if tmpNucleus in backlogged_nuclei:
                                 tmpLog.info(
                                     '  skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog'
                                     .format(tmpNucleus))
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         nucleusList = newNucleusList
                         tmpLog.info(
                             '{0} candidates passed transfer backlog check'.
                             format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # check endpoint
                     fractionFreeSpace = {}
                     newNucleusList = {}
                     tmpStat, tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                         taskSpec.jediTaskID, ['output', 'log'])
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         toSkip = False
                         for tmpDatasetSpec in tmpDatasetSpecList:
                             # ignore distributed datasets
                             if DataServiceUtils.getDistributedDestination(
                                     tmpDatasetSpec.storageToken
                             ) is not None:
                                 continue
                             # get endpoint with the pattern
                             tmpEP = tmpNucleusSpec.getAssociatedEndpoint(
                                 tmpDatasetSpec.storageToken)
                             if tmpEP is None:
                                 tmpLog.info(
                                     '  skip nucleus={0} since no endpoint with {1} criteria=-match'
                                     .format(tmpNucleus,
                                             tmpDatasetSpec.storageToken))
                                 toSkip = True
                                 break
                             # check state
                             """
                             if tmpEP['state'] not in ['ACTIVE']:
                                 tmpLog.info('  skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus,
                                                                                                                          tmpEP['ddm_endpoint_name'],
                                                                                                                          tmpEP['state']))
                                 toSkip = True
                                 break
                             """
                             # check space
                             tmpSpaceSize = tmpEP['space_free'] + tmpEP[
                                 'space_expired']
                             tmpSpaceToUse = 0
                             if tmpNucleus in self.fullRW:
                                 # 0.25GB per cpuTime/corePower/day
                                 tmpSpaceToUse = long(
                                     self.fullRW[tmpNucleus] / 10 / 24 /
                                     3600 * 0.25)
                             if tmpSpaceSize - tmpSpaceToUse < diskThreshold:
                                 tmpLog.info(
                                     '  skip nucleus={0} since disk shortage (free {1} GB - reserved {2} GB < thr {3} GB) at endpoint {4} criteria=-space'
                                     .format(tmpNucleus, tmpSpaceSize,
                                             tmpSpaceToUse, diskThreshold,
                                             tmpEP['ddm_endpoint_name']))
                                 toSkip = True
                                 break
                             # keep fraction of free space
                             if tmpNucleus not in fractionFreeSpace:
                                 fractionFreeSpace[tmpNucleus] = {
                                     'total': 0,
                                     'free': 0
                                 }
                             try:
                                 tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                     float(fractionFreeSpace[tmpNucleus]['total'])
                             except Exception:
                                 tmpOld = None
                             try:
                                 tmpNew = float(tmpSpaceSize -
                                                tmpSpaceToUse) / float(
                                                    tmpEP['space_total'])
                             except Exception:
                                 tmpNew = None
                             if tmpNew is not None and (tmpOld is None
                                                        or tmpNew < tmpOld):
                                 fractionFreeSpace[tmpNucleus] = {
                                     'total': tmpEP['space_total'],
                                     'free': tmpSpaceSize - tmpSpaceToUse
                                 }
                         if not toSkip:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info(
                         '{0} candidates passed endpoint check {1} TB'.
                         format(len(nucleusList), diskThreshold / 1024))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # ability to execute jobs
                     newNucleusList = {}
                     # get all panda sites
                     tmpSiteList = []
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         tmpSiteList += tmpNucleusSpec.allPandaSites
                     tmpSiteList = list(set(tmpSiteList))
                     tmpLog.debug('===== start for job check')
                     jobBroker = AtlasProdJobBroker(self.ddmIF,
                                                    self.taskBufferIF)
                     tmpSt, tmpRet = jobBroker.doBrokerage(
                         taskSpec, taskSpec.cloud, inputChunk, None, True,
                         tmpSiteList, tmpLog)
                     tmpLog.debug('===== done for job check')
                     if tmpSt != Interaction.SC_SUCCEEDED:
                         tmpLog.error('no sites can run jobs')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     okNuclei = set()
                     for tmpSite in tmpRet:
                         siteSpec = siteMapper.getSite(tmpSite)
                         okNuclei.add(siteSpec.pandasite)
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleus in okNuclei:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                         else:
                             tmpLog.info(
                                 '  skip nucleus={0} due to missing ability to run jobs criteria=-job'
                                 .format(tmpNucleus))
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed job check'.format(
                         len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # data locality
                     toSkip = False
                     availableData = {}
                     for datasetSpec in inputChunk.getDatasets():
                         # only for real datasets
                         if datasetSpec.isPseudo():
                             continue
                         # ignore DBR
                         if DataServiceUtils.isDBR(datasetSpec.datasetName):
                             continue
                         # skip locality check
                         if DataServiceUtils.getDatasetType(
                                 datasetSpec.datasetName
                         ) in datasetTypeToSkipCheck:
                             continue
                         # primary only
                         if taskParamMap.get(
                                 'taskBrokerOnMaster'
                         ) is True and not datasetSpec.isMaster():
                             continue
                         # use deep scan for primary dataset unless data carousel
                         if datasetSpec.isMaster(
                         ) and not taskSpec.inputPreStaging():
                             deepScan = True
                         else:
                             deepScan = False
                         # get nuclei where data is available
                         tmpSt, tmpRet = AtlasBrokerUtils.getNucleiWithData(
                             siteMapper, self.ddmIF,
                             datasetSpec.datasetName,
                             list(nucleusList.keys()), deepScan)
                         if tmpSt != Interaction.SC_SUCCEEDED:
                             tmpLog.error(
                                 'failed to get nuclei where data is available, since {0}'
                                 .format(tmpRet))
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             toSkip = True
                             break
                         # sum
                         for tmpNucleus, tmpVals in iteritems(tmpRet):
                             if tmpNucleus not in availableData:
                                 availableData[tmpNucleus] = tmpVals
                             else:
                                 availableData[tmpNucleus] = dict(
                                     (k, v + tmpVals[k])
                                     for (k, v) in iteritems(
                                         availableData[tmpNucleus]))
                     if toSkip:
                         continue
                     if availableData != {}:
                         newNucleusList = {}
                         # skip if no data
                         skipMsgList = []
                         for tmpNucleus, tmpNucleusSpec in iteritems(
                                 nucleusList):
                             if taskSpec.inputPreStaging(
                             ) and availableData[tmpNucleus][
                                     'ava_num_any'] > 0:
                                 # use incomplete replicas for data carousel since the completeness is guaranteed
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                             elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \
                                     availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac:
                                 tmpMsg = '  skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(
                                     tmpNucleus, availableData[tmpNucleus]
                                     ['ava_size_any'],
                                     availableData[tmpNucleus]['tot_size'],
                                     thrInputSizeFrac)
                                 skipMsgList.append(tmpMsg)
                             elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \
                                     availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac:
                                 tmpMsg = '  skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(
                                     tmpNucleus, availableData[tmpNucleus]
                                     ['ava_num_any'],
                                     availableData[tmpNucleus]['tot_num'],
                                     thrInputNumFrac)
                                 skipMsgList.append(tmpMsg)
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         totInputSize = list(availableData.values(
                         ))[0]['tot_size'] / 1024 / 1024 / 1024
                         data_locality_check_str = (
                             '(ioIntensity ({0}) is None or less than {1} kBPerS '
                             'and input size ({2} GB) is less than {3}) '
                             'or task.currentPriority ({4}) is higher than or equal to {5}'
                         ).format(taskSpec.ioIntensity,
                                  minIoIntensityWithLD, int(totInputSize),
                                  minInputSizeWithLD,
                                  taskSpec.currentPriority,
                                  maxTaskPrioWithLD)
                         if len(newNucleusList) > 0:
                             nucleusList = newNucleusList
                             for tmpMsg in skipMsgList:
                                 tmpLog.info(tmpMsg)
                         elif ((taskSpec.ioIntensity is None
                               or taskSpec.ioIntensity <= minIoIntensityWithLD)
                               and totInputSize <= minInputSizeWithLD) \
                               or taskSpec.currentPriority >= maxTaskPrioWithLD:
                             availableData = {}
                             tmpLog.info(
                                 '  disable data locality check since no nucleus has input data, {}'
                                 .format(data_locality_check_str))
                         else:
                             # no candidate + unavoidable data locality check
                             nucleusList = newNucleusList
                             for tmpMsg in skipMsgList:
                                 tmpLog.info(tmpMsg)
                             tmpLog.info(
                                 '  the following conditions required to disable data locality check: {}'
                                 .format(data_locality_check_str))
                         tmpLog.info(
                             '{0} candidates passed data check'.format(
                                 len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # weight
                     self.prioRW.acquire()
                     nucleusRW = self.prioRW[taskSpec.currentPriority]
                     self.prioRW.release()
                     totalWeight = 0
                     nucleusweights = []
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleus not in nucleusRW:
                             nucleusRW[tmpNucleus] = 0
                         wStr = '1'
                         # with RW
                         if tmpNucleus in nucleusRW and nucleusRW[
                                 tmpNucleus] >= cutOffRW:
                             weight = 1 / float(nucleusRW[tmpNucleus])
                             wStr += '/( RW={0} )'.format(
                                 nucleusRW[tmpNucleus])
                         else:
                             weight = 1
                             wStr += '/(1 : RW={0}<{1})'.format(
                                 nucleusRW[tmpNucleus], cutOffRW)
                         # with data
                         if availableData != {}:
                             if availableData[tmpNucleus]['tot_size'] > 0:
                                 weight *= float(availableData[tmpNucleus]
                                                 ['ava_size_any'])
                                 weight /= float(
                                     availableData[tmpNucleus]['tot_size'])
                                 wStr += '* ( available_input_size_DISKTAPE={0} )'.format(
                                     availableData[tmpNucleus]
                                     ['ava_size_any'])
                                 wStr += '/ ( total_input_size={0} )'.format(
                                     availableData[tmpNucleus]['tot_size'])
                                 # negative weight for tape
                                 if availableData[tmpNucleus][
                                         'ava_size_any'] > availableData[
                                             tmpNucleus]['ava_size_disk']:
                                     weight *= negWeightTape
                                     wStr += '*( weight_TAPE={0} )'.format(
                                         negWeightTape)
                         # fraction of free space
                         if tmpNucleus in fractionFreeSpace:
                             try:
                                 tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                     float(fractionFreeSpace[tmpNucleus]['total'])
                                 weight *= tmpFrac
                                 wStr += '*( free_space={0} )/( total_space={1} )'.format(
                                     fractionFreeSpace[tmpNucleus]['free'],
                                     fractionFreeSpace[tmpNucleus]['total'])
                             except Exception:
                                 pass
                         tmpLog.info(
                             '  use nucleus={0} weight={1} {2} criteria=+use'
                             .format(tmpNucleus, weight, wStr))
                         totalWeight += weight
                         nucleusweights.append((tmpNucleus, weight))
                     tmpLog.info('final {0} candidates'.format(
                         len(nucleusList)))
                     ######################################
                     # final selection
                     tgtWeight = random.uniform(0, totalWeight)
                     candidateNucleus = None
                     for tmpNucleus, weight in nucleusweights:
                         tgtWeight -= weight
                         if tgtWeight <= 0:
                             candidateNucleus = tmpNucleus
                             break
                     if candidateNucleus is None:
                         candidateNucleus = nucleusweights[-1][0]
                 ######################################
                 # update
                 nucleusSpec = nucleusList[candidateNucleus]
                 # get output/log datasets
                 tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                     taskSpec.jediTaskID, ['output', 'log'])
                 # get destinations
                 retMap = {
                     taskSpec.jediTaskID:
                     AtlasBrokerUtils.getDictToSetNucleus(
                         nucleusSpec, tmpDatasetSpecs)
                 }
                 tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
                 tmpLog.info(
                     '  set nucleus={0} with {1} criteria=+set'.format(
                         candidateNucleus, tmpRet))
                 self.sendLogMessage(tmpLog)
                 if tmpRet:
                     tmpMsg = 'set task_status=ready'
                     tmpLog.sendMsg(tmpMsg, self.msgType)
                 # update RW table
                 self.prioRW.acquire()
                 for prio, rwMap in iteritems(self.prioRW):
                     if prio > taskSpec.currentPriority:
                         continue
                     if candidateNucleus in rwMap:
                         rwMap[candidateNucleus] += taskRW
                     else:
                         rwMap[candidateNucleus] = taskRW
                 self.prioRW.release()
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             errMsg = '{0}.runImpl() failed with {1} {2} '.format(
                 self.__class__.__name__, errtype.__name__, errvalue)
             errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID)
             errMsg += traceback.format_exc()
             logger.error(errMsg)
Пример #6
0
 def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,
                         '<jediTaskID={0}>'.format(taskSpec.jediTaskID))
     tmpLog.debug('start')
     # return for failure
     retFatal = self.SC_FATAL, inputChunk
     retTmpError = self.SC_FAILED, inputChunk
     # get sites in the cloud
     if not taskSpec.site in ['', None]:
         scanSiteList = [taskSpec.site]
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
     elif inputChunk.getPreassignedSite() != None:
         scanSiteList = [inputChunk.getPreassignedSite()]
         tmpLog.debug('site={0} is pre-assigned in masterDS'.format(
             inputChunk.getPreassignedSite()))
     else:
         scanSiteList = self.siteMapper.getCloud(cloudName)['sites']
         tmpLog.debug('cloud=%s has %s candidates' %
                      (cloudName, len(scanSiteList)))
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status != 'online':
             skipFlag = True
         if not skipFlag:
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug('  skip %s due to status=%s' %
                          (tmpSiteName, tmpSiteSpec.status))
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed site status check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for memory
     minRamCount = max(taskSpec.ramCount, inputChunk.ramCount)
     if not minRamCount in [0, None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory:
                 tmpLog.debug(
                     '  skip {0} due to site RAM shortage={1}(site upper limit) < {2}'
                     .format(tmpSiteName, tmpSiteSpec.maxmemory,
                             minRamCount))
                 continue
             if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory:
                 tmpLog.debug(
                     '  skip {0} due to job RAM shortage={1}(site lower limit) > {2}'
                     .format(tmpSiteName, tmpSiteSpec.minmemory,
                             minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(
             len(scanSiteList), minRamCount, taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for scratch disk
     minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize(
     ) + inputChunk.getMaxAtomSize()
     minDiskCountS = minDiskCountS / 1024 / 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = taskSpec.getOutDiskSize(
         ) + taskSpec.getWorkDiskSize()
         minDiskCountR = minDiskCountR / 1024 / 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0:
             if tmpSiteSpec.isDirectIO():
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug(
                     '  skip {0} due to small scratch disk={1} < {2}'.
                     format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # free space must be >= 200GB
         diskThreshold = 200
         tmpSpaceSize = tmpSiteSpec.space
         if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold:
             tmpLog.debug(
                 '  skip {0} due to disk shortage in SE = {1} < {2}GB'.
                 format(tmpSiteName, tmpSiteSpec.space, diskThreshold))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0, None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug(
                     '  skip {0} due to short site walltime={1}(site upper limit) < {2}'
                     .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug(
                     '  skip {0} due to short job walltime={1}(site lower limit) > {2}'
                     .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(
             len(scanSiteList), minWalltime, taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][
                 'updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug('  skip %s due to no pilot' % tmpSiteName)
             #continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed pilot activity check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # sites already used by task
     tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(
         taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # calculate weight
     tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(
         taskSpec.vo, taskSpec.prodSourceLabel, taskSpec.currentPriority)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     preSiteCandidateSpec = None
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName,
                                                'running', None, None)
         nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                 tmpSiteName, 'defined',
                                                 None, None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                  tmpSiteName, 'activated',
                                                  None, None)
         weight = float(nRunning + 1) / float(nActivated + nAssigned +
                                              1) / float(nAssigned + 1)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # set weight
         siteCandidateSpec.weight = weight
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if not weightMap.has_key(weight):
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)
     # limit the number of sites
     maxNumSites = 5
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         if len(candidateSpecList) >= maxNumSites:
             break
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight[:(maxNumSites -
                                                len(candidateSpecList))]
     # collect site names
     scanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         tmpSiteName = siteCandidateSpec.siteName
         # append
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug('  use {0} with weight={1}'.format(
             siteCandidateSpec.siteName, siteCandidateSpec.weight))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED, inputChunk
Пример #7
0
 def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID))
     tmpLog.debug('start')
     # return for failure
     retFatal    = self.SC_FATAL,inputChunk
     retTmpError = self.SC_FAILED,inputChunk
     # get sites in the cloud
     if not taskSpec.site in ['',None]:
         scanSiteList = [taskSpec.site]
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
     elif inputChunk.getPreassignedSite() != None:
         scanSiteList = [inputChunk.getPreassignedSite()]
         tmpLog.debug('site={0} is pre-assigned in masterDS'.format(inputChunk.getPreassignedSite()))
     else:
         scanSiteList = self.siteMapper.getCloud(cloudName)['sites']
         tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList)))
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status != 'online':
             skipFlag = True
         if not skipFlag:
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug('  skip %s due to status=%s' % (tmpSiteName,tmpSiteSpec.status))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for memory
     minRamCount  = max(taskSpec.ramCount, inputChunk.ramCount)
     if not minRamCount in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory:
                 tmpLog.debug('  skip {0} due to site RAM shortage={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                       tmpSiteSpec.maxmemory,
                                                                                                       minRamCount))
                 continue
             if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory:
                 tmpLog.debug('  skip {0} due to job RAM shortage={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                      tmpSiteSpec.minmemory,
                                                                                                      minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList),
                                                                          minRamCount,taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for scratch disk
     minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize()
     minDiskCountS = minDiskCountS / 1024 / 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()
         minDiskCountR = minDiskCountR / 1024 / 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0:
             if tmpSiteSpec.isDirectIO():
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug('  skip {0} due to small scratch disk={1} < {2}'.format(tmpSiteName,
                                                                                      tmpSiteSpec.maxwdir,
                                                                                      minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # free space must be >= 200GB
         diskThreshold = 200
         tmpSpaceSize = tmpSiteSpec.space
         if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold:
             tmpLog.debug('  skip {0} due to disk shortage in SE = {1} < {2}GB'.format(tmpSiteName,tmpSiteSpec.space,
                                                                                       diskThreshold))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug('  skip {0} due to short site walltime={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                         tmpSiteSpec.maxtime,
                                                                                                         minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug('  skip {0} due to short job walltime={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                        tmpSiteSpec.mintime,
                                                                                                        minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug('  skip %s due to no pilot' % tmpSiteName)
             #continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # sites already used by task
     tmpSt,sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # calculate weight
     tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,
                                                                                 taskSpec.prodSourceLabel,
                                                                                 taskSpec.currentPriority)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     preSiteCandidateSpec = None
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',  None,None)
         nAssigned  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'defined',  None,None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None)
         weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # set weight
         siteCandidateSpec.weight = weight
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if not weightMap.has_key(weight):
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)    
     # limit the number of sites
     maxNumSites = 5
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         if len(candidateSpecList) >= maxNumSites:
             break
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight[:(maxNumSites-len(candidateSpecList))]
     # collect site names
     scanSiteList = []    
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         tmpSiteName = siteCandidateSpec.siteName
         # append
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug('  use {0} with weight={1}'.format(siteCandidateSpec.siteName,
                                                         siteCandidateSpec.weight))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # return
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,inputChunk
Пример #8
0
 def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                         monToken='<jediTaskID={0} {1}>'.format(taskSpec.jediTaskID,
                                                                datetime.datetime.utcnow().isoformat('/')))
     tmpLog.debug('start')
     # return for failure
     retFatal    = self.SC_FATAL,inputChunk
     retTmpError = self.SC_FAILED,inputChunk
     # get primary site candidates 
     sitePreAssigned = False
     excludeList = []
     includeList = None
     scanSiteList = []
     # get list of site access
     siteAccessList = self.taskBufferIF.listSiteAccess(None,taskSpec.userName)
     siteAccessMap = {}
     for tmpSiteName,tmpAccess in siteAccessList:
         siteAccessMap[tmpSiteName] = tmpAccess
     # site limitation
     if taskSpec.useLimitedSites():
         if 'excludedSite' in taskParamMap:
             excludeList = taskParamMap['excludedSite']
             # str to list for task retry
             try:
                 if type(excludeList) != types.ListType:
                     excludeList = excludeList.split(',')
             except:
                 pass
         if 'includedSite' in taskParamMap:
             includeList = taskParamMap['includedSite']
             # str to list for task retry
             if includeList == '':
                 includeList = None
             try:
                 if type(includeList) != types.ListType:
                     includeList = includeList.split(',')
             except:
                 pass
     # loop over all sites        
     for siteName,tmpSiteSpec in self.siteMapper.siteSpecList.iteritems():
         if tmpSiteSpec.type == 'analysis':
             scanSiteList.append(siteName)
     # preassigned
     if not taskSpec.site in ['',None]:
         # site is pre-assigned
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
         sitePreAssigned = True
         if not taskSpec.site in scanSiteList:
             scanSiteList.append(taskSpec.site)
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     # allowed remote access protocol
     allowedRemoteProtocol = 'fax'
     # MP    
     if taskSpec.coreCount != None and taskSpec.coreCount > 1:
         # use MCORE only
         useMP = 'only'
     elif taskSpec.coreCount == 0:
         # use MCORE and normal 
         useMP = 'any'
     else:
         # not use MCORE
         useMP = 'unuse'
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status in ['offline']:
             skipFlag = True
         elif tmpSiteSpec.status in ['brokeroff','test']:
             if not sitePreAssigned:
                 skipFlag = True
             elif tmpSiteName != taskSpec.site:
                 skipFlag = True
         if not skipFlag:    
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug('  skip site=%s due to status=%s criteria=-status' % (tmpSiteName,tmpSiteSpec.status))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for MP
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \
                     (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]):
                     newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to core mismatch cores_site=%s <> cores_task=%s criteria=-cpucore' % \
                              (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for release
     if taskSpec.transHome != None:
         if taskSpec.transHome.startswith('ROOT'):
             # hack until x86_64-slc6-gcc47-opt is published in installedsw
             if taskSpec.architecture == 'x86_64-slc6-gcc47-opt':
                 tmpCmtConfig = 'x86_64-slc6-gcc46-opt'
             else:
                 tmpCmtConfig = taskSpec.architecture
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      cmtConfig=tmpCmtConfig,
                                                                      onlyCmtConfig=True)
         elif 'AthAnalysis' in taskSpec.transHome or re.search('Ath[a-zA-Z]+Base',taskSpec.transHome) != None:
             # AthAnalysis
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      cmtConfig=taskSpec.architecture,
                                                                      onlyCmtConfig=True)
         else:    
             # remove AnalysisTransforms-
             transHome = re.sub('^[^-]+-*','',taskSpec.transHome)
             transHome = re.sub('_','-',transHome)
             if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None and taskSpec.transHome != 'AnalysisTransforms' and \
                     re.search('\d{4}-\d{2}-\d{2}T\d{4}$',taskSpec.transHome) == None :
                 # cache is checked 
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                          caches=transHome,
                                                                          cmtConfig=taskSpec.architecture)
             elif transHome == '' and taskSpec.transUses != None:
                 # remove Atlas-
                 transUses = taskSpec.transUses.split('-')[-1]
                 # release is checked 
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                          releases=transUses,
                                                                          cmtConfig=taskSpec.architecture)
             else:
                 # nightlies
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                          releases='CVMFS')
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip site=%s due to missing rel/cache %s:%s:%s criteria=-cache' % \
                              (tmpSiteName,taskSpec.transUses,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for SW {1}:{2}:{3}'.format(len(scanSiteList),
                                                                        taskSpec.transUses,
                                                                        taskSpec.transHome,
                                                                        taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for memory
     minRamCount = inputChunk.getMaxRamCount()
     minRamCount = JediCoreUtils.compensateRamCount(minRamCount)
     if not minRamCount in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # site max memory requirement
             if not tmpSiteSpec.maxrss in [0,None]:
                 site_maxmemory = tmpSiteSpec.maxrss
             else:
                 site_maxmemory = tmpSiteSpec.maxmemory
             if not site_maxmemory in [0,None] and minRamCount != 0 and minRamCount > site_maxmemory:
                 tmpLog.debug('  skip site={0} due to site RAM shortage. site_maxmemory={1} < job_minramcount={2} criteria=-lowmemory'.format(tmpSiteName,
                                                                                                       site_maxmemory,
                                                                                                       minRamCount))
                 continue
             # site min memory requirement
             if not tmpSiteSpec.minrss in [0,None]:
                 site_minmemory = tmpSiteSpec.minrss
             else:
                 site_minmemory = tmpSiteSpec.minmemory
             if not site_minmemory in [0,None] and minRamCount != 0 and minRamCount < site_minmemory:
                 tmpLog.debug('  skip site={0} due to job RAM shortage. site_minmemory={1} > job_minramcount={2} criteria=-highmemory'.format(tmpSiteName,
                                                                                                      site_minmemory,
                                                                                                      minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList),
                                                                          minRamCount,taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for scratch disk
     tmpMaxAtomSize  = inputChunk.getMaxAtomSize()
     tmpEffAtomSize  = inputChunk.getMaxAtomSize(effectiveSize=True)
     tmpOutDiskSize  = taskSpec.getOutDiskSize()
     tmpWorkDiskSize = taskSpec.getWorkDiskSize()
     minDiskCountS = tmpOutDiskSize*tmpEffAtomSize + tmpWorkDiskSize + tmpMaxAtomSize
     minDiskCountS = minDiskCountS / 1024 / 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = tmpOutDiskSize*tmpEffAtomSize + tmpWorkDiskSize
         minDiskCountR = minDiskCountR / 1024 / 1024
     tmpLog.debug('maxAtomSize={0} effectiveAtomSize={1} outDiskCount={2} workDiskSize={3}'.format(tmpMaxAtomSize,
                                                                                                   tmpEffAtomSize,
                                                                                                   tmpOutDiskSize,
                                                                                                   tmpWorkDiskSize))
     tmpLog.debug('minDiskCountScratch={0} minDiskCountRemote={1}'.format(minDiskCountS,
                                                                          minDiskCountR))
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0:
             if tmpSiteSpec.isDirectIO():
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug('  skip site={0} due to small scratch disk={1} < {2} criteria=-disk'.format(tmpSiteName,
                                                                                      tmpSiteSpec.maxwdir,
                                                                                      minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check endpoint
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         tmpEndPoint = tmpSiteSpec.ddm_endpoints.getEndPoint(tmpSiteSpec.ddm)
         if tmpEndPoint is not None:
             # free space must be >= 200GB
             diskThreshold = 200
             tmpSpaceSize = 0
             if tmpEndPoint['space_expired'] is not None:
                 tmpSpaceSize += tmpEndPoint['space_expired']
             if tmpEndPoint['space_free'] is not None:
                 tmpSpaceSize += tmpEndPoint['space_free']
             if tmpSpaceSize < diskThreshold:
                 tmpLog.debug('  skip site={0} due to disk shortage in SE {1} < {2}GB criteria=-disk'.format(tmpSiteName,tmpSpaceSize,
                                                                                         diskThreshold))
                 continue
             # check if blacklisted
             if tmpEndPoint['blacklisted'] == 'Y':
                 tmpLog.debug('  skip site={0} since {1} is blacklisted in DDM criteria=-blacklist'.format(tmpSiteName,tmpSiteSpec.ddm))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0,None] and minWalltime > 0:
         minWalltime *= tmpEffAtomSize
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug('  skip site={0} due to short site walltime={1}(site upper limit) < {2} criteria=-shortwalltime'.format(tmpSiteName,
                                                                                                         tmpSiteSpec.maxtime,
                                                                                                         minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug('  skip site={0} due to short job walltime={1}(site lower limit) > {2} criteria=-longwalltime'.format(tmpSiteName,
                                                                                                        tmpSiteSpec.mintime,
                                                                                                        minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug('  skip site=%s due to no pilot criteria=-nopilot' % tmpSiteName)
             if not self.testMode:
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # check inclusion and exclusion
     newScanSiteList = []
     sitesForANY = []
     for tmpSiteName in scanSiteList:
         autoSite = False
         # check exclusion
         if AtlasBrokerUtils.isMatched(tmpSiteName,excludeList):
             tmpLog.debug('  skip site={0} excluded criteria=-excluded'.format(tmpSiteName))
             continue
         # check inclusion
         if includeList != None and not AtlasBrokerUtils.isMatched(tmpSiteName,includeList):
             if 'AUTO' in includeList:
                 autoSite = True
             else:
                 tmpLog.debug('  skip site={0} not included criteria=-notincluded'.format(tmpSiteName))
                 continue
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limited access
         if tmpSiteSpec.accesscontrol == 'grouplist':
             if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \
                     siteAccessMap[tmpSiteSpec.sitename] != 'approved':
                 tmpLog.debug('  skip site={0} limited access criteria=-limitedaccess'.format(tmpSiteName))
                 continue
         # check cloud
         if not taskSpec.cloud in [None,'','any',tmpSiteSpec.cloud]: 
             tmpLog.debug('  skip site={0} cloud mismatch criteria=-cloudmismatch'.format(tmpSiteName))
             continue
         if autoSite:
             sitesForANY.append(tmpSiteName)
         else:
             newScanSiteList.append(tmpSiteName)
     # use AUTO sites if no sites are included
     if newScanSiteList == []:
         newScanSiteList = sitesForANY
     else:
         for tmpSiteName in sitesForANY:
             tmpLog.debug('  skip site={0} not included criteria=-notincluded'.format(tmpSiteName))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed inclusion/exclusion/cloud'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for data availability
     hasDDS = False
     dataWeight = {}
     remoteSourceList = {}
     if inputChunk.getDatasets() != []:
         oldScanSiteList = copy.copy(scanSiteList)
         for datasetSpec in inputChunk.getDatasets():
             datasetName = datasetSpec.datasetName
             if not self.dataSiteMap.has_key(datasetName):
                 # get the list of sites where data is available
                 tmpLog.debug('getting the list of sites where {0} is available'.format(datasetName))
                 tmpSt,tmpRet = AtlasBrokerUtils.getAnalSitesWithData(scanSiteList,
                                                                      self.siteMapper,
                                                                      self.ddmIF,datasetName)
                 if tmpSt in [Interaction.JEDITemporaryError,Interaction.JEDITimeoutError]: 
                     tmpLog.error('temporary failed to get the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     # send info to logger
                     self.sendLogMessage(tmpLog)
                     return retTmpError
                 if tmpSt == Interaction.JEDIFatalError:
                     tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     # send info to logger
                     self.sendLogMessage(tmpLog)
                     return retFatal
                 # append
                 self.dataSiteMap[datasetName] = tmpRet
                 if datasetName.startswith('ddo'):
                     tmpLog.debug(' {0} sites'.format(len(tmpRet)))
                 else:
                     tmpLog.debug(' {0} sites : {1}'.format(len(tmpRet),str(tmpRet)))
                     # check if distributed
                     if tmpRet != {}:
                         isDistributed = True
                         for tmpMap in tmpRet.values():
                             for tmpVal in tmpMap.values():
                                 if tmpVal['state'] == 'complete':
                                     isDistributed = False
                                     break
                             if not isDistributed:
                                 break
                         if isDistributed:
                             # check if really distributed
                             isDistributed = self.ddmIF.isDistributedDataset(datasetName)
                             if isDistributed:
                                 hasDDS = True
                                 datasetSpec.setDistributed()
                                 tmpLog.debug(' {0} is distributed'.format(datasetName))
             # check if the data is available at somewhere
             if self.dataSiteMap[datasetName] == {}:
                 tmpLog.error('{0} is unavailable at any site'.format(datasetName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 # send info to logger
                 self.sendLogMessage(tmpLog)
                 return retFatal
         # get the list of sites where data is available    
         scanSiteList = None
         scanSiteListOnDisk = None
         normFactor = 0
         for datasetName,tmpDataSite in self.dataSiteMap.iteritems():
             normFactor += 1
             # get sites where replica is available
             tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(tmpDataSite,includeTape=True)
             tmpDiskSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(tmpDataSite,includeTape=False)
             # get sites which can remotely access source sites
             if inputChunk.isMerging:
                 # disable remote access for merging
                 tmpSatelliteSites = {}
             elif (not sitePreAssigned) or (sitePreAssigned and not taskSpec.site in tmpSiteList):
                 tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites(tmpDiskSiteList,self.taskBufferIF,
                                                                        self.siteMapper,nSites=50,
                                                                        protocol=allowedRemoteProtocol)
             else:
                 tmpSatelliteSites = {}
             # make weight map for local
             for tmpSiteName in tmpSiteList:
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = 0
                 # give more weight to disk
                 if tmpSiteName in tmpDiskSiteList:
                     dataWeight[tmpSiteName] += 1
                 else:
                     dataWeight[tmpSiteName] += 0.001
             # make weight map for remote
             for tmpSiteName,tmpWeightSrcMap in tmpSatelliteSites.iteritems():
                 # skip since local data is available
                 if tmpSiteName in tmpSiteList:
                     continue
                 tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                 # negative weight for remote access
                 wRemote = 50.0
                 if not tmpSiteSpec.wansinklimit in [0,None]:
                     wRemote /= float(tmpSiteSpec.wansinklimit)
                 # sum weight
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = float(tmpWeightSrcMap['weight'])/wRemote
                 else:
                     dataWeight[tmpSiteName] += float(tmpWeightSrcMap['weight'])/wRemote
                 # make remote source list
                 if not remoteSourceList.has_key(tmpSiteName):
                     remoteSourceList[tmpSiteName] = {}
                 remoteSourceList[tmpSiteName][datasetName] = tmpWeightSrcMap['source']
             # first list
             if scanSiteList == None:
                 scanSiteList = []
                 for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                     if not tmpSiteName in oldScanSiteList:
                         continue
                     if not tmpSiteName in scanSiteList:
                         scanSiteList.append(tmpSiteName)
                 scanSiteListOnDisk = set()
                 for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys():
                     if not tmpSiteName in oldScanSiteList:
                         continue
                     scanSiteListOnDisk.add(tmpSiteName)
                 continue
             # pickup sites which have all data
             newScanList = []
             for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                 if tmpSiteName in scanSiteList and not tmpSiteName in newScanList:
                     newScanList.append(tmpSiteName)
             scanSiteList = newScanList
             tmpLog.debug('{0} is available at {1} sites'.format(datasetName,len(scanSiteList)))
             # pickup sites which have all data on DISK
             newScanListOnDisk = set()
             for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys():
                 if tmpSiteName in scanSiteListOnDisk:
                     newScanListOnDisk.add(tmpSiteName)
             scanSiteListOnDisk = newScanListOnDisk
             tmpLog.debug('{0} is available at {1} sites on DISK'.format(datasetName,len(scanSiteListOnDisk)))
         # check for preassigned
         if sitePreAssigned and not taskSpec.site in scanSiteList:
             scanSiteList = []
             tmpLog.debug('data is unavailable locally or remotely at preassigned site {0}'.format(taskSpec.site))
         elif len(scanSiteListOnDisk) > 0:
             # use only disk sites
             scanSiteList = list(scanSiteListOnDisk)
         tmpLog.debug('{0} candidates have input data'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retFatal
     ######################################
     # sites already used by task
     tmpSt,sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # calculate weight
     fqans = taskSpec.makeFQANs()
     """
     tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans,
                                                                                               taskSpec.workingGroup,True)
     currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight)
     currentPriority -= 500
     tmpLog.debug('currentPriority={0}'.format(currentPriority))
     """
     tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,
                                                                                 taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     # check for preassigned
     if sitePreAssigned and not taskSpec.site in scanSiteList:
         tmpLog.debug("preassigned site {0} did not pass all tests".format(taskSpec.site))
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retFatal
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     timeWindowForFC = 6
     preSiteCandidateSpec = None
     failureCounts = self.taskBufferIF.getFailureCountsForTask_JEDI(taskSpec.jediTaskID,timeWindowForFC)
     problematicSites = set()
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',  None,None)
         nAssigned  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'defined',  None,None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) + \
                      AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None)
         nStarting  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'starting', None,None)
         nFailed    = 0
         nClosed    = 0
         nFinished  = 0
         if tmpSiteName in failureCounts:
             if 'failed' in failureCounts[tmpSiteName]:
                 nFailed = failureCounts[tmpSiteName]['failed']
             if 'closed' in failureCounts[tmpSiteName]:
                 nClosed = failureCounts[tmpSiteName]['closed']
             if 'finished' in failureCounts[tmpSiteName]:
                 nFinished = failureCounts[tmpSiteName]['finished']
         # problematic sites
         if nFailed+nClosed > 2*nFinished:
             problematicSites.add(tmpSiteName)
         # calculate weight
         weight = float(nRunning + 1) / float(nActivated + nAssigned + nStarting + 1)
         nThrottled = 0
         if remoteSourceList.has_key(tmpSiteName):
             nThrottled = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None)
             weight /= float(nThrottled + 1)
         # noramize weights by taking data availability into account
         tmpDataWeight = 1
         if dataWeight.has_key(tmpSiteName):
             weight = weight * dataWeight[tmpSiteName]
             tmpDataWeight = dataWeight[tmpSiteName]
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # preassigned
         if sitePreAssigned and tmpSiteName == taskSpec.site:
             preSiteCandidateSpec = siteCandidateSpec
         # set weight
         siteCandidateSpec.weight = weight
         tmpStr  = '  site={0} nRun={1} nDef={2} nAct={3} nStart={4} '.format(tmpSiteName,
                                                                              nRunning,        
                                                                              nAssigned,       
                                                                              nActivated,      
                                                                              nStarting)
         tmpStr += 'nFailed={0} nClosed={1} nFinished={2} nTr={3} dataW={4} W={5}'.format(nFailed,
                                                                                          nClosed,
                                                                                          nFinished,
                                                                                          nThrottled,
                                                                                          tmpDataWeight,
                                                                                          weight)
         tmpLog.debug(tmpStr)
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if not weightMap.has_key(weight):
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)    
     # sort candidates by weights
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight
     # limit the number of sites. use all sites for distributed datasets
     if not hasDDS:
         maxNumSites = 10
         # remove problematic sites
         candidateSpecList = AtlasBrokerUtils.skipProblematicSites(candidateSpecList,
                                                                   problematicSites,
                                                                   sitesUsedByTask,
                                                                   preSiteCandidateSpec,
                                                                   maxNumSites,
                                                                   timeWindowForFC,
                                                                   tmpLog)
     # append preassigned
     if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList: 
         candidateSpecList.append(preSiteCandidateSpec)
     # collect site names
     scanSiteList = []    
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # get list of available files
     availableFileMap = {}     
     for datasetSpec in inputChunk.getDatasets():
         try:
             # get list of site to be scanned
             fileScanSiteList = []
             for tmpSiteName in scanSiteList:
                 fileScanSiteList.append(tmpSiteName)
                 if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(datasetSpec.datasetName):
                     for tmpRemoteSite in remoteSourceList[tmpSiteName][datasetSpec.datasetName]:
                         if not tmpRemoteSite in fileScanSiteList:
                             fileScanSiteList.append(tmpRemoteSite)
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(fileScanSiteList,self.siteMapper)
             # disable file lookup for merge jobs
             if inputChunk.isMerging:
                 checkCompleteness = False
             else:
                 checkCompleteness = True
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec,
                                                         siteStorageEP,
                                                         self.siteMapper,
                                                         ngGroup=[2],
                                                         checkCompleteness=checkCompleteness)
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         tmpSiteName = siteCandidateSpec.siteName
         # preassigned
         if sitePreAssigned and tmpSiteName != taskSpec.site:
             tmpLog.debug('  skip site={0} non pre-assigned site criteria=-nonpreassigned'.format(tmpSiteName))
             continue
         # set available files
         if inputChunk.getDatasets() == []: 
             isAvailable = True
         else:
             isAvailable = False
         for tmpDatasetName,availableFiles in availableFileMap.iteritems():
             tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName)
             # check remote files
             if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(tmpDatasetName):
                 for tmpRemoteSite in remoteSourceList[tmpSiteName][tmpDatasetName]:
                     if availableFiles.has_key(tmpRemoteSite) and \
                             len(tmpDatasetSpec.Files) <= len(availableFiles[tmpRemoteSite]['localdisk']):
                         # use only remote disk files
                         siteCandidateSpec.remoteFiles += availableFiles[tmpRemoteSite]['localdisk']
                         # set remote site and access protocol
                         siteCandidateSpec.remoteProtocol = allowedRemoteProtocol
                         siteCandidateSpec.remoteSource   = tmpRemoteSite
                         isAvailable = True
                         break
             # local files
             if availableFiles.has_key(tmpSiteName):
                 if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \
                         len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']) or \
                         len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localtape']) or \
                         (tmpDatasetSpec.isDistributed() and len(availableFiles[tmpSiteName]['all']) > 0):
                     siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['localdisk']
                     # add cached files to local list since cached files go to pending when reassigned
                     siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['cache']
                     siteCandidateSpec.localTapeFiles  += availableFiles[tmpSiteName]['localtape']
                     siteCandidateSpec.cacheFiles  += availableFiles[tmpSiteName]['cache']
                     siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote']
                     siteCandidateSpec.addAvailableFiles(availableFiles[tmpSiteName]['all'])
                     isAvailable = True
                 else:
                     tmpMsg = '{0} is incompete at {1} : nFiles={2} nLocal={3} nCached={4} nTape={5}'
                     tmpLog.debug(tmpMsg.format(tmpDatasetName,
                                                tmpSiteName,
                                                len(tmpDatasetSpec.Files),
                                                len(availableFiles[tmpSiteName]['localdisk']),
                                                len(availableFiles[tmpSiteName]['cache']),
                                                len(availableFiles[tmpSiteName]['localtape']),
                                                ))
             if not isAvailable:
                 break
         # append
         if not isAvailable:
             tmpLog.debug('  skip site={0} file unavailable criteria=-fileunavailable'.format(siteCandidateSpec.siteName))
             continue
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug('  use site={0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5} criteria=+use'.format(siteCandidateSpec.siteName,
                                                                                                              siteCandidateSpec.weight,
                                                                                                              len(siteCandidateSpec.localDiskFiles),
                                                                                                              len(siteCandidateSpec.localTapeFiles),
                                                                                                              len(siteCandidateSpec.cacheFiles),
                                                                                                              len(siteCandidateSpec.remoteFiles),
                                                                                                              ))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     # send info to logger
     self.sendLogMessage(tmpLog)
     # return
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,inputChunk
Пример #9
0
 def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID))
     tmpLog.debug('start')
     # return for failure
     retFatal    = self.SC_FATAL,inputChunk
     retTmpError = self.SC_FAILED,inputChunk
     # get sites in the cloud
     if not taskSpec.site in ['',None]:
         scanSiteList = [taskSpec.site]
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
     elif inputChunk.getPreassignedSite() != None:
         scanSiteList = [inputChunk.getPreassignedSite()]
         tmpLog.debug('site={0} is pre-assigned in masterDS'.format(inputChunk.getPreassignedSite()))
     else:
         scanSiteList = self.siteMapper.getCloud(cloudName)['sites']
         tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList)))
     # get job statistics
     tmpSt,jobStatMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # T1 
     t1Sites = [self.siteMapper.getCloud(cloudName)['source']]
     # hospital sites
     if self.hospitalQueueMap.has_key(cloudName):
         t1Sites += self.hospitalQueueMap[cloudName]
     # MP    
     if taskSpec.coreCount != None and taskSpec.coreCount > 1:
         useMP = True
     else:
         useMP = False
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status != 'online':
             skipFlag = True
         if not skipFlag:    
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug('  skip %s due to status=%s' % (tmpSiteName,tmpSiteSpec.status))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for reprocessing
     if taskSpec.processingType == 'reprocessing':
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check schedconfig.validatedreleases
             if tmpSiteSpec.validatedreleases == ['True']:
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip %s due to validatedreleases != True' % tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for reprocessing'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for high priorities
     if taskSpec.currentPriority >= 950 and not useMP:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:            
             if tmpSiteName in t1Sites:
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip %s due to high prio which needs to run at T1' % tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed for high prio'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for data availability
     for datasetSpec in inputChunk.getDatasets():
         datasetName = datasetSpec.datasetName
         if not self.dataSiteMap.has_key(datasetName):
             # get the list of sites where data is available
             tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName))
             tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper,
                                                              self.ddmIF,datasetName)
             if tmpSt == self.SC_FAILED:
                 tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet)
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retTmpError
             if tmpSt == self.SC_FATAL:
                 tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet)
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retFatal
             # append
             self.dataSiteMap[datasetName] = tmpRet
             tmpLog.debug('map of data availability : {0}'.format(str(tmpRet)))
         # check if T1 has the data
         if self.dataSiteMap[datasetName].has_key(cloudName):
             cloudHasData = True
         else:
             cloudHasData = False
         t1hasData = False
         if cloudHasData:
             for tmpSE,tmpSeVal in self.dataSiteMap[datasetName][cloudName]['t1'].iteritems():
                 if tmpSeVal['state'] == 'complete':
                     t1hasData = True
                     break
             # T1 has incomplete data while no data at T2
             if not t1hasData and self.dataSiteMap[datasetName][cloudName]['t2'] == []:
                 # use incomplete data at T1 anyway
                 t1hasData = True
         # data is missing at T1         
         if not t1hasData:
             tmpLog.debug('{0} is unavailable at T1. scanning T2 sites in homeCloud={1}'.format(datasetName,cloudName))
             # make subscription to T1
             # FIXME
             pass
             # use T2 until data is complete at T1
             newScanSiteList = []
             for tmpSiteName in scanSiteList:                    
                 if cloudHasData and tmpSiteName in self.dataSiteMap[datasetName][cloudName]['t2']:
                     newScanSiteList.append(tmpSiteName)
                 else:
                     tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                     if tmpSiteSpec.cloud != cloudName:
                         tmpLog.debug('  skip %s due to foreign T2' % tmpSiteName)
                     else:
                         tmpLog.debug('  skip %s due to missing data at T2' % tmpSiteName)
             scanSiteList = newScanSiteList
             tmpLog.debug('{0} candidates passed T2 scan in the home cloud with input:{1}'.format(len(scanSiteList),datasetName))
             if scanSiteList == []:
                 tmpLog.error('no candidates')
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retTmpError
     ######################################
     # selection for fairshare
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if AtlasBrokerUtils.hasZeroShare(tmpSiteSpec,taskSpec,tmpLog):
             tmpLog.debug('  skip {0} due to zero share'.format(tmpSiteName))
             continue
         newScanSiteList.append(tmpSiteName)                
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed zero share check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for I/O intensive tasks
     # FIXME
     pass
     ######################################
     # selection for MP
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if (useMP and tmpSiteSpec.coreCount > 1) or \
            (not useMP and tmpSiteSpec.coreCount in [0,1,None]):
                 newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug('  skip %s due to core mismatch site:%s != task:%s' % \
                          (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for release
     if taskSpec.transHome != None:
         if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None:
             # only cache is checked for normal tasks
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      caches=taskSpec.transHome,
                                                                      cmtConfig=taskSpec.architecture)
         else:
             # nightlies
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      releases='nightlies',
                                                                      cmtConfig=taskSpec.architecture)
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY'] or \
                tmpSiteSpec.cloud in ['ND'] or \
                tmpSiteName in ['CERN-RELEASE']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip %s due to missing rel/cache %s:%s' % \
                              (tmpSiteName,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for ATLAS release {1}:{2}'.format(len(scanSiteList),
                                                                               taskSpec.transHome,
                                                                               taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for memory
     minRamCount  = taskSpec.ramCount
     if not minRamCount in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory:
                 tmpLog.debug('  skip {0} due to site RAM shortage={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                       tmpSiteSpec.maxmemory,
                                                                                                       minRamCount))
                 continue
             if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory:
                 tmpLog.debug('  skip {0} due to job RAM shortage={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                      tmpSiteSpec.minmemory,
                                                                                                      minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList),
                                                                          minRamCount,taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for scratch disk
     minDiskCount = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize()
     minDiskCount = minDiskCount / 1024 / 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0 and minDiskCount > tmpSiteSpec.maxwdir:
             tmpLog.debug('  skip {0} due to small scratch disk={1} < {2}'.format(tmpSiteName,
                                                                                  tmpSiteSpec.maxwdir,
                                                                                  minDiskCount))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # don't check for T1
         if tmpSiteName in t1Sites:
             pass
         else:
             # check at the site
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # the number of jobs which will produce outputs
             nRemJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'assigned') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running')
             # the size of input files which will be copied to the site
             movingInputSize = self.taskBufferIF.getMovingInputSize_JEDI(tmpSiteName)
             if movingInputSize == None:
                 tmpLog.error('failed to get the size of input file moving to {0}'.format(tmpSiteName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retTmpError
             # free space - inputs - outputs(250MB*nJobs) must be >= 200GB
             outSizePerJob = 0.250
             diskThreshold = 200
             tmpSpaceSize = tmpSiteSpec.space - movingInputSize - nRemJobs * outSizePerJob
             if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold:
                 tmpLog.debug('  skip {0} due to disk shortage in SE = {1}-{2}-{3}x{4} < {5}'.format(tmpSiteName,tmpSiteSpec.space,
                                                                                                     movingInputSize,outSizePerJob,
                                                                                                     nRemJobs,diskThreshold))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug('  skip {0} due to short site walltime={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                         tmpSiteSpec.maxtime,
                                                                                                         minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug('  skip {0} due to short job walltime={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                        tmpSiteSpec.mintime,
                                                                                                        minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed walltime check ={1}({2})'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for transferring
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limit
         def_maxTransferring = 2000 
         if tmpSiteSpec.transferringlimit == 0:
             # use default value
             maxTransferring   = def_maxTransferring
         else:
             maxTransferring = tmpSiteSpec.transferringlimit
         # check at the site
         nTraJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'transferring',cloud=cloudName)
         nRunJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running',cloud=cloudName)
         if max(maxTransferring,2*nRunJobs) < nTraJobs and not tmpSiteSpec.cloud in ['ND']:
             tmpLog.debug('  skip %s due to too many transferring %s > max(%s,2x%s)' % \
                          (tmpSiteName,nTraJobs,def_maxTransferring,nRunJobs))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed transferring check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug('  skip %s due to no pilot' % tmpSiteName)
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # get available files
     totalSize = 0
     normalizeFactors = {}        
     availableFileMap = {}
     for datasetSpec in inputChunk.getDatasets():
         try:
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(scanSiteList,self.siteMapper)
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec,
                                                         siteStorageEP,
                                                         self.siteMapper,
                                                         ngGroup=[1])
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
         # get total size
         totalSize += datasetSpec.getSize()
         # loop over all sites to get the size of available files
         for tmpSiteName in scanSiteList:
             if not normalizeFactors.has_key(tmpSiteName):
                 normalizeFactors[tmpSiteName] = 0
             # get the total size of available files
             if availableFileMap[datasetSpec.datasetName].has_key(tmpSiteName):
                 availableFiles = availableFileMap[datasetSpec.datasetName][tmpSiteName]
                 for tmpFileSpec in \
                         availableFiles['localdisk']+availableFiles['localtape']+availableFiles['cache']:
                     normalizeFactors[tmpSiteName] += tmpFileSpec.fsize
     ######################################
     # calculate weight
     tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,
                                                                                 taskSpec.prodSourceLabel,
                                                                                 taskSpec.currentPriority)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     for tmpSiteName in scanSiteList:
         nRunning   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',cloudName,taskSpec.workQueue_ID)
         nAssigned  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'assigned',cloudName,taskSpec.workQueue_ID)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',cloudName,taskSpec.workQueue_ID)
         weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1)
         # normalize weights by taking data availability into account
         if totalSize != 0:
             weight = weight * float(normalizeFactors[tmpSiteName]+totalSize) / float(totalSize)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # set weight
         siteCandidateSpec.weight = weight
         # set available files
         for tmpDatasetName,availableFiles in availableFileMap.iteritems():
             if availableFiles.has_key(tmpSiteName):
                 siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['localdisk']
                 siteCandidateSpec.localTapeFiles  += availableFiles[tmpSiteName]['localtape']
                 siteCandidateSpec.cacheFiles  += availableFiles[tmpSiteName]['cache']
                 siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote']
         # append        
         inputChunk.addSiteCandidate(siteCandidateSpec)
         tmpLog.debug('  use {0} with weight={1}'.format(tmpSiteName,weight))
     # return
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,inputChunk
Пример #10
0
 def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                         monToken='<jediTaskID={0} {1}>'.format(taskSpec.jediTaskID,
                                                                datetime.datetime.utcnow().isoformat('/')))
     tmpLog.debug('start')
     # return for failure
     retFatal    = self.SC_FATAL,inputChunk
     retTmpError = self.SC_FAILED,inputChunk
     # get sites in the cloud
     sitePreAssigned = False
     siteListPreAssigned = False
     if not taskSpec.site in ['',None]:
         if ',' in taskSpec.site:
             # site list
             siteListPreAssigned = True
             scanSiteList = taskSpec.site.split(',')
         else:
             # site
             sitePreAssigned = True
             scanSiteList = [taskSpec.site]
         tmpLog.debug('site={0} is pre-assigned criteria=+preassign'.format(taskSpec.site))
     elif inputChunk.getPreassignedSite() != None:
         siteListPreAssigned = True
         scanSiteList = DataServiceUtils.getSitesShareDDM(self.siteMapper,inputChunk.getPreassignedSite())
         scanSiteList.append(inputChunk.getPreassignedSite())
         tmpMsg = 'use site={0} since they share DDM endpoints with orinal_site={1} which is pre-assigned in masterDS '.format(str(scanSiteList),
                                                                                                                               inputChunk.getPreassignedSite())
         tmpMsg += 'criteria=+premerge'
         tmpLog.debug(tmpMsg)
     else:
         scanSiteList = self.siteMapper.getCloud(cloudName)['sites']
         tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList)))
     # get job statistics
     tmpSt,jobStatMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     # T1 
     if not taskSpec.useWorldCloud():
         t1Sites = [self.siteMapper.getCloud(cloudName)['source']]
         # hospital sites
         if self.hospitalQueueMap.has_key(cloudName):
             t1Sites += self.hospitalQueueMap[cloudName]
     else:
         # get destination for WORLD cloud
         t1Sites = []
         tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,datasetTypes=['log'])
         for datasetSpec in datasetSpecList:
             if not datasetSpec.destination in t1Sites:
                 t1Sites.append(datasetSpec.destination)
     # sites sharing SE with T1
     sitesShareSeT1 = DataServiceUtils.getSitesShareDDM(self.siteMapper,t1Sites[0])
     # all T1
     allT1Sites = self.getAllT1Sites()
     # core count
     if inputChunk.isMerging and taskSpec.mergeCoreCount != None:
         taskCoreCount = taskSpec.mergeCoreCount
     else:
         taskCoreCount = taskSpec.coreCount
     # MP
     if taskCoreCount != None and taskCoreCount > 1:
         # use MCORE only
         useMP = 'only'
     elif taskCoreCount == 0:
         # use MCORE and normal 
         useMP = 'any'
     else:
         # not use MCORE
         useMP = 'unuse'
     # get workQueue
     workQueue = self.taskBufferIF.getWorkQueueMap().getQueueWithID(taskSpec.workQueue_ID)
     ######################################
     # selection for status
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check site status
             skipFlag = False
             if tmpSiteSpec.status != 'online':
                 skipFlag = True
             if not skipFlag:    
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to status=%s criteria=-status' % (tmpSiteName,tmpSiteSpec.status))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for reprocessing
     if taskSpec.processingType == 'reprocessing':
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check schedconfig.validatedreleases
             if tmpSiteSpec.validatedreleases == ['True']:
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to validatedreleases <> True criteria=-validated' % tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for reprocessing'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for high priorities
     t1WeightForHighPrio = 1
     if (taskSpec.currentPriority >= 900 or inputChunk.useScout()) \
             and not sitePreAssigned and not siteListPreAssigned:
         t1WeightForHighPrio = 100
         newScanSiteList = []
         for tmpSiteName in scanSiteList:            
             if tmpSiteName in t1Sites+sitesShareSeT1+allT1Sites:
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpMsg = '  skip site={0} due to highPrio/scouts which needs to run at T1 or sites associated with {1} T1 SE '.format(tmpSiteName,
                                                                                                                                       cloudName)
                 tmpMsg += 'criteria=-scoutprio'
                 tmpLog.debug(tmpMsg)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed for highPrio/scouts'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection to avoid slow or inactive sites
     if (taskSpec.currentPriority >= 800 or inputChunk.useScout() or \
             inputChunk.isMerging or taskSpec.mergeOutput()) \
             and not sitePreAssigned:
         # get inactive sites
         inactiveTimeLimit = 2
         inactiveSites = self.taskBufferIF.getInactiveSites_JEDI('production',inactiveTimeLimit)
         newScanSiteList = []
         tmpMsgList = []
         for tmpSiteName in scanSiteList:
             nToGetAll = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \
                 AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'starting')
             if tmpSiteName in ['BNL_CLOUD','BNL_CLOUD_MCORE','ATLAS_OPP_OSG']:
                 tmpMsg = '  skip site={0} since high prio/scouts/merge needs to avoid slow sites '.format(tmpSiteName)
                 tmpMsg += 'criteria=-slow'
                 tmpMsgList.append(tmpMsg)
             elif tmpSiteName in inactiveSites and nToGetAll > 0:
                 tmpMsg = '  skip site={0} since high prio/scouts/merge needs to avoid inactive sites (laststart is older than {1}h) '.format(tmpSiteName,
                                                                                                                                              inactiveTimeLimit)
                 tmpMsg += 'criteria=-inactive'
                 tmpMsgList.append(tmpMsg)
             else:
                 newScanSiteList.append(tmpSiteName)
         if newScanSiteList != []:
             scanSiteList = newScanSiteList
             for tmpMsg in tmpMsgList:
                 tmpLog.debug(tmpMsg)
         tmpLog.debug('{0} candidates passed for slowness/inactive check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for data availability
     if not sitePreAssigned and not siteListPreAssigned:
         for datasetSpec in inputChunk.getDatasets():
             datasetName = datasetSpec.datasetName
             # ignore DBR
             if DataServiceUtils.isDBR(datasetName):
                 continue
             if not self.dataSiteMap.has_key(datasetName):
                 # get the list of sites where data is available
                 tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName))
                 tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper,
                                                                  self.ddmIF,datasetName,
                                                                  datasetSpec.storageToken)
                 if tmpSt == self.SC_FAILED:
                     tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     self.sendLogMessage(tmpLog)
                     return retTmpError
                 if tmpSt == self.SC_FATAL:
                     tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     self.sendLogMessage(tmpLog)
                     return retFatal
                 # append
                 self.dataSiteMap[datasetName] = tmpRet
                 tmpLog.debug('map of data availability : {0}'.format(str(tmpRet)))
             """
             # check if T1 has the data
             if self.dataSiteMap[datasetName].has_key(cloudName):
                 cloudHasData = True
             else:
                 cloudHasData = False
             t1hasData = False
             if cloudHasData:
                 for tmpSE,tmpSeVal in self.dataSiteMap[datasetName][cloudName]['t1'].iteritems():
                     if tmpSeVal['state'] == 'complete':
                         t1hasData = True
                         break
                 # T1 has incomplete data while no data at T2
                 if not t1hasData and self.dataSiteMap[datasetName][cloudName]['t2'] == []:
                     # use incomplete data at T1 anyway
                     t1hasData = True
             # data is missing at T1         
             if not t1hasData:
                 tmpLog.debug('{0} is unavailable at T1. scanning T2 sites in homeCloud={1}'.format(datasetName,cloudName))
                 # make subscription to T1
                 # FIXME
                 pass
                 # use T2 until data is complete at T1
                 newScanSiteList = []
                 for tmpSiteName in scanSiteList:                    
                     if cloudHasData and tmpSiteName in self.dataSiteMap[datasetName][cloudName]['t2']:
                         newScanSiteList.append(tmpSiteName)
                     else:
                         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                         if tmpSiteSpec.cloud != cloudName:
                             tmpLog.debug('  skip %s due to foreign T2' % tmpSiteName)
                         else:
                             tmpLog.debug('  skip %s due to missing data at T2' % tmpSiteName)
                 scanSiteList = newScanSiteList
                 tmpLog.debug('{0} candidates passed T2 scan in the home cloud with input:{1}'.format(len(scanSiteList),datasetName))
                 if scanSiteList == []:
                     tmpLog.error('no candidates')
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     return retTmpError
             """        
     ######################################
     # selection for fairshare
     if not (workQueue.queue_type in ['managed'] and workQueue.queue_name in ['test','validation']):
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if AtlasBrokerUtils.hasZeroShare(tmpSiteSpec,taskSpec,inputChunk.isMerging,tmpLog):
                 tmpLog.debug('  skip site={0} due to zero share criteria=-zeroshare'.format(tmpSiteName))
                 continue
             newScanSiteList.append(tmpSiteName)                
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed zero share check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for I/O intensive tasks
     # FIXME
     pass
     ######################################
     # selection for MP
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \
                     (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]):
                     newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to core mismatch site:%s <> task:%s criteria=-cpucore' % \
                              (tmpSiteName,tmpSiteSpec.coreCount,taskCoreCount))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for release
     if taskSpec.transHome != None:
         if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None:
             # only cache is checked for normal tasks
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      caches=taskSpec.transHome,
                                                                      cmtConfig=taskSpec.architecture)
         else:
             # nightlies
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      releases='CVMFS')
             #                                                         releases='nightlies',
             #                                                         cmtConfig=taskSpec.architecture)
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY'] or \
                tmpSiteName in ['CERN-RELEASE']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip site=%s due to missing cache=%s:%s criteria=-cache' % \
                              (tmpSiteName,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for ATLAS release {1}:{2}'.format(len(scanSiteList),
                                                                               taskSpec.transHome,
                                                                               taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for memory
     minRamCount  = max(taskSpec.ramCount, inputChunk.ramCount)
     if not minRamCount in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory:
                 tmpMsg = '  skip site={0} due to site RAM shortage {1}(site upper limit) less than {2} '.format(tmpSiteName,
                                                                                                                 tmpSiteSpec.maxmemory,
                                                                                                                 minRamCount)
                 tmpMsg += 'criteria=-lowmemory'
                 tmpLog.debug(tmpMsg)
                 continue
             if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory:
                 tmpMsg = '  skip site={0} due to job RAM shortage {1}(site lower limit) greater than {2} '.format(tmpSiteName,
                                                                                                                   tmpSiteSpec.minmemory,
                                                                                                                   minRamCount)
                 tmpMsg += 'criteria=-highmemory'
                 tmpLog.debug(tmpMsg)
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed memory check {1}({2})'.format(len(scanSiteList),
                                                                           minRamCount,taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for scratch disk
     if taskSpec.outputScaleWithEvents():
         minDiskCount = taskSpec.getOutDiskSize()*inputChunk.getMaxAtomSize(getNumEvents=True)
     else:
         minDiskCount = taskSpec.getOutDiskSize()*inputChunk.getMaxAtomSize(effectiveSize=True)
     minDiskCount = minDiskCount + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize()
     minDiskCount = minDiskCount / 1024 / 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0 and minDiskCount > tmpSiteSpec.maxwdir:
             tmpMsg = '  skip site={0} due to small scratch disk {1} less than {2} '.format(tmpSiteName,
                                                                                            tmpSiteSpec.maxwdir,
                                                                                            minDiskCount)
             tmpMsg += 'criteria=-disk'
             tmpLog.debug(tmpMsg)
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check minDiskCount>{1}MB'.format(len(scanSiteList),
                                                                                       minDiskCount))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # don't check for T1
         if tmpSiteName in t1Sites:
             pass
         else:
             # check at the site
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # the number of jobs which will produce outputs
             nRemJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'assigned') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'throttled') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running')
             # the size of input files which will be copied to the site
             movingInputSize = self.taskBufferIF.getMovingInputSize_JEDI(tmpSiteName)
             if movingInputSize == None:
                 tmpLog.error('failed to get the size of input file moving to {0}'.format(tmpSiteName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 self.sendLogMessage(tmpLog)
                 return retTmpError
             # free space - inputs - outputs(250MB*nJobs) must be >= 200GB
             outSizePerJob = 0.250
             diskThreshold = 200
             tmpSiteSpaceMap = self.ddmIF.getRseUsage(tmpSiteSpec.ddm)
             if tmpSiteSpaceMap != {}:
                 tmpSiteFreeSpace = tmpSiteSpaceMap['free']
                 tmpSpaceSize = tmpSiteFreeSpace - movingInputSize - nRemJobs * outSizePerJob
                 if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold:
                     tmpLog.debug('  skip {0} due to disk shortage in SE = {1}-{2}-{3}x{4} < {5}'.format(tmpSiteName,tmpSiteFreeSpace,
                                                                                                         movingInputSize,outSizePerJob,
                                                                                                         nRemJobs,diskThreshold))
                     continue
             # check if blacklisted
             if self.ddmIF.isBlackListedEP(tmpSiteSpec.ddm):
                 tmpLog.debug('  skip site={0} since endpoint={1} is blacklisted in DDM criteria=-blacklist'.format(tmpSiteName,tmpSiteSpec.ddm))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for walltime
     if not taskSpec.useHS06():
         tmpMaxAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True)
         minWalltime = taskSpec.walltime * tmpMaxAtomSize
         strMinWalltime = 'walltime*inputSize={0}*{1}'.format(taskSpec.walltime,tmpMaxAtomSize)
     else:
         tmpMaxAtomSize = inputChunk.getMaxAtomSize(getNumEvents=True)
         minWalltime = taskSpec.cpuTime * tmpMaxAtomSize
         strMinWalltime = 'cpuTime*nEventsPerJob={0}*{1}'.format(taskSpec.cpuTime,tmpMaxAtomSize)
     if minWalltime != None or inputChunk.useScout():
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             siteMaxTime = tmpSiteSpec.maxtime
             origSiteMaxTime = siteMaxTime
             # sending scouts merge or wallime-undefined jobs to only sites where walltime is more than 1 day
             if inputChunk.useScout() or inputChunk.isMerging or \
                     (taskSpec.walltime in [0,None] and taskSpec.walltimeUnit in ['',None] and taskSpec.cpuTimeUnit in ['',None]):
                 minTimeForZeroWalltime = 24*60*60
                 if siteMaxTime != 0 and siteMaxTime < minTimeForZeroWalltime:
                     tmpMsg = '  skip site={0} due to site walltime {1} (site upper limit) insufficient '.format(tmpSiteName,
                                                                                                                 siteMaxTime)
                     if inputChunk.useScout():
                         tmpMsg += 'for scouts ({0} at least) '.format(minTimeForZeroWalltime)
                         tmpMsg += 'criteria=-scoutwalltime'
                     else:
                         tmpMsg += 'for zero walltime ({0} at least) '.format(minTimeForZeroWalltime)
                         tmpMsg += 'criteria=-zerowalltime'
                     tmpLog.debug(tmpMsg)
                     continue
             # check max walltime at the site
             tmpSiteStr = '{0}'.format(siteMaxTime)
             if taskSpec.useHS06():
                 oldSiteMaxTime = siteMaxTime
                 siteMaxTime -= taskSpec.baseWalltime
                 tmpSiteStr = '({0}-{1})'.format(oldSiteMaxTime,taskSpec.baseWalltime)
             if not siteMaxTime in [None,0] and not tmpSiteSpec.coreCount in [None,0]:
                 siteMaxTime *= tmpSiteSpec.coreCount
                 tmpSiteStr += '*{0}'.format(tmpSiteSpec.coreCount)
             if taskSpec.useHS06():
                 if not siteMaxTime in [None,0] and not tmpSiteSpec.corepower in [None,0]:
                     siteMaxTime *= tmpSiteSpec.corepower
                     tmpSiteStr += '*{0}'.format(tmpSiteSpec.corepower)
                 siteMaxTime *= float(taskSpec.cpuEfficiency) / 100.0
                 siteMaxTime = long(siteMaxTime)
                 tmpSiteStr += '*{0}%'.format(taskSpec.cpuEfficiency)
             if origSiteMaxTime != 0 and minWalltime > siteMaxTime:
                 tmpMsg = '  skip site={0} due to short site walltime {1} (site upper limit) less than {2} '.format(tmpSiteName,
                                                                                                                    tmpSiteStr,
                                                                                                                    strMinWalltime)
                 tmpMsg += 'criteria=-shortwalltime'
                 tmpLog.debug(tmpMsg)
                 continue
             # check min walltime at the site
             siteMinTime = tmpSiteSpec.mintime
             origSiteMinTime = siteMinTime
             tmpSiteStr = '{0}'.format(siteMinTime)
             if taskSpec.useHS06():
                 oldSiteMinTime = siteMinTime
                 siteMinTime -= taskSpec.baseWalltime
                 tmpSiteStr = '({0}-{1})'.format(oldSiteMinTime,taskSpec.baseWalltime)
             if not siteMinTime in [None,0] and not tmpSiteSpec.coreCount in [None,0]:
                 siteMinTime *= tmpSiteSpec.coreCount
                 tmpSiteStr += '*{0}'.format(tmpSiteSpec.coreCount)
             if taskSpec.useHS06():
                 if not siteMinTime in [None,0] and not tmpSiteSpec.corepower in [None,0]:
                     siteMinTime *= tmpSiteSpec.corepower
                     tmpSiteStr += '*{0}'.format(tmpSiteSpec.corepower)
                 siteMinTime *= float(taskSpec.cpuEfficiency) / 100.0
                 siteMinTime = long(siteMinTime)
                 tmpSiteStr += '*{0}%'.format(taskSpec.cpuEfficiency)
             if origSiteMinTime != 0 and minWalltime < siteMinTime:
                 tmpMsg = '  skip site {0} due to short job walltime {1} (site lower limit) greater than {2} '.format(tmpSiteName,
                                                                                                                      tmpSiteStr,
                                                                                                                      strMinWalltime)
                 tmpMsg += 'criteria=-longwalltime'
                 tmpLog.debug(tmpMsg)
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         if not taskSpec.useHS06():
             tmpLog.debug('{0} candidates passed walltime check {1}({2})'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit))
         else:
             tmpLog.debug('{0} candidates passed walltime check {1}({2}*nEventsPerJob)'.format(len(scanSiteList),strMinWalltime,taskSpec.cpuTimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for network connectivity
     if not sitePreAssigned:
         ipConnectivity = taskSpec.getIpConnectivity()
         if ipConnectivity != None:
             newScanSiteList = []
             for tmpSiteName in scanSiteList:
                 tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                 # check at the site
                 if tmpSiteSpec.wnconnectivity == 'full':
                     pass
                 elif tmpSiteSpec.wnconnectivity == 'http' and ipConnectivity == 'http':
                     pass
                 else:
                     tmpMsg = '  skip site={0} due to insufficient connectivity (site={1}) for task={2} '.format(tmpSiteName,
                                                                                                                 tmpSiteSpec.wnconnectivity,
                                                                                                                 ipConnectivity)
                     tmpMsg += 'criteria=-network'
                     tmpLog.debug(tmpMsg)
                     continue
                 newScanSiteList.append(tmpSiteName)
             scanSiteList = newScanSiteList
             tmpLog.debug('{0} candidates passed network check ({1})'.format(len(scanSiteList),
                                                                             ipConnectivity))
             if scanSiteList == []:
                 tmpLog.error('no candidates')
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 self.sendLogMessage(tmpLog)
                 return retTmpError
     ######################################
     # selection for event service
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # event service
             if taskSpec.useEventService():
                 if tmpSiteSpec.getJobSeed() == 'std':
                     tmpMsg = '  skip site={0} since EventService is not allowed '.format(tmpSiteName)
                     tmpMsg += 'criteria=-es'
                     tmpLog.debug(tmpMsg)
                     continue
             else:
                 if tmpSiteSpec.getJobSeed() == 'es':
                     tmpMsg = '  skip site={0} since only EventService is allowed '.format(tmpSiteName)
                     tmpMsg += 'criteria=-nones'
                     tmpLog.debug(tmpMsg)
                     continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed EventService check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for transferring
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limit
         def_maxTransferring = 2000 
         if tmpSiteSpec.transferringlimit == 0:
             # use default value
             maxTransferring   = def_maxTransferring
         else:
             maxTransferring = tmpSiteSpec.transferringlimit
         # check at the site
         nTraJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'transferring',cloud=cloudName)
         nRunJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running',cloud=cloudName)
         if max(maxTransferring,2*nRunJobs) < nTraJobs and not tmpSiteSpec.cloud in ['ND']:
             tmpLog.debug('  skip site=%s due to too many transferring=%s greater than max(%s,2x%s) criteria=-transferring' % \
                              (tmpSiteName,nTraJobs,def_maxTransferring,nRunJobs))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed transferring check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for T1 weight
     t1Weight = taskSpec.getT1Weight()
     if t1Weight == 0:
         # use T1 weight in cloudconfig
         t1Weight = self.siteMapper.getCloud(cloudName)['weight']
     if t1Weight < 0:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             if not tmpSiteName in t1Sites:
                 tmpLog.debug('  skip site={0} due to negative T1 weight criteria=-t1weight'.format(tmpSiteName))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         t1Weight = 1
     t1Weight = max(t1Weight,t1WeightForHighPrio)
     tmpLog.debug('T1 weight {0}'.format(t1Weight))
     tmpLog.debug('{0} candidates passed T1 weight check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for nPilot
     nPilotMap = {}
     if not sitePreAssigned:
         nWNmap = self.taskBufferIF.getCurrentSiteData()
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             # check at the site
             nPilot = 0
             if nWNmap.has_key(tmpSiteName):
                 nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob']
             if nPilot == 0 and not 'test' in taskSpec.prodSourceLabel:
                 tmpLog.debug('  skip site=%s due to no pilot criteria=-nopilot' % tmpSiteName)
                 continue
             newScanSiteList.append(tmpSiteName)
             nPilotMap[tmpSiteName] = nPilot
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # get available files
     normalizeFactors = {}        
     availableFileMap = {}
     for datasetSpec in inputChunk.getDatasets():
         try:
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(scanSiteList,self.siteMapper,
                                                                        ignoreCC=True)
             # disable file lookup for merge jobs or secondary datasets
             checkCompleteness = True
             useCompleteOnly = False
             if inputChunk.isMerging:
                 checkCompleteness = False
             if not datasetSpec.isMaster():
                 useCompleteOnly = True
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec,
                                                         siteStorageEP,
                                                         self.siteMapper,
                                                         ngGroup=[1],
                                                         checkCompleteness=checkCompleteness,
                                                         storageToken=datasetSpec.storageToken,
                                                         useCompleteOnly=useCompleteOnly)
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
         # loop over all sites to get the size of available files
         for tmpSiteName in scanSiteList:
             if not normalizeFactors.has_key(tmpSiteName):
                 normalizeFactors[tmpSiteName] = 0
             # get the total size of available files
             if availableFileMap[datasetSpec.datasetName].has_key(tmpSiteName):
                 availableFiles = availableFileMap[datasetSpec.datasetName][tmpSiteName]
                 for tmpFileSpec in \
                         availableFiles['localdisk']+availableFiles['localtape']+availableFiles['cache']:
                     normalizeFactors[tmpSiteName] += tmpFileSpec.fsize
     # get max total size
     tmpTotalSizes = normalizeFactors.values()
     tmpTotalSizes.sort()
     if tmpTotalSizes != []:
         totalSize = tmpTotalSizes.pop()
     else:
         totalSize = 0
     ######################################
     # calculate weight
     tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,
                                                                                 taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     tmpLog.debug('calculate weight and check cap for {0} candidates'.format(len(scanSiteList)))
     weightMapPrimary = {}
     weightMapSecondary = {}
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         nRunning   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',None,taskSpec.workQueue_ID)
         nDefined   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'definied',None,taskSpec.workQueue_ID) + self.getLiveCount(tmpSiteName)
         nAssigned  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'assigned',None,taskSpec.workQueue_ID)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,taskSpec.workQueue_ID) + \
                      AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,taskSpec.workQueue_ID)
         nStarting  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'starting',None,taskSpec.workQueue_ID)
         if tmpSiteName in nPilotMap:
             nPilot = nPilotMap[tmpSiteName]
         else:
             nPilot = 0
         manyAssigned = float(nAssigned + 1) / float(nActivated + 1)
         manyAssigned = min(2.0,manyAssigned)
         manyAssigned = max(1.0,manyAssigned)
         weight = float(nRunning + 1) / float(nActivated + nAssigned + nStarting + nDefined + 1) / manyAssigned
         weightStr = 'nRun={0} nAct={1} nAss={2} nStart={3} nDef={4} totalSize={5} manyAss={6} nPilot={7} '.format(nRunning,nActivated,nAssigned,
                                                                                                                   nStarting,nDefined,
                                                                                                                   totalSize,manyAssigned,
                                                                                                                   nPilot)
         # normalize weights by taking data availability into account
         if totalSize != 0:
             weight = weight * float(normalizeFactors[tmpSiteName]+totalSize) / float(totalSize)
             weightStr += 'availableSize={0} '.format(normalizeFactors[tmpSiteName])
         # T1 weight
         if tmpSiteName in t1Sites+sitesShareSeT1:
             weight *= t1Weight
             weightStr += 't1W={0} '.format(t1Weight)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # set weight and params
         siteCandidateSpec.weight = weight
         siteCandidateSpec.nRunningJobs = nRunning
         siteCandidateSpec.nQueuedJobs = nActivated + nAssigned + nStarting
         siteCandidateSpec.nAssignedJobs = nAssigned
         # set available files
         for tmpDatasetName,availableFiles in availableFileMap.iteritems():
             if availableFiles.has_key(tmpSiteName):
                 siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['localdisk']
                 siteCandidateSpec.localTapeFiles  += availableFiles[tmpSiteName]['localtape']
                 siteCandidateSpec.cacheFiles  += availableFiles[tmpSiteName]['cache']
                 siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote']
         # check if site is locked for WORLD
         lockedByBrokerage = False
         if taskSpec.useWorldCloud():
             lockedByBrokerage = self.checkSiteLock(taskSpec.vo,taskSpec.prodSourceLabel,
                                                    tmpSiteName,taskSpec.workQueue_ID)
         # check cap with nRunning
         cutOffValue = 20
         cutOffFactor = 2 
         nRunningCap = max(cutOffValue,cutOffFactor*nRunning)
         nRunningCap = max(nRunningCap,nPilot)
         okMsg = '  use site={0} with weight={1} {2} criteria=+use'.format(tmpSiteName,weight,weightStr)
         okAsPrimay = False
         if lockedByBrokerage:
             ngMsg = '  skip site={0} due to locked by another brokerage '.format(tmpSiteName)
             ngMsg += 'criteria=-lock'
         elif (nDefined+nActivated+nAssigned+nStarting) > nRunningCap:
             ngMsg = '  skip site={0} due to nDefined+nActivated+nAssigned+nStarting={1} '.format(tmpSiteName,
                                                                                                  nDefined+nActivated+nAssigned+nStarting)
             ngMsg += 'greater than max({0},{1}*nRunning={1}*{2},nPilot={3}) '.format(cutOffValue,
                                                                                      cutOffFactor,                                  
                                                                                      nRunning,                                      
                                                                                      nPilot)
             ngMsg += 'criteria=-cap'
         else:
             ngMsg = '  skip site={0} due to low weight '.format(tmpSiteName)
             ngMsg += 'criteria=-loweigh'
             okAsPrimay = True
         # use primay if cap/lock check is passed
         if okAsPrimay:
             weightMap = weightMapPrimary
         else:
             weightMap = weightMapSecondary
         # add weight
         if not weight in weightMap:
             weightMap[weight] = []
         weightMap[weight].append((siteCandidateSpec,okMsg,ngMsg))
     # use second candidates if no primary candidates passed cap/lock check
     if weightMapPrimary == {}:
         tmpLog.debug('use second candidates since no sites pass cap/lock check')
         weightMap = weightMapSecondary
         # use hightest 3 weights                                                                                                                                                  
         weightRank = 3
     else:
         weightMap = weightMapPrimary
         # use all weights
         weightRank = None
         # dump NG message
         for tmpWeight in weightMapSecondary.keys():
             for siteCandidateSpec,tmpOkMsg,tmpNgMsg in weightMapSecondary[tmpWeight]:
                 tmpLog.debug(tmpNgMsg)
     # max candidates for WORLD
     if taskSpec.useWorldCloud():
         maxSiteCandidates = 10
     else:
         maxSiteCandidates = None
     newScanSiteList = []
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightIdx,tmpWeight in enumerate(weightList):
         for siteCandidateSpec,tmpOkMsg,tmpNgMsg in weightMap[tmpWeight]:
             if (weightRank == None or weightIdx < weightRank) and \
                     (maxSiteCandidates == None or len(newScanSiteList) < maxSiteCandidates):
                 # use site
                 tmpLog.debug(tmpOkMsg)
                 newScanSiteList.append(siteCandidateSpec.siteName)
                 inputChunk.addSiteCandidate(siteCandidateSpec)
             else:
                 # dump NG message
                 tmpLog.debug(tmpNgMsg)
     scanSiteList = newScanSiteList
     # final check
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     # lock sites for WORLD
     if taskSpec.useWorldCloud():
         for tmpSiteName in scanSiteList:
             self.lockSite(taskSpec.vo,taskSpec.prodSourceLabel,tmpSiteName,taskSpec.workQueue_ID)
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     # return
     self.sendLogMessage(tmpLog)
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,inputChunk
Пример #11
0
 def doSetup(self, taskSpec, datasetToRegister, pandaJobs):
     # make logger
     tmpLog = MsgWrapper(logger,
                         "<jediTaskID={0}>".format(taskSpec.jediTaskID))
     tmpLog.info('start label={0} taskType={1}'.format(
         taskSpec.prodSourceLabel, taskSpec.taskType))
     # returns
     retFatal = self.SC_FATAL
     retTmpError = self.SC_FAILED
     retOK = self.SC_SUCCEEDED
     try:
         # get DDM I/F
         ddmIF = self.ddmIF.getInterface(taskSpec.vo)
         # register datasets
         if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']:
             # prod vs anal
             userSetup = False
             if taskSpec.prodSourceLabel in ['user']:
                 userSetup = True
                 # collect datasetID to register datasets/containers just in case
                 for tmpPandaJob in pandaJobs:
                     if not tmpPandaJob.produceUnMerge():
                         for tmpFileSpec in tmpPandaJob.Files:
                             if tmpFileSpec.type in ['output', 'log']:
                                 if not tmpFileSpec.datasetID in datasetToRegister:
                                     datasetToRegister.append(
                                         tmpFileSpec.datasetID)
             tmpLog.info('datasetToRegister={0}'.format(
                 str(datasetToRegister)))
             # get site mapper
             siteMapper = self.taskBufferIF.getSiteMapper()
             # loop over all datasets
             avDatasetList = []
             cnDatasetMap = {}
             for datasetID in datasetToRegister:
                 # get output and log datasets
                 tmpLog.info(
                     'getting datasetSpec with datasetID={0}'.format(
                         datasetID))
                 tmpStat, datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(
                     taskSpec.jediTaskID, datasetID)
                 if not tmpStat:
                     tmpLog.error('failed to get output and log datasets')
                     return retFatal
                 # DDM backend
                 ddmBackEnd = taskSpec.getDdmBackEnd()
                 tmpLog.info('checking {0}'.format(datasetSpec.datasetName))
                 # check if dataset and container are available in DDM
                 for targetName in [
                         datasetSpec.datasetName, datasetSpec.containerName
                 ]:
                     if targetName == None:
                         continue
                     if not targetName in avDatasetList:
                         # set lifetime
                         if targetName.startswith('panda'):
                             if datasetSpec.type == 'trn_log' and taskSpec.prodSourceLabel == 'managed':
                                 lifetime = 365
                             else:
                                 lifetime = 14
                         else:
                             lifetime = None
                         # check dataset/container in DDM
                         tmpList = ddmIF.listDatasets(targetName)
                         if tmpList == []:
                             # get location
                             location = None
                             locForRule = None
                             if targetName == datasetSpec.datasetName:
                                 # dataset
                                 if datasetSpec.site in ['', None]:
                                     if DataServiceUtils.getDistributedDestination(
                                             datasetSpec.storageToken
                                     ) != None:
                                         locForRule = datasetSpec.destination
                                     elif DataServiceUtils.getDestinationSE(
                                             datasetSpec.storageToken
                                     ) != None:
                                         location = DataServiceUtils.getDestinationSE(
                                             datasetSpec.storageToken)
                                     elif taskSpec.cloud != None:
                                         # use T1 SE
                                         tmpT1Name = siteMapper.getCloud(
                                             taskSpec.cloud)['source']
                                         location = siteMapper.getDdmEndpoint(
                                             tmpT1Name,
                                             datasetSpec.storageToken)
                                 else:
                                     tmpLog.info('site={0} token='.format(
                                         datasetSpec.site,
                                         datasetSpec.storageToken))
                                     location = siteMapper.getDdmEndpoint(
                                         datasetSpec.site,
                                         datasetSpec.storageToken)
                             if locForRule == None:
                                 locForRule = location
                             # set metadata
                             if taskSpec.prodSourceLabel in [
                                     'managed', 'test'
                             ] and targetName == datasetSpec.datasetName:
                                 metaData = {}
                                 metaData['task_id'] = taskSpec.jediTaskID
                                 if not taskSpec.campaign in [None, '']:
                                     metaData[
                                         'campaign'] = taskSpec.campaign
                                 if datasetSpec.getTransient() != None:
                                     metaData[
                                         'transient'] = datasetSpec.getTransient(
                                         )
                             else:
                                 metaData = None
                             # register dataset/container
                             tmpLog.info(
                                 'registering {0} with location={1} backend={2} lifetime={3} meta={4}'
                                 .format(targetName, location, ddmBackEnd,
                                         lifetime, str(metaData)))
                             tmpStat = ddmIF.registerNewDataset(
                                 targetName,
                                 backEnd=ddmBackEnd,
                                 location=location,
                                 lifetime=lifetime,
                                 metaData=metaData)
                             if not tmpStat:
                                 tmpLog.error(
                                     'failed to register {0}'.format(
                                         targetName))
                                 return retFatal
                             # procedures for user
                             if userSetup or DataServiceUtils.getDistributedDestination(
                                     datasetSpec.storageToken) != None:
                                 # register location
                                 tmpToRegister = False
                                 if userSetup and targetName == datasetSpec.datasetName and not datasetSpec.site in [
                                         '', None
                                 ]:
                                     userName = taskSpec.userName
                                     grouping = None
                                     tmpToRegister = True
                                 elif DataServiceUtils.getDistributedDestination(
                                         datasetSpec.storageToken) != None:
                                     userName = None
                                     grouping = 'NONE'
                                     tmpToRegister = True
                                 if tmpToRegister:
                                     activity = DataServiceUtils.getActivityForOut(
                                         taskSpec.prodSourceLabel)
                                     tmpLog.info(
                                         'registering location={0} lifetime={1}days activity={2} grouping={3}'
                                         .format(locForRule, lifetime,
                                                 activity, grouping))
                                     tmpStat = ddmIF.registerDatasetLocation(
                                         targetName,
                                         locForRule,
                                         owner=userName,
                                         lifetime=lifetime,
                                         backEnd=ddmBackEnd,
                                         activity=activity,
                                         grouping=grouping)
                                     if not tmpStat:
                                         tmpLog.error(
                                             'failed to register location {0} with {2} for {1}'
                                             .format(
                                                 locForRule, targetName,
                                                 ddmBackEnd))
                                         return retFatal
                             avDatasetList.append(targetName)
                         else:
                             tmpLog.info('{0} already registered'.format(
                                 targetName))
                 # check if dataset is in the container
                 if datasetSpec.containerName != None and datasetSpec.containerName != datasetSpec.datasetName:
                     # get list of constituent datasets in the container
                     if not cnDatasetMap.has_key(datasetSpec.containerName):
                         cnDatasetMap[
                             datasetSpec.
                             containerName] = ddmIF.listDatasetsInContainer(
                                 datasetSpec.containerName)
                     # add dataset
                     if not datasetSpec.datasetName in cnDatasetMap[
                             datasetSpec.containerName]:
                         tmpLog.info('adding {0} to {1}'.format(
                             datasetSpec.datasetName,
                             datasetSpec.containerName))
                         tmpStat = ddmIF.addDatasetsToContainer(
                             datasetSpec.containerName,
                             [datasetSpec.datasetName],
                             backEnd=ddmBackEnd)
                         if not tmpStat:
                             tmpLog.error('failed to add {0} to {1}'.format(
                                 datasetSpec.datasetName,
                                 datasetSpec.containerName))
                             return retFatal
                         cnDatasetMap[datasetSpec.containerName].append(
                             datasetSpec.datasetName)
                     else:
                         tmpLog.info('{0} already in {1}'.format(
                             datasetSpec.datasetName,
                             datasetSpec.containerName))
                 # update dataset
                 datasetSpec.status = 'registered'
                 self.taskBufferIF.updateDataset_JEDI(
                     datasetSpec, {
                         'jediTaskID': taskSpec.jediTaskID,
                         'datasetID': datasetID
                     })
                 # register ES datasets
                 if False:  # FIXME taskSpec.useEventService() and not taskSpec.useJobCloning() and datasetSpec.type == 'output':
                     targetName = datasetSpec.datasetName + EventServiceUtils.esSuffixDDM
                     location = None
                     metaData = {}
                     metaData['task_id'] = taskSpec.jediTaskID
                     metaData['hidden'] = True
                     tmpLog.info(
                         'registering ES dataset {0} with location={1} meta={2}'
                         .format(targetName, location, str(metaData)))
                     tmpStat = ddmIF.registerNewDataset(targetName,
                                                        location=location,
                                                        metaData=metaData)
                     if not tmpStat:
                         tmpLog.error(
                             'failed to register ES dataset {0}'.format(
                                 targetName))
                         return retFatal
                     # register rule
                     location = 'type=ES'
                     activity = DataServiceUtils.getActivityForOut(
                         taskSpec.prodSourceLabel)
                     grouping = 'NONE'
                     tmpLog.info(
                         'registering location={0} activity={1} grouping={2}'
                         .format(location, activity, grouping))
                     tmpStat = ddmIF.registerDatasetLocation(
                         targetName,
                         location,
                         activity=activity,
                         grouping=grouping)
                     if not tmpStat:
                         tmpLog.error(
                             'failed to register location {0} with {2} for {1}'
                             .format(location, targetName, activity))
                         return retFatal
         # open datasets
         if taskSpec.prodSourceLabel in ['managed', 'test']:
             # get the list of output/log datasets
             outDatasetList = []
             for tmpPandaJob in pandaJobs:
                 for tmpFileSpec in tmpPandaJob.Files:
                     if tmpFileSpec.type in ['output', 'log']:
                         if not tmpFileSpec.destinationDBlock in outDatasetList:
                             outDatasetList.append(
                                 tmpFileSpec.destinationDBlock)
             # open datasets
             for outDataset in outDatasetList:
                 tmpLog.info('open {0}'.format(outDataset))
                 ddmIF.openDataset(outDataset)
                 # unset lifetime
                 ddmIF.setDatasetMetadata(outDataset, 'lifetime', None)
         # return
         tmpLog.info('done')
         return retOK
     except:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('doSetup failed with {0}:{1}'.format(
             errtype.__name__, errvalue))
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retFatal
Пример #12
0
 def doSetup(self,taskSpec,datasetToRegister,pandaJobs):
     # make logger
     tmpLog = MsgWrapper(logger,"<jediTaskID={0}>".format(taskSpec.jediTaskID))
     tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType))
     # returns
     retFatal    = self.SC_FATAL
     retTmpError = self.SC_FAILED
     retOK       = self.SC_SUCCEEDED
     try:
         # get DDM I/F
         ddmIF = self.ddmIF.getInterface(taskSpec.vo)
         # register datasets
         if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']:
             # prod vs anal
             userSetup = False
             if taskSpec.prodSourceLabel in ['user']:
                 userSetup = True
                 # collect datasetID to register datasets/containers just in case
                 for tmpPandaJob in pandaJobs:
                     if not tmpPandaJob.produceUnMerge():
                         for tmpFileSpec in tmpPandaJob.Files:
                             if tmpFileSpec.type in ['output','log']:
                                 if not tmpFileSpec.datasetID in datasetToRegister:
                                     datasetToRegister.append(tmpFileSpec.datasetID)
             tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister)))
             # get site mapper
             siteMapper = self.taskBufferIF.getSiteMapper()
             # loop over all datasets
             avDatasetList = []
             cnDatasetMap  = {}
             for datasetID in datasetToRegister:
                 # get output and log datasets
                 tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID))
                 tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID,
                                                                               datasetID)
                 if not tmpStat:
                     tmpLog.error('failed to get output and log datasets')
                     return retFatal
                 # DDM backend
                 ddmBackEnd = taskSpec.getDdmBackEnd()
                 tmpLog.info('checking {0}'.format(datasetSpec.datasetName)) 
                 # check if dataset and container are available in DDM
                 for targetName in [datasetSpec.datasetName,datasetSpec.containerName]:
                     if targetName == None:
                         continue
                     if not targetName in avDatasetList:
                         # set lifetime
                         if targetName.startswith('panda'):
                             lifetime = 14
                         else:
                             lifetime = None
                         # check dataset/container in DDM
                         tmpList = ddmIF.listDatasets(targetName)
                         if tmpList == []:
                             # get location
                             location = None
                             locForRule = None
                             if targetName == datasetSpec.datasetName:
                                 # dataset
                                 if datasetSpec.site in ['',None]:
                                     if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                                         locForRule = datasetSpec.destination
                                     elif DataServiceUtils.getDestinationSE(datasetSpec.storageToken) != None:
                                         location = DataServiceUtils.getDestinationSE(datasetSpec.storageToken)
                                     elif taskSpec.cloud != None:
                                         # use T1 SE
                                         tmpT1Name = siteMapper.getCloud(taskSpec.cloud)['source']
                                         location = siteMapper.getDdmEndpoint(tmpT1Name,datasetSpec.storageToken)
                                 else:
                                     location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken)
                             if locForRule == None:
                                 locForRule = location
                             # set metadata
                             if taskSpec.prodSourceLabel in ['managed','test'] and targetName == datasetSpec.datasetName:
                                 metaData = {}
                                 metaData['task_id'] = taskSpec.jediTaskID
                                 if not taskSpec.campaign in [None,'']:
                                     metaData['campaign'] = taskSpec.campaign 
                                 if datasetSpec.getTransient() != None:
                                     metaData['transient'] = datasetSpec.getTransient()
                             else:
                                 metaData = None
                             # register dataset/container
                             tmpLog.info('registering {0} with location={1} backend={2} lifetime={3} meta={4}'.format(targetName,
                                                                                                                      location,
                                                                                                                      ddmBackEnd,
                                                                                                                      lifetime,
                                                                                                                      str(metaData)))
                             tmpStat = ddmIF.registerNewDataset(targetName,backEnd=ddmBackEnd,location=location,
                                                                lifetime=lifetime,metaData=metaData)
                             if not tmpStat:
                                 tmpLog.error('failed to register {0}'.format(targetName))
                                 return retFatal
                             # procedures for user 
                             if userSetup or DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                                 # register location
                                 tmpToRegister = False
                                 if userSetup and targetName == datasetSpec.datasetName and not datasetSpec.site in ['',None]:
                                     userName = taskSpec.userName
                                     grouping = None
                                     tmpToRegister = True
                                 elif DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                                     userName = None
                                     grouping = 'NONE'
                                     tmpToRegister = True
                                 if tmpToRegister:
                                     activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel)
                                     tmpLog.info('registring location={0} lifetime={1}days activity={2} grouping={3}'.format(locForRule,lifetime,
                                                                                                                             activity,grouping))
                                     tmpStat = ddmIF.registerDatasetLocation(targetName,locForRule,owner=userName,
                                                                             lifetime=lifetime,backEnd=ddmBackEnd,
                                                                             activity=activity,grouping=grouping)
                                     if not tmpStat:
                                         tmpLog.error('failed to register location {0} with {2} for {1}'.format(locForRule,
                                                                                                                targetName,
                                                                                                                ddmBackEnd))
                                         return retFatal
                             avDatasetList.append(targetName)
                         else:
                             tmpLog.info('{0} already registered'.format(targetName))
                 # check if dataset is in the container
                 if datasetSpec.containerName != None and datasetSpec.containerName != datasetSpec.datasetName:
                     # get list of constituent datasets in the container
                     if not cnDatasetMap.has_key(datasetSpec.containerName):
                         cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName)
                     # add dataset
                     if not datasetSpec.datasetName in cnDatasetMap[datasetSpec.containerName]:
                         tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) 
                         tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName],
                                                                backEnd=ddmBackEnd)
                         if not tmpStat:
                             tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName,
                                                                            datasetSpec.containerName))
                             return retFatal
                         cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName)
                     else:
                         tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) 
                 # update dataset
                 datasetSpec.status = 'registered'
                 self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID,
                                                                   'datasetID':datasetID})
         # open datasets
         if taskSpec.prodSourceLabel in ['managed','test']:
             # get the list of output/log datasets
             outDatasetList = []
             for tmpPandaJob in pandaJobs:
                 for tmpFileSpec in tmpPandaJob.Files:
                     if tmpFileSpec.type in ['output','log']:
                         if not tmpFileSpec.destinationDBlock in outDatasetList:
                             outDatasetList.append(tmpFileSpec.destinationDBlock)
             # open datasets
             for outDataset in outDatasetList:
                 tmpLog.info('open {0}'.format(outDataset))
                 ddmIF.openDataset(outDataset)
                 # unset lifetime
                 ddmIF.setDatasetMetadata(outDataset,'lifetime',None)
         # return
         tmpLog.info('done')        
         return retOK
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue))
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retFatal
Пример #13
0
 def runImpl(self):
     # cutoff for disk in TB
     diskThreshold = self.taskBufferIF.getConfigValue(self.msgType, 'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name),
                                                      'jedi', 'atlas')
     if diskThreshold is None:
         diskThreshold = 100 * 1024
     # dataset type to ignore file availability check
     datasetTypeToSkipCheck = ['log']
     # thresholds for data availability check
     thrInputSize = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas')
     if thrInputSize is None:
         thrInputSize = 1
     thrInputSize *= 1024*1024*1024
     thrInputNum = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_THRESHOLD', 'jedi', 'atlas')
     if thrInputNum is None:
         thrInputNum = 100
     thrInputSizeFrac = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas')
     if thrInputSizeFrac is None:
         thrInputSizeFrac = 10
     thrInputSizeFrac = float(thrInputSizeFrac) / 100
     thrInputNumFrac = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas')
     if thrInputNumFrac is None:
         thrInputNumFrac = 10
     thrInputNumFrac = float(thrInputNumFrac) / 100
     cutOffRW = 50
     negWeightTape = 0.001
     # main
     lastJediTaskID = None
     siteMapper = self.taskBufferIF.getSiteMapper()
     while True:
         try:
             taskInputList = self.inputList.get(1)
             # no more datasets
             if len(taskInputList) == 0:
                 self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__,
                                                                                                             self.numTasks))
                 return
             # loop over all tasks
             for taskSpec,inputChunk in taskInputList:
                 lastJediTaskID = taskSpec.jediTaskID
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='jediTaskID={0}'.format(taskSpec.jediTaskID))
                 tmpLog.debug('start')
                 tmpLog.info('thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}'.format(thrInputSize,
                                                                                                                 thrInputNum,
                                                                                                                 thrInputSizeFrac,
                                                                                                                 thrInputNumFrac))
                 # RW
                 taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID)
                 # get nuclei
                 nucleusList = siteMapper.nuclei
                 if taskSpec.nucleus in nucleusList:
                     candidateNucleus = taskSpec.nucleus
                 else:
                     tmpLog.info('got {0} candidates'.format(len(nucleusList)))
                     ######################################
                     # check status
                     newNucleusList = {}
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleusSpec.state in ['ACTIVE']:
                             tmpLog.info('  skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus,
                                                                                                         tmpNucleusSpec.state))
                         else:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed status check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # check status of transfer backlog
                     t1Weight = taskSpec.getT1Weight()
                     if t1Weight < 0:
                         tmpLog.info('skip transfer backlog check due to negative T1Weight')
                     else:
                         newNucleusList = {}
                         backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei()
                         for tmpNucleus, tmpNucleusSpec in nucleusList.iteritems():
                             if tmpNucleus in backlogged_nuclei:
                                 tmpLog.info('  skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog'.
                                              format(tmpNucleus))
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         nucleusList = newNucleusList
                         tmpLog.info('{0} candidates passed transfer backlog check'.format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # check endpoint
                     fractionFreeSpace = {}
                     newNucleusList = {}
                     tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                                   ['output','log'])
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         toSkip = False
                         for tmpDatasetSpec in tmpDatasetSpecList:
                             # ignore distributed datasets
                             if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None:
                                 continue
                             # get endpoint with the pattern
                             tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken)
                             if tmpEP == None:
                                 tmpLog.info('  skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus,
                                                                                                                     tmpDatasetSpec.storageToken))
                                 toSkip = True
                                 break
                             # check state
                             """
                             if not tmpEP['state'] in ['ACTIVE']:
                                 tmpLog.info('  skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus,
                                                                                                                          tmpEP['ddm_endpoint_name'],
                                                                                                                          tmpEP['state']))
                                 toSkip = True
                                 break
                             """    
                             # check space
                             tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired']
                             tmpSpaceToUse = 0
                             if tmpNucleus in self.fullRW:
                                 # 0.25GB per cpuTime/corePower/day
                                 tmpSpaceToUse = long(self.fullRW[tmpNucleus]/10/24/3600*0.25)
                             if tmpSpaceSize-tmpSpaceToUse < diskThreshold:
                                 tmpLog.info('  skip nucleus={0} since disk shortage (free {1} - reserved {2} < thr {3}) at endpoint {4} criteria=-space'.format(tmpNucleus,
                                                                                                                                                                  tmpSpaceSize,
                                                                                                                                                                  tmpSpaceToUse,
                                                                                                                                                                  diskThreshold,
                                                                                                                                                                  tmpEP['ddm_endpoint_name']))
                                 toSkip = True
                                 break
                             # keep fraction of free space
                             if not tmpNucleus in fractionFreeSpace:
                                 fractionFreeSpace[tmpNucleus] = {'total':0,'free':0}
                             try:
                                 tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                     float(fractionFreeSpace[tmpNucleus]['total'])
                             except:
                                 tmpOld = None
                             try:
                                 tmpNew = float(tmpSpaceSize-tmpSpaceToUse)/float(tmpEP['space_total'])
                             except:
                                 tmpNew = None
                             if tmpNew != None and (tmpOld == None or tmpNew < tmpOld):
                                 fractionFreeSpace[tmpNucleus] = {'total':tmpEP['space_total'],
                                                                  'free':tmpSpaceSize-tmpSpaceToUse}
                         if not toSkip:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed endpoint check {1} TB'.format(len(nucleusList),diskThreshold/1024))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # ability to execute jobs
                     newNucleusList = {}
                     # get all panda sites
                     tmpSiteList = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         tmpSiteList += tmpNucleusSpec.allPandaSites
                     tmpSiteList = list(set(tmpSiteList))
                     tmpLog.debug('===== start for job check')
                     jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF)
                     tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True,
                                                          tmpSiteList,tmpLog)
                     tmpLog.debug('===== done for job check')
                     if tmpSt != Interaction.SC_SUCCEEDED:
                         tmpLog.error('no sites can run jobs')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     okNuclei = set()
                     for tmpSite in tmpRet:
                         siteSpec = siteMapper.getSite(tmpSite)
                         okNuclei.add(siteSpec.pandasite)
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if tmpNucleus in okNuclei:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                         else:
                             tmpLog.info('  skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus))
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed job check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ###################################### 
                     # data locality
                     toSkip = False
                     availableData = {}
                     for datasetSpec in inputChunk.getDatasets():
                         # only for real datasets
                         if datasetSpec.isPseudo():
                             continue
                         # ignore DBR
                         if DataServiceUtils.isDBR(datasetSpec.datasetName):
                             continue
                         # skip locality check
                         if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck:
                             continue
                         # use deep scan for primary dataset
                         if datasetSpec.isMaster():
                             deepScan = True
                         else:
                             deepScan = False
                         # get nuclei where data is available
                         tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF,
                                                                           datasetSpec.datasetName,
                                                                           nucleusList.keys(),
                                                                           deepScan)
                         if tmpSt != Interaction.SC_SUCCEEDED:
                             tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet))
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             toSkip = True
                             break
                         # sum
                         for tmpNucleus,tmpVals in tmpRet.iteritems():
                             if not tmpNucleus in availableData:
                                 availableData[tmpNucleus] = tmpVals
                             else:
                                 availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems())
                     if toSkip:
                         continue
                     if availableData != {}:
                         newNucleusList = {}
                         # skip if no data
                         skipMsgList = []
                         for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                             if len(nucleusList) == 1:
                                 tmpLog.info('  disable data locality check for nucleus={0} since no other candidate'.format(tmpNucleus))
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                             elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \
                                     availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac:
                                 tmpMsg = '  skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus,
                                                                                                                                     availableData[tmpNucleus]['ava_size_any'],
                                                                                                                                     availableData[tmpNucleus]['tot_size'],
                                                                                                                                     thrInputSizeFrac)
                                 skipMsgList.append(tmpMsg)
                             elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \
                                     availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac:
                                 tmpMsg = '  skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus,
                                                                                                                                       availableData[tmpNucleus]['ava_num_any'],
                                                                                                                                       availableData[tmpNucleus]['tot_num'],
                                                                                                                                       thrInputNumFrac)
                                 skipMsgList.append(tmpMsg)
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         if len(newNucleusList) > 0:
                             nucleusList = newNucleusList
                             for tmpMsg in skipMsgList:
                                 tmpLog.info(tmpMsg)
                         else:
                             tmpLog.info('  disable data locality check since no nucleus has input data')
                         tmpLog.info('{0} candidates passed data check'.format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ###################################### 
                     # weight
                     self.prioRW.acquire()
                     nucleusRW = self.prioRW[taskSpec.currentPriority]
                     self.prioRW.release()
                     totalWeight = 0
                     nucleusweights = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleus in nucleusRW:
                             nucleusRW[tmpNucleus] = 0
                         wStr = '1'
                         # with RW
                         if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW:
                             weight = 1 / float(nucleusRW[tmpNucleus])
                             wStr += '/( RW={0} )'.format(nucleusRW[tmpNucleus])
                         else:
                             weight = 1
                             wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW)
                         # with data
                         if availableData != {}:
                             if availableData[tmpNucleus]['tot_size'] > 0:
                                 weight *= float(availableData[tmpNucleus]['ava_size_any'])
                                 weight /= float(availableData[tmpNucleus]['tot_size'])
                                 wStr += '* ( available_input_size_DISKTAPE={0} )'.format(availableData[tmpNucleus]['ava_size_any'])
                                 wStr += '/ ( total_input_size={0} )'.format(availableData[tmpNucleus]['tot_size'])
                                 # negative weight for tape
                                 if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']:
                                     weight *= negWeightTape
                                     wStr += '*( weight_TAPE={0} )'.format(negWeightTape)
                             # fraction of free space
                             if tmpNucleus in fractionFreeSpace:
                                 try:
                                     tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                         float(fractionFreeSpace[tmpNucleus]['total'])
                                     weight *= tmpFrac
                                     wStr += '*( free_space={0} )/( total_space={1} )'.format(fractionFreeSpace[tmpNucleus]['free'],
                                                                                          fractionFreeSpace[tmpNucleus]['total'])
                                 except:
                                     pass
                         tmpLog.info('  use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr))
                         totalWeight += weight
                         nucleusweights.append((tmpNucleus,weight))
                     tmpLog.info('final {0} candidates'.format(len(nucleusList)))
                     ###################################### 
                     # final selection
                     tgtWeight = random.uniform(0,totalWeight)
                     candidateNucleus = None
                     for tmpNucleus,weight in nucleusweights:
                         tgtWeight -= weight
                         if tgtWeight <= 0:
                             candidateNucleus = tmpNucleus
                             break
                     if candidateNucleus == None:
                         candidateNucleus = nucleusweights[-1][0]
                 ###################################### 
                 # update
                 nucleusSpec = nucleusList[candidateNucleus]
                 # get output/log datasets
                 tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                            ['output','log'])
                 # get destinations
                 retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)}
                 tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
                 tmpLog.info('  set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet))
                 self.sendLogMessage(tmpLog)
                 if tmpRet:
                     tmpMsg = 'set task.status=ready'
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                 # update RW table
                 self.prioRW.acquire()
                 for prio,rwMap in self.prioRW.iteritems():
                     if prio > taskSpec.currentPriority:
                         continue
                     if candidateNucleus in rwMap:
                         rwMap[candidateNucleus] += taskRW
                     else:
                         rwMap[candidateNucleus] = taskRW
                 self.prioRW.release()
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errMsg  = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID)
             errMsg += traceback.format_exc()
             logger.error(errMsg)
Пример #14
0
 def doSetup(self,taskSpec,datasetToRegister):
     # make logger
     tmpLog = MsgWrapper(logger,"<jediTaskID={0}>".format(taskSpec.jediTaskID))
     tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType))
     tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister)))
     # returns
     retFatal    = self.SC_FATAL
     retTmpError = self.SC_FAILED
     retOK       = self.SC_SUCCEEDED
     try:
         if datasetToRegister != []:
             # prod vs anal
             userSetup = False
             if taskSpec.prodSourceLabel in ['user']:
                 userSetup = True
             # get DDM I/F
             ddmIF = self.ddmIF.getInterface(taskSpec.vo)
             # get site mapper
             siteMapper = self.taskBufferIF.getSiteMapper()
             # loop over all datasets
             avDatasetList = []
             cnDatasetMap  = {}
             for datasetID in datasetToRegister:
                 # get output and log datasets
                 tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID))
                 tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID,
                                                                               datasetID)
                 if not tmpStat:
                     tmpLog.error('failed to get output and log datasets')
                     return retFatal
                 tmpLog.info('checking {0}'.format(datasetSpec.datasetName)) 
                 # check if dataset and container are available in DDM
                 for targetName in [datasetSpec.datasetName,datasetSpec.containerName]:
                     if targetName == None:
                         continue
                     if not targetName in avDatasetList:
                         # check dataset/container in DDM
                         tmpList = ddmIF.listDatasets(targetName)
                         if tmpList == []:
                             # register dataset/container
                             tmpLog.info('registering {0}'.format(targetName))
                             tmpStat = ddmIF.registerNewDataset(targetName)
                             if not tmpStat:
                                 tmpLog.error('failed to register {0}'.format(targetName))
                                 return retFatal
                             # procedures for user 
                             if userSetup:
                                 # set owner
                                 tmpLog.info('setting owner={0}'.format(taskSpec.userName))
                                 tmpStat = ddmIF.setDatasetOwner(targetName,taskSpec.userName)
                                 if not tmpStat:
                                     tmpLog.error('failed to set ownership {0} with {1}'.format(targetName,
                                                                                                taskSpec.userName))
                                     return retFatal
                                 # register location
                                 if targetName == datasetSpec.datasetName and not datasetSpec.site in ['',None]: 
                                     location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken)
                                     tmpLog.info('registring location={0}'.format(location))
                                     tmpStat = ddmIF.registerDatasetLocation(targetName,location,owner=taskSpec.userName)
                                     if not tmpStat:
                                         tmpLog.error('failed to register location {0} for {1}'.format(location,
                                                                                                       targetName))
                                         return retFatal
                             avDatasetList.append(targetName)
                         else:
                             tmpLog.info('{0} already registered'.format(targetName))
                 # check if dataset is in the container
                 if datasetSpec.containerName != None and datasetSpec.containerName != datasetSpec.datasetName:
                     # get list of constituent datasets in the container
                     if not cnDatasetMap.has_key(datasetSpec.containerName):
                         cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName)
                     # add dataset
                     if not datasetSpec.datasetName in cnDatasetMap[datasetSpec.containerName]:
                         tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) 
                         tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName])
                         if not tmpStat:
                             tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName,
                                                                            datasetSpec.containerName))
                             return retFatal
                         cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName)
                     else:
                         tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) 
                 # update dataset
                 datasetSpec.status = 'registered'
                 self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID,
                                                                   'datasetID':datasetID})
         # return
         tmpLog.info('done')        
         return retOK
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue))
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retFatal
Пример #15
0
 def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,
                         '<jediTaskID={0}>'.format(taskSpec.jediTaskID))
     tmpLog.debug('start')
     # return for failure
     retFatal = self.SC_FATAL, inputChunk
     retTmpError = self.SC_FAILED, inputChunk
     # set cloud
     try:
         if not taskParamMap:
             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                 taskSpec.jediTaskID)
             taskParamMap = RefinerUtils.decodeJSON(taskParam)
         if not taskSpec.cloud and 'cloud' in taskParamMap:
             taskSpec.cloud = taskParamMap['cloud']
     except Exception:
         pass
     # get sites in the cloud
     site_preassigned = True
     if taskSpec.site not in ['', None]:
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
         if self.siteMapper.checkSite(taskSpec.site):
             scanSiteList = [taskSpec.site]
         else:
             scanSiteList = []
             for tmpSite in self.siteMapper.getCloud(
                     taskSpec.cloud)['sites']:
                 if re.search(taskSpec.site, tmpSite):
                     scanSiteList.append(tmpSite)
             if not scanSiteList:
                 tmpLog.error('unknown site={}'.format(taskSpec.site))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retTmpError
     elif inputChunk.getPreassignedSite() is not None:
         scanSiteList = [inputChunk.getPreassignedSite()]
         tmpLog.debug('site={0} is pre-assigned in masterDS'.format(
             inputChunk.getPreassignedSite()))
     else:
         site_preassigned = False
         scanSiteList = self.siteMapper.getCloud(taskSpec.cloud)['sites']
         # remove NA
         if 'NA' in scanSiteList:
             scanSiteList.remove('NA')
         tmpLog.debug('cloud=%s has %s candidates' %
                      (taskSpec.cloud, len(scanSiteList)))
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     ######################################
     # selection for status and PandaSite
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         if tmpSiteSpec.status != 'online' and not site_preassigned:
             tmpLog.debug('  skip %s due to status=%s' %
                          (tmpSiteName, tmpSiteSpec.status))
             continue
         # check PandaSite
         if 'PandaSite' in taskParamMap and taskParamMap['PandaSite']:
             if tmpSiteSpec.pandasite != taskParamMap['PandaSite']:
                 tmpLog.debug('  skip %s due to wrong PandaSite=%s <> %s' %
                              (tmpSiteName, tmpSiteSpec.pandasite,
                               taskParamMap['PandaSite']))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed site status check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for scratch disk
     minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize(
     ) + inputChunk.getMaxAtomSize()
     minDiskCountS = minDiskCountS // 1024 // 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = taskSpec.getOutDiskSize(
         ) + taskSpec.getWorkDiskSize()
         minDiskCountR = minDiskCountR // 1024 // 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir:
             if JediCoreUtils.use_direct_io_for_job(taskSpec, tmpSiteSpec,
                                                    inputChunk):
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug(
                     '  skip {0} due to small scratch disk={1} < {2}'.
                     format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # free space must be >= 200GB
         diskThreshold = 200
         tmpSpaceSize = tmpSiteSpec.space
         if tmpSiteSpec.space and tmpSpaceSize < diskThreshold:
             tmpLog.debug(
                 '  skip {0} due to disk shortage in SE = {1} < {2}GB'.
                 format(tmpSiteName, tmpSiteSpec.space, diskThreshold))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if minWalltime not in [0, None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug(
                     '  skip {0} due to short site walltime={1}(site upper limit) < {2}'
                     .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug(
                     '  skip {0} due to short job walltime={1}(site lower limit) > {2}'
                     .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(
             len(scanSiteList), minWalltime, taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for memory
     origMinRamCount = inputChunk.getMaxRamCount()
     if not site_preassigned and origMinRamCount:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # job memory requirement
             if taskSpec.ramPerCore():
                 minRamCount = origMinRamCount * (
                     tmpSiteSpec.coreCount if tmpSiteSpec.coreCount else 1)
                 minRamCount += (taskSpec.baseRamCount
                                 if taskSpec.baseRamCount else 0)
             else:
                 minRamCount = origMinRamCount
             # site max memory requirement
             site_maxmemory = tmpSiteSpec.maxrss if tmpSiteSpec.maxrss else 0
             # check at the site
             if site_maxmemory and minRamCount and minRamCount > site_maxmemory:
                 tmpMsg = '  skip site={0} due to site RAM shortage {1}(site upper limit) less than {2} '.format(
                     tmpSiteName, site_maxmemory, minRamCount)
                 tmpLog.debug(tmpMsg)
                 continue
             # site min memory requirement
             site_minmemory = tmpSiteSpec.minrss if tmpSiteSpec.minrss else 0
             if site_minmemory and minRamCount and minRamCount < site_minmemory:
                 tmpMsg = '  skip site={0} due to job RAM shortage {1}(site lower limit) greater than {2} '.format(
                     tmpSiteName, site_minmemory, minRamCount)
                 tmpLog.info(tmpMsg)
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed memory check'.format(
             len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if tmpSiteName in nWNmap:
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][
                 'updateJob']
         if nPilot == 0 and taskSpec.prodSourceLabel not in ['test']:
             tmpLog.debug('  skip %s due to no pilot' % tmpSiteName)
             #continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed pilot activity check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # sites already used by task
     tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(
         taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # get list of available files
     availableFileMap = {}
     for datasetSpec in inputChunk.getDatasets():
         try:
             # get list of site to be scanned
             tmpLog.debug(
                 'getting the list of available files for {0}'.format(
                     datasetSpec.datasetName))
             fileScanSiteList = []
             for tmpPseudoSiteName in scanSiteList:
                 tmpSiteSpec = self.siteMapper.getSite(tmpPseudoSiteName)
                 tmpSiteName = tmpSiteSpec.get_unified_name()
                 if tmpSiteName in fileScanSiteList:
                     continue
                 fileScanSiteList.append(tmpSiteName)
             # mapping between sites and input storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteInputStorageEndpointMap(
                 fileScanSiteList, self.siteMapper,
                 taskSpec.prodSourceLabel, None)
             # disable file lookup for merge jobs
             if inputChunk.isMerging:
                 checkCompleteness = False
             else:
                 checkCompleteness = True
             if not datasetSpec.isMaster():
                 useCompleteOnly = True
             else:
                 useCompleteOnly = False
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(
                 datasetSpec,
                 siteStorageEP,
                 self.siteMapper,
                 check_completeness=checkCompleteness,
                 file_scan_in_container=False,
                 complete_only=useCompleteOnly)
             if tmpAvFileMap is None:
                 raise Interaction.JEDITemporaryError(
                     'ddmIF.getAvailableFiles failed')
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except Exception as e:
             tmpLog.error('failed to get available files with {}'.format(e))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # calculate weight
     tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsByGlobalShare(
         taskSpec.vo)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     preSiteCandidateSpec = None
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName,
                                                'running', None, None)
         nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                 tmpSiteName, 'defined',
                                                 None, None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                  tmpSiteName, 'activated',
                                                  None, None)
         weight = float(nRunning + 1) / float(nActivated + nAssigned +
                                              1) / float(nAssigned + 1)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # set weight
         siteCandidateSpec.weight = weight
         # files
         for tmpDatasetName, availableFiles in six.iteritems(
                 availableFileMap):
             if tmpSiteName in availableFiles:
                 siteCandidateSpec.add_local_disk_files(
                     availableFiles[tmpSiteName]['localdisk'])
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if weight not in weightMap:
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)
     # limit the number of sites
     maxNumSites = 5
     weightList = list(weightMap.keys())
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         if len(candidateSpecList) >= maxNumSites:
             break
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight[:(maxNumSites -
                                                len(candidateSpecList))]
     # collect site names
     scanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         # append
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug('  use {} with weight={} nFiles={}'.format(
             siteCandidateSpec.siteName, siteCandidateSpec.weight,
             len(siteCandidateSpec.localDiskFiles)))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED, inputChunk