def doRefine(self,jediTaskID,taskParamMap):
     # make logger
     tmpLog = self.tmpLog
     tmpLog.debug('start taskType={0}'.format(self.taskSpec.taskType))
     try:
         self.doBasicRefine(taskParamMap)
         # set nosplit+repeat for DBR
         for datasetSpec in self.inSecDatasetSpecList:
             if DataServiceUtils.isDBR(datasetSpec.datasetName):
                 datasetSpec.attributes = 'repeat,nosplit'
         # append attempt number
         for tmpKey,tmpOutTemplateMapList in self.outputTemplateMap.iteritems():
             for tmpOutTemplateMap in tmpOutTemplateMapList:
                 outFileTemplate = tmpOutTemplateMap['filenameTemplate']
                 if re.search('\.\d+$',outFileTemplate) == None and not outFileTemplate.endswith('.panda.um'):
                     tmpOutTemplateMap['filenameTemplate'] = outFileTemplate + '.1'
         # set destination if nessesary
         for datasetSpec in self.outDatasetSpecList:
             storageToken = DataServiceUtils.getDestinationSE(datasetSpec.storageToken)
             if storageToken != None:
                 tmpSiteList = self.ddmIF.getInterface(self.taskSpec.vo).getSitesWithEndPoint(storageToken,self.siteMapper,'production')
                 if tmpSiteList == []:
                     raise RuntimeError,'cannot find online siteID associated to {0}'.format(storageToken)
                 datasetSpec.destination = tmpSiteList[0]
         # set to register datasets
         #self.taskSpec.setToRegisterDatasets()
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('doBasicRefine failed with {0}:{1}'.format(errtype.__name__,errvalue))
         raise errtype,errvalue
     tmpLog.debug('done')
     return self.SC_SUCCEEDED
 def doRefine(self,jediTaskID,taskParamMap):
     # make logger
     tmpLog = self.tmpLog
     tmpLog.debug('start taskType={0}'.format(self.taskSpec.taskType))
     try:
         # preprocessing
         tmpStat,taskParamMap = self.doPreProRefine(taskParamMap)
         if tmpStat == True:
             tmpLog.debug('done for preprocessing')
             return self.SC_SUCCEEDED
         if tmpStat == False:
             # failed
             tmpLog.error('doPreProRefine failed')
             return self.SC_FAILED
         # normal refine
         self.doBasicRefine(taskParamMap)
         # set nosplit+repeat for DBR
         for datasetSpec in self.inSecDatasetSpecList:
             # get the latest version of DBR
             if datasetSpec.datasetName == 'DBR_LATEST':
                 tmpLog.debug('resolving real name for {0}'.format(datasetSpec.datasetName))
                 datasetSpec.datasetName = self.ddmIF.getInterface(self.taskSpec.vo).getLatestDBRelease(useResultCache=3600)
                 datasetSpec.containerName = datasetSpec.datasetName
             # set attributes to DBR
             if DataServiceUtils.isDBR(datasetSpec.datasetName):
                 datasetSpec.attributes = 'repeat,nosplit'
         # destination
         if taskParamMap.has_key('destination'):
             for datasetSpec in self.outDatasetSpecList:
                 datasetSpec.destination = taskParamMap['destination']
         # use build
         if taskParamMap.has_key('buildSpec'):
             self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['useBuild'])
         # use template dataset
         self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['instantiateTmpl'])
         self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['instantiateTmplSite'])
         for datasetSpec in self.outDatasetSpecList:
             datasetSpec.type = "tmpl_{0}".format(datasetSpec.type) 
         # get jobsetID
         tmpStat,tmpJobID = self.taskBufferIF.getUserJobsetID_JEDI(self.taskSpec.userName)
         if not tmpStat:
             tmpLog.error('failed to get jobsetID failed')
             return self.SC_FAILED
         self.taskSpec.reqID = tmpJobID
         # site limitation
         if 'excludedSite' in taskParamMap and 'includedSite' in taskParamMap:
             self.taskSpec.setLimitedSites('incexc')
         elif 'excludedSite' in taskParamMap:
             self.taskSpec.setLimitedSites('exc')
         elif 'includedSite' in taskParamMap:
             self.taskSpec.setLimitedSites('inc')
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errStr = 'doRefine failed with {0}:{1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errStr)
         self.taskSpec.setErrDiag(errStr,None)
         raise errtype,errvalue
     tmpLog.debug('done')
     return self.SC_SUCCEEDED
Exemplo n.º 3
0
 def doCheck(self,taskSpecList):
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start doCheck')
     # return for failure
     retFatal    = self.SC_FATAL,{}
     retTmpError = self.SC_FAILED,{}
     # get list of jediTaskIDs
     taskIdList = []
     taskSpecMap = {}
     for taskSpec in taskSpecList:
         taskIdList.append(taskSpec.jediTaskID)
         taskSpecMap[taskSpec.jediTaskID] = taskSpec
     # check with panda
     tmpLog.debug('check with panda')
     tmpPandaStatus,cloudsInPanda = PandaClient.seeCloudTask(taskIdList)
     if tmpPandaStatus != 0:
         tmpLog.error('failed to see clouds')
         return retTmpError
     # make return map
     retMap = {}
     for tmpTaskID,tmpCoreName in cloudsInPanda.iteritems():
         tmpLog.debug('jediTaskID={0} -> {1}'.format(tmpTaskID,tmpCoreName))
         if not tmpCoreName in ['NULL','',None]:
             taskSpec = taskSpecMap[tmpTaskID]
             if taskSpec.useWorldCloud():
                 # get destinations for WORLD cloud
                 ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                 # get site
                 siteSpec = self.siteMapper.getSite(tmpCoreName)
                 # get nucleus
                 nucleus = siteSpec.pandasite
                 # get output/log datasets
                 tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(tmpTaskID,['output','log'])
                 # get destinations
                 retMap[tmpTaskID] = {'datasets':[],'nucleus':nucleus}
                 for datasetSpec in tmpDatasetSpecs:
                     # skip distributed datasets
                     if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                         continue
                     # get token
                     token = ddmIF.convertTokenToEndpoint(siteSpec.ddm,datasetSpec.storageToken)
                     # use default endpoint
                     if token == None:
                         token = siteSpec.ddm
                     # add origianl token
                     if not datasetSpec.storageToken in ['',None]:
                         token += '/{0}'.format(datasetSpec.storageToken)
                     retMap[tmpTaskID]['datasets'].append({'datasetID':datasetSpec.datasetID,
                                                           'token':'dst:{0}'.format(token),
                                                           'destination':tmpCoreName})
             else:
                 retMap[tmpTaskID] = tmpCoreName
     tmpLog.debug('ret {0}'.format(str(retMap)))
     # return
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,retMap
Exemplo n.º 4
0
def getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs):
    # get destinations
    retMap = {'datasets':[],'nucleus':nucleusSpec.name}
    for datasetSpec in tmpDatasetSpecs:
        # skip distributed datasets
        if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
            continue
        # get token
        token = nucleusSpec.getAssoicatedEndpoint(datasetSpec.storageToken)['ddm_endpoint_name']
        # add origianl token
        if not datasetSpec.storageToken in ['',None]:
            token += '/{0}'.format(datasetSpec.storageToken.split('/')[-1])
        retMap['datasets'].append({'datasetID':datasetSpec.datasetID,
                                   'token':'dst:{0}'.format(token),
                                   'destination':'nucleus:{0}'.format(nucleusSpec.name)})
    return retMap
Exemplo n.º 5
0
 def runImpl(self):
     # cutoff for disk in TB
     diskThreshold = 5 * 1024
     # dataset type to ignore file availability check
     datasetTypeToSkipCheck = ['log']
     thrInputSize = 1024*1024*1024
     thrInputNum = 100
     thrInputSizeFrac = 0.1
     thrInputNumFrac = 0.1
     cutOffRW = 50
     negWeightTape = 0.001
     # main
     lastJediTaskID = None
     siteMapper = self.taskBufferIF.getSiteMapper()
     while True:
         try:
             taskInputList = self.inputList.get(1)
             # no more datasets
             if len(taskInputList) == 0:
                 self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__,
                                                                                                             self.numTasks))
                 return
             # loop over all tasks
             for taskSpec,inputChunk in taskInputList:
                 lastJediTaskID = taskSpec.jediTaskID
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='{0}'.format(taskSpec.jediTaskID))
                 tmpLog.debug('start')
                 # get nuclei
                 nucleusList = siteMapper.nuclei
                 if taskSpec.nucleus in nucleusList:
                     candidateNucleus = taskSpec.nucleus
                 else:
                     tmpLog.debug('got {0} candidates'.format(len(nucleusList)))
                     ######################################
                     # check status
                     newNucleusList = {}
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleusSpec.state in ['ACTIVE']:
                             tmpLog.debug('  skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus,
                                                                                                         tmpNucleusSpec.state))
                         else:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.debug('{0} candidates passed status check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # check endpoint
                     newNucleusList = {}
                     tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                                   ['output','log'])
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         toSkip = False
                         for tmpDatasetSpec in tmpDatasetSpecList:
                             # ignore distributed datasets
                             if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None:
                                 continue
                             # get endpoint with the pattern
                             tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken)
                             if tmpEP == None:
                                 tmpLog.debug('  skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus,
                                                                                                                     tmpDatasetSpec.storageToken))
                                 toSkip = True
                                 break
                             # check state
                             """
                             if not tmpEP['state'] in ['ACTIVE']:
                                 tmpLog.debug('  skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus,
                                                                                                                          tmpEP['ddm_endpoint_name'],
                                                                                                                          tmpEP['state']))
                                 toSkip = True
                                 break
                             """    
                             # check space
                             tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired']
                             if tmpSpaceSize < diskThreshold:
                                 tmpLog.debug('  skip nucleus={0} since disk shortage ({1}<{2}) at endpoint {3} criteria=-space'.format(tmpNucleus,
                                                                                                                                        tmpSpaceSize,
                                                                                                                                        diskThreshold,
                                                                                                                                        tmpEP['state']))
                                 toSkip = True
                                 break
                         if not toSkip:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.debug('{0} candidates passed endpoint check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ###################################### 
                     # data locality
                     toSkip = False
                     availableData = {}
                     for datasetSpec in inputChunk.getDatasets():
                         # only for real datasets
                         if datasetSpec.isPseudo():
                             continue
                         # ignore DBR
                         if DataServiceUtils.isDBR(datasetSpec.datasetName):
                             continue
                         # skip locality check
                         if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck:
                             continue
                         # get nuclei where data is available
                         tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF,
                                                                           datasetSpec.datasetName,
                                                                           nucleusList.keys())
                         if tmpSt != Interaction.SC_SUCCEEDED:
                             tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet))
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             toSkip = True
                             break
                         # sum
                         for tmpNucleus,tmpVals in tmpRet.iteritems():
                             if not tmpNucleus in availableData:
                                 availableData[tmpNucleus] = tmpVals
                             else:
                                 availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems())
                     if toSkip:
                         continue
                     if availableData != {}:
                         newNucleusList = {}
                         # skip if no data
                         for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                             if availableData[tmpNucleus]['tot_size'] > thrInputSize and \
                                     availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac:
                                 tmpLog.debug('  skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus,
                                                                                                                                         availableData[tmpNucleus]['ava_size_any'],
                                                                                                                                         availableData[tmpNucleus]['tot_size'],
                                                                                                                                         thrInputSizeFrac))
                             elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \
                                     availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac:
                                 tmpLog.debug('  skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus,
                                                                                                                                           availableData[tmpNucleus]['ava_num_any'],
                                                                                                                                           availableData[tmpNucleus]['tot_num'],
                                                                                                                                           thrInputNumFrac))
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         nucleusList = newNucleusList
                         tmpLog.debug('{0} candidates passed data check'.format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # ability to execute jobs
                     newNucleusList = {}
                     # get all panda sites
                     tmpSiteList = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         tmpSiteList += tmpNucleusSpec.allPandaSites
                     tmpSiteList = list(set(tmpSiteList))
                     tmpLog.debug('===== start for job check')
                     jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF)
                     tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True,
                                                          tmpSiteList,tmpLog)
                     tmpLog.debug('===== done for job check')
                     if tmpSt != Interaction.SC_SUCCEEDED:
                         tmpLog.debug('failed to get sites where jobs can run. Use any nuclei where input is available')
                         # use any nuclei where input is available if no sites can run jobs
                         tmpRet = tmpSiteList
                     okNuclei = set()
                     for tmpSite in tmpRet:
                         siteSpec = siteMapper.getSite(tmpSite)
                         okNuclei.add(siteSpec.pandasite)
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if tmpNucleus in okNuclei:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                         else:
                             tmpLog.debug('  skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus))
                     nucleusList = newNucleusList
                     tmpLog.debug('{0} candidates passed job check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ###################################### 
                     # RW
                     taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID)
                     ###################################### 
                     # weight
                     self.prioRW.acquire()
                     nucleusRW = self.prioRW[taskSpec.currentPriority]
                     self.prioRW.release()
                     totalWeight = 0
                     nucleusweights = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleus in nucleusRW:
                             nucleusRW[tmpNucleus] = 0
                         wStr = '1'
                         # with RW
                         if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW:
                             weight = 1 / float(nucleusRW[tmpNucleus])
                             wStr += '/({0}=RW)'.format(nucleusRW[tmpNucleus])
                         else:
                             weight = 1
                             wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW)
                         # with data
                         if availableData != {}:
                             weight *= float(availableData[tmpNucleus]['ava_size_any'])
                             weight /= float(availableData[tmpNucleus]['tot_size'])
                             wStr += '*({0}=available input size on DISK/TAPE)'.format(availableData[tmpNucleus]['ava_size_any'])
                             wStr += '/({0}=total input size)'.format(availableData[tmpNucleus]['tot_size'])
                             # negative weight for tape
                             if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']:
                                 weight *= negWeightTape
                                 wStr += '*({0}=weight for TAPE)'.format(negWeightTape)
                         tmpLog.debug('  use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr))
                         totalWeight += weight
                         nucleusweights.append((tmpNucleus,weight))
                     tmpLog.debug('final {0} candidates'.format(len(nucleusList)))
                     ###################################### 
                     # final selection
                     tgtWeight = random.uniform(0,totalWeight)
                     candidateNucleus = None
                     for tmpNucleus,weight in nucleusweights:
                         tgtWeight -= weight
                         if tgtWeight <= 0:
                             candidateNucleus = tmpNucleus
                             break
                     if candidateNucleus == None:
                         candidateNucleus = nucleusweights[-1][0]
                 ###################################### 
                 # update
                 nucleusSpec = nucleusList[candidateNucleus]
                 # get output/log datasets
                 tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                            ['output','log'])
                 # get destinations
                 retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)}
                 tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
                 tmpLog.info('  set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet))
                 # update RW table
                 self.prioRW.acquire()
                 for prio,rwMap in self.prioRW.iteritems():
                     if prio > taskSpec.currentPriority:
                         continue
                     if candidateNucleus in rwMap:
                         rwMap[candidateNucleus] += taskRW
                     else:
                         rwMap[candidateNucleus] = taskRW
                 self.prioRW.release()
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errMsg  = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID)
             errMsg += traceback.format_exc()
             logger.error(errMsg)
Exemplo n.º 6
0
def getSitesWithData(siteMapper,ddmIF,datasetName,storageToken=None):
    # get num of files
    try:
        if not datasetName.endswith('/'):
            totalNumDatasets = 1
        else:
            tmpDsMap = ddmIF.listDatasetsInContainer(datasetName)
            totalNumDatasets = len(tmpDsMap)
    except:
        errtype,errvalue = sys.exc_info()[:2]
        return errtype,'ddmIF.ddmIF.getFilesInDataset failed with %s' % errvalue
    # get replicas
    try:
        replicaMap= {}
        replicaMap[datasetName] = ddmIF.listDatasetReplicas(datasetName)
    except:
        errtype,errvalue = sys.exc_info()[:2]
        return errtype,'ddmIF.listDatasetReplicas failed with %s' % errvalue
    # loop over all clouds
    retMap = {}
    for tmpCloudName in siteMapper.cloudSpec.keys():
        retMap[tmpCloudName] = {'t1':{},'t2':[]}
        # get T1 DDM endpoints
        tmpCloudSpec = siteMapper.getCloud(tmpCloudName)
        # FIXME until CERN-PROD_TZERO is added to cloudconfig.tier1SE
        if tmpCloudName == 'CERN':
            if not 'CERN-PROD_TZERO' in tmpCloudSpec['tier1SE']:
                tmpCloudSpec['tier1SE'].append('CERN-PROD_TZERO')
        for tmpSePat in tmpCloudSpec['tier1SE']:
            if '*' in tmpSePat:
                tmpSePat = tmpSePat.replace('*','.*')
            tmpSePat = '^' + tmpSePat +'$'
            for tmpSE in replicaMap[datasetName].keys():
                # check name with regexp pattern
                if re.search(tmpSePat,tmpSE) == None:
                    continue
                # check space token
                if not storageToken in ['',None,'NULL']:
                    seStr = ddmIF.getSiteProperty(tmpSE,'se')
                    try:
                        if seStr.split(':')[1] != storageToken:
                            continue
                    except:
                        pass
                # check archived metadata
                # FIXME 
                pass
                # check tape attribute
                try:
                    tmpOnTape = ddmIF.getSiteProperty(tmpSE,'is_tape')
                except:
                    continue
                    # errtype,errvalue = sys.exc_info()[:2]
                    # return errtype,'ddmIF.getSiteProperty for %s:tape failed with %s' % (tmpSE,errvalue)
                # check completeness
                tmpStatistics = replicaMap[datasetName][tmpSE][-1] 
                if tmpStatistics['found'] == None:
                    tmpDatasetStatus = 'unknown'
                    pass
                elif tmpStatistics['total'] == tmpStatistics['found'] and tmpStatistics['total'] >= totalNumDatasets:
                    tmpDatasetStatus = 'complete'
                else:
                    tmpDatasetStatus = 'incomplete'
                # append
                retMap[tmpCloudName]['t1'][tmpSE] = {'tape':tmpOnTape,'state':tmpDatasetStatus}
        # get T2 list
        tmpSiteList = DataServiceUtils.getSitesWithDataset(datasetName,siteMapper,replicaMap,
                                                           tmpCloudName,useHomeCloud=True,
                                                           useOnlineSite=True,includeT1=False)
        # append
        retMap[tmpCloudName]['t2'] = tmpSiteList
        # remove if empty
        if len(retMap[tmpCloudName]['t1']) == 0 and len(retMap[tmpCloudName]['t2']) == 0:
            del retMap[tmpCloudName]
    # return
    return Interaction.SC_SUCCEEDED,retMap
Exemplo n.º 7
0
 def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                         monToken='<jediTaskID={0} {1}>'.format(taskSpec.jediTaskID,
                                                                datetime.datetime.utcnow().isoformat('/')))
     tmpLog.debug('start')
     # return for failure
     retFatal    = self.SC_FATAL,inputChunk
     retTmpError = self.SC_FAILED,inputChunk
     # get sites in the cloud
     sitePreAssigned = False
     siteListPreAssigned = False
     if not taskSpec.site in ['',None]:
         if ',' in taskSpec.site:
             # site list
             siteListPreAssigned = True
             scanSiteList = taskSpec.site.split(',')
         else:
             # site
             sitePreAssigned = True
             scanSiteList = [taskSpec.site]
         tmpLog.debug('site={0} is pre-assigned criteria=+preassign'.format(taskSpec.site))
     elif inputChunk.getPreassignedSite() != None:
         siteListPreAssigned = True
         scanSiteList = DataServiceUtils.getSitesShareDDM(self.siteMapper,inputChunk.getPreassignedSite())
         scanSiteList.append(inputChunk.getPreassignedSite())
         tmpMsg = 'use site={0} since they share DDM endpoints with orinal_site={1} which is pre-assigned in masterDS '.format(str(scanSiteList),
                                                                                                                               inputChunk.getPreassignedSite())
         tmpMsg += 'criteria=+premerge'
         tmpLog.debug(tmpMsg)
     else:
         scanSiteList = self.siteMapper.getCloud(cloudName)['sites']
         tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList)))
     # get job statistics
     tmpSt,jobStatMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     # T1 
     if not taskSpec.useWorldCloud():
         t1Sites = [self.siteMapper.getCloud(cloudName)['source']]
         # hospital sites
         if self.hospitalQueueMap.has_key(cloudName):
             t1Sites += self.hospitalQueueMap[cloudName]
     else:
         # get destination for WORLD cloud
         t1Sites = []
         tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,datasetTypes=['log'])
         for datasetSpec in datasetSpecList:
             if not datasetSpec.destination in t1Sites:
                 t1Sites.append(datasetSpec.destination)
     # sites sharing SE with T1
     sitesShareSeT1 = DataServiceUtils.getSitesShareDDM(self.siteMapper,t1Sites[0])
     # all T1
     allT1Sites = self.getAllT1Sites()
     # core count
     if inputChunk.isMerging and taskSpec.mergeCoreCount != None:
         taskCoreCount = taskSpec.mergeCoreCount
     else:
         taskCoreCount = taskSpec.coreCount
     # MP
     if taskCoreCount != None and taskCoreCount > 1:
         # use MCORE only
         useMP = 'only'
     elif taskCoreCount == 0:
         # use MCORE and normal 
         useMP = 'any'
     else:
         # not use MCORE
         useMP = 'unuse'
     # get workQueue
     workQueue = self.taskBufferIF.getWorkQueueMap().getQueueWithID(taskSpec.workQueue_ID)
     ######################################
     # selection for status
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check site status
             skipFlag = False
             if tmpSiteSpec.status != 'online':
                 skipFlag = True
             if not skipFlag:    
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to status=%s criteria=-status' % (tmpSiteName,tmpSiteSpec.status))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for reprocessing
     if taskSpec.processingType == 'reprocessing':
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check schedconfig.validatedreleases
             if tmpSiteSpec.validatedreleases == ['True']:
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to validatedreleases <> True criteria=-validated' % tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for reprocessing'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for high priorities
     t1WeightForHighPrio = 1
     if (taskSpec.currentPriority >= 900 or inputChunk.useScout()) \
             and not sitePreAssigned and not siteListPreAssigned:
         t1WeightForHighPrio = 100
         newScanSiteList = []
         for tmpSiteName in scanSiteList:            
             if tmpSiteName in t1Sites+sitesShareSeT1+allT1Sites:
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpMsg = '  skip site={0} due to highPrio/scouts which needs to run at T1 or sites associated with {1} T1 SE '.format(tmpSiteName,
                                                                                                                                       cloudName)
                 tmpMsg += 'criteria=-scoutprio'
                 tmpLog.debug(tmpMsg)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed for highPrio/scouts'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection to avoid slow or inactive sites
     if (taskSpec.currentPriority >= 800 or inputChunk.useScout() or \
             inputChunk.isMerging or taskSpec.mergeOutput()) \
             and not sitePreAssigned:
         # get inactive sites
         inactiveTimeLimit = 2
         inactiveSites = self.taskBufferIF.getInactiveSites_JEDI('production',inactiveTimeLimit)
         newScanSiteList = []
         tmpMsgList = []
         for tmpSiteName in scanSiteList:
             nToGetAll = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \
                 AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'starting')
             if tmpSiteName in ['BNL_CLOUD','BNL_CLOUD_MCORE','ATLAS_OPP_OSG']:
                 tmpMsg = '  skip site={0} since high prio/scouts/merge needs to avoid slow sites '.format(tmpSiteName)
                 tmpMsg += 'criteria=-slow'
                 tmpMsgList.append(tmpMsg)
             elif tmpSiteName in inactiveSites and nToGetAll > 0:
                 tmpMsg = '  skip site={0} since high prio/scouts/merge needs to avoid inactive sites (laststart is older than {1}h) '.format(tmpSiteName,
                                                                                                                                              inactiveTimeLimit)
                 tmpMsg += 'criteria=-inactive'
                 tmpMsgList.append(tmpMsg)
             else:
                 newScanSiteList.append(tmpSiteName)
         if newScanSiteList != []:
             scanSiteList = newScanSiteList
             for tmpMsg in tmpMsgList:
                 tmpLog.debug(tmpMsg)
         tmpLog.debug('{0} candidates passed for slowness/inactive check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for data availability
     if not sitePreAssigned and not siteListPreAssigned:
         for datasetSpec in inputChunk.getDatasets():
             datasetName = datasetSpec.datasetName
             # ignore DBR
             if DataServiceUtils.isDBR(datasetName):
                 continue
             if not self.dataSiteMap.has_key(datasetName):
                 # get the list of sites where data is available
                 tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName))
                 tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper,
                                                                  self.ddmIF,datasetName,
                                                                  datasetSpec.storageToken)
                 if tmpSt == self.SC_FAILED:
                     tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     self.sendLogMessage(tmpLog)
                     return retTmpError
                 if tmpSt == self.SC_FATAL:
                     tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     self.sendLogMessage(tmpLog)
                     return retFatal
                 # append
                 self.dataSiteMap[datasetName] = tmpRet
                 tmpLog.debug('map of data availability : {0}'.format(str(tmpRet)))
             """
             # check if T1 has the data
             if self.dataSiteMap[datasetName].has_key(cloudName):
                 cloudHasData = True
             else:
                 cloudHasData = False
             t1hasData = False
             if cloudHasData:
                 for tmpSE,tmpSeVal in self.dataSiteMap[datasetName][cloudName]['t1'].iteritems():
                     if tmpSeVal['state'] == 'complete':
                         t1hasData = True
                         break
                 # T1 has incomplete data while no data at T2
                 if not t1hasData and self.dataSiteMap[datasetName][cloudName]['t2'] == []:
                     # use incomplete data at T1 anyway
                     t1hasData = True
             # data is missing at T1         
             if not t1hasData:
                 tmpLog.debug('{0} is unavailable at T1. scanning T2 sites in homeCloud={1}'.format(datasetName,cloudName))
                 # make subscription to T1
                 # FIXME
                 pass
                 # use T2 until data is complete at T1
                 newScanSiteList = []
                 for tmpSiteName in scanSiteList:                    
                     if cloudHasData and tmpSiteName in self.dataSiteMap[datasetName][cloudName]['t2']:
                         newScanSiteList.append(tmpSiteName)
                     else:
                         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                         if tmpSiteSpec.cloud != cloudName:
                             tmpLog.debug('  skip %s due to foreign T2' % tmpSiteName)
                         else:
                             tmpLog.debug('  skip %s due to missing data at T2' % tmpSiteName)
                 scanSiteList = newScanSiteList
                 tmpLog.debug('{0} candidates passed T2 scan in the home cloud with input:{1}'.format(len(scanSiteList),datasetName))
                 if scanSiteList == []:
                     tmpLog.error('no candidates')
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     return retTmpError
             """        
     ######################################
     # selection for fairshare
     if not (workQueue.queue_type in ['managed'] and workQueue.queue_name in ['test','validation']):
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if AtlasBrokerUtils.hasZeroShare(tmpSiteSpec,taskSpec,inputChunk.isMerging,tmpLog):
                 tmpLog.debug('  skip site={0} due to zero share criteria=-zeroshare'.format(tmpSiteName))
                 continue
             newScanSiteList.append(tmpSiteName)                
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed zero share check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for I/O intensive tasks
     # FIXME
     pass
     ######################################
     # selection for MP
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \
                     (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]):
                     newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to core mismatch site:%s <> task:%s criteria=-cpucore' % \
                              (tmpSiteName,tmpSiteSpec.coreCount,taskCoreCount))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for release
     if taskSpec.transHome != None:
         if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None:
             # only cache is checked for normal tasks
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      caches=taskSpec.transHome,
                                                                      cmtConfig=taskSpec.architecture)
         else:
             # nightlies
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      releases='CVMFS')
             #                                                         releases='nightlies',
             #                                                         cmtConfig=taskSpec.architecture)
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY'] or \
                tmpSiteName in ['CERN-RELEASE']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip site=%s due to missing cache=%s:%s criteria=-cache' % \
                              (tmpSiteName,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for ATLAS release {1}:{2}'.format(len(scanSiteList),
                                                                               taskSpec.transHome,
                                                                               taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for memory
     minRamCount  = max(taskSpec.ramCount, inputChunk.ramCount)
     if not minRamCount in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory:
                 tmpMsg = '  skip site={0} due to site RAM shortage {1}(site upper limit) less than {2} '.format(tmpSiteName,
                                                                                                                 tmpSiteSpec.maxmemory,
                                                                                                                 minRamCount)
                 tmpMsg += 'criteria=-lowmemory'
                 tmpLog.debug(tmpMsg)
                 continue
             if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory:
                 tmpMsg = '  skip site={0} due to job RAM shortage {1}(site lower limit) greater than {2} '.format(tmpSiteName,
                                                                                                                   tmpSiteSpec.minmemory,
                                                                                                                   minRamCount)
                 tmpMsg += 'criteria=-highmemory'
                 tmpLog.debug(tmpMsg)
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed memory check {1}({2})'.format(len(scanSiteList),
                                                                           minRamCount,taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for scratch disk
     if taskSpec.outputScaleWithEvents():
         minDiskCount = taskSpec.getOutDiskSize()*inputChunk.getMaxAtomSize(getNumEvents=True)
     else:
         minDiskCount = taskSpec.getOutDiskSize()*inputChunk.getMaxAtomSize(effectiveSize=True)
     minDiskCount = minDiskCount + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize()
     minDiskCount = minDiskCount / 1024 / 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0 and minDiskCount > tmpSiteSpec.maxwdir:
             tmpMsg = '  skip site={0} due to small scratch disk {1} less than {2} '.format(tmpSiteName,
                                                                                            tmpSiteSpec.maxwdir,
                                                                                            minDiskCount)
             tmpMsg += 'criteria=-disk'
             tmpLog.debug(tmpMsg)
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check minDiskCount>{1}MB'.format(len(scanSiteList),
                                                                                       minDiskCount))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # don't check for T1
         if tmpSiteName in t1Sites:
             pass
         else:
             # check at the site
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # the number of jobs which will produce outputs
             nRemJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'assigned') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'throttled') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running')
             # the size of input files which will be copied to the site
             movingInputSize = self.taskBufferIF.getMovingInputSize_JEDI(tmpSiteName)
             if movingInputSize == None:
                 tmpLog.error('failed to get the size of input file moving to {0}'.format(tmpSiteName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 self.sendLogMessage(tmpLog)
                 return retTmpError
             # free space - inputs - outputs(250MB*nJobs) must be >= 200GB
             outSizePerJob = 0.250
             diskThreshold = 200
             tmpSiteSpaceMap = self.ddmIF.getRseUsage(tmpSiteSpec.ddm)
             if tmpSiteSpaceMap != {}:
                 tmpSiteFreeSpace = tmpSiteSpaceMap['free']
                 tmpSpaceSize = tmpSiteFreeSpace - movingInputSize - nRemJobs * outSizePerJob
                 if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold:
                     tmpLog.debug('  skip {0} due to disk shortage in SE = {1}-{2}-{3}x{4} < {5}'.format(tmpSiteName,tmpSiteFreeSpace,
                                                                                                         movingInputSize,outSizePerJob,
                                                                                                         nRemJobs,diskThreshold))
                     continue
             # check if blacklisted
             if self.ddmIF.isBlackListedEP(tmpSiteSpec.ddm):
                 tmpLog.debug('  skip site={0} since endpoint={1} is blacklisted in DDM criteria=-blacklist'.format(tmpSiteName,tmpSiteSpec.ddm))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for walltime
     if not taskSpec.useHS06():
         tmpMaxAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True)
         minWalltime = taskSpec.walltime * tmpMaxAtomSize
         strMinWalltime = 'walltime*inputSize={0}*{1}'.format(taskSpec.walltime,tmpMaxAtomSize)
     else:
         tmpMaxAtomSize = inputChunk.getMaxAtomSize(getNumEvents=True)
         minWalltime = taskSpec.cpuTime * tmpMaxAtomSize
         strMinWalltime = 'cpuTime*nEventsPerJob={0}*{1}'.format(taskSpec.cpuTime,tmpMaxAtomSize)
     if minWalltime != None or inputChunk.useScout():
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             siteMaxTime = tmpSiteSpec.maxtime
             origSiteMaxTime = siteMaxTime
             # sending scouts merge or wallime-undefined jobs to only sites where walltime is more than 1 day
             if inputChunk.useScout() or inputChunk.isMerging or \
                     (taskSpec.walltime in [0,None] and taskSpec.walltimeUnit in ['',None] and taskSpec.cpuTimeUnit in ['',None]):
                 minTimeForZeroWalltime = 24*60*60
                 if siteMaxTime != 0 and siteMaxTime < minTimeForZeroWalltime:
                     tmpMsg = '  skip site={0} due to site walltime {1} (site upper limit) insufficient '.format(tmpSiteName,
                                                                                                                 siteMaxTime)
                     if inputChunk.useScout():
                         tmpMsg += 'for scouts ({0} at least) '.format(minTimeForZeroWalltime)
                         tmpMsg += 'criteria=-scoutwalltime'
                     else:
                         tmpMsg += 'for zero walltime ({0} at least) '.format(minTimeForZeroWalltime)
                         tmpMsg += 'criteria=-zerowalltime'
                     tmpLog.debug(tmpMsg)
                     continue
             # check max walltime at the site
             tmpSiteStr = '{0}'.format(siteMaxTime)
             if taskSpec.useHS06():
                 oldSiteMaxTime = siteMaxTime
                 siteMaxTime -= taskSpec.baseWalltime
                 tmpSiteStr = '({0}-{1})'.format(oldSiteMaxTime,taskSpec.baseWalltime)
             if not siteMaxTime in [None,0] and not tmpSiteSpec.coreCount in [None,0]:
                 siteMaxTime *= tmpSiteSpec.coreCount
                 tmpSiteStr += '*{0}'.format(tmpSiteSpec.coreCount)
             if taskSpec.useHS06():
                 if not siteMaxTime in [None,0] and not tmpSiteSpec.corepower in [None,0]:
                     siteMaxTime *= tmpSiteSpec.corepower
                     tmpSiteStr += '*{0}'.format(tmpSiteSpec.corepower)
                 siteMaxTime *= float(taskSpec.cpuEfficiency) / 100.0
                 siteMaxTime = long(siteMaxTime)
                 tmpSiteStr += '*{0}%'.format(taskSpec.cpuEfficiency)
             if origSiteMaxTime != 0 and minWalltime > siteMaxTime:
                 tmpMsg = '  skip site={0} due to short site walltime {1} (site upper limit) less than {2} '.format(tmpSiteName,
                                                                                                                    tmpSiteStr,
                                                                                                                    strMinWalltime)
                 tmpMsg += 'criteria=-shortwalltime'
                 tmpLog.debug(tmpMsg)
                 continue
             # check min walltime at the site
             siteMinTime = tmpSiteSpec.mintime
             origSiteMinTime = siteMinTime
             tmpSiteStr = '{0}'.format(siteMinTime)
             if taskSpec.useHS06():
                 oldSiteMinTime = siteMinTime
                 siteMinTime -= taskSpec.baseWalltime
                 tmpSiteStr = '({0}-{1})'.format(oldSiteMinTime,taskSpec.baseWalltime)
             if not siteMinTime in [None,0] and not tmpSiteSpec.coreCount in [None,0]:
                 siteMinTime *= tmpSiteSpec.coreCount
                 tmpSiteStr += '*{0}'.format(tmpSiteSpec.coreCount)
             if taskSpec.useHS06():
                 if not siteMinTime in [None,0] and not tmpSiteSpec.corepower in [None,0]:
                     siteMinTime *= tmpSiteSpec.corepower
                     tmpSiteStr += '*{0}'.format(tmpSiteSpec.corepower)
                 siteMinTime *= float(taskSpec.cpuEfficiency) / 100.0
                 siteMinTime = long(siteMinTime)
                 tmpSiteStr += '*{0}%'.format(taskSpec.cpuEfficiency)
             if origSiteMinTime != 0 and minWalltime < siteMinTime:
                 tmpMsg = '  skip site {0} due to short job walltime {1} (site lower limit) greater than {2} '.format(tmpSiteName,
                                                                                                                      tmpSiteStr,
                                                                                                                      strMinWalltime)
                 tmpMsg += 'criteria=-longwalltime'
                 tmpLog.debug(tmpMsg)
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         if not taskSpec.useHS06():
             tmpLog.debug('{0} candidates passed walltime check {1}({2})'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit))
         else:
             tmpLog.debug('{0} candidates passed walltime check {1}({2}*nEventsPerJob)'.format(len(scanSiteList),strMinWalltime,taskSpec.cpuTimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for network connectivity
     if not sitePreAssigned:
         ipConnectivity = taskSpec.getIpConnectivity()
         if ipConnectivity != None:
             newScanSiteList = []
             for tmpSiteName in scanSiteList:
                 tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                 # check at the site
                 if tmpSiteSpec.wnconnectivity == 'full':
                     pass
                 elif tmpSiteSpec.wnconnectivity == 'http' and ipConnectivity == 'http':
                     pass
                 else:
                     tmpMsg = '  skip site={0} due to insufficient connectivity (site={1}) for task={2} '.format(tmpSiteName,
                                                                                                                 tmpSiteSpec.wnconnectivity,
                                                                                                                 ipConnectivity)
                     tmpMsg += 'criteria=-network'
                     tmpLog.debug(tmpMsg)
                     continue
                 newScanSiteList.append(tmpSiteName)
             scanSiteList = newScanSiteList
             tmpLog.debug('{0} candidates passed network check ({1})'.format(len(scanSiteList),
                                                                             ipConnectivity))
             if scanSiteList == []:
                 tmpLog.error('no candidates')
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 self.sendLogMessage(tmpLog)
                 return retTmpError
     ######################################
     # selection for event service
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # event service
             if taskSpec.useEventService():
                 if tmpSiteSpec.getJobSeed() == 'std':
                     tmpMsg = '  skip site={0} since EventService is not allowed '.format(tmpSiteName)
                     tmpMsg += 'criteria=-es'
                     tmpLog.debug(tmpMsg)
                     continue
             else:
                 if tmpSiteSpec.getJobSeed() == 'es':
                     tmpMsg = '  skip site={0} since only EventService is allowed '.format(tmpSiteName)
                     tmpMsg += 'criteria=-nones'
                     tmpLog.debug(tmpMsg)
                     continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed EventService check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for transferring
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limit
         def_maxTransferring = 2000 
         if tmpSiteSpec.transferringlimit == 0:
             # use default value
             maxTransferring   = def_maxTransferring
         else:
             maxTransferring = tmpSiteSpec.transferringlimit
         # check at the site
         nTraJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'transferring',cloud=cloudName)
         nRunJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running',cloud=cloudName)
         if max(maxTransferring,2*nRunJobs) < nTraJobs and not tmpSiteSpec.cloud in ['ND']:
             tmpLog.debug('  skip site=%s due to too many transferring=%s greater than max(%s,2x%s) criteria=-transferring' % \
                              (tmpSiteName,nTraJobs,def_maxTransferring,nRunJobs))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed transferring check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for T1 weight
     t1Weight = taskSpec.getT1Weight()
     if t1Weight == 0:
         # use T1 weight in cloudconfig
         t1Weight = self.siteMapper.getCloud(cloudName)['weight']
     if t1Weight < 0:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             if not tmpSiteName in t1Sites:
                 tmpLog.debug('  skip site={0} due to negative T1 weight criteria=-t1weight'.format(tmpSiteName))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         t1Weight = 1
     t1Weight = max(t1Weight,t1WeightForHighPrio)
     tmpLog.debug('T1 weight {0}'.format(t1Weight))
     tmpLog.debug('{0} candidates passed T1 weight check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for nPilot
     nPilotMap = {}
     if not sitePreAssigned:
         nWNmap = self.taskBufferIF.getCurrentSiteData()
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             # check at the site
             nPilot = 0
             if nWNmap.has_key(tmpSiteName):
                 nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob']
             if nPilot == 0 and not 'test' in taskSpec.prodSourceLabel:
                 tmpLog.debug('  skip site=%s due to no pilot criteria=-nopilot' % tmpSiteName)
                 continue
             newScanSiteList.append(tmpSiteName)
             nPilotMap[tmpSiteName] = nPilot
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # get available files
     normalizeFactors = {}        
     availableFileMap = {}
     for datasetSpec in inputChunk.getDatasets():
         try:
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(scanSiteList,self.siteMapper,
                                                                        ignoreCC=True)
             # disable file lookup for merge jobs or secondary datasets
             checkCompleteness = True
             useCompleteOnly = False
             if inputChunk.isMerging:
                 checkCompleteness = False
             if not datasetSpec.isMaster():
                 useCompleteOnly = True
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec,
                                                         siteStorageEP,
                                                         self.siteMapper,
                                                         ngGroup=[1],
                                                         checkCompleteness=checkCompleteness,
                                                         storageToken=datasetSpec.storageToken,
                                                         useCompleteOnly=useCompleteOnly)
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
         # loop over all sites to get the size of available files
         for tmpSiteName in scanSiteList:
             if not normalizeFactors.has_key(tmpSiteName):
                 normalizeFactors[tmpSiteName] = 0
             # get the total size of available files
             if availableFileMap[datasetSpec.datasetName].has_key(tmpSiteName):
                 availableFiles = availableFileMap[datasetSpec.datasetName][tmpSiteName]
                 for tmpFileSpec in \
                         availableFiles['localdisk']+availableFiles['localtape']+availableFiles['cache']:
                     normalizeFactors[tmpSiteName] += tmpFileSpec.fsize
     # get max total size
     tmpTotalSizes = normalizeFactors.values()
     tmpTotalSizes.sort()
     if tmpTotalSizes != []:
         totalSize = tmpTotalSizes.pop()
     else:
         totalSize = 0
     ######################################
     # calculate weight
     tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,
                                                                                 taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     tmpLog.debug('calculate weight and check cap for {0} candidates'.format(len(scanSiteList)))
     weightMapPrimary = {}
     weightMapSecondary = {}
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         nRunning   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',None,taskSpec.workQueue_ID)
         nDefined   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'definied',None,taskSpec.workQueue_ID) + self.getLiveCount(tmpSiteName)
         nAssigned  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'assigned',None,taskSpec.workQueue_ID)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,taskSpec.workQueue_ID) + \
                      AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,taskSpec.workQueue_ID)
         nStarting  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'starting',None,taskSpec.workQueue_ID)
         if tmpSiteName in nPilotMap:
             nPilot = nPilotMap[tmpSiteName]
         else:
             nPilot = 0
         manyAssigned = float(nAssigned + 1) / float(nActivated + 1)
         manyAssigned = min(2.0,manyAssigned)
         manyAssigned = max(1.0,manyAssigned)
         weight = float(nRunning + 1) / float(nActivated + nAssigned + nStarting + nDefined + 1) / manyAssigned
         weightStr = 'nRun={0} nAct={1} nAss={2} nStart={3} nDef={4} totalSize={5} manyAss={6} nPilot={7} '.format(nRunning,nActivated,nAssigned,
                                                                                                                   nStarting,nDefined,
                                                                                                                   totalSize,manyAssigned,
                                                                                                                   nPilot)
         # normalize weights by taking data availability into account
         if totalSize != 0:
             weight = weight * float(normalizeFactors[tmpSiteName]+totalSize) / float(totalSize)
             weightStr += 'availableSize={0} '.format(normalizeFactors[tmpSiteName])
         # T1 weight
         if tmpSiteName in t1Sites+sitesShareSeT1:
             weight *= t1Weight
             weightStr += 't1W={0} '.format(t1Weight)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # set weight and params
         siteCandidateSpec.weight = weight
         siteCandidateSpec.nRunningJobs = nRunning
         siteCandidateSpec.nQueuedJobs = nActivated + nAssigned + nStarting
         siteCandidateSpec.nAssignedJobs = nAssigned
         # set available files
         for tmpDatasetName,availableFiles in availableFileMap.iteritems():
             if availableFiles.has_key(tmpSiteName):
                 siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['localdisk']
                 siteCandidateSpec.localTapeFiles  += availableFiles[tmpSiteName]['localtape']
                 siteCandidateSpec.cacheFiles  += availableFiles[tmpSiteName]['cache']
                 siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote']
         # check if site is locked for WORLD
         lockedByBrokerage = False
         if taskSpec.useWorldCloud():
             lockedByBrokerage = self.checkSiteLock(taskSpec.vo,taskSpec.prodSourceLabel,
                                                    tmpSiteName,taskSpec.workQueue_ID)
         # check cap with nRunning
         cutOffValue = 20
         cutOffFactor = 2 
         nRunningCap = max(cutOffValue,cutOffFactor*nRunning)
         nRunningCap = max(nRunningCap,nPilot)
         okMsg = '  use site={0} with weight={1} {2} criteria=+use'.format(tmpSiteName,weight,weightStr)
         okAsPrimay = False
         if lockedByBrokerage:
             ngMsg = '  skip site={0} due to locked by another brokerage '.format(tmpSiteName)
             ngMsg += 'criteria=-lock'
         elif (nDefined+nActivated+nAssigned+nStarting) > nRunningCap:
             ngMsg = '  skip site={0} due to nDefined+nActivated+nAssigned+nStarting={1} '.format(tmpSiteName,
                                                                                                  nDefined+nActivated+nAssigned+nStarting)
             ngMsg += 'greater than max({0},{1}*nRunning={1}*{2},nPilot={3}) '.format(cutOffValue,
                                                                                      cutOffFactor,                                  
                                                                                      nRunning,                                      
                                                                                      nPilot)
             ngMsg += 'criteria=-cap'
         else:
             ngMsg = '  skip site={0} due to low weight '.format(tmpSiteName)
             ngMsg += 'criteria=-loweigh'
             okAsPrimay = True
         # use primay if cap/lock check is passed
         if okAsPrimay:
             weightMap = weightMapPrimary
         else:
             weightMap = weightMapSecondary
         # add weight
         if not weight in weightMap:
             weightMap[weight] = []
         weightMap[weight].append((siteCandidateSpec,okMsg,ngMsg))
     # use second candidates if no primary candidates passed cap/lock check
     if weightMapPrimary == {}:
         tmpLog.debug('use second candidates since no sites pass cap/lock check')
         weightMap = weightMapSecondary
         # use hightest 3 weights                                                                                                                                                  
         weightRank = 3
     else:
         weightMap = weightMapPrimary
         # use all weights
         weightRank = None
         # dump NG message
         for tmpWeight in weightMapSecondary.keys():
             for siteCandidateSpec,tmpOkMsg,tmpNgMsg in weightMapSecondary[tmpWeight]:
                 tmpLog.debug(tmpNgMsg)
     # max candidates for WORLD
     if taskSpec.useWorldCloud():
         maxSiteCandidates = 10
     else:
         maxSiteCandidates = None
     newScanSiteList = []
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightIdx,tmpWeight in enumerate(weightList):
         for siteCandidateSpec,tmpOkMsg,tmpNgMsg in weightMap[tmpWeight]:
             if (weightRank == None or weightIdx < weightRank) and \
                     (maxSiteCandidates == None or len(newScanSiteList) < maxSiteCandidates):
                 # use site
                 tmpLog.debug(tmpOkMsg)
                 newScanSiteList.append(siteCandidateSpec.siteName)
                 inputChunk.addSiteCandidate(siteCandidateSpec)
             else:
                 # dump NG message
                 tmpLog.debug(tmpNgMsg)
     scanSiteList = newScanSiteList
     # final check
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     # lock sites for WORLD
     if taskSpec.useWorldCloud():
         for tmpSiteName in scanSiteList:
             self.lockSite(taskSpec.vo,taskSpec.prodSourceLabel,tmpSiteName,taskSpec.workQueue_ID)
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     # return
     self.sendLogMessage(tmpLog)
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,inputChunk
Exemplo n.º 8
0
 def doSetup(self,taskSpec,datasetToRegister,pandaJobs):
     # make logger
     tmpLog = MsgWrapper(logger,"<jediTaskID={0}>".format(taskSpec.jediTaskID))
     tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType))
     # returns
     retFatal    = self.SC_FATAL
     retTmpError = self.SC_FAILED
     retOK       = self.SC_SUCCEEDED
     try:
         # get DDM I/F
         ddmIF = self.ddmIF.getInterface(taskSpec.vo)
         # register datasets
         if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']:
             # prod vs anal
             userSetup = False
             if taskSpec.prodSourceLabel in ['user']:
                 userSetup = True
                 # collect datasetID to register datasets/containers just in case
                 for tmpPandaJob in pandaJobs:
                     if not tmpPandaJob.produceUnMerge():
                         for tmpFileSpec in tmpPandaJob.Files:
                             if tmpFileSpec.type in ['output','log']:
                                 if not tmpFileSpec.datasetID in datasetToRegister:
                                     datasetToRegister.append(tmpFileSpec.datasetID)
             tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister)))
             # get site mapper
             siteMapper = self.taskBufferIF.getSiteMapper()
             # loop over all datasets
             avDatasetList = []
             cnDatasetMap  = {}
             for datasetID in datasetToRegister:
                 # get output and log datasets
                 tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID))
                 tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID,
                                                                               datasetID)
                 if not tmpStat:
                     tmpLog.error('failed to get output and log datasets')
                     return retFatal
                 # DDM backend
                 ddmBackEnd = taskSpec.getDdmBackEnd()
                 tmpLog.info('checking {0}'.format(datasetSpec.datasetName)) 
                 # check if dataset and container are available in DDM
                 for targetName in [datasetSpec.datasetName,datasetSpec.containerName]:
                     if targetName == None:
                         continue
                     if not targetName in avDatasetList:
                         # set lifetime
                         if targetName.startswith('panda'):
                             lifetime = 14
                         else:
                             lifetime = None
                         # check dataset/container in DDM
                         tmpList = ddmIF.listDatasets(targetName)
                         if tmpList == []:
                             # get location
                             location = None
                             locForRule = None
                             if targetName == datasetSpec.datasetName:
                                 # dataset
                                 if datasetSpec.site in ['',None]:
                                     if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                                         locForRule = datasetSpec.destination
                                     elif DataServiceUtils.getDestinationSE(datasetSpec.storageToken) != None:
                                         location = DataServiceUtils.getDestinationSE(datasetSpec.storageToken)
                                     elif taskSpec.cloud != None:
                                         # use T1 SE
                                         tmpT1Name = siteMapper.getCloud(taskSpec.cloud)['source']
                                         location = siteMapper.getDdmEndpoint(tmpT1Name,datasetSpec.storageToken)
                                 else:
                                     location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken)
                             if locForRule == None:
                                 locForRule = location
                             # set metadata
                             if taskSpec.prodSourceLabel in ['managed','test'] and targetName == datasetSpec.datasetName:
                                 metaData = {}
                                 metaData['task_id'] = taskSpec.jediTaskID
                                 if not taskSpec.campaign in [None,'']:
                                     metaData['campaign'] = taskSpec.campaign 
                                 if datasetSpec.getTransient() != None:
                                     metaData['transient'] = datasetSpec.getTransient()
                             else:
                                 metaData = None
                             # register dataset/container
                             tmpLog.info('registering {0} with location={1} backend={2} lifetime={3} meta={4}'.format(targetName,
                                                                                                                      location,
                                                                                                                      ddmBackEnd,
                                                                                                                      lifetime,
                                                                                                                      str(metaData)))
                             tmpStat = ddmIF.registerNewDataset(targetName,backEnd=ddmBackEnd,location=location,
                                                                lifetime=lifetime,metaData=metaData)
                             if not tmpStat:
                                 tmpLog.error('failed to register {0}'.format(targetName))
                                 return retFatal
                             # procedures for user 
                             if userSetup or DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                                 # register location
                                 tmpToRegister = False
                                 if userSetup and targetName == datasetSpec.datasetName and not datasetSpec.site in ['',None]:
                                     userName = taskSpec.userName
                                     grouping = None
                                     tmpToRegister = True
                                 elif DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                                     userName = None
                                     grouping = 'NONE'
                                     tmpToRegister = True
                                 if tmpToRegister:
                                     activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel)
                                     tmpLog.info('registring location={0} lifetime={1}days activity={2} grouping={3}'.format(locForRule,lifetime,
                                                                                                                             activity,grouping))
                                     tmpStat = ddmIF.registerDatasetLocation(targetName,locForRule,owner=userName,
                                                                             lifetime=lifetime,backEnd=ddmBackEnd,
                                                                             activity=activity,grouping=grouping)
                                     if not tmpStat:
                                         tmpLog.error('failed to register location {0} with {2} for {1}'.format(locForRule,
                                                                                                                targetName,
                                                                                                                ddmBackEnd))
                                         return retFatal
                             avDatasetList.append(targetName)
                         else:
                             tmpLog.info('{0} already registered'.format(targetName))
                 # check if dataset is in the container
                 if datasetSpec.containerName != None and datasetSpec.containerName != datasetSpec.datasetName:
                     # get list of constituent datasets in the container
                     if not cnDatasetMap.has_key(datasetSpec.containerName):
                         cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName)
                     # add dataset
                     if not datasetSpec.datasetName in cnDatasetMap[datasetSpec.containerName]:
                         tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) 
                         tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName],
                                                                backEnd=ddmBackEnd)
                         if not tmpStat:
                             tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName,
                                                                            datasetSpec.containerName))
                             return retFatal
                         cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName)
                     else:
                         tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) 
                 # update dataset
                 datasetSpec.status = 'registered'
                 self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID,
                                                                   'datasetID':datasetID})
         # open datasets
         if taskSpec.prodSourceLabel in ['managed','test']:
             # get the list of output/log datasets
             outDatasetList = []
             for tmpPandaJob in pandaJobs:
                 for tmpFileSpec in tmpPandaJob.Files:
                     if tmpFileSpec.type in ['output','log']:
                         if not tmpFileSpec.destinationDBlock in outDatasetList:
                             outDatasetList.append(tmpFileSpec.destinationDBlock)
             # open datasets
             for outDataset in outDatasetList:
                 tmpLog.info('open {0}'.format(outDataset))
                 ddmIF.openDataset(outDataset)
                 # unset lifetime
                 ddmIF.setDatasetMetadata(outDataset,'lifetime',None)
         # return
         tmpLog.info('done')        
         return retOK
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue))
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retFatal
Exemplo n.º 9
0
 def doFinalProcedure(self,taskSpec,tmpLog):
     tmpLog.info('final procedure for status={0} processingType={1}'.format(taskSpec.status,
                                                                            taskSpec.processingType))
     if taskSpec.status in ['done','finished'] or \
             (taskSpec.status == 'paused' and taskSpec.oldStatus in ['done','finished']):
         trnLifeTime = 14*24*60*60
         trnLifeTimeLong = 28*24*60*60
         ddmIF = self.ddmIF.getInterface(taskSpec.vo)
         # set lifetime to transient datasets
         metaData = {'lifetime':trnLifeTime}
         datasetTypeListI = set()
         datasetTypeListO = set()
         for datasetSpec in taskSpec.datasetSpecList:
             if datasetSpec.type in ['log','output']:
                 if datasetSpec.getTransient() == True:
                     tmpLog.debug('set metadata={0} to datasetID={1}:Name={2}'.format(str(metaData),
                                                                                     datasetSpec.datasetID,
                                                                                     datasetSpec.datasetName))
                     for metadataName,metadaValue in metaData.iteritems():
                         ddmIF.setDatasetMetadata(datasetSpec.datasetName,metadataName,metadaValue)
             # collect dataset types
             datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName)
             if not datasetType in ['',None]:
                 if datasetSpec.type == 'input':
                     datasetTypeListI.add(datasetType)
                 elif datasetSpec.type == 'output':
                     datasetTypeListO.add(datasetType)
         # set lifetime to parent transient datasets
         if taskSpec.processingType in ['merge']:
             # get parent task
             if not taskSpec.parent_tid in [None,taskSpec.jediTaskID]:
                 # get parent
                 tmpStat,parentTaskSpec = self.taskBufferIF.getTaskDatasetsWithID_JEDI(taskSpec.parent_tid,None,False)
                 if tmpStat and parentTaskSpec != None:
                     # set lifetime to parent datasets if they are transient
                     for datasetSpec in parentTaskSpec.datasetSpecList:
                         if datasetSpec.type in ['output']:
                             # check dataset type
                             datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName)
                             if not datasetType in datasetTypeListI or not datasetType in datasetTypeListO:
                                 continue
                             # use longer lifetime for finished AOD merge with success rate < 90%
                             if taskSpec.status == 'finished' and datasetType == 'AOD' \
                                     and self.getTaskCompleteness(taskSpec)[-1] < 900:
                                 metaData = {'lifetime':trnLifeTimeLong}
                             else:
                                 metaData = {'lifetime':trnLifeTime}
                             tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName)
                             if tmpMetadata['transient'] == True:
                                 tmpLog.debug('set metadata={0} to parent jediTaskID={1}:datasetID={2}:Name={3}'.format(str(metaData),
                                                                                                                       taskSpec.parent_tid,
                                                                                                                       datasetSpec.datasetID,
                                                                                                                       datasetSpec.datasetName))
                                 for metadataName,metadaValue in metaData.iteritems():
                                     ddmIF.setDatasetMetadata(datasetSpec.datasetName,metadataName,metadaValue)
     # delete empty datasets
     if taskSpec.status == 'done' or (taskSpec.status == 'paused' and taskSpec.oldStatus == 'done'):
         ddmIF = self.ddmIF.getInterface(taskSpec.vo)
         # loop over all datasets
         for datasetSpec in taskSpec.datasetSpecList:
             try:
                 if datasetSpec.type == 'output' and datasetSpec.nFilesFinished == 0:
                     tmpStat = ddmIF.deleteDataset(datasetSpec.datasetName,True,True)
                     tmpLog.debug('delete empty prod dataset {0} with {1}'.format(datasetSpec.datasetName,tmpStat))
             except:
                 errtype,errvalue = sys.exc_info()[:2]
                 tmpLog.warning('failed to delete empty dataset with {0}:{1}'.format(errtype.__name__,errvalue))
     # set lifetime to failed datasets
     if taskSpec.status in ['failed','broken','aborted']:
         trnLifeTime = 30*24*60*60
         ddmIF = self.ddmIF.getInterface(taskSpec.vo)
         # only log datasets
         metaData = {'lifetime':trnLifeTime}
         for datasetSpec in taskSpec.datasetSpecList:
             if datasetSpec.type in ['log']:
                 tmpLog.debug('set metadata={0} to failed datasetID={1}:Name={2}'.format(str(metaData),
                                                                                        datasetSpec.datasetID,
                                                                                        datasetSpec.datasetName))
                 for metadataName,metadaValue in metaData.iteritems():
                     ddmIF.setDatasetMetadata(datasetSpec.datasetName,metadataName,metadaValue)
     return self.SC_SUCCEEDED
Exemplo n.º 10
0
 def appendJob(self, job, siteMapperCache=None):
     # event service merge
     if EventServiceUtils.isEventServiceMerge(job):
         isEventServiceMerge = True
     else:
         isEventServiceMerge = False
     # PandaID
     self.data['PandaID'] = job.PandaID
     # prodSourceLabel
     self.data['prodSourceLabel'] = job.prodSourceLabel
     # swRelease
     self.data['swRelease'] = job.AtlasRelease
     # homepackage
     self.data['homepackage'] = job.homepackage
     # transformation
     self.data['transformation'] = job.transformation
     # job name
     self.data['jobName'] = job.jobName
     # job definition ID
     self.data['jobDefinitionID'] = job.jobDefinitionID
     # cloud
     self.data['cloud'] = job.cloud
     # files
     strIFiles = ''
     strOFiles = ''
     strDispatch = ''
     strDisToken = ''
     strDisTokenForOutput = ''
     strDestination = ''
     strRealDataset = ''
     strRealDatasetIn = ''
     strProdDBlock = ''
     strDestToken = ''
     strProdToken = ''
     strProdTokenForOutput = ''
     strGUID = ''
     strFSize = ''
     strCheckSum = ''
     strFileDestinationSE = ''
     strScopeIn = ''
     strScopeOut = ''
     strScopeLog = ''
     logFile = ''
     logGUID = ''
     ddmEndPointIn = []
     ddmEndPointOut = []
     noOutput = []
     siteSpec = None
     inDsLfnMap = {}
     inLFNset = set()
     if siteMapperCache is not None:
         siteMapper = siteMapperCache.getObj()
         siteSpec = siteMapper.getSite(job.computingSite)
         # resolve destSE
         try:
             job.destinationSE = siteMapper.resolveNucleus(
                 job.destinationSE)
             for tmpFile in job.Files:
                 tmpFile.destinationSE = siteMapper.resolveNucleus(
                     tmpFile.destinationSE)
         except Exception:
             pass
         siteMapperCache.releaseObj()
     for file in job.Files:
         if file.type == 'input':
             if EventServiceUtils.isJumboJob(job) and file.lfn in inLFNset:
                 pass
             else:
                 inLFNset.add(file.lfn)
                 if strIFiles != '':
                     strIFiles += ','
                 strIFiles += file.lfn
                 if strDispatch != '':
                     strDispatch += ','
                 strDispatch += file.dispatchDBlock
                 if strDisToken != '':
                     strDisToken += ','
                 strDisToken += file.dispatchDBlockToken
                 strProdDBlock += '%s,' % file.prodDBlock
                 if not isEventServiceMerge:
                     strProdToken += '%s,' % file.prodDBlockToken
                 else:
                     strProdToken += '%s,' % job.metadata[1][file.lfn]
                 if strGUID != '':
                     strGUID += ','
                 strGUID += file.GUID
                 strRealDatasetIn += '%s,' % file.dataset
                 strFSize += '%s,' % file.fsize
                 if file.checksum not in ['', 'NULL', None]:
                     strCheckSum += '%s,' % file.checksum
                 else:
                     strCheckSum += '%s,' % file.md5sum
                 strScopeIn += '%s,' % file.scope
                 ddmEndPointIn.append(
                     self.getDdmEndpoint(siteSpec, file.dispatchDBlockToken,
                                         'input', job.prodSourceLabel,
                                         job.job_label))
                 if file.dataset not in inDsLfnMap:
                     inDsLfnMap[file.dataset] = []
                 inDsLfnMap[file.dataset].append(file.lfn)
         if file.type == 'output' or file.type == 'log':
             if strOFiles != '':
                 strOFiles += ','
             strOFiles += file.lfn
             if strDestination != '':
                 strDestination += ','
             strDestination += file.destinationDBlock
             if strRealDataset != '':
                 strRealDataset += ','
             strRealDataset += file.dataset
             strFileDestinationSE += '%s,' % file.destinationSE
             if file.type == 'log':
                 logFile = file.lfn
                 logGUID = file.GUID
                 strScopeLog = file.scope
             else:
                 strScopeOut += '%s,' % file.scope
             if strDestToken != '':
                 strDestToken += ','
             strDestToken += re.sub(
                 '^ddd:', 'dst:',
                 file.destinationDBlockToken.split(',')[0])
             strDisTokenForOutput += '%s,' % file.dispatchDBlockToken
             strProdTokenForOutput += '%s,' % file.prodDBlockToken
             ddmEndPointOut.append(
                 self.getDdmEndpoint(
                     siteSpec,
                     file.destinationDBlockToken.split(',')[0], 'output',
                     job.prodSourceLabel, job.job_label))
             if file.isAllowedNoOutput():
                 noOutput.append(file.lfn)
     # inFiles
     self.data['inFiles'] = strIFiles
     # dispatch DBlock
     self.data['dispatchDblock'] = strDispatch
     # dispatch DBlock space token
     self.data['dispatchDBlockToken'] = strDisToken
     # dispatch DBlock space token for output
     self.data['dispatchDBlockTokenForOut'] = strDisTokenForOutput[:-1]
     # outFiles
     self.data['outFiles'] = strOFiles
     # destination DBlock
     self.data['destinationDblock'] = strDestination
     # destination DBlock space token
     self.data['destinationDBlockToken'] = strDestToken
     # prod DBlocks
     self.data['prodDBlocks'] = strProdDBlock[:-1]
     # prod DBlock space token
     self.data['prodDBlockToken'] = strProdToken[:-1]
     # real output datasets
     self.data['realDatasets'] = strRealDataset
     # real output datasets
     self.data['realDatasetsIn'] = strRealDatasetIn[:-1]
     # file's destinationSE
     self.data['fileDestinationSE'] = strFileDestinationSE[:-1]
     # log filename
     self.data['logFile'] = logFile
     # log GUID
     self.data['logGUID'] = logGUID
     # jobPars
     self.data['jobPars'], ppSteps = job.extractMultiStepExec()
     if ppSteps is not None:
         self.data.update(ppSteps)
     if job.to_encode_job_params():
         self.data['jobPars'] = base64.b64encode(
             self.data['jobPars'].encode()).decode()
     # attempt number
     self.data['attemptNr'] = job.attemptNr
     # GUIDs
     self.data['GUID'] = strGUID
     # checksum
     self.data['checksum'] = strCheckSum[:-1]
     # fsize
     self.data['fsize'] = strFSize[:-1]
     # scope
     self.data['scopeIn'] = strScopeIn[:-1]
     self.data['scopeOut'] = strScopeOut[:-1]
     self.data['scopeLog'] = strScopeLog
     # DDM endpoints
     try:
         self.data['ddmEndPointIn'] = ','.join(ddmEndPointIn)
     except TypeError:
         self.data['ddmEndPointIn'] = ''
     try:
         self.data['ddmEndPointOut'] = ','.join(ddmEndPointOut)
     except TypeError:
         self.data['ddmEndPointOut'] = ''
     # destinationSE
     self.data['destinationSE'] = job.destinationSE
     # user ID
     self.data['prodUserID'] = job.prodUserID
     # CPU count
     self.data['maxCpuCount'] = job.maxCpuCount
     # RAM count
     self.data['minRamCount'] = job.minRamCount
     # disk count
     self.data['maxDiskCount'] = job.maxDiskCount
     # cmtconfig
     if ppSteps is None:
         self.data['cmtConfig'] = job.cmtConfig
     else:
         self.data['cmtConfig'] = ''
     # processingType
     self.data['processingType'] = job.processingType
     # transferType
     self.data['transferType'] = job.transferType
     # sourceSite
     self.data['sourceSite'] = job.sourceSite
     # current priority
     self.data['currentPriority'] = job.currentPriority
     # taskID
     if job.lockedby == 'jedi':
         self.data['taskID'] = job.jediTaskID
     else:
         self.data['taskID'] = job.taskID
     # core count
     if job.coreCount in ['NULL', None]:
         self.data['coreCount'] = 1
     else:
         self.data['coreCount'] = job.coreCount
     # jobsetID
     self.data['jobsetID'] = job.jobsetID
     # nucleus
     self.data['nucleus'] = job.nucleus
     # walltime
     self.data['maxWalltime'] = job.maxWalltime
     # looping check
     if job.is_no_looping_check():
         self.data['loopingCheck'] = False
     # debug mode
     if job.specialHandling is not None and 'debug' in job.specialHandling:
         self.data['debug'] = 'True'
     # event service or job cloning
     if EventServiceUtils.isJobCloningJob(job):
         self.data['cloneJob'] = EventServiceUtils.getJobCloningType(job)
     elif EventServiceUtils.isEventServiceJob(
             job) or EventServiceUtils.isJumboJob(job):
         self.data['eventService'] = 'True'
         # prod DBlock space token for pre-merging output
         self.data['prodDBlockTokenForOutput'] = strProdTokenForOutput[:-1]
     # event service merge
     if isEventServiceMerge:
         self.data['eventServiceMerge'] = 'True'
         # write to file for ES merge
         writeToFileStr = ''
         try:
             for outputName in job.metadata[0]:
                 inputList = job.metadata[0][outputName]
                 writeToFileStr += 'inputFor_{0}:'.format(outputName)
                 for tmpInput in inputList:
                     writeToFileStr += '{0},'.format(tmpInput)
                 writeToFileStr = writeToFileStr[:-1]
                 writeToFileStr += '^'
             writeToFileStr = writeToFileStr[:-1]
         except Exception:
             pass
         self.data['writeToFile'] = writeToFileStr
     elif job.writeInputToFile():
         try:
             # write input to file
             writeToFileStr = ''
             for inDS in inDsLfnMap:
                 inputList = inDsLfnMap[inDS]
                 inDS = re.sub('/$', '', inDS)
                 inDS = inDS.split(':')[-1]
                 writeToFileStr += 'tmpin_{0}:'.format(inDS)
                 writeToFileStr += ','.join(inputList)
                 writeToFileStr += '^'
             writeToFileStr = writeToFileStr[:-1]
             self.data['writeToFile'] = writeToFileStr
         except Exception:
             pass
     # replace placeholder
     if EventServiceUtils.isJumboJob(job) or EventServiceUtils.isCoJumboJob(
             job):
         try:
             for inDS in inDsLfnMap:
                 inputList = inDsLfnMap[inDS]
                 inDS = re.sub('/$', '', inDS)
                 inDS = inDS.split(':')[-1]
                 srcStr = 'tmpin__cnt_{0}'.format(inDS)
                 dstStr = ','.join(inputList)
                 self.data['jobPars'] = self.data['jobPars'].replace(
                     srcStr, dstStr)
         except Exception:
             pass
     # no output
     if noOutput != []:
         self.data['allowNoOutput'] = ','.join(noOutput)
     # alternative stage-out
     if job.getAltStgOut() is not None:
         self.data['altStageOut'] = job.getAltStgOut()
     # log to OS
     if job.putLogToOS():
         self.data['putLogToOS'] = 'True'
     # suppress execute string conversion
     if job.noExecStrCnv():
         self.data['noExecStrCnv'] = 'True'
     # in-file positional event number
     if job.inFilePosEvtNum():
         self.data['inFilePosEvtNum'] = 'True'
     # use prefetcher
     if job.usePrefetcher():
         self.data['usePrefetcher'] = 'True'
     # image name
     if job.container_name not in ['NULL', None]:
         self.data['container_name'] = job.container_name
     # IO
     self.data['ioIntensity'] = job.get_task_attribute('ioIntensity')
     self.data['ioIntensityUnit'] = job.get_task_attribute(
         'ioIntensityUnit')
     # HPO
     if job.is_hpo_workflow():
         self.data['isHPO'] = 'True'
     # VP
     if siteSpec is not None:
         scope_input, scope_output = DataServiceUtils.select_scope(
             siteSpec, job.prodSourceLabel, job.job_label)
         if siteSpec.use_vp(scope_input):
             self.data['useVP'] = 'True'
Exemplo n.º 11
0
 def getAvailableFiles(self,datasetSpec,siteEndPointMap,siteMapper,ngGroup=[],checkLFC=False):
     # make logger
     methodName = 'getAvailableFiles'
     methodName += ' <datasetID={0}>'.format(datasetSpec.datasetID)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start datasetName={0}'.format(datasetSpec.datasetName))
     try:
         # list of NG endpoints
         ngEndPoints = []
         if 1 in ngGroup:
             ngEndPoints += ['_SCRATCHDISK$','_LOCALGROUPDISK$','_LOCALGROUPTAPE$','_USERDISK$',
                            '_DAQ$','_TMPDISK$','_TZERO$','_GRIDFTP$','MOCKTEST$']
         if 2 in ngGroup:
             ngEndPoints += ['_LOCALGROUPTAPE$',
                            '_DAQ$','_TMPDISK$','_TZERO$','_GRIDFTP$','MOCKTEST$']
         # get all associated endpoints
         siteAllEndPointsMap = {}
         for siteName,endPointPattList in siteEndPointMap.iteritems():
             # get all endpoints matching with patterns 
             allEndPointList = []
             for endPointPatt in endPointPattList:
                 if '*' in endPointPatt:
                     # wildcard
                     endPointPatt = endPointPatt.replace('*','.*')
                     for endPointToA in TiersOfATLAS.getAllDestinationSites():
                         if re.search('^'+endPointPatt+'$',endPointToA) != None:
                             if not endPointToA in allEndPointList:
                                 allEndPointList.append(endPointToA)
                 else:
                     # normal endpoint
                     if endPointPatt in TiersOfATLAS.getAllDestinationSites() and \
                            not endPointPatt in allEndPointList:
                         allEndPointList.append(endPointPatt)
             # get associated endpoints
             siteAllEndPointsMap[siteName] = []
             for endPoint in allEndPointList:
                 # append
                 if not self.checkNGEndPoint(endPoint,ngEndPoints) and \
                         not endPoint in siteAllEndPointsMap[siteName]:
                     siteAllEndPointsMap[siteName].append(endPoint)
                 else:
                     # already checked
                     continue
                 # get alternate name
                 altName = TiersOfATLAS.getSiteProperty(endPoint,'alternateName')
                 if altName != None and altName != ['']:
                     for assEndPoint in TiersOfATLAS.resolveGOC({altName[0]:None})[altName[0]]:
                         if not assEndPoint in siteAllEndPointsMap[siteName] and \
                                not self.checkNGEndPoint(assEndPoint,ngEndPoints):
                             siteAllEndPointsMap[siteName].append(assEndPoint)
         # get replica map
         tmpStat,tmpOut = self.listDatasetReplicas(datasetSpec.datasetName)
         if tmpStat != self.SC_SUCCEEDED:
             tmpLog.error('faild to get dataset replicas with {0}'.format(tmpOut))
             raise tmpStat,tmpOut
         datasetReplicaMap = tmpOut
         # collect SE, LFC hosts, storage path, storage type
         lfcSeMap = {}
         storagePathMap = {}
         completeReplicaMap = {}
         siteHasCompleteReplica = False
         for siteName,allEndPointList in siteAllEndPointsMap.iteritems():
             tmpLfcSeMap = {}
             tmpStoragePathMap = {}
             tmpSiteSpec = siteMapper.getSite(siteName)
             for tmpEndPoint in allEndPointList:
                 # storage type
                 if TiersOfATLAS.isTapeSite(tmpEndPoint):
                     storageType = 'localtape'
                 else:
                     storageType = 'localdisk'
                 # no scan when site has complete replicas
                 if datasetReplicaMap.has_key(tmpEndPoint) and datasetReplicaMap[tmpEndPoint][-1]['found'] != None \
                    and datasetReplicaMap[tmpEndPoint][-1]['total'] == datasetReplicaMap[tmpEndPoint][-1]['found']:
                     completeReplicaMap[tmpEndPoint] = storageType
                     siteHasCompleteReplica = True
                 # no LFC scan for many-time datasets
                 if datasetSpec.isManyTime():
                     continue
                 # get LFC
                 lfc = TiersOfATLAS.getLocalCatalog(tmpEndPoint)
                 # add map
                 if not tmpLfcSeMap.has_key(lfc):
                     tmpLfcSeMap[lfc] = []
                 # get SE
                 seStr = TiersOfATLAS.getSiteProperty(tmpEndPoint, 'srm')
                 tmpMatch = re.search('://([^:/]+):*\d*/',seStr)
                 if tmpMatch != None:
                     se = tmpMatch.group(1)
                     if not se in tmpLfcSeMap[lfc]:
                         tmpLfcSeMap[lfc].append(se)
                 else:
                     tmpLog.error('faild to extract SE from %s for %s:%s' % \
                                  (seStr,siteName,tmpEndPoint))
                 # get SE + path
                 seStr = TiersOfATLAS.getSiteProperty(tmpEndPoint, 'srm')
                 tmpMatch = re.search('(srm://.+)$',seStr)
                 if tmpMatch == None:
                     tmpLog.error('faild to extract SE+PATH from %s for %s:%s' % \
                                  (seStr,siteName,tmpEndPoint))
                     continue
                 # add full path to storage map
                 tmpSePath = tmpMatch.group(1)
                 tmpStoragePathMap[tmpSePath] = {'siteName':siteName,'storageType':storageType}
                 # add compact path
                 tmpSePath = re.sub('(:\d+)*/srm/[^\?]+\?SFN=','',tmpSePath)
                 tmpStoragePathMap[tmpSePath] = {'siteName':siteName,'storageType':storageType}
             # add to map to trigger LFC scan if complete replica is missing at the site
             if DataServiceUtils.isCachedFile(datasetSpec.datasetName,tmpSiteSpec):
                 pass
             elif not siteHasCompleteReplica or checkLFC:
                 for tmpKey,tmpVal in tmpLfcSeMap.iteritems():
                     if not lfcSeMap.has_key(tmpKey):
                         lfcSeMap[tmpKey] = []
                     lfcSeMap[tmpKey] += tmpVal
                 for tmpKey,tmpVal in tmpStoragePathMap.iteritems():
                     storagePathMap[tmpKey] = tmpVal
         # collect GUIDs and LFNs
         fileMap        = {}
         lfnMap         = {}
         lfnFileSepcMap = {}
         scopeMap       = {}
         for tmpFile in datasetSpec.Files:
             fileMap[tmpFile.GUID] = tmpFile.lfn
             lfnMap[tmpFile.lfn] = tmpFile
             lfnFileSepcMap[tmpFile.lfn] = tmpFile
             scopeMap[tmpFile.lfn] = tmpFile.scope
         # get SURLs
         surlMap = {}
         for lfcHost,seList in lfcSeMap.iteritems():
             tmpLog.debug('lookup in LFC:{0} for {1}'.format(lfcHost,str(seList)))               
             tmpStat,tmpRetMap = self.getSURLsFromLFC(fileMap,lfcHost,seList,scopes=scopeMap)
             tmpLog.debug(str(tmpStat))
             if tmpStat != self.SC_SUCCEEDED:
                 raise RuntimeError,tmpRetMap
             for lfn,surls in tmpRetMap.iteritems():
                 if not surlMap.has_key(lfn):
                     surlMap[lfn] = surls
                 else:
                     surlMap[lfn] += surls
         # make return
         returnMap = {}
         for siteName,allEndPointList in siteAllEndPointsMap.iteritems():
             # set default return values
             if not returnMap.has_key(siteName):
                 returnMap[siteName] = {'localdisk':[],'localtape':[],'cache':[],'remote':[]}
             # loop over all files    
             tmpSiteSpec = siteMapper.getSite(siteName)                
             # check if the file is cached
             if DataServiceUtils.isCachedFile(datasetSpec.datasetName,tmpSiteSpec):
                 for tmpFileSpec in datasetSpec.Files:
                     # add to cached file list
                     returnMap[siteName]['cache'].append(tmpFileSpec)
             # complete replicas
             if not checkLFC:        
                 for tmpEndPoint in allEndPointList:
                     if completeReplicaMap.has_key(tmpEndPoint):
                         storageType = completeReplicaMap[tmpEndPoint]
                         returnMap[siteName][storageType] += datasetSpec.Files
         # loop over all available LFNs
         avaLFNs = surlMap.keys()
         avaLFNs.sort()
         for tmpLFN in avaLFNs:
             tmpFileSpec = lfnFileSepcMap[tmpLFN]                
             # loop over all SURLs
             for tmpSURL in surlMap[tmpLFN]:
                 for tmpSePath in storagePathMap.keys():
                     # check SURL
                     if tmpSURL.startswith(tmpSePath):
                         # add
                         siteName = storagePathMap[tmpSePath]['siteName']
                         storageType = storagePathMap[tmpSePath]['storageType']
                         if not tmpFileSpec in returnMap[siteName][storageType]:
                             returnMap[siteName][storageType].append(tmpFileSpec)
                         break
         # dump
         dumpStr = ''
         for siteName,storageTypeFile in returnMap.iteritems():
             dumpStr += '{0}:('.format(siteName)
             for storageType,fileList in storageTypeFile.iteritems():
                 dumpStr += '{0}:{1},'.format(storageType,len(fileList))
             dumpStr = dumpStr[:-1]
             dumpStr += ') '
         dumpStr= dumpStr[:-1]
         tmpLog.debug(dumpStr)
         # return
         tmpLog.info('done')            
         return self.SC_SUCCEEDED,returnMap
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errMsg = 'failed with {0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return self.SC_FAILED,'{0}.{1} {2}'.format(self.__class__.__name__,methodName,errMsg)
Exemplo n.º 12
0
     if False:  #outFileName is None:
         outFileName.append(tmpFile.lfn)
 if tmpFile.type in ['output', 'log']:
     fileList = []
     if False:  # tmpFile.type == 'output':# and iOut > 0:
         for i in range(8):
             newFile = copy.copy(tmpFile)
             newFile.lfn += '._00{0}'.format(i)
             fileList.append(newFile)
         #continue
     else:
         fileList.append(tmpFile)
     iOut += 1
     for file in fileList:
         file.GUID = str(uuid.uuid4())
         if DataServiceUtils.getDistributedDestination(
                 file.destinationDBlockToken) is not None:
             tmpSrcDDM = DataServiceUtils.getDistributedDestination(
                 file.destinationDBlockToken)
         elif job.computingSite == file.destinationSE and \
                 file.destinationDBlockToken in siteSpec.setokens_output[scope_output]:
             tmpSrcDDM = siteSpec.setokens_output[scope_output][
                 file.destinationDBlockToken]
         elif file.lfn in outFileName:
             tmpSrcDDM = DataServiceUtils.getDestinationSE(
                 file.destinationDBlockToken)
             if tmpSrcDDM is None:
                 tmpSrcSite = siteMapper.getSite(file.destinationSE)
                 tmp_scope_input, tmp_scope_output = select_scope(
                     siteSpec, job.prodSourceLabel)
                 tmpSrcDDM = tmpSrcSite.ddm_output[tmp_scope_output]
         else:
Exemplo n.º 13
0
    def execute(self):
        try:
            # loop over all files
            fileMap = {}
            for fileSpec in self.job.Files:
                # ignore inputs
                if fileSpec.type not in ['output', 'log']:
                    continue
                # ignore local
                if fileSpec.destinationSE == 'local':
                    continue
                # collect file attributes
                try:
                    fsize = int(fileSpec.fsize)
                except Exception:
                    fsize = None
                # set GUID if empty
                if not fileSpec.GUID:
                    fileSpec.GUID = str(uuid.uuid4())
                fileAttrs = {'guid': fileSpec.GUID,
                             'lfn': fileSpec.lfn,
                             'size': fsize,
                             'checksum': fileSpec.checksum,
                             'ds': fileSpec.destinationDBlock}
                if self.extraInfo:
                    if 'surl' in self.extraInfo and fileSpec.lfn in self.extraInfo['surl']:
                        fileAttrs['surl'] = self.extraInfo['surl'][fileSpec.lfn]
                    if 'nevents' in self.extraInfo and fileSpec.lfn in self.extraInfo['nevents']:
                        fileAttrs['events'] = self.extraInfo['nevents'][fileSpec.lfn]
                fileMap.setdefault(fileSpec.destinationDBlock, [])
                fileMap[fileSpec.destinationDBlock].append(fileAttrs)
            # register files
            if fileMap:
                dstRSE = self.siteMapper.getSite(self.job.computingSite).ddm_output['default']
                destIdMap = {dstRSE: fileMap}
                nTry = 3
                for iTry in range(nTry):
                    isFatal = False
                    isFailed = False
                    regStart = datetime.datetime.utcnow()
                    try:
                        self.logger.debug('{} {}'.format('registerFilesInDatasets', str(destIdMap)))
                        out = rucioAPI.registerFilesInDataset(destIdMap, {})
                    except (DataIdentifierNotFound,
                            FileConsistencyMismatch,
                            UnsupportedOperation,
                            InvalidPath,
                            InvalidObject,
                            RSENotFound,
                            RSEProtocolNotSupported,
                            InvalidRSEExpression,
                            RSEFileNameNotSupported,
                            KeyError) as e:
                        # fatal errors
                        out = 'failed with {}\n {}'.format(str(e), traceback.format_exc())
                        isFatal = True
                        isFailed = True
                    except Exception as e:
                        # unknown errors
                        isFailed = True
                        out = 'failed with unknown error: {}\n {}'.format(str(e), traceback.format_exc())
                        if 'value too large for column' in out or \
                                'unique constraint (ATLAS_RUCIO.DIDS_GUID_IDX) violate' in out or \
                                'unique constraint (ATLAS_RUCIO.DIDS_PK) violated' in out or \
                                'unique constraint (ATLAS_RUCIO.ARCH_CONTENTS_PK) violated' in out:
                            isFatal = True
                        else:
                            isFatal = False
                    regTime = datetime.datetime.utcnow() - regStart
                    self.logger.debug('took %s.%03d sec' % (regTime.seconds, regTime.microseconds / 1000))

                    # failed
                    if isFailed or isFatal:
                        self.logger.error('%s' % out)
                        if (iTry + 1) == nTry or isFatal:
                            self.job.ddmErrorCode = ErrorCode.EC_Adder
                            # extract important error string
                            extractedErrStr = DataServiceUtils.extractImportantError(out)
                            errMsg = "Could not add files to DDM: "
                            if extractedErrStr == '':
                                self.job.ddmErrorDiag = errMsg + out.split('\n')[-1]
                            else:
                                self.job.ddmErrorDiag = errMsg + extractedErrStr
                            if isFatal:
                                self.result.setFatal()
                            else:
                                self.result.setTemporary()
                            return 1
                        self.logger.error("Try:%s" % iTry)
                        # sleep
                        time.sleep(10)
                    else:
                        self.logger.debug('%s' % str(out))
                        break
            # done
            self.result.setSucceeded()
            self.logger.debug("end plugin")
        except Exception as e:
            errStr = 'failed to execute with {}\n'.format(str(e))
            errStr += traceback.format_exc()
            self.logger.error(errStr)
            self.result.setTemporary()
        # return
        return
Exemplo n.º 14
0
 def doRefine(self, jediTaskID, taskParamMap):
     # make logger
     tmpLog = self.tmpLog
     tmpLog.debug('start taskType={0}'.format(self.taskSpec.taskType))
     try:
         # basic refine
         self.doBasicRefine(taskParamMap)
         # set nosplit+repeat for DBR
         for datasetSpec in self.inSecDatasetSpecList:
             if DataServiceUtils.isDBR(datasetSpec.datasetName):
                 datasetSpec.attributes = 'repeat,nosplit'
         # enable consistency check
         if self.taskSpec.parent_tid not in [
                 None, self.taskSpec.jediTaskID
         ]:
             for datasetSpec in self.inMasterDatasetSpec:
                 if datasetSpec.isMaster() and datasetSpec.type == 'input':
                     datasetSpec.enableCheckConsistency()
         # append attempt number
         for tmpKey, tmpOutTemplateMapList in iteritems(
                 self.outputTemplateMap):
             for tmpOutTemplateMap in tmpOutTemplateMapList:
                 outFileTemplate = tmpOutTemplateMap['filenameTemplate']
                 if re.search(
                         '\.\d+$', outFileTemplate
                 ) is None and not outFileTemplate.endswith('.panda.um'):
                     tmpOutTemplateMap[
                         'filenameTemplate'] = outFileTemplate + '.1'
         # extract input datatype
         datasetTypeListIn = []
         for datasetSpec in self.inMasterDatasetSpec + self.inSecDatasetSpecList:
             datasetType = DataServiceUtils.getDatasetType(
                 datasetSpec.datasetName)
             if datasetType not in ['', None]:
                 datasetTypeListIn.append(datasetType)
         # extract datatype and set destination if nessesary
         datasetTypeList = []
         for datasetSpec in self.outDatasetSpecList:
             datasetType = DataServiceUtils.getDatasetType(
                 datasetSpec.datasetName)
             if datasetType not in ['', None]:
                 datasetTypeList.append(datasetType)
         # set numThrottled to use the task throttling mechanism
         if 'noThrottle' not in taskParamMap:
             self.taskSpec.numThrottled = 0
         # set to register datasets
         self.taskSpec.setToRegisterDatasets()
         # set transient to parent datasets
         if self.taskSpec.processingType in [
                 'merge'
         ] and self.taskSpec.parent_tid not in [
                 None, self.taskSpec.jediTaskID
         ]:
             # get parent
             tmpStat, parentTaskSpec = self.taskBufferIF.getTaskDatasetsWithID_JEDI(
                 self.taskSpec.parent_tid, None, False)
             if tmpStat and parentTaskSpec is not None:
                 # set transient to parent datasets
                 metaData = {'transient': True}
                 for datasetSpec in parentTaskSpec.datasetSpecList:
                     if datasetSpec.type in ['log', 'output']:
                         datasetType = DataServiceUtils.getDatasetType(
                             datasetSpec.datasetName)
                         if datasetType not in [
                                 '', None
                         ] and datasetType in datasetTypeList and datasetType in datasetTypeListIn:
                             tmpLog.info(
                                 'set metadata={0} to parent jediTaskID={1}:datasetID={2}:Name={3}'
                                 .format(str(metaData),
                                         self.taskSpec.parent_tid,
                                         datasetSpec.datasetID,
                                         datasetSpec.datasetName))
                             for metadataName, metadaValue in iteritems(
                                     metaData):
                                 self.ddmIF.getInterface(
                                     self.taskSpec.vo).setDatasetMetadata(
                                         datasetSpec.datasetName,
                                         metadataName, metadaValue)
         # input prestaging
         if self.taskSpec.inputPreStaging():
             # set first contents feed flag
             self.taskSpec.set_first_contents_feed(True)
     except JediException.UnknownDatasetError as e:
         tmpLog.debug('in doRefine. {0}'.format(str(e)))
         raise e
     except Exception as e:
         tmpLog.error('doRefine failed with {0} {1}'.format(
             str(e), traceback.format_exc()))
         raise e
     tmpLog.debug('done')
     return self.SC_SUCCEEDED
Exemplo n.º 15
0
 def doRefine(self,jediTaskID,taskParamMap):
     # make logger
     tmpLog = self.tmpLog
     tmpLog.debug('start taskType={0}'.format(self.taskSpec.taskType))
     try:
         self.doBasicRefine(taskParamMap)
         # set nosplit+repeat for DBR
         for datasetSpec in self.inSecDatasetSpecList:
             if DataServiceUtils.isDBR(datasetSpec.datasetName):
                 datasetSpec.attributes = 'repeat,nosplit'
         # enable consistency check
         if not self.taskSpec.parent_tid in [None,self.taskSpec.jediTaskID]:
             for datasetSpec in self.inMasterDatasetSpec:
                 if datasetSpec.isMaster() and datasetSpec.type == 'input':
                     datasetSpec.enableCheckConsistency()
         # append attempt number
         for tmpKey,tmpOutTemplateMapList in self.outputTemplateMap.iteritems():
             for tmpOutTemplateMap in tmpOutTemplateMapList:
                 outFileTemplate = tmpOutTemplateMap['filenameTemplate']
                 if re.search('\.\d+$',outFileTemplate) == None and not outFileTemplate.endswith('.panda.um'):
                     tmpOutTemplateMap['filenameTemplate'] = outFileTemplate + '.1'
         # extract datatype and set destination if nessesary
         datasetTypeList = []
         for datasetSpec in self.outDatasetSpecList:
             datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName)
             if datasetType != '':
                 datasetTypeList.append(datasetType)
             storageToken = DataServiceUtils.getDestinationSE(datasetSpec.storageToken)
             if storageToken != None:
                 tmpSiteList = self.ddmIF.getInterface(self.taskSpec.vo).getSitesWithEndPoint(storageToken,self.siteMapper,'production')
                 if tmpSiteList == []:
                     raise RuntimeError,'cannot find online siteID associated to {0}'.format(storageToken)
                 datasetSpec.destination = tmpSiteList[0]
         # set numThrottled to use the task throttling mechanism
         if not 'noThrottle' in taskParamMap:
             self.taskSpec.numThrottled = 0
         # set to register datasets
         self.taskSpec.setToRegisterDatasets()
         # set transient to parent datasets
         if self.taskSpec.processingType in ['merge'] and not self.taskSpec.parent_tid in [None,self.taskSpec.jediTaskID]:
             # get parent
             tmpStat,parentTaskSpec = self.taskBufferIF.getTaskDatasetsWithID_JEDI(self.taskSpec.parent_tid,None,False)
             if tmpStat and parentTaskSpec != None:
                 # set transient to parent datasets
                 metaData = {'transient':True}
                 for datasetSpec in parentTaskSpec.datasetSpecList:
                     if datasetSpec.type in ['log','output']:
                         datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName)
                         if datasetType != '' and datasetType in datasetTypeList:
                             tmpLog.info('set metadata={0} to parent jediTaskID={1}:datasetID={2}:Name={3}'.format(str(metaData),
                                                                                                                   self.taskSpec.parent_tid,
                                                                                                                   datasetSpec.datasetID,
                                                                                                                   datasetSpec.datasetName))
                             for metadataName,metadaValue in metaData.iteritems():
                                 self.ddmIF.getInterface(self.taskSpec.vo).setDatasetMetadata(datasetSpec.datasetName,
                                                                                              metadataName,metadaValue)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('doBasicRefine failed with {0}:{1}'.format(errtype.__name__,errvalue))
         raise errtype,errvalue
     tmpLog.debug('done')
     return self.SC_SUCCEEDED
Exemplo n.º 16
0
 def doCheck(self, taskSpecList):
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start doCheck')
     # return for failure
     retFatal = self.SC_FATAL, {}
     retTmpError = self.SC_FAILED, {}
     # get list of jediTaskIDs
     taskIdList = []
     taskSpecMap = {}
     for taskSpec in taskSpecList:
         taskIdList.append(taskSpec.jediTaskID)
         taskSpecMap[taskSpec.jediTaskID] = taskSpec
     # check with panda
     tmpLog.debug('check with panda')
     tmpPandaStatus, cloudsInPanda = PandaClient.seeCloudTask(taskIdList)
     if tmpPandaStatus != 0:
         tmpLog.error('failed to see clouds')
         return retTmpError
     # make return map
     retMap = {}
     for tmpTaskID, tmpCoreName in cloudsInPanda.iteritems():
         tmpLog.debug('jediTaskID={0} -> {1}'.format(
             tmpTaskID, tmpCoreName))
         if not tmpCoreName in ['NULL', '', None]:
             taskSpec = taskSpecMap[tmpTaskID]
             if taskSpec.useWorldCloud():
                 # get destinations for WORLD cloud
                 ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                 # get site
                 siteSpec = self.siteMapper.getSite(tmpCoreName)
                 # get nucleus
                 nucleus = siteSpec.pandasite
                 # get output/log datasets
                 tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                     tmpTaskID, ['output', 'log'])
                 # get destinations
                 retMap[tmpTaskID] = {'datasets': [], 'nucleus': nucleus}
                 for datasetSpec in tmpDatasetSpecs:
                     # skip distributed datasets
                     if DataServiceUtils.getDistributedDestination(
                             datasetSpec.storageToken) != None:
                         continue
                     # get token
                     token = ddmIF.convertTokenToEndpoint(
                         siteSpec.ddm_output, datasetSpec.storageToken)
                     # use default endpoint
                     if token == None:
                         token = siteSpec.ddm_output
                     # add origianl token
                     if not datasetSpec.storageToken in ['', None]:
                         token += '/{0}'.format(datasetSpec.storageToken)
                     retMap[tmpTaskID]['datasets'].append({
                         'datasetID':
                         datasetSpec.datasetID,
                         'token':
                         'dst:{0}'.format(token),
                         'destination':
                         tmpCoreName
                     })
             else:
                 retMap[tmpTaskID] = tmpCoreName
     tmpLog.debug('ret {0}'.format(str(retMap)))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED, retMap
Exemplo n.º 17
0
def main(backGround=False):
    _logger.debug('starting ...')
    # register signal handler
    signal.signal(signal.SIGINT, catch_sig)
    signal.signal(signal.SIGHUP, catch_sig)
    signal.signal(signal.SIGTERM, catch_sig)
    signal.signal(signal.SIGALRM, catch_sig)
    signal.alarm(overallTimeout)
    # forking
    pid = os.fork()
    if pid != 0:
        # watch child process
        os.wait()
        time.sleep(1)
    else:
        # main loop
        from pandaserver.taskbuffer.TaskBuffer import taskBuffer
        # check certificate
        certName = '%s/pandasv1_usercert.pem' % panda_config.certdir
        keyName = '%s/pandasv1_userkey.pem' % panda_config.certdir
        _logger.debug('checking certificate {0}'.format(certName))
        certOK, certMsg = DataServiceUtils.checkCertificate(certName)
        if not certOK:
            _logger.error('bad certificate : {0}'.format(certMsg))
        # initialize cx_Oracle using dummy connection
        from pandaserver.taskbuffer.Initializer import initializer
        initializer.init()
        # instantiate TB
        taskBuffer.init(panda_config.dbhost,
                        panda_config.dbpasswd,
                        nDBConnection=1)
        # instantiate sitemapper
        siteMapper = SiteMapper(taskBuffer)
        # ActiveMQ params
        queue = '/queue/Consumer.panda.rucio.events'
        ssl_opts = {
            'use_ssl': True,
            'ssl_version': ssl.PROTOCOL_TLSv1,
            'ssl_cert_file': certName,
            'ssl_key_file': keyName
        }
        # resolve multiple brokers
        brokerList = socket.gethostbyname_ex('atlas-mb.cern.ch')[-1]
        # set listener
        connList = []
        for tmpBroker in brokerList:
            try:
                clientid = 'PANDA-' + socket.getfqdn() + '-' + tmpBroker
                subscription_id = 'panda-server-consumer'
                _logger.debug('setting listener %s to broker %s' %
                              (clientid, tmpBroker))
                conn = stomp.Connection(host_and_ports=[(tmpBroker, 61023)],
                                        **ssl_opts)
                connList.append(conn)
            except Exception:
                errtype, errvalue = sys.exc_info()[:2]
                _logger.error("failed to connect to %s : %s %s" %
                              (tmpBroker, errtype, errvalue))
                catch_sig(None, None)
        while True:
            for conn in connList:
                try:
                    if not conn.is_connected():
                        conn.set_listener(
                            'DatasetCallbackListener',
                            DatasetCallbackListener(conn, taskBuffer,
                                                    siteMapper,
                                                    subscription_id))
                        conn.start()
                        conn.connect(headers={'client-id': clientid})
                        conn.subscribe(destination=queue,
                                       id=subscription_id,
                                       ack='auto')
                        _logger.debug('listener %s is up and running' %
                                      clientid)
                except Exception:
                    errtype, errvalue = sys.exc_info()[:2]
                    _logger.error("failed to set listener on %s : %s %s" %
                                  (tmpBroker, errtype, errvalue))
                    catch_sig(None, None)
            time.sleep(5)
Exemplo n.º 18
0
def getAnalSitesWithData(siteList,siteMapper,ddmIF,datasetName):
    # get replicas
    try:
        replicaMap= {}
        replicaMap[datasetName] = ddmIF.listDatasetReplicas(datasetName)
    except:
        errtype,errvalue = sys.exc_info()[:2]
        return errtype,'ddmIF.listDatasetReplicas failed with %s' % errvalue
    # loop over all clouds
    retMap = {}
    for tmpSiteName in siteList:
        tmpSiteSpec = siteMapper.getSite(tmpSiteName)
        # loop over all DDM endpoints
        checkedEndPoints = []
        for tmpDDM in [tmpSiteSpec.ddm] + tmpSiteSpec.setokens.values():
            # skip empty
            if tmpDDM == '':
                continue
            # get prefix
            tmpPrefix = re.sub('_[^_]+$','_',tmpDDM) 
            # already checked 
            if tmpPrefix in checkedEndPoints:
                continue
            # DBR
            if DataServiceUtils.isCachedFile(datasetName,tmpSiteSpec):
                # no replica check since it is cached 
                if not retMap.has_key(tmpSiteName):
                    retMap[tmpSiteName] = {}
                retMap[tmpSiteName][tmpDDM] = {'tape':False,'state':'complete'}
                checkedEndPoints.append(tmpPrefix)
                continue
            checkedEndPoints.append(tmpPrefix)
            tmpSePat = '^' + tmpPrefix
            for tmpSE in replicaMap[datasetName].keys():
                # check name with regexp pattern
                if re.search(tmpSePat,tmpSE) == None:
                    continue
                # check archived metadata
                # FIXME 
                pass
                # check tape attribute
                try:
                    tmpOnTape = ddmIF.getSiteProperty(tmpSE,'tape')
                except:
                    errtype,errvalue = sys.exc_info()[:2]
                    return errtype,'ddmIF.getSiteProperty for %s:tape failed with %s' % (tmpSE,errvalue)
                # check completeness
                tmpStatistics = replicaMap[datasetName][tmpSE][-1] 
                if tmpStatistics['found'] == None:
                    tmpDatasetStatus = 'unknown'
                    # refresh request
                    try:
                        ddmIF.checkDatasetConsistency(tmpSE,datasetName)
                    except:
                        pass
                elif tmpStatistics['total'] == tmpStatistics['found']:
                    tmpDatasetStatus = 'complete'
                else:
                    tmpDatasetStatus = 'incomplete'
                # append
                if not retMap.has_key(tmpSiteName):
                    retMap[tmpSiteName] = {}
                retMap[tmpSiteName][tmpSE] = {'tape':tmpOnTape,'state':tmpDatasetStatus}
    # return
    return Interaction.SC_SUCCEEDED,retMap
Exemplo n.º 19
0
 def doSetup(self, taskSpec, datasetToRegister, pandaJobs):
     # make logger
     tmpLog = MsgWrapper(logger,
                         "<jediTaskID={0}>".format(taskSpec.jediTaskID))
     tmpLog.info('start label={0} taskType={1}'.format(
         taskSpec.prodSourceLabel, taskSpec.taskType))
     # returns
     retFatal = self.SC_FATAL
     retTmpError = self.SC_FAILED
     retOK = self.SC_SUCCEEDED
     try:
         # get DDM I/F
         ddmIF = self.ddmIF.getInterface(taskSpec.vo)
         # register datasets
         if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']:
             # prod vs anal
             userSetup = False
             if taskSpec.prodSourceLabel in ['user']:
                 userSetup = True
                 # collect datasetID to register datasets/containers just in case
                 for tmpPandaJob in pandaJobs:
                     if not tmpPandaJob.produceUnMerge():
                         for tmpFileSpec in tmpPandaJob.Files:
                             if tmpFileSpec.type in ['output', 'log']:
                                 if not tmpFileSpec.datasetID in datasetToRegister:
                                     datasetToRegister.append(
                                         tmpFileSpec.datasetID)
             tmpLog.info('datasetToRegister={0}'.format(
                 str(datasetToRegister)))
             # get site mapper
             siteMapper = self.taskBufferIF.getSiteMapper()
             # loop over all datasets
             avDatasetList = []
             cnDatasetMap = {}
             for datasetID in datasetToRegister:
                 # get output and log datasets
                 tmpLog.info(
                     'getting datasetSpec with datasetID={0}'.format(
                         datasetID))
                 tmpStat, datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(
                     taskSpec.jediTaskID, datasetID)
                 if not tmpStat:
                     tmpLog.error('failed to get output and log datasets')
                     return retFatal
                 # DDM backend
                 ddmBackEnd = taskSpec.getDdmBackEnd()
                 tmpLog.info('checking {0}'.format(datasetSpec.datasetName))
                 # check if dataset and container are available in DDM
                 for targetName in [
                         datasetSpec.datasetName, datasetSpec.containerName
                 ]:
                     if targetName == None:
                         continue
                     if not targetName in avDatasetList:
                         # set lifetime
                         if targetName.startswith('panda'):
                             lifetime = 14
                         else:
                             lifetime = None
                         # check dataset/container in DDM
                         tmpList = ddmIF.listDatasets(targetName)
                         if tmpList == []:
                             # get location
                             location = None
                             locForRule = None
                             if targetName == datasetSpec.datasetName:
                                 # dataset
                                 if datasetSpec.site in ['', None]:
                                     if DataServiceUtils.getDistributedDestination(
                                             datasetSpec.storageToken
                                     ) != None:
                                         locForRule = datasetSpec.destination
                                     elif DataServiceUtils.getDestinationSE(
                                             datasetSpec.storageToken
                                     ) != None:
                                         location = DataServiceUtils.getDestinationSE(
                                             datasetSpec.storageToken)
                                     elif taskSpec.cloud != None:
                                         # use T1 SE
                                         tmpT1Name = siteMapper.getCloud(
                                             taskSpec.cloud)['source']
                                         location = siteMapper.getDdmEndpoint(
                                             tmpT1Name,
                                             datasetSpec.storageToken)
                                 else:
                                     location = siteMapper.getDdmEndpoint(
                                         datasetSpec.site,
                                         datasetSpec.storageToken)
                             if locForRule == None:
                                 locForRule = location
                             # set metadata
                             if taskSpec.prodSourceLabel in [
                                     'managed', 'test'
                             ] and targetName == datasetSpec.datasetName:
                                 metaData = {}
                                 metaData['task_id'] = taskSpec.jediTaskID
                                 if not taskSpec.campaign in [None, '']:
                                     metaData[
                                         'campaign'] = taskSpec.campaign
                                 if datasetSpec.getTransient() != None:
                                     metaData[
                                         'transient'] = datasetSpec.getTransient(
                                         )
                             else:
                                 metaData = None
                             # register dataset/container
                             tmpLog.info(
                                 'registering {0} with location={1} backend={2} lifetime={3} meta={4}'
                                 .format(targetName, location, ddmBackEnd,
                                         lifetime, str(metaData)))
                             tmpStat = ddmIF.registerNewDataset(
                                 targetName,
                                 backEnd=ddmBackEnd,
                                 location=location,
                                 lifetime=lifetime,
                                 metaData=metaData)
                             if not tmpStat:
                                 tmpLog.error(
                                     'failed to register {0}'.format(
                                         targetName))
                                 return retFatal
                             # procedures for user
                             if userSetup or DataServiceUtils.getDistributedDestination(
                                     datasetSpec.storageToken) != None:
                                 # register location
                                 tmpToRegister = False
                                 if userSetup and targetName == datasetSpec.datasetName and not datasetSpec.site in [
                                         '', None
                                 ]:
                                     userName = taskSpec.userName
                                     grouping = None
                                     tmpToRegister = True
                                 elif DataServiceUtils.getDistributedDestination(
                                         datasetSpec.storageToken) != None:
                                     userName = None
                                     grouping = 'NONE'
                                     tmpToRegister = True
                                 if tmpToRegister:
                                     activity = DataServiceUtils.getActivityForOut(
                                         taskSpec.prodSourceLabel)
                                     tmpLog.info(
                                         'registring location={0} lifetime={1}days activity={2} grouping={3}'
                                         .format(locForRule, lifetime,
                                                 activity, grouping))
                                     tmpStat = ddmIF.registerDatasetLocation(
                                         targetName,
                                         locForRule,
                                         owner=userName,
                                         lifetime=lifetime,
                                         backEnd=ddmBackEnd,
                                         activity=activity,
                                         grouping=grouping)
                                     if not tmpStat:
                                         tmpLog.error(
                                             'failed to register location {0} with {2} for {1}'
                                             .format(
                                                 locForRule, targetName,
                                                 ddmBackEnd))
                                         return retFatal
                             avDatasetList.append(targetName)
                         else:
                             tmpLog.info('{0} already registered'.format(
                                 targetName))
                 # check if dataset is in the container
                 if datasetSpec.containerName != None and datasetSpec.containerName != datasetSpec.datasetName:
                     # get list of constituent datasets in the container
                     if not cnDatasetMap.has_key(datasetSpec.containerName):
                         cnDatasetMap[
                             datasetSpec.
                             containerName] = ddmIF.listDatasetsInContainer(
                                 datasetSpec.containerName)
                     # add dataset
                     if not datasetSpec.datasetName in cnDatasetMap[
                             datasetSpec.containerName]:
                         tmpLog.info('adding {0} to {1}'.format(
                             datasetSpec.datasetName,
                             datasetSpec.containerName))
                         tmpStat = ddmIF.addDatasetsToContainer(
                             datasetSpec.containerName,
                             [datasetSpec.datasetName],
                             backEnd=ddmBackEnd)
                         if not tmpStat:
                             tmpLog.error('failed to add {0} to {1}'.format(
                                 datasetSpec.datasetName,
                                 datasetSpec.containerName))
                             return retFatal
                         cnDatasetMap[datasetSpec.containerName].append(
                             datasetSpec.datasetName)
                     else:
                         tmpLog.info('{0} already in {1}'.format(
                             datasetSpec.datasetName,
                             datasetSpec.containerName))
                 # update dataset
                 datasetSpec.status = 'registered'
                 self.taskBufferIF.updateDataset_JEDI(
                     datasetSpec, {
                         'jediTaskID': taskSpec.jediTaskID,
                         'datasetID': datasetID
                     })
         # open datasets
         if taskSpec.prodSourceLabel in ['managed', 'test']:
             # get the list of output/log datasets
             outDatasetList = []
             for tmpPandaJob in pandaJobs:
                 for tmpFileSpec in tmpPandaJob.Files:
                     if tmpFileSpec.type in ['output', 'log']:
                         if not tmpFileSpec.destinationDBlock in outDatasetList:
                             outDatasetList.append(
                                 tmpFileSpec.destinationDBlock)
             # open datasets
             for outDataset in outDatasetList:
                 tmpLog.info('open {0}'.format(outDataset))
                 ddmIF.openDataset(outDataset)
                 # unset lifetime
                 ddmIF.setDatasetMetadata(outDataset, 'lifetime', None)
         # return
         tmpLog.info('done')
         return retOK
     except:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('doSetup failed with {0}:{1}'.format(
             errtype.__name__, errvalue))
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retFatal
Exemplo n.º 20
0
def getSitesWithData(siteMapper,ddmIF,datasetName,storageToken=None):
    # get num of files
    try:
        if not datasetName.endswith('/'):
            totalNumDatasets = 1
        else:
            tmpDsMap = ddmIF.listDatasetsInContainer(datasetName)
            totalNumDatasets = len(tmpDsMap)
    except:
        errtype,errvalue = sys.exc_info()[:2]
        return errtype,'ddmIF.ddmIF.getFilesInDataset failed with %s' % errvalue
    
    # get replicas
    try:
        replicaMap= {}
        replicaMap[datasetName] = ddmIF.listDatasetReplicas(datasetName)
    except:
        errtype,errvalue = sys.exc_info()[:2]
        return errtype,'ddmIF.listDatasetReplicas failed with %s' % errvalue
    # loop over all clouds
    retMap = {}
    for tmpCloudName in siteMapper.cloudSpec.keys():
        retMap[tmpCloudName] = {'t1':{},'t2':[]}
        # get T1 DDM endpoints
        tmpCloudSpec = siteMapper.getCloud(tmpCloudName)
        # FIXME until CERN-PROD_TZERO is added to cloudconfig.tier1SE
        if tmpCloudName == 'CERN':
            if not 'CERN-PROD_TZERO' in tmpCloudSpec['tier1SE']:
                tmpCloudSpec['tier1SE'].append('CERN-PROD_TZERO')
        for tmpSePat in tmpCloudSpec['tier1SE']:
            if '*' in tmpSePat:
                tmpSePat = tmpSePat.replace('*','.*')
            tmpSePat = '^' + tmpSePat +'$'
            for tmpSE in replicaMap[datasetName].keys():
                # check name with regexp pattern
                if re.search(tmpSePat,tmpSE) == None:
                    continue
                # check space token
                if not storageToken in ['',None,'NULL']:
                    seStr = ddmIF.getSiteProperty(tmpSE,'srm')
                    try:
                        if seStr.split(':')[1] != storageToken:
                            continue
                    except:
                        pass
                # check archived metadata
                # FIXME 
                pass
                # check tape attribute
                try:
                    tmpOnTape = ddmIF.getSiteProperty(tmpSE,'tape')
                except:
                    errtype,errvalue = sys.exc_info()[:2]
                    return errtype,'ddmIF.getSiteProperty for %s:tape failed with %s' % (tmpSE,errvalue)
                # check completeness
                tmpStatistics = replicaMap[datasetName][tmpSE][-1] 
                if tmpStatistics['found'] == None:
                    tmpDatasetStatus = 'unknown'
                    # refresh request
                    try:
                        ddmIF.checkDatasetConsistency(tmpSE,datasetName)
                    except:
                        pass
                elif tmpStatistics['total'] == tmpStatistics['found'] and tmpStatistics['total'] >= totalNumDatasets:
                    tmpDatasetStatus = 'complete'
                else:
                    tmpDatasetStatus = 'incomplete'
                # append
                retMap[tmpCloudName]['t1'][tmpSE] = {'tape':tmpOnTape,'state':tmpDatasetStatus}
        # get T2 list
        tmpSiteList = DataServiceUtils.getSitesWithDataset(datasetName,siteMapper,replicaMap,
                                                           tmpCloudName,useHomeCloud=True,
                                                           useOnlineSite=True,includeT1=False)
        # append
        retMap[tmpCloudName]['t2'] = tmpSiteList
        # remove if empty
        if len(retMap[tmpCloudName]['t1']) == 0 and len(retMap[tmpCloudName]['t2']) == 0:
            del retMap[tmpCloudName]
    # return
    return Interaction.SC_SUCCEEDED,retMap
 def doFinalProcedure(self, taskSpec, tmpLog):
     tmpLog.info('final procedure for status={0} processingType={1}'.format(
         taskSpec.status, taskSpec.processingType))
     if taskSpec.status in ['done','finished'] or \
             (taskSpec.status == 'paused' and taskSpec.oldStatus in ['done','finished']):
         trnLifeTime = 14 * 24 * 60 * 60
         trnLifeTimeLong = 28 * 24 * 60 * 60
         trnLifeTimeMerge = 60 * 24 * 60 * 60
         ddmIF = self.ddmIF.getInterface(taskSpec.vo)
         # set lifetime to transient datasets
         metaData = {'lifetime': trnLifeTime}
         datasetTypeListI = set()
         datasetTypeListO = set()
         for datasetSpec in taskSpec.datasetSpecList:
             if datasetSpec.type in ['log', 'output']:
                 if datasetSpec.getTransient() == True:
                     tmpLog.debug(
                         'set metadata={0} to datasetID={1}:Name={2}'.
                         format(str(metaData), datasetSpec.datasetID,
                                datasetSpec.datasetName))
                     for metadataName, metadaValue in metaData.iteritems():
                         ddmIF.setDatasetMetadata(datasetSpec.datasetName,
                                                  metadataName, metadaValue)
             # collect dataset types
             datasetType = DataServiceUtils.getDatasetType(
                 datasetSpec.datasetName)
             if not datasetType in ['', None]:
                 if datasetSpec.type == 'input':
                     datasetTypeListI.add(datasetType)
                 elif datasetSpec.type == 'output':
                     datasetTypeListO.add(datasetType)
         # set lifetime to parent transient datasets
         if taskSpec.processingType in ['merge'] and \
                 (taskSpec.status == 'done' or \
                      (taskSpec.status == 'paused' and taskSpec.oldStatus == 'done')):
             # get parent task
             if not taskSpec.parent_tid in [None, taskSpec.jediTaskID]:
                 # get parent
                 tmpStat, parentTaskSpec = self.taskBufferIF.getTaskDatasetsWithID_JEDI(
                     taskSpec.parent_tid, None, False)
                 if tmpStat and parentTaskSpec != None:
                     # set lifetime to parent datasets if they are transient
                     for datasetSpec in parentTaskSpec.datasetSpecList:
                         if datasetSpec.type in ['output']:
                             # check dataset type
                             datasetType = DataServiceUtils.getDatasetType(
                                 datasetSpec.datasetName)
                             if not datasetType in datasetTypeListI or not datasetType in datasetTypeListO:
                                 continue
                             metaData = {'lifetime': trnLifeTimeMerge}
                             tmpMetadata = ddmIF.getDatasetMetaData(
                                 datasetSpec.datasetName)
                             if tmpMetadata['transient'] == True:
                                 tmpLog.debug(
                                     'set metadata={0} to parent jediTaskID={1}:datasetID={2}:Name={3}'
                                     .format(str(metaData),
                                             taskSpec.parent_tid,
                                             datasetSpec.datasetID,
                                             datasetSpec.datasetName))
                                 for metadataName, metadaValue in metaData.iteritems(
                                 ):
                                     ddmIF.setDatasetMetadata(
                                         datasetSpec.datasetName,
                                         metadataName, metadaValue)
     # delete empty datasets
     if taskSpec.status == 'done' or (taskSpec.status == 'paused'
                                      and taskSpec.oldStatus == 'done'):
         ddmIF = self.ddmIF.getInterface(taskSpec.vo)
         # loop over all datasets
         for datasetSpec in taskSpec.datasetSpecList:
             try:
                 if datasetSpec.type == 'output' and datasetSpec.nFilesFinished == 0:
                     tmpStat = ddmIF.deleteDataset(datasetSpec.datasetName,
                                                   True, True)
                     tmpLog.debug(
                         'delete empty prod dataset {0} with {1}'.format(
                             datasetSpec.datasetName, tmpStat))
             except:
                 errtype, errvalue = sys.exc_info()[:2]
                 tmpLog.warning(
                     'failed to delete empty dataset with {0}:{1}'.format(
                         errtype.__name__, errvalue))
     # set lifetime to failed datasets
     if taskSpec.status in ['failed', 'broken', 'aborted']:
         trnLifeTime = 30 * 24 * 60 * 60
         ddmIF = self.ddmIF.getInterface(taskSpec.vo)
         # only log datasets
         metaData = {'lifetime': trnLifeTime}
         for datasetSpec in taskSpec.datasetSpecList:
             if datasetSpec.type in ['log']:
                 tmpLog.debug(
                     'set metadata={0} to failed datasetID={1}:Name={2}'.
                     format(str(metaData), datasetSpec.datasetID,
                            datasetSpec.datasetName))
                 for metadataName, metadaValue in metaData.iteritems():
                     ddmIF.setDatasetMetadata(datasetSpec.datasetName,
                                              metadataName, metadaValue)
     return self.SC_SUCCEEDED
Exemplo n.º 22
0
 def doRefine(self, jediTaskID, taskParamMap):
     # make logger
     tmpLog = self.tmpLog
     tmpLog.debug('start taskType={0}'.format(self.taskSpec.taskType))
     try:
         # preprocessing
         tmpStat, taskParamMap = self.doPreProRefine(taskParamMap)
         if tmpStat == True:
             tmpLog.debug('done for preprocessing')
             return self.SC_SUCCEEDED
         if tmpStat == False:
             # failed
             tmpLog.error('doPreProRefine failed')
             return self.SC_FAILED
         # normal refine
         self.doBasicRefine(taskParamMap)
         # set nosplit+repeat for DBR
         for datasetSpec in self.inSecDatasetSpecList:
             # get the latest version of DBR
             if datasetSpec.datasetName == 'DBR_LATEST':
                 tmpLog.debug('resolving real name for {0}'.format(
                     datasetSpec.datasetName))
                 datasetSpec.datasetName = self.ddmIF.getInterface(
                     self.taskSpec.vo).getLatestDBRelease(
                         useResultCache=3600)
                 datasetSpec.containerName = datasetSpec.datasetName
             # set attributes to DBR
             if DataServiceUtils.isDBR(datasetSpec.datasetName):
                 datasetSpec.attributes = 'repeat,nosplit'
         # check invalid characters
         for datasetSpec in self.outDatasetSpecList:
             if not DataServiceUtils.checkInvalidCharacters(
                     datasetSpec.datasetName):
                 errStr = "invalid characters in {0}".format(
                     datasetSpec.datasetName)
                 tmpLog.error(errStr)
                 self.taskSpec.setErrDiag(errStr, None)
                 return self.SC_FATAL
         # destination
         if taskParamMap.has_key('destination'):
             for datasetSpec in self.outDatasetSpecList:
                 datasetSpec.destination = taskParamMap['destination']
         # use build
         if taskParamMap.has_key('buildSpec'):
             self.setSplitRule(None, 1,
                               JediTaskSpec.splitRuleToken['useBuild'])
         # use template dataset
         self.setSplitRule(None, 1,
                           JediTaskSpec.splitRuleToken['instantiateTmpl'])
         self.setSplitRule(
             None, 1, JediTaskSpec.splitRuleToken['instantiateTmplSite'])
         for datasetSpec in self.outDatasetSpecList:
             datasetSpec.type = "tmpl_{0}".format(datasetSpec.type)
         # get jobsetID
         tmpStat, tmpJobID = self.taskBufferIF.getUserJobsetID_JEDI(
             self.taskSpec.userName)
         if not tmpStat:
             tmpLog.error('failed to get jobsetID failed')
             return self.SC_FAILED
         self.taskSpec.reqID = tmpJobID
         # site limitation
         if 'excludedSite' in taskParamMap and 'includedSite' in taskParamMap:
             self.taskSpec.setLimitedSites('incexc')
         elif 'excludedSite' in taskParamMap:
             self.taskSpec.setLimitedSites('exc')
         elif 'includedSite' in taskParamMap:
             self.taskSpec.setLimitedSites('inc')
     except:
         errtype, errvalue = sys.exc_info()[:2]
         errStr = 'doRefine failed with {0}:{1}'.format(
             errtype.__name__, errvalue)
         tmpLog.error(errStr)
         self.taskSpec.setErrDiag(errStr, None)
         raise errtype, errvalue
     tmpLog.debug('done')
     return self.SC_SUCCEEDED
Exemplo n.º 23
0
 def doRefine(self,jediTaskID,taskParamMap):
     # make logger
     tmpLog = self.tmpLog
     tmpLog.debug('start taskType={0}'.format(self.taskSpec.taskType))
     try:
         # basic refine    
         self.doBasicRefine(taskParamMap)
         # set nosplit+repeat for DBR
         for datasetSpec in self.inSecDatasetSpecList:
             if DataServiceUtils.isDBR(datasetSpec.datasetName):
                 datasetSpec.attributes = 'repeat,nosplit'
         # enable consistency check
         if not self.taskSpec.parent_tid in [None,self.taskSpec.jediTaskID]:
             for datasetSpec in self.inMasterDatasetSpec:
                 if datasetSpec.isMaster() and datasetSpec.type == 'input':
                     datasetSpec.enableCheckConsistency()
         # append attempt number
         for tmpKey,tmpOutTemplateMapList in self.outputTemplateMap.iteritems():
             for tmpOutTemplateMap in tmpOutTemplateMapList:
                 outFileTemplate = tmpOutTemplateMap['filenameTemplate']
                 if re.search('\.\d+$',outFileTemplate) == None and not outFileTemplate.endswith('.panda.um'):
                     tmpOutTemplateMap['filenameTemplate'] = outFileTemplate + '.1'
         # extract input datatype
         datasetTypeListIn = []
         for datasetSpec in self.inMasterDatasetSpec+self.inSecDatasetSpecList:
             datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName)
             if not datasetType in ['',None]:
                 datasetTypeListIn.append(datasetType)
         # extract datatype and set destination if nessesary
         datasetTypeList = []
         for datasetSpec in self.outDatasetSpecList:
             datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName)
             if not datasetType in ['',None]:
                 datasetTypeList.append(datasetType)
         # set numThrottled to use the task throttling mechanism
         if not 'noThrottle' in taskParamMap:
             self.taskSpec.numThrottled = 0
         # set to register datasets
         self.taskSpec.setToRegisterDatasets()
         # set transient to parent datasets
         if self.taskSpec.processingType in ['merge'] and not self.taskSpec.parent_tid in [None,self.taskSpec.jediTaskID]:
             # get parent
             tmpStat,parentTaskSpec = self.taskBufferIF.getTaskDatasetsWithID_JEDI(self.taskSpec.parent_tid,None,False)
             if tmpStat and parentTaskSpec != None:
                 # set transient to parent datasets
                 metaData = {'transient':True}
                 for datasetSpec in parentTaskSpec.datasetSpecList:
                     if datasetSpec.type in ['log','output']:
                         datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName)
                         if not datasetType in ['',None] and datasetType in datasetTypeList and datasetType in datasetTypeListIn:
                             tmpLog.info('set metadata={0} to parent jediTaskID={1}:datasetID={2}:Name={3}'.format(str(metaData),
                                                                                                                   self.taskSpec.parent_tid,
                                                                                                                   datasetSpec.datasetID,
                                                                                                                   datasetSpec.datasetName))
                             for metadataName,metadaValue in metaData.iteritems():
                                 self.ddmIF.getInterface(self.taskSpec.vo).setDatasetMetadata(datasetSpec.datasetName,
                                                                                              metadataName,metadaValue)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('doBasicRefine failed with {0}:{1}'.format(errtype.__name__,errvalue))
         raise errtype,errvalue
     tmpLog.debug('done')
     return self.SC_SUCCEEDED
Exemplo n.º 24
0
 def doActionForReassgin(self, gTmpLog):
     # get DDM I/F
     ddmIF = self.ddmIF.getInterface(self.vo)
     # get site mapper
     siteMapper = self.taskBufferIF.getSiteMapper()
     # get tasks to get reassigned
     taskList = self.taskBufferIF.getTasksToReassign_JEDI(
         self.vo, self.prodSourceLabel)
     gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList)))
     for taskSpec in taskList:
         tmpLog = MsgWrapper(logger,
                             '<jediTaskID={0}'.format(taskSpec.jediTaskID))
         tmpLog.debug('start to reassign')
         # DDM backend
         ddmBackEnd = taskSpec.getDdmBackEnd()
         # get datasets
         tmpStat, datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
             taskSpec.jediTaskID, ['output', 'log'])
         if tmpStat != True:
             tmpLog.error('failed to get datasets')
             continue
         # update DB
         if not taskSpec.useWorldCloud():
             # update cloudtasks
             tmpStat = self.taskBufferIF.setCloudTaskByUser(
                 'jedi', taskSpec.jediTaskID, taskSpec.cloud, 'assigned',
                 True)
             if tmpStat != 'SUCCEEDED':
                 tmpLog.error('failed to update CloudTasks')
                 continue
             # check cloud
             if not siteMapper.checkCloud(taskSpec.cloud):
                 tmpLog.error("cloud={0} doesn't exist".format(
                     taskSpec.cloud))
                 continue
         else:
             # re-run task brokerage
             if taskSpec.nucleus in [None, '']:
                 taskSpec.status = 'assigning'
                 taskSpec.oldStatus = None
                 taskSpec.setToRegisterDatasets()
                 self.taskBufferIF.updateTask_JEDI(
                     taskSpec, {'jediTaskID': taskSpec.jediTaskID},
                     setOldModTime=True)
                 tmpLog.debug(
                     'set task_status={0} to trigger task brokerage again'.
                     format(taskSpec.status))
                 continue
             # get nucleus
             nucleusSpec = siteMapper.getNucleus(taskSpec.nucleus)
             if nucleusSpec == None:
                 tmpLog.error("nucleus={0} doesn't exist".format(
                     taskSpec.nucleus))
                 continue
             # set nucleus
             retMap = {
                 taskSpec.jediTaskID:
                 AtlasBrokerUtils.getDictToSetNucleus(
                     nucleusSpec, datasetSpecList)
             }
             tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
         # get T1/nucleus
         if not taskSpec.useWorldCloud():
             t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest']
         else:
             t1SiteName = nucleusSpec.getOnePandaSite()
         t1Site = siteMapper.getSite(t1SiteName)
         # loop over all datasets
         isOK = True
         for datasetSpec in datasetSpecList:
             tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName))
             if DataServiceUtils.getDistributedDestination(
                     datasetSpec.storageToken) != None:
                 tmpLog.debug('skip {0} is distributed'.format(
                     datasetSpec.datasetName))
                 continue
             # get location
             location = siteMapper.getDdmEndpoint(t1Site.sitename,
                                                  datasetSpec.storageToken)
             # make subscription
             try:
                 tmpLog.debug(
                     'registering subscription to {0} with backend={1}'.
                     format(location, ddmBackEnd))
                 tmpStat = ddmIF.registerDatasetSubscription(
                     datasetSpec.datasetName,
                     location,
                     'Production Output',
                     asynchronous=True)
                 if tmpStat != True:
                     tmpLog.error("failed to make subscription")
                     isOK = False
                     break
             except:
                 errtype, errvalue = sys.exc_info()[:2]
                 tmpLog.warning(
                     'failed to make subscription with {0}:{1}'.format(
                         errtype.__name__, errvalue))
                 isOK = False
                 break
         # succeeded
         if isOK:
             # activate task
             if taskSpec.oldStatus in ['assigning', 'exhausted', None]:
                 taskSpec.status = 'ready'
             else:
                 taskSpec.status = taskSpec.oldStatus
             taskSpec.oldStatus = None
             self.taskBufferIF.updateTask_JEDI(
                 taskSpec, {'jediTaskID': taskSpec.jediTaskID},
                 setOldModTime=True)
             tmpLog.debug('finished to reassign')
Exemplo n.º 25
0
 def doRefine(self, jediTaskID, taskParamMap):
     # make logger
     tmpLog = self.tmpLog
     tmpLog.debug("start taskType={0}".format(self.taskSpec.taskType))
     try:
         self.doBasicRefine(taskParamMap)
         # set nosplit+repeat for DBR
         for datasetSpec in self.inSecDatasetSpecList:
             if DataServiceUtils.isDBR(datasetSpec.datasetName):
                 datasetSpec.attributes = "repeat,nosplit"
         # enable consistency check
         if not self.taskSpec.parent_tid in [None, self.taskSpec.jediTaskID]:
             for datasetSpec in self.inMasterDatasetSpec:
                 if datasetSpec.isMaster() and datasetSpec.type == "input":
                     datasetSpec.enableCheckConsistency()
         # append attempt number
         for tmpKey, tmpOutTemplateMapList in self.outputTemplateMap.iteritems():
             for tmpOutTemplateMap in tmpOutTemplateMapList:
                 outFileTemplate = tmpOutTemplateMap["filenameTemplate"]
                 if re.search("\.\d+$", outFileTemplate) == None and not outFileTemplate.endswith(".panda.um"):
                     tmpOutTemplateMap["filenameTemplate"] = outFileTemplate + ".1"
         # extract datatype and set destination if nessesary
         datasetTypeList = []
         for datasetSpec in self.outDatasetSpecList:
             datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName)
             if datasetType != "":
                 datasetTypeList.append(datasetType)
             storageToken = DataServiceUtils.getDestinationSE(datasetSpec.storageToken)
             if storageToken != None:
                 tmpSiteList = self.ddmIF.getInterface(self.taskSpec.vo).getSitesWithEndPoint(
                     storageToken, self.siteMapper, "production"
                 )
                 if tmpSiteList == []:
                     raise RuntimeError, "cannot find online siteID associated to {0}".format(storageToken)
                 datasetSpec.destination = tmpSiteList[0]
         # set numThrottled to use the task throttling mechanism
         if not "noThrottle" in taskParamMap:
             self.taskSpec.numThrottled = 0
         # set to register datasets
         self.taskSpec.setToRegisterDatasets()
         # set transient to parent datasets
         if self.taskSpec.processingType in ["merge"] and not self.taskSpec.parent_tid in [
             None,
             self.taskSpec.jediTaskID,
         ]:
             # get parent
             tmpStat, parentTaskSpec = self.taskBufferIF.getTaskDatasetsWithID_JEDI(
                 self.taskSpec.parent_tid, None, False
             )
             if tmpStat and parentTaskSpec != None:
                 # set transient to parent datasets
                 metaData = {"transient": True}
                 for datasetSpec in parentTaskSpec.datasetSpecList:
                     if datasetSpec.type in ["log", "output"]:
                         datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName)
                         if datasetType != "" and datasetType in datasetTypeList:
                             tmpLog.info(
                                 "set metadata={0} to parent jediTaskID={1}:datasetID={2}:Name={3}".format(
                                     str(metaData),
                                     self.taskSpec.parent_tid,
                                     datasetSpec.datasetID,
                                     datasetSpec.datasetName,
                                 )
                             )
                             for metadataName, metadaValue in metaData.iteritems():
                                 self.ddmIF.getInterface(self.taskSpec.vo).setDatasetMetadata(
                                     datasetSpec.datasetName, metadataName, metadaValue
                                 )
     except:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error("doBasicRefine failed with {0}:{1}".format(errtype.__name__, errvalue))
         raise errtype, errvalue
     tmpLog.debug("done")
     return self.SC_SUCCEEDED
Exemplo n.º 26
0
 def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID))
     tmpLog.debug('start')
     # return for failure
     retFatal    = self.SC_FATAL,inputChunk
     retTmpError = self.SC_FAILED,inputChunk
     # get sites in the cloud
     sitePreAssigned = False
     if not taskSpec.site in ['',None]:
         sitePreAssigned = True
         scanSiteList = [taskSpec.site]
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
     elif inputChunk.getPreassignedSite() != None:
         sitePreAssigned = True
         scanSiteList = [inputChunk.getPreassignedSite()]
         tmpLog.debug('site={0} is pre-assigned in masterDS'.format(inputChunk.getPreassignedSite()))
     else:
         scanSiteList = self.siteMapper.getCloud(cloudName)['sites']
         tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList)))
     # get job statistics
     tmpSt,jobStatMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # T1 
     t1Sites = [self.siteMapper.getCloud(cloudName)['source']]
     # hospital sites
     if self.hospitalQueueMap.has_key(cloudName):
         t1Sites += self.hospitalQueueMap[cloudName]
     # MP    
     if taskSpec.coreCount != None and taskSpec.coreCount > 1:
         # use MCORE only
         useMP = 'only'
     elif taskSpec.coreCount == 0:
         # use MCORE and normal 
         useMP = 'any'
     else:
         # not use MCORE
         useMP = 'unuse'
     ######################################
     # selection for status
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check site status
             skipFlag = False
             if tmpSiteSpec.status != 'online':
                 skipFlag = True
             if not skipFlag:    
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip %s due to status=%s' % (tmpSiteName,tmpSiteSpec.status))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for reprocessing
     if taskSpec.processingType == 'reprocessing':
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check schedconfig.validatedreleases
             if tmpSiteSpec.validatedreleases == ['True']:
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip %s due to validatedreleases != True' % tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for reprocessing'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for high priorities
     if (taskSpec.currentPriority >= 950 or inputChunk.useScout()) and useMP != 'only' and not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:            
             if tmpSiteName in t1Sites:
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip {0} due to highPrio/scouts which needs to run at {1} T1'.format(tmpSiteName,
                                                                                                      cloudName))
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed for highPrio/scouts'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for data availability
     if not sitePreAssigned:
         for datasetSpec in inputChunk.getDatasets():
             datasetName = datasetSpec.datasetName
             # ignore DBR
             if DataServiceUtils.isDBR(datasetName):
                 continue
             if not self.dataSiteMap.has_key(datasetName):
                 # get the list of sites where data is available
                 tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName))
                 tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper,
                                                                  self.ddmIF,datasetName,
                                                                  datasetSpec.storageToken)
                 if tmpSt == self.SC_FAILED:
                     tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     return retTmpError
                 if tmpSt == self.SC_FATAL:
                     tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     return retFatal
                 # append
                 self.dataSiteMap[datasetName] = tmpRet
                 tmpLog.debug('map of data availability : {0}'.format(str(tmpRet)))
             # check if T1 has the data
             if self.dataSiteMap[datasetName].has_key(cloudName):
                 cloudHasData = True
             else:
                 cloudHasData = False
             t1hasData = False
             if cloudHasData:
                 for tmpSE,tmpSeVal in self.dataSiteMap[datasetName][cloudName]['t1'].iteritems():
                     if tmpSeVal['state'] == 'complete':
                         t1hasData = True
                         break
                 # T1 has incomplete data while no data at T2
                 if not t1hasData and self.dataSiteMap[datasetName][cloudName]['t2'] == []:
                     # use incomplete data at T1 anyway
                     t1hasData = True
             # data is missing at T1         
             if not t1hasData:
                 tmpLog.debug('{0} is unavailable at T1. scanning T2 sites in homeCloud={1}'.format(datasetName,cloudName))
                 # make subscription to T1
                 # FIXME
                 pass
                 # use T2 until data is complete at T1
                 newScanSiteList = []
                 for tmpSiteName in scanSiteList:                    
                     if cloudHasData and tmpSiteName in self.dataSiteMap[datasetName][cloudName]['t2']:
                         newScanSiteList.append(tmpSiteName)
                     else:
                         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                         if tmpSiteSpec.cloud != cloudName:
                             tmpLog.debug('  skip %s due to foreign T2' % tmpSiteName)
                         else:
                             tmpLog.debug('  skip %s due to missing data at T2' % tmpSiteName)
                 scanSiteList = newScanSiteList
                 tmpLog.debug('{0} candidates passed T2 scan in the home cloud with input:{1}'.format(len(scanSiteList),datasetName))
                 if scanSiteList == []:
                     tmpLog.error('no candidates')
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     return retTmpError
     ######################################
     # selection for fairshare
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if AtlasBrokerUtils.hasZeroShare(tmpSiteSpec,taskSpec,tmpLog):
             tmpLog.debug('  skip {0} due to zero share'.format(tmpSiteName))
             continue
         newScanSiteList.append(tmpSiteName)                
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed zero share check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for I/O intensive tasks
     # FIXME
     pass
     ######################################
     # selection for MP
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \
                     (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]):
                     newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip %s due to core mismatch site:%s != task:%s' % \
                              (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for release
     if taskSpec.transHome != None:
         if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None:
             # only cache is checked for normal tasks
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      caches=taskSpec.transHome,
                                                                      cmtConfig=taskSpec.architecture)
         else:
             # nightlies
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      releases='CVMFS')
             #                                                         releases='nightlies',
             #                                                         cmtConfig=taskSpec.architecture)
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY'] or \
                tmpSiteSpec.cloud in ['ND'] or \
                tmpSiteName in ['CERN-RELEASE']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip %s due to missing rel/cache %s:%s' % \
                              (tmpSiteName,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for ATLAS release {1}:{2}'.format(len(scanSiteList),
                                                                               taskSpec.transHome,
                                                                               taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for memory
     minRamCount  = taskSpec.ramCount
     if not minRamCount in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory:
                 tmpLog.debug('  skip {0} due to site RAM shortage={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                       tmpSiteSpec.maxmemory,
                                                                                                       minRamCount))
                 continue
             if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory:
                 tmpLog.debug('  skip {0} due to job RAM shortage={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                      tmpSiteSpec.minmemory,
                                                                                                      minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList),
                                                                          minRamCount,taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for scratch disk
     minDiskCount = taskSpec.getOutDiskSize()*inputChunk.getMaxAtomSize(effectiveSize=True) \
         + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize()
     minDiskCount = minDiskCount / 1024 / 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0 and minDiskCount > tmpSiteSpec.maxwdir:
             tmpLog.debug('  skip {0} due to small scratch disk={1} < {2}'.format(tmpSiteName,
                                                                                  tmpSiteSpec.maxwdir,
                                                                                  minDiskCount))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # don't check for T1
         if tmpSiteName in t1Sites:
             pass
         else:
             # check at the site
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # the number of jobs which will produce outputs
             nRemJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'assigned') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running')
             # the size of input files which will be copied to the site
             movingInputSize = self.taskBufferIF.getMovingInputSize_JEDI(tmpSiteName)
             if movingInputSize == None:
                 tmpLog.error('failed to get the size of input file moving to {0}'.format(tmpSiteName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retTmpError
             # free space - inputs - outputs(250MB*nJobs) must be >= 200GB
             outSizePerJob = 0.250
             diskThreshold = 200
             tmpSpaceSize = tmpSiteSpec.space - movingInputSize - nRemJobs * outSizePerJob
             if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold:
                 tmpLog.debug('  skip {0} due to disk shortage in SE = {1}-{2}-{3}x{4} < {5}'.format(tmpSiteName,tmpSiteSpec.space,
                                                                                                     movingInputSize,outSizePerJob,
                                                                                                     nRemJobs,diskThreshold))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime * inputChunk.getMaxAtomSize(effectiveSize=True)
     if not minWalltime in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug('  skip {0} due to short site walltime={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                         tmpSiteSpec.maxtime,
                                                                                                         minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug('  skip {0} due to short job walltime={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                        tmpSiteSpec.mintime,
                                                                                                        minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed walltime check ={1}({2})'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for transferring
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limit
         def_maxTransferring = 2000 
         if tmpSiteSpec.transferringlimit == 0:
             # use default value
             maxTransferring   = def_maxTransferring
         else:
             maxTransferring = tmpSiteSpec.transferringlimit
         # check at the site
         nTraJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'transferring',cloud=cloudName)
         nRunJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running',cloud=cloudName)
         if max(maxTransferring,2*nRunJobs) < nTraJobs and not tmpSiteSpec.cloud in ['ND']:
             tmpLog.debug('  skip %s due to too many transferring %s > max(%s,2x%s)' % \
                          (tmpSiteName,nTraJobs,def_maxTransferring,nRunJobs))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed transferring check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for T1 weight
     t1Weight = taskSpec.getT1Weight()
     if t1Weight == 0:
         # use T1 weight in cloudconfig
         t1Weight = self.siteMapper.getCloud(cloudName)['weight']
     tmpLog.debug('T1 weight {0}'.format(t1Weight))
     if t1Weight < 0:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             if not tmpSiteName in t1Sites:
                 tmpLog.debug('  skip {0} due to negative T1 weight'.format(tmpSiteName))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed T1 weight check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for nPilot
     if not sitePreAssigned:
         nWNmap = self.taskBufferIF.getCurrentSiteData()
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             # check at the site
             nPilot = 0
             if nWNmap.has_key(tmpSiteName):
                 nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob']
             if nPilot == 0 and not 'test' in taskSpec.prodSourceLabel:
                 tmpLog.debug('  skip %s due to no pilot' % tmpSiteName)
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # get available files
     normalizeFactors = {}        
     availableFileMap = {}
     for datasetSpec in inputChunk.getDatasets():
         try:
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(scanSiteList,self.siteMapper)
             # disable file lookup for merge jobs
             if inputChunk.isMerging or not datasetSpec.isMaster():
                 checkCompleteness = False
             else:
                 checkCompleteness = True
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec,
                                                         siteStorageEP,
                                                         self.siteMapper,
                                                         ngGroup=[1],
                                                         checkCompleteness=checkCompleteness,
                                                         storageToken=datasetSpec.storageToken)
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
         # loop over all sites to get the size of available files
         for tmpSiteName in scanSiteList:
             if not normalizeFactors.has_key(tmpSiteName):
                 normalizeFactors[tmpSiteName] = 0
             # get the total size of available files
             if availableFileMap[datasetSpec.datasetName].has_key(tmpSiteName):
                 availableFiles = availableFileMap[datasetSpec.datasetName][tmpSiteName]
                 for tmpFileSpec in \
                         availableFiles['localdisk']+availableFiles['localtape']+availableFiles['cache']:
                     normalizeFactors[tmpSiteName] += tmpFileSpec.fsize
     # get max total size
     tmpTotalSizes = normalizeFactors.values()
     tmpTotalSizes.sort()
     if tmpTotalSizes != []:
         totalSize = tmpTotalSizes.pop()
     else:
         totalSize = 0
     ######################################
     # calculate weight
     tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,
                                                                                 taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     for tmpSiteName in scanSiteList:
         nRunning   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',cloudName,taskSpec.workQueue_ID)
         nAssigned  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'assigned',cloudName,taskSpec.workQueue_ID)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',cloudName,taskSpec.workQueue_ID)
         weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1)
         # normalize weights by taking data availability into account
         if totalSize != 0:
             weight = weight * float(normalizeFactors[tmpSiteName]+totalSize) / float(totalSize)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # T1 weight
         if tmpSiteName in t1Sites:
             weight *= t1Weight
         # set weight
         siteCandidateSpec.weight = weight
         # set available files
         for tmpDatasetName,availableFiles in availableFileMap.iteritems():
             if availableFiles.has_key(tmpSiteName):
                 siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['localdisk']
                 siteCandidateSpec.localTapeFiles  += availableFiles[tmpSiteName]['localtape']
                 siteCandidateSpec.cacheFiles  += availableFiles[tmpSiteName]['cache']
                 siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote']
         # append        
         inputChunk.addSiteCandidate(siteCandidateSpec)
         tmpLog.debug('  use {0} with weight={1}'.format(tmpSiteName,weight))
     # return
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,inputChunk
Exemplo n.º 27
0
def getAnalSitesWithData(siteList,siteMapper,ddmIF,datasetName):
    # get replicas
    try:
        replicaMap= {}
        replicaMap[datasetName] = ddmIF.listDatasetReplicas(datasetName)
    except:
        errtype,errvalue = sys.exc_info()[:2]
        return errtype,'ddmIF.listDatasetReplicas failed with %s' % errvalue
    # loop over all clouds
    retMap = {}
    for tmpSiteName in siteList:
        tmpSiteSpec = siteMapper.getSite(tmpSiteName)
        # loop over all DDM endpoints
        checkedEndPoints = []
        for tmpDDM in [tmpSiteSpec.ddm] + tmpSiteSpec.setokens.values():
            # skip empty
            if tmpDDM == '':
                continue
            # get prefix
            tmpPrefix = re.sub('_[^_]+$','_',tmpDDM) 
            # already checked 
            if tmpPrefix in checkedEndPoints:
                continue
            # DBR
            if DataServiceUtils.isCachedFile(datasetName,tmpSiteSpec):
                # no replica check since it is cached 
                if not retMap.has_key(tmpSiteName):
                    retMap[tmpSiteName] = {}
                retMap[tmpSiteName][tmpDDM] = {'tape':False,'state':'complete'}
                checkedEndPoints.append(tmpPrefix)
                continue
            checkedEndPoints.append(tmpPrefix)
            tmpSePat = '^' + tmpPrefix
            for tmpSE in replicaMap[datasetName].keys():
                # check name with regexp pattern
                if re.search(tmpSePat,tmpSE) == None:
                    continue
                # skip staging
                if re.search('STAGING$',tmpSE) != None:
                    continue
                # check archived metadata
                # FIXME 
                pass
                # check tape attribute
                try:
                    tmpOnTape = ddmIF.getSiteProperty(tmpSE,'is_tape')
                except:
                    continue
                    # errtype,errvalue = sys.exc_info()[:2]
                    # return errtype,'ddmIF.getSiteProperty for %s:tape failed with %s' % (tmpSE,errvalue)
                # check completeness
                tmpStatistics = replicaMap[datasetName][tmpSE][-1] 
                if tmpStatistics['found'] == None:
                    tmpDatasetStatus = 'unknown'
                    # refresh request
                    pass
                elif tmpStatistics['total'] == tmpStatistics['found']:
                    tmpDatasetStatus = 'complete'
                else:
                    tmpDatasetStatus = 'incomplete'
                # append
                if not retMap.has_key(tmpSiteName):
                    retMap[tmpSiteName] = {}
                retMap[tmpSiteName][tmpSE] = {'tape':tmpOnTape,'state':tmpDatasetStatus}
    # return
    return Interaction.SC_SUCCEEDED,retMap
Exemplo n.º 28
0
 def run(self):
     self.lock.acquire()
     try:
         # get jobs from DB
         ids = self.ids
         self.proxyLock.acquire()
         jobs = taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False)
         self.proxyLock.release()
         upJobs = []
         finJobs = []
         for job in jobs:
             if job is None or job.jobStatus == 'unknown':
                 continue
             seList = ['dummy']
             tmpNucleus = siteMapper.getNucleus(job.nucleus)
             # get SEs
             if job.prodSourceLabel == 'user' and job.destinationSE not in siteMapper.siteSpecList:
                 # using --destSE for analysis job to transfer output
                 seList = [job.destinationSE]
             elif tmpNucleus is not None:
                 seList = list(tmpNucleus.allDdmEndPoints)
             elif siteMapper.checkCloud(job.cloud):
                 # normal production jobs
                 if DataServiceUtils.checkJobDestinationSE(job) is None:
                     tmpDstID = siteMapper.getCloud(job.cloud)['dest']
                 else:
                     tmpDstID = job.destinationSE
                 tmpDstSite = siteMapper.getSite(tmpDstID)
                 scope_input, scope_output = select_scope(tmpDstSite, job.prodSourceLabel)
                 seList = tmpDstSite.ddm_endpoints_output[scope_output].getLocalEndPoints()
             # get LFN list
             lfns   = []
             guids  = []
             scopes = []
             nTokens = 0
             for file in job.Files:
                 # only output files are checked
                 if file.type == 'output' or file.type == 'log':
                     if file.status == 'nooutput':
                         continue
                     if DataServiceUtils.getDistributedDestination(file.destinationDBlockToken) is not None:
                         continue
                     lfns.append(file.lfn)
                     guids.append(file.GUID)
                     scopes.append(file.scope)
                     nTokens += len(file.destinationDBlockToken.split(','))
             # get files in LRC
             _logger.debug("%s Cloud:%s" % (job.PandaID,job.cloud))
             tmpStat,okFiles = rucioAPI.listFileReplicas(scopes,lfns,seList)
             if not tmpStat:
                 _logger.error("%s failed to get file replicas" % job.PandaID)
                 okFiles = {}
             # count files
             nOkTokens = 0
             for okLFN in okFiles:
                 okSEs = okFiles[okLFN]
                 nOkTokens += len(okSEs)
             # check all files are ready
             _logger.debug("%s nToken:%s nOkToken:%s" % (job.PandaID,nTokens,nOkTokens))
             if nTokens <= nOkTokens:
                 _logger.debug("%s Finisher : Finish" % job.PandaID)
                 for file in job.Files:
                     if file.type == 'output' or file.type == 'log':
                         if file.status != 'nooutput':
                             file.status = 'ready'
                 # append to run Finisher
                 finJobs.append(job)
             else:
                 endTime = job.endTime
                 if endTime == 'NULL':
                     endTime = job.startTime
                 # priority-dependent timeout
                 tmpCloudSpec = siteMapper.getCloud(job.cloud)
                 if job.currentPriority >= 800 and (job.prodSourceLabel not in ['user']):
                     if 'transtimehi' in tmpCloudSpec:
                         timeOutValue = tmpCloudSpec['transtimehi']
                     else:
                         timeOutValue = 1
                 else:
                     if 'transtimelo' in tmpCloudSpec:
                         timeOutValue = tmpCloudSpec['transtimelo']
                     else:
                         timeOutValue = 2
                 # protection
                 if timeOutValue < 1:
                     timeOutValue  = 1
                 timeOut = self.timeNow - datetime.timedelta(days=timeOutValue)
                 _logger.debug("%s  Priority:%s Limit:%s End:%s" % (job.PandaID,job.currentPriority,str(timeOut),str(endTime)))
                 if endTime < timeOut:
                     # timeout
                     _logger.debug("%s Finisher : Kill" % job.PandaID)
                     strMiss = ''
                     for lfn in lfns:
                         if lfn not in okFiles:
                             strMiss += ' %s' % lfn
                     job.jobStatus = 'failed'
                     job.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_Transfer
                     job.taskBufferErrorDiag = 'transfer timeout for '+strMiss
                     guidMap = {}
                     for file in job.Files:
                         # set file status
                         if file.status == 'transferring' or file.type in ['log','output']:
                             file.status = 'failed'
                         # collect GUIDs to delete files from _tid datasets
                         if file.type == 'output' or file.type == 'log':
                             if file.destinationDBlock not in guidMap:
                                 guidMap[file.destinationDBlock] = []
                             guidMap[file.destinationDBlock].append(file.GUID)
                 else:
                     # wait
                     _logger.debug("%s Finisher : Wait" % job.PandaID)
                     for lfn in lfns:
                         if lfn not in okFiles:
                             _logger.debug("%s    -> %s" % (job.PandaID,lfn))
             upJobs.append(job)
         # update
         _logger.debug("updating ...")
         self.proxyLock.acquire()
         taskBuffer.updateJobs(upJobs,False)
         self.proxyLock.release()
         # run Finisher
         for job in finJobs:
             fThr = Finisher(taskBuffer,None,job)
             fThr.start()
             fThr.join()
         _logger.debug("done")
         time.sleep(1)
     except Exception:
         errtype,errvalue = sys.exc_info()[:2]
         errStr  = "FinisherThr failed with %s %s" % (errtype,errvalue)
         errStr += traceback.format_exc()
         _logger.error(errStr)
     self.pool.remove(self)
     self.lock.release()
Exemplo n.º 29
0
#     print '------------------- ddm -------------------'
#     print 'ddm_input: {0}, ddm_output: {1}'.format(tmp_site_spec.ddm_input, tmp_site_spec.ddm_output)
#     print '------------------- setokens values -------------------'
#     print 'setokens_input: {0}, setokens_output: {1}'.format(tmp_site_spec.setokens_input.values(),
#                                                              tmp_site_spec.setokens_output.values())
#     print '------------------- setokens -------------------'
#     print 'setokens_input: {0}, setokens_output: {1}'.format(tmp_site_spec.setokens_input,
#                                                              tmp_site_spec.setokens_output)

from pandaserver.dataservice import DataServiceUtils
sites = site_mapper.getCloud('WORLD')['sites']
sites.sort()
for tmp_site_name in sites:
    print 'tmp_site_name: {0}'.format(tmp_site_name)

    tmp_site_spec = site_mapper.getSite(tmp_site_name)

    #print 'tmp_site_spec.ddm_input: {0}'.format(tmp_site_spec.ddm_input)
    #print 'tmp_site_spec.setokens_input: {0}'.format(tmp_site_spec.setokens_input.values())
    #print 'combination: {0}'.format([tmp_site_spec.ddm_input] + tmp_site_spec.setokens_input.values())

    for tmp_ddm_endpoint in [tmp_site_spec.ddm_input
                             ] + tmp_site_spec.setokens_input.values():
        try:
            tmp_prefix = DataServiceUtils.getDQ2Prefix(tmp_ddm_endpoint)
            print 'prefix: {0}'.format(tmp_prefix)
        except TypeError:
            print 'excepted!'

    print '-------------------'
Exemplo n.º 30
0
 def runImpl(self):
     # cutoff for disk in TB
     diskThreshold = self.taskBufferIF.getConfigValue(self.msgType, 'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name),
                                                      'jedi', 'atlas')
     if diskThreshold is None:
         diskThreshold = 100 * 1024
     # dataset type to ignore file availability check
     datasetTypeToSkipCheck = ['log']
     # thresholds for data availability check
     thrInputSize = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas')
     if thrInputSize is None:
         thrInputSize = 1
     thrInputSize *= 1024*1024*1024
     thrInputNum = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_THRESHOLD', 'jedi', 'atlas')
     if thrInputNum is None:
         thrInputNum = 100
     thrInputSizeFrac = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas')
     if thrInputSizeFrac is None:
         thrInputSizeFrac = 10
     thrInputSizeFrac = float(thrInputSizeFrac) / 100
     thrInputNumFrac = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas')
     if thrInputNumFrac is None:
         thrInputNumFrac = 10
     thrInputNumFrac = float(thrInputNumFrac) / 100
     cutOffRW = 50
     negWeightTape = 0.001
     # main
     lastJediTaskID = None
     siteMapper = self.taskBufferIF.getSiteMapper()
     while True:
         try:
             taskInputList = self.inputList.get(1)
             # no more datasets
             if len(taskInputList) == 0:
                 self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__,
                                                                                                             self.numTasks))
                 return
             # loop over all tasks
             for taskSpec,inputChunk in taskInputList:
                 lastJediTaskID = taskSpec.jediTaskID
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='jediTaskID={0}'.format(taskSpec.jediTaskID))
                 tmpLog.debug('start')
                 tmpLog.info('thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}'.format(thrInputSize,
                                                                                                                 thrInputNum,
                                                                                                                 thrInputSizeFrac,
                                                                                                                 thrInputNumFrac))
                 # RW
                 taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID)
                 # get nuclei
                 nucleusList = siteMapper.nuclei
                 if taskSpec.nucleus in nucleusList:
                     candidateNucleus = taskSpec.nucleus
                 else:
                     tmpLog.info('got {0} candidates'.format(len(nucleusList)))
                     ######################################
                     # check status
                     newNucleusList = {}
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleusSpec.state in ['ACTIVE']:
                             tmpLog.info('  skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus,
                                                                                                         tmpNucleusSpec.state))
                         else:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed status check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # check status of transfer backlog
                     t1Weight = taskSpec.getT1Weight()
                     if t1Weight < 0:
                         tmpLog.info('skip transfer backlog check due to negative T1Weight')
                     else:
                         newNucleusList = {}
                         backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei()
                         for tmpNucleus, tmpNucleusSpec in nucleusList.iteritems():
                             if tmpNucleus in backlogged_nuclei:
                                 tmpLog.info('  skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog'.
                                              format(tmpNucleus))
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         nucleusList = newNucleusList
                         tmpLog.info('{0} candidates passed transfer backlog check'.format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # check endpoint
                     fractionFreeSpace = {}
                     newNucleusList = {}
                     tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                                   ['output','log'])
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         toSkip = False
                         for tmpDatasetSpec in tmpDatasetSpecList:
                             # ignore distributed datasets
                             if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None:
                                 continue
                             # get endpoint with the pattern
                             tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken)
                             if tmpEP == None:
                                 tmpLog.info('  skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus,
                                                                                                                     tmpDatasetSpec.storageToken))
                                 toSkip = True
                                 break
                             # check state
                             """
                             if not tmpEP['state'] in ['ACTIVE']:
                                 tmpLog.info('  skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus,
                                                                                                                          tmpEP['ddm_endpoint_name'],
                                                                                                                          tmpEP['state']))
                                 toSkip = True
                                 break
                             """    
                             # check space
                             tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired']
                             tmpSpaceToUse = 0
                             if tmpNucleus in self.fullRW:
                                 # 0.25GB per cpuTime/corePower/day
                                 tmpSpaceToUse = long(self.fullRW[tmpNucleus]/10/24/3600*0.25)
                             if tmpSpaceSize-tmpSpaceToUse < diskThreshold:
                                 tmpLog.info('  skip nucleus={0} since disk shortage (free {1} - reserved {2} < thr {3}) at endpoint {4} criteria=-space'.format(tmpNucleus,
                                                                                                                                                                  tmpSpaceSize,
                                                                                                                                                                  tmpSpaceToUse,
                                                                                                                                                                  diskThreshold,
                                                                                                                                                                  tmpEP['ddm_endpoint_name']))
                                 toSkip = True
                                 break
                             # keep fraction of free space
                             if not tmpNucleus in fractionFreeSpace:
                                 fractionFreeSpace[tmpNucleus] = {'total':0,'free':0}
                             try:
                                 tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                     float(fractionFreeSpace[tmpNucleus]['total'])
                             except:
                                 tmpOld = None
                             try:
                                 tmpNew = float(tmpSpaceSize-tmpSpaceToUse)/float(tmpEP['space_total'])
                             except:
                                 tmpNew = None
                             if tmpNew != None and (tmpOld == None or tmpNew < tmpOld):
                                 fractionFreeSpace[tmpNucleus] = {'total':tmpEP['space_total'],
                                                                  'free':tmpSpaceSize-tmpSpaceToUse}
                         if not toSkip:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed endpoint check {1} TB'.format(len(nucleusList),diskThreshold/1024))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # ability to execute jobs
                     newNucleusList = {}
                     # get all panda sites
                     tmpSiteList = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         tmpSiteList += tmpNucleusSpec.allPandaSites
                     tmpSiteList = list(set(tmpSiteList))
                     tmpLog.debug('===== start for job check')
                     jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF)
                     tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True,
                                                          tmpSiteList,tmpLog)
                     tmpLog.debug('===== done for job check')
                     if tmpSt != Interaction.SC_SUCCEEDED:
                         tmpLog.error('no sites can run jobs')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     okNuclei = set()
                     for tmpSite in tmpRet:
                         siteSpec = siteMapper.getSite(tmpSite)
                         okNuclei.add(siteSpec.pandasite)
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if tmpNucleus in okNuclei:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                         else:
                             tmpLog.info('  skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus))
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed job check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ###################################### 
                     # data locality
                     toSkip = False
                     availableData = {}
                     for datasetSpec in inputChunk.getDatasets():
                         # only for real datasets
                         if datasetSpec.isPseudo():
                             continue
                         # ignore DBR
                         if DataServiceUtils.isDBR(datasetSpec.datasetName):
                             continue
                         # skip locality check
                         if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck:
                             continue
                         # use deep scan for primary dataset
                         if datasetSpec.isMaster():
                             deepScan = True
                         else:
                             deepScan = False
                         # get nuclei where data is available
                         tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF,
                                                                           datasetSpec.datasetName,
                                                                           nucleusList.keys(),
                                                                           deepScan)
                         if tmpSt != Interaction.SC_SUCCEEDED:
                             tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet))
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             toSkip = True
                             break
                         # sum
                         for tmpNucleus,tmpVals in tmpRet.iteritems():
                             if not tmpNucleus in availableData:
                                 availableData[tmpNucleus] = tmpVals
                             else:
                                 availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems())
                     if toSkip:
                         continue
                     if availableData != {}:
                         newNucleusList = {}
                         # skip if no data
                         skipMsgList = []
                         for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                             if len(nucleusList) == 1:
                                 tmpLog.info('  disable data locality check for nucleus={0} since no other candidate'.format(tmpNucleus))
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                             elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \
                                     availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac:
                                 tmpMsg = '  skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus,
                                                                                                                                     availableData[tmpNucleus]['ava_size_any'],
                                                                                                                                     availableData[tmpNucleus]['tot_size'],
                                                                                                                                     thrInputSizeFrac)
                                 skipMsgList.append(tmpMsg)
                             elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \
                                     availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac:
                                 tmpMsg = '  skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus,
                                                                                                                                       availableData[tmpNucleus]['ava_num_any'],
                                                                                                                                       availableData[tmpNucleus]['tot_num'],
                                                                                                                                       thrInputNumFrac)
                                 skipMsgList.append(tmpMsg)
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         if len(newNucleusList) > 0:
                             nucleusList = newNucleusList
                             for tmpMsg in skipMsgList:
                                 tmpLog.info(tmpMsg)
                         else:
                             tmpLog.info('  disable data locality check since no nucleus has input data')
                         tmpLog.info('{0} candidates passed data check'.format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ###################################### 
                     # weight
                     self.prioRW.acquire()
                     nucleusRW = self.prioRW[taskSpec.currentPriority]
                     self.prioRW.release()
                     totalWeight = 0
                     nucleusweights = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleus in nucleusRW:
                             nucleusRW[tmpNucleus] = 0
                         wStr = '1'
                         # with RW
                         if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW:
                             weight = 1 / float(nucleusRW[tmpNucleus])
                             wStr += '/( RW={0} )'.format(nucleusRW[tmpNucleus])
                         else:
                             weight = 1
                             wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW)
                         # with data
                         if availableData != {}:
                             if availableData[tmpNucleus]['tot_size'] > 0:
                                 weight *= float(availableData[tmpNucleus]['ava_size_any'])
                                 weight /= float(availableData[tmpNucleus]['tot_size'])
                                 wStr += '* ( available_input_size_DISKTAPE={0} )'.format(availableData[tmpNucleus]['ava_size_any'])
                                 wStr += '/ ( total_input_size={0} )'.format(availableData[tmpNucleus]['tot_size'])
                                 # negative weight for tape
                                 if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']:
                                     weight *= negWeightTape
                                     wStr += '*( weight_TAPE={0} )'.format(negWeightTape)
                             # fraction of free space
                             if tmpNucleus in fractionFreeSpace:
                                 try:
                                     tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                         float(fractionFreeSpace[tmpNucleus]['total'])
                                     weight *= tmpFrac
                                     wStr += '*( free_space={0} )/( total_space={1} )'.format(fractionFreeSpace[tmpNucleus]['free'],
                                                                                          fractionFreeSpace[tmpNucleus]['total'])
                                 except:
                                     pass
                         tmpLog.info('  use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr))
                         totalWeight += weight
                         nucleusweights.append((tmpNucleus,weight))
                     tmpLog.info('final {0} candidates'.format(len(nucleusList)))
                     ###################################### 
                     # final selection
                     tgtWeight = random.uniform(0,totalWeight)
                     candidateNucleus = None
                     for tmpNucleus,weight in nucleusweights:
                         tgtWeight -= weight
                         if tgtWeight <= 0:
                             candidateNucleus = tmpNucleus
                             break
                     if candidateNucleus == None:
                         candidateNucleus = nucleusweights[-1][0]
                 ###################################### 
                 # update
                 nucleusSpec = nucleusList[candidateNucleus]
                 # get output/log datasets
                 tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                            ['output','log'])
                 # get destinations
                 retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)}
                 tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
                 tmpLog.info('  set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet))
                 self.sendLogMessage(tmpLog)
                 if tmpRet:
                     tmpMsg = 'set task.status=ready'
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                 # update RW table
                 self.prioRW.acquire()
                 for prio,rwMap in self.prioRW.iteritems():
                     if prio > taskSpec.currentPriority:
                         continue
                     if candidateNucleus in rwMap:
                         rwMap[candidateNucleus] += taskRW
                     else:
                         rwMap[candidateNucleus] = taskRW
                 self.prioRW.release()
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errMsg  = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID)
             errMsg += traceback.format_exc()
             logger.error(errMsg)
Exemplo n.º 31
0
 def doActionForReassgin(self,gTmpLog):
     # get DDM I/F
     ddmIF = self.ddmIF.getInterface(self.vo)
     # get site mapper
     siteMapper = self.taskBufferIF.getSiteMapper()
     # get tasks to get reassigned
     taskList = self.taskBufferIF.getTasksToReassign_JEDI(self.vo,self.prodSourceLabel)
     gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList)))
     for taskSpec in taskList:
         tmpLog = MsgWrapper(logger,'<jediTaskID={0}'.format(taskSpec.jediTaskID))
         tmpLog.debug('start to reassign')
         # DDM backend
         ddmBackEnd = taskSpec.getDdmBackEnd()
         # get datasets
         tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,['output','log'])
         if tmpStat != True:
             tmpLog.error('failed to get datasets')
             continue
         # update DB
         if not taskSpec.useWorldCloud():
             # update cloudtasks
             tmpStat = self.taskBufferIF.setCloudTaskByUser('jedi',taskSpec.jediTaskID,taskSpec.cloud,'assigned',True)
             if tmpStat != 'SUCCEEDED':
                 tmpLog.error('failed to update CloudTasks')
                 continue
             # check cloud
             if not siteMapper.checkCloud(taskSpec.cloud):
                 tmpLog.error("cloud={0} doesn't exist".format(taskSpec.cloud))
                 continue
         else:
             # re-run task brokerage
             if taskSpec.nucleus in [None,'']:
                 taskSpec.status = 'assigning'
                 taskSpec.oldStatus = None
                 taskSpec.setToRegisterDatasets()
                 self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID},
                                                   setOldModTime=True)
                 tmpLog.debug('set task_status={0} to trigger task brokerage again'.format(taskSpec.status))
                 continue
             # get nucleus
             nucleusSpec = siteMapper.getNucleus(taskSpec.nucleus)
             if nucleusSpec == None:
                 tmpLog.error("nucleus={0} doesn't exist".format(taskSpec.nucleus))
                 continue
             # set nucleus
             retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,datasetSpecList)}
             tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
         # get T1/nucleus
         if not taskSpec.useWorldCloud():
             t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest']
         else:
             t1SiteName = nucleusSpec.getOnePandaSite()
         t1Site = siteMapper.getSite(t1SiteName)
         # loop over all datasets
         isOK = True
         for datasetSpec in datasetSpecList:
             tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName))
             if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                 tmpLog.debug('skip {0} is distributed'.format(datasetSpec.datasetName))
                 continue
             # get location
             location = siteMapper.getDdmEndpoint(t1Site.sitename,datasetSpec.storageToken)
             # make subscription
             try:
                 tmpLog.debug('registering subscription to {0} with backend={1}'.format(location,
                                                                                        ddmBackEnd))
                 tmpStat = ddmIF.registerDatasetSubscription(datasetSpec.datasetName,location,
                                                             'Production Output',asynchronous=True)
                 if tmpStat != True:
                     tmpLog.error("failed to make subscription")
                     isOK = False
                     break
             except:
                 errtype,errvalue = sys.exc_info()[:2]
                 tmpLog.warning('failed to make subscription with {0}:{1}'.format(errtype.__name__,errvalue))
                 isOK = False
                 break
         # succeeded
         if isOK:    
             # activate task
             if taskSpec.oldStatus in ['assigning','exhausted',None]:
                 taskSpec.status = 'ready'
             else:
                 taskSpec.status = taskSpec.oldStatus
             taskSpec.oldStatus = None
             self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID},
                                               setOldModTime=True)
             tmpLog.debug('finished to reassign')
Exemplo n.º 32
0
 def doRefine(self,jediTaskID,taskParamMap):
     # make logger
     tmpLog = self.tmpLog
     tmpLog.debug('start taskType={0}'.format(self.taskSpec.taskType))
     try:
         # add ES paramsters
         if 'addEsParams' in taskParamMap and taskParamMap['addEsParams'] == True:
             preInclude = False
             preExec = False
             for tmpItem in taskParamMap['jobParameters']:
                 if 'value' in tmpItem:
                     if 'preInclude' in tmpItem['value']:
                         tmpStr = '<PANDA_ES_ONLY>,AthenaMP/AthenaMP_EventService.py</PANDA_ES_ONLY>'
                         tmpItem['value'] = self.insertString('preInclude',tmpStr,tmpItem['value'])
                         preInclude = True
                     if 'preExec' in tmpItem['value']:
                         tmpStr  = '<PANDA_ES_ONLY>;'
                         tmpStr += 'import os;pilot_tmp=type(str(),(),{})();'
                         tmpStr += 'pilot_tmp.__dict__.update(**os.environ);'
                         tmpStr += 'from AthenaMP.AthenaMPFlags import jobproperties as jps;'
                         tmpStr += 'jps.AthenaMPFlags.EventRangeChannel=pilot_tmp.PILOT_EVENTRANGECHANNEL'
                         tmpStr += '</PANDA_ES_ONLY>'
                         tmpItem['value'] = self.insertString('preExec',tmpStr,tmpItem['value'])
                         preExec = True
             # add if missing
             if not preInclude:
                 tmpStr = '<PANDA_ES_ONLY>preInclude="AthenaMP/AthenaMP_EventService.py"</PANDA_ES_ONLY>'
                 taskParamMap['jobParameters'].append({'type':'constant',
                                                       'value':tmpStr})
             if not preExec:
                 tmpStr  = '<PANDA_ES_ONLY>preExec="'
                 tmpStr += 'import os;pilot_tmp=type(str(),(),{})();'
                 tmpStr += 'pilot_tmp.__dict__.update(**os.environ);'
                 tmpStr += 'from AthenaMP.AthenaMPFlags import jobproperties as jps;'
                 tmpStr += 'jps.AthenaMPFlags.EventRangeChannel=pilot_tmp.PILOT_EVENTRANGECHANNEL'
                 tmpStr += '"</PANDA_ES_ONLY>'
                 taskParamMap['jobParameters'].append({'type':'constant',
                                                       'value':tmpStr})
         # basic refine    
         self.doBasicRefine(taskParamMap)
         # set nosplit+repeat for DBR
         for datasetSpec in self.inSecDatasetSpecList:
             if DataServiceUtils.isDBR(datasetSpec.datasetName):
                 datasetSpec.attributes = 'repeat,nosplit'
         # enable consistency check
         if not self.taskSpec.parent_tid in [None,self.taskSpec.jediTaskID]:
             for datasetSpec in self.inMasterDatasetSpec:
                 if datasetSpec.isMaster() and datasetSpec.type == 'input':
                     datasetSpec.enableCheckConsistency()
         # append attempt number
         for tmpKey,tmpOutTemplateMapList in self.outputTemplateMap.iteritems():
             for tmpOutTemplateMap in tmpOutTemplateMapList:
                 outFileTemplate = tmpOutTemplateMap['filenameTemplate']
                 if re.search('\.\d+$',outFileTemplate) == None and not outFileTemplate.endswith('.panda.um'):
                     tmpOutTemplateMap['filenameTemplate'] = outFileTemplate + '.1'
         # extract input datatype
         datasetTypeListIn = []
         for datasetSpec in self.inMasterDatasetSpec+self.inSecDatasetSpecList:
             datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName)
             if not datasetType in ['',None]:
                 datasetTypeListIn.append(datasetType)
         # extract datatype and set destination if nessesary
         datasetTypeList = []
         for datasetSpec in self.outDatasetSpecList:
             datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName)
             if not datasetType in ['',None]:
                 datasetTypeList.append(datasetType)
         # set numThrottled to use the task throttling mechanism
         if not 'noThrottle' in taskParamMap:
             self.taskSpec.numThrottled = 0
         # set to register datasets
         self.taskSpec.setToRegisterDatasets()
         # set transient to parent datasets
         if self.taskSpec.processingType in ['merge'] and not self.taskSpec.parent_tid in [None,self.taskSpec.jediTaskID]:
             # get parent
             tmpStat,parentTaskSpec = self.taskBufferIF.getTaskDatasetsWithID_JEDI(self.taskSpec.parent_tid,None,False)
             if tmpStat and parentTaskSpec != None:
                 # set transient to parent datasets
                 metaData = {'transient':True}
                 for datasetSpec in parentTaskSpec.datasetSpecList:
                     if datasetSpec.type in ['log','output']:
                         datasetType = DataServiceUtils.getDatasetType(datasetSpec.datasetName)
                         if not datasetType in ['',None] and datasetType in datasetTypeList and datasetType in datasetTypeListIn:
                             tmpLog.info('set metadata={0} to parent jediTaskID={1}:datasetID={2}:Name={3}'.format(str(metaData),
                                                                                                                   self.taskSpec.parent_tid,
                                                                                                                   datasetSpec.datasetID,
                                                                                                                   datasetSpec.datasetName))
                             for metadataName,metadaValue in metaData.iteritems():
                                 self.ddmIF.getInterface(self.taskSpec.vo).setDatasetMetadata(datasetSpec.datasetName,
                                                                                              metadataName,metadaValue)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('doBasicRefine failed with {0}:{1}'.format(errtype.__name__,errvalue))
         raise errtype,errvalue
     tmpLog.debug('done')
     return self.SC_SUCCEEDED