Python MsgWrapper.debug примеры использования

Язык программирования: Python

Пространство имен/Пакет: pandajedi.jedicore.MsgWrapper

Класс/Тип: MsgWrapper

Метод/Функция: debug

Примеров на hotexamples.com: 59

Python MsgWrapper.debug - 59 примеров найдено. Это лучшие примеры Python кода для pandajedi.jedicore.MsgWrapper.MsgWrapper.debug, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

MsgWrapper(30)

debug(30)

error(30)

info(23)

sendMsg(8)

uploadLog(7)

warning(3)

Пример #1

Показать файл

Файл: GenWatchDog.py Проект: PanDAWMS/panda-jedi

 def doAction(self):
     # get logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start')
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED

Пример #2

Показать файл

Файл: AtlasProdWatchDog.py Проект: RRCKI/panda-jedi

 def doAction(self):
     try:
         # get logger
         tmpLog = MsgWrapper(logger)
         tmpLog.debug('start')
         # action for priority boost
         self.doActionForPriorityBoost(tmpLog)
         # action for reassign
         self.doActionForReassgin(tmpLog)
         # action for throttled
         self.doActionForThrottled(tmpLog)
         # action for high prio pending
         for minPriority,timeoutVal in [(950,10),
                                        (900,30),
                                        ]:
             self.doActionForHighPrioPending(tmpLog,minPriority,timeoutVal)
         # action to set scout job data w/o scouts
         self.doActionToSetScoutJobData(tmpLog)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0}:{1} {2}'.format(errtype.__name__,errvalue,
                                                       traceback.format_exc()))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED

Пример #3

Показать файл

 def doAction(self):
     try:
         # get logger
         origTmpLog = MsgWrapper(logger)
         origTmpLog.debug('start')
         # lock
         got_lock = self._get_lock()
         if not got_lock:
             origTmpLog.debug('locked by another process. Skipped')
             return self.SC_SUCCEEDED
         origTmpLog.debug('got lock')
         # undo preassigned tasks
         self.undo_preassign()
         # preassign tasks to sites
         ret_map = self.do_preassign()
         # unlock
         # self._release_lock()
         # origTmpLog.debug('released lock')
         # to-reassign map
         to_reassign_map = ret_map['to_reassign']
         if to_reassign_map:
             # wait some minutes so that preassigned tasks can be brokered, before reassigning jobs
             origTmpLog.debug('wait {0}s before reassigning jobs'.format(
                 reassign_jobs_wait_time))
             time.sleep(reassign_jobs_wait_time)
             # reassign jobs of preassigned tasks
             self.reassign_jobs(to_reassign_map)
     except Exception:
         errtype, errvalue = sys.exc_info()[:2]
         err_str = traceback.format_exc()
         origTmpLog.error('failed with {0} {1} ; {2}'.format(
             errtype, errvalue, err_str))
     # return
     origTmpLog.debug('done')
     return self.SC_SUCCEEDED

Пример #4

Показать файл

Файл: AtlasProdWatchDog.py Проект: lukewayne123/panda-jedi

 def doActionForReassgin(self,gTmpLog):
     # get DDM I/F
     ddmIF = self.ddmIF.getInterface(self.vo)
     # get site mapper
     siteMapper = self.taskBufferIF.getSiteMapper()
     # get tasks to get reassigned
     taskList = self.taskBufferIF.getTasksToReassign_JEDI(self.vo,self.prodSourceLabel)
     gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList)))
     for taskSpec in taskList:
         tmpLog = MsgWrapper(logger,'<jediTaskID={0}'.format(taskSpec.jediTaskID))
         tmpLog.debug('start to reassign')
         # DDM backend
         ddmBackEnd = taskSpec.getDdmBackEnd()
         # update cloudtasks
         tmpStat = self.taskBufferIF.setCloudTaskByUser('jedi',taskSpec.jediTaskID,taskSpec.cloud,'assigned',True)
         if tmpStat != 'SUCCEEDED':
             tmpLog.error('failed to update CloudTasks')
             continue
         # get datasets
         tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,['output','log'])
         if tmpStat != True:
             tmpLog.error('failed to get datasets')
             continue
         # check cloud
         if not siteMapper.checkCloud(taskSpec.cloud):
             tmpLog.error("cloud={0} doesn't exist".format(taskSpec.cloud))
             continue
         # get T1
         t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest']
         t1Site = siteMapper.getSite(t1SiteName)
         # loop over all datasets
         isOK = True
         for datasetSpec in datasetSpecList:
             tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName))
             # get location
             location = siteMapper.getDdmEndpoint(t1Site.sitename,datasetSpec.storageToken)
             # set origin metadata
             tmpLog.debug('setting metadata origin={0}'.format(location))
             tmpStat = ddmIF.setDatasetMetadata(datasetSpec.datasetName,'origin',location)
             if tmpStat != True:
                 tmpLog.error("failed to set origin")
                 isOK = False
                 break
             # make subscription
             tmpLog.debug('registering subscription to {0} with backend={1}'.format(location,
                                                                                    ddmBackEnd))
             tmpStat = ddmIF.registerDatasetSubscription(datasetSpec.datasetName,location,
                                                         activity='Production',ignoreUnknown=True,
                                                         backEnd=ddmBackEnd)
             if tmpStat != True:
                 tmpLog.error("failed to make subscription")
                 isOK = False
                 break
         # succeeded
         if isOK:    
             # activate task
             taskSpec.status = taskSpec.oldStatus
             taskSpec.oldStatus = None
             self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID})
             tmpLog.debug('finished to reassign')

Пример #5

Показать файл

 def doAction(self):
     try:
         # get logger
         tmpLog = MsgWrapper(logger)
         tmpLog.debug('start')
         # action for priority boost
         self.doActionForPriorityBoost(tmpLog)
         # action for reassign
         self.doActionForReassgin(tmpLog)
         # action for throttled
         self.doActionForThrottled(tmpLog)
         # action for high prio pending
         for minPriority, timeoutVal in [
             (950, 10),
             (900, 30),
         ]:
             self.doActionForHighPrioPending(tmpLog, minPriority,
                                             timeoutVal)
         # action to set scout job data w/o scouts
         self.doActionToSetScoutJobData(tmpLog)
     except:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0}:{1} {2}'.format(
             errtype.__name__, errvalue, traceback.format_exc()))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED

Пример #6

Показать файл

Файл: GenWatchDog.py Проект: PanDAWMS/panda-jedi

 def doAction(self):
     # get logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start')
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED

Пример #7

Показать файл

 def getDatasetMetaData(self,datasetName):
     # make logger
     methodName = 'getDatasetMetaData'
     methodName = '{0} datasetName={1}'.format(methodName,datasetName)
     tmpLog = MsgWrapper(logger,methodName)
     try:
         # get DQ2 API
         dq2=DQ2()
         # get file list
         tmpRet = dq2.getMetaDataAttribute(datasetName,dq2.listMetaDataAttributes())
         # change dataset state to string
         if tmpRet['state'] in [DatasetState.CLOSED,DatasetState.FROZEN]:
             tmpRet['state'] = 'closed'
         elif tmpRet['state'] == DatasetState.OPEN:
             tmpRet['state'] = 'open'
         else:
             tmpRet['state'] = 'unknown'                
         tmpLog.debug(str(tmpRet))    
         return self.SC_SUCCEEDED,tmpRet
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errMsg = 'failed with {0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         errCode = self.checkError(errtype)
         return errCode,'{0}.{1} {2}'.format(self.__class__.__name__,methodName,errMsg)

Пример #8

Показать файл

Файл: AtlasProdWatchDog.py Проект: PanDAWMS/panda-jedi

 def doAction(self):
     try:
         # get logger
         tmpLog = MsgWrapper(logger)
         tmpLog.debug('start')
         # action for priority boost
         self.doActionForPriorityBoost(tmpLog)
         # action for reassign
         self.doActionForReassgin(tmpLog)
         # action for throttled
         self.doActionForThrottled(tmpLog)
         # action for high prio pending
         for minPriority,timeoutVal in [(950,10),
                                        (900,30),
                                        ]:
             self.doActionForHighPrioPending(tmpLog,minPriority,timeoutVal)
         # action to set scout job data w/o scouts
         self.doActionToSetScoutJobData(tmpLog)
         # action to throttle jobs in paused tasks
         self.doActionToThrottleJobInPausedTasks(tmpLog)
         # action for jumbo
         jumbo = JumboWatchDog(self.taskBufferIF, self.ddmIF, tmpLog, 'atlas', 'managed')
         jumbo.run()
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0}:{1} {2}'.format(errtype.__name__,errvalue,
                                                       traceback.format_exc()))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED

Пример #9

Показать файл

 def doCheck(self,taskSpecList):
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start doCheck')
     # return for failure
     retFatal    = self.SC_FATAL,{}
     retTmpError = self.SC_FAILED,{}
     # get list of jediTaskIDs
     taskIdList = []
     taskSpecMap = {}
     for taskSpec in taskSpecList:
         taskIdList.append(taskSpec.jediTaskID)
         taskSpecMap[taskSpec.jediTaskID] = taskSpec
     # check with panda
     tmpLog.debug('check with panda')
     tmpPandaStatus,cloudsInPanda = PandaClient.seeCloudTask(taskIdList)
     if tmpPandaStatus != 0:
         tmpLog.error('failed to see clouds')
         return retTmpError
     # make return map
     retMap = {}
     for tmpTaskID,tmpCoreName in cloudsInPanda.iteritems():
         tmpLog.debug('jediTaskID={0} -> {1}'.format(tmpTaskID,tmpCoreName))
         if not tmpCoreName in ['NULL','',None]:
             taskSpec = taskSpecMap[tmpTaskID]
             if taskSpec.useWorldCloud():
                 # get destinations for WORLD cloud
                 ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                 # get site
                 siteSpec = self.siteMapper.getSite(tmpCoreName)
                 # get nucleus
                 nucleus = siteSpec.pandasite
                 # get output/log datasets
                 tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(tmpTaskID,['output','log'])
                 # get destinations
                 retMap[tmpTaskID] = {'datasets':[],'nucleus':nucleus}
                 for datasetSpec in tmpDatasetSpecs:
                     # skip distributed datasets
                     if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                         continue
                     # get token
                     token = ddmIF.convertTokenToEndpoint(siteSpec.ddm,datasetSpec.storageToken)
                     # use default endpoint
                     if token == None:
                         token = siteSpec.ddm
                     # add origianl token
                     if not datasetSpec.storageToken in ['',None]:
                         token += '/{0}'.format(datasetSpec.storageToken)
                     retMap[tmpTaskID]['datasets'].append({'datasetID':datasetSpec.datasetID,
                                                           'token':'dst:{0}'.format(token),
                                                           'destination':tmpCoreName})
             else:
                 retMap[tmpTaskID] = tmpCoreName
     tmpLog.debug('ret {0}'.format(str(retMap)))
     # return
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,retMap

Пример #10

Показать файл

Файл: AtlasProdTaskBroker.py Проект: ruslan33/panda-jedi

 def doCheck(self,taskSpecList):
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start doCheck')
     # return for failure
     retFatal    = self.SC_FATAL,{}
     retTmpError = self.SC_FAILED,{}
     # get list of jediTaskIDs
     taskIdList = []
     taskSpecMap = {}
     for taskSpec in taskSpecList:
         taskIdList.append(taskSpec.jediTaskID)
         taskSpecMap[taskSpec.jediTaskID] = taskSpec
     # check with panda
     tmpLog.debug('check with panda')
     tmpPandaStatus,cloudsInPanda = PandaClient.seeCloudTask(taskIdList)
     if tmpPandaStatus != 0:
         tmpLog.error('failed to see clouds')
         return retTmpError
     # make return map
     retMap = {}
     for tmpTaskID,tmpCoreName in cloudsInPanda.iteritems():
         tmpLog.debug('jediTaskID={0} -> {1}'.format(tmpTaskID,tmpCoreName))
         if not tmpCoreName in ['NULL','',None]:
             taskSpec = taskSpecMap[tmpTaskID]
             if taskSpec.useWorldCloud():
                 # get destinations for WORLD cloud
                 ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                 # get site
                 siteSpec = self.siteMapper.getSite(tmpCoreName)
                 # get nucleus
                 nucleus = siteSpec.pandasite
                 # get output/log datasets
                 tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(tmpTaskID,['output','log'])
                 # get destinations
                 retMap[tmpTaskID] = {'datasets':[],'nucleus':nucleus}
                 for datasetSpec in tmpDatasetSpecs:
                     # skip distributed datasets
                     if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                         continue
                     # get token
                     token = ddmIF.convertTokenToEndpoint(siteSpec.ddm,datasetSpec.storageToken)
                     # use default endpoint
                     if token == None:
                         token = siteSpec.ddm
                     # add origianl token
                     if not datasetSpec.storageToken in ['',None]:
                         token += '/{0}'.format(datasetSpec.storageToken)
                     retMap[tmpTaskID]['datasets'].append({'datasetID':datasetSpec.datasetID,
                                                           'token':'dst:{0}'.format(token),
                                                           'destination':tmpCoreName})
             else:
                 retMap[tmpTaskID] = tmpCoreName
     tmpLog.debug('ret {0}'.format(str(retMap)))
     # return
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,retMap

Пример #11

Показать файл

Файл: AtlasProdTaskBroker.py Проект: tertychnyy/panda-jedi

 def doCheck(self, taskSpecList):
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug("start doCheck")
     # return for failure
     retFatal = self.SC_FATAL, {}
     retTmpError = self.SC_FAILED, {}
     # get list of jediTaskIDs
     taskIdList = []
     taskSpecMap = {}
     for taskSpec in taskSpecList:
         taskIdList.append(taskSpec.jediTaskID)
         taskSpecMap[taskSpec.jediTaskID] = taskSpec
     # check with panda
     tmpLog.debug("check with panda")
     tmpPandaStatus, cloudsInPanda = PandaClient.seeCloudTask(taskIdList)
     if tmpPandaStatus != 0:
         tmpLog.error("failed to see clouds")
         return retTmpError
     # make return map
     retMap = {}
     for tmpTaskID, tmpCoreName in cloudsInPanda.iteritems():
         tmpLog.debug("jediTaskID={0} -> {1}".format(tmpTaskID, tmpCoreName))
         if not tmpCoreName in ["NULL", "", None]:
             taskSpec = taskSpecMap[tmpTaskID]
             if taskSpec.useWorldCloud():
                 # get destinations for WORLD cloud
                 ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                 # get site
                 siteSpec = self.siteMapper.getSite(tmpCoreName)
                 # get output/log datasets
                 tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                     tmpTaskID, ["output", "log"]
                 )
                 # get destinations
                 retMap[tmpTaskID] = []
                 for datasetSpec in tmpDatasetSpecs:
                     token = ddmIF.convertTokenToEndpoint(siteSpec.ddm, datasetSpec.storageToken)
                     # use default endpoint
                     if token == None:
                         token = siteSpec.ddm
                     retMap[tmpTaskID].append(
                         {
                             "datasetID": datasetSpec.datasetID,
                             "token": "dst:{0}".format(token),
                             "destination": tmpCoreName,
                         }
                     )
             else:
                 retMap[tmpTaskID] = tmpCoreName
     tmpLog.debug("ret {0}".format(str(retMap)))
     # return
     tmpLog.debug("done")
     return self.SC_SUCCEEDED, retMap

Пример #12

Показать файл

Файл: TaskRefiner.py Проект: PanDAWMS/panda-jedi

 def start(self):
     # start base classes
     JediKnight.start(self)
     FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF)
     # go into main loop
     while True:
         startTime = datetime.datetime.utcnow()
         try:
             # get logger
             tmpLog = MsgWrapper(logger)
             tmpLog.debug('start')
             # loop over all vos
             for vo in self.vos:
                 # loop over all sourceLabels
                 for prodSourceLabel in self.prodSourceLabels:
                     # get the list of tasks to refine
                     tmpList = self.taskBufferIF.getTasksToRefine_JEDI(
                         vo, prodSourceLabel)
                     if tmpList is None:
                         # failed
                         tmpLog.error(
                             'failed to get the list of tasks to refine')
                     else:
                         tmpLog.debug('got {0} tasks'.format(len(tmpList)))
                         # put to a locked list
                         taskList = ListWithLock(tmpList)
                         # make thread pool
                         threadPool = ThreadPool()
                         # get work queue mapper
                         workQueueMapper = self.taskBufferIF.getWorkQueueMap(
                         )
                         # make workers
                         nWorker = jedi_config.taskrefine.nWorkers
                         for iWorker in range(nWorker):
                             thr = TaskRefinerThread(
                                 taskList, threadPool, self.taskBufferIF,
                                 self.ddmIF, self, workQueueMapper)
                             thr.start()
                         # join
                         threadPool.join()
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             tmpLog.error('failed in {0}.start() with {1} {2}'.format(
                 self.__class__.__name__, errtype.__name__, errvalue))
             tmpLog.error('Traceback: {0}'.format(traceback.format_exc()))
         # sleep if needed
         loopCycle = jedi_config.taskrefine.loopCycle
         timeDelta = datetime.datetime.utcnow() - startTime
         sleepPeriod = loopCycle - timeDelta.seconds
         if sleepPeriod > 0:
             time.sleep(sleepPeriod)
         # randomize cycle
         self.randomSleep(max_val=loopCycle)

Пример #13

Показать файл

Файл: TaskRefiner.py Проект: PanDAWMS/panda-jedi

 def start(self):
     # start base classes
     JediKnight.start(self)
     FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF)
     # go into main loop
     while True:
         startTime = datetime.datetime.utcnow()
         try:
             # get logger
             tmpLog = MsgWrapper(logger)
             tmpLog.debug('start')
             # loop over all vos
             for vo in self.vos:
                 # loop over all sourceLabels
                 for prodSourceLabel in self.prodSourceLabels:
                     # get the list of tasks to refine
                     tmpList = self.taskBufferIF.getTasksToRefine_JEDI(vo, prodSourceLabel)
                     if tmpList == None:
                         # failed
                         tmpLog.error('failed to get the list of tasks to refine')
                     else:
                         tmpLog.debug('got {0} tasks'.format(len(tmpList)))
                         # put to a locked list
                         taskList = ListWithLock(tmpList)
                         # make thread pool
                         threadPool = ThreadPool()
                         # get work queue mapper
                         workQueueMapper = self.taskBufferIF.getWorkQueueMap()
                         # make workers
                         nWorker = jedi_config.taskrefine.nWorkers
                         for iWorker in range(nWorker):
                             thr = TaskRefinerThread(taskList, threadPool,
                                                     self.taskBufferIF,
                                                     self.ddmIF,
                                                     self, workQueueMapper)
                             thr.start()
                         # join
                         threadPool.join()
         except:
             errtype, errvalue = sys.exc_info()[:2]
             tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,
                                                                      errtype.__name__, errvalue))
             tmpLog.error('Traceback: {0}'.format(traceback.format_exc()))
         # sleep if needed
         loopCycle = jedi_config.taskrefine.loopCycle
         timeDelta = datetime.datetime.utcnow() - startTime
         sleepPeriod = loopCycle - timeDelta.seconds
         if sleepPeriod > 0:
             time.sleep(sleepPeriod)
         # randomize cycle
         self.randomSleep()

Пример #14

Показать файл

 def doCleanDataLocality(self):
     tmpLog = MsgWrapper(logger, ' #ATM #KV doCleanDataLocality')
     tmpLog.debug('start')
     try:
         # lock
         got_lock = self.taskBufferIF.lockProcess_JEDI(  vo=self.vo, prodSourceLabel='default',
                                                         cloud=None, workqueue_id=None, resource_name=None,
                                                         component='AtlasDataLocalityUpdaterWatchDog.doCleanDataLocality',
                                                         pid=self.pid, timeLimit=1440)
         if not got_lock:
             tmpLog.debug('locked by another process. Skipped')
             return
         tmpLog.debug('got lock')
         # lifetime of records
         record_lifetime_hours = 24
         # run
         now_timestamp = datetime.datetime.utcnow()
         before_timestamp = now_timestamp - datetime.timedelta(hours=record_lifetime_hours)
         n_rows = self.taskBufferIF.deleteOutdatedDatasetLocality_JEDI(before_timestamp)
         tmpLog.info('cleaned up {0} records'.format(n_rows))
         # done
         tmpLog.debug('done')
     except Exception:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0} {1} {2}'.format(errtype, errvalue, traceback.format_exc()))

Пример #15

Показать файл

 def doForWaitingJobs(self):
     tmpLog = MsgWrapper(logger, 'doForWaitingJobs label=user')
     # check every 60 min
     checkInterval = 60
     # get lib.tgz for waiting jobs
     libList = self.taskBufferIF.getLibForWaitingRunJob_JEDI(self.vo, self.prodSourceLabel, checkInterval)
     tmpLog.debug('got {0} lib.tgz files'.format(len(libList)))
     # activate or kill orphan jobs which were submitted to use lib.tgz when the lib.tgz was being produced
     for prodUserName,datasetName,tmpFileSpec in libList:
         tmpLog = MsgWrapper(logger,'< #ATM #KV doForWaitingJobs jediTaskID={0} label=user >'.format(tmpFileSpec.jediTaskID))
         tmpLog.debug('start')
         # check status of lib.tgz
         if tmpFileSpec.status == 'failed':
             # get buildJob
             pandaJobSpecs = self.taskBufferIF.peekJobs([tmpFileSpec.PandaID],
                                                        fromDefined=False,
                                                        fromActive=False,
                                                        fromWaiting=False)
             pandaJobSpec = pandaJobSpecs[0]
             if pandaJobSpec is not None:
                 # kill
                 self.taskBufferIF.updateJobs([pandaJobSpec],False)
                 tmpLog.debug('  action=killed_downstream_jobs for user="******" with libDS={1}'.format(prodUserName,datasetName))
             else:
                 # PandaJobSpec not found
                 tmpLog.error('  cannot find PandaJobSpec for user="******" with PandaID={1}'.format(prodUserName,
                                                                                                  tmpFileSpec.PandaID))
         elif tmpFileSpec.status == 'finished':
             # set metadata
             self.taskBufferIF.setGUIDs([{'guid':tmpFileSpec.GUID,
                                          'lfn':tmpFileSpec.lfn,
                                          'checksum':tmpFileSpec.checksum,
                                          'fsize':tmpFileSpec.fsize,
                                          'scope':tmpFileSpec.scope,
                                          }])
             # get lib dataset
             dataset = self.taskBufferIF.queryDatasetWithMap({'name':datasetName})
             if dataset is not None:
                 # activate jobs
                 aThr = Activator(self.taskBufferIF,dataset)
                 aThr.start()
                 aThr.join()
                 tmpLog.debug('  action=activated_downstream_jobs for user="******" with libDS={1}'.format(prodUserName,datasetName))
             else:
                 # datasetSpec not found
                 tmpLog.error('  cannot find datasetSpec for user="******" with libDS={1}'.format(prodUserName,datasetName))
         else:
             # lib.tgz is not ready
             tmpLog.debug('  keep waiting for user="******" libDS={1}'.format(prodUserName,datasetName))

Пример #16

Показать файл

Файл: AtlasProdWatchDog.py Проект: lukewayne123/panda-jedi

 def doAction(self):
     try:
         # get logger
         tmpLog = MsgWrapper(logger)
         tmpLog.debug('start')
         # action for priority boost
         self.doActionForPriorityBoost(tmpLog)
         # action for reassign
         self.doActionForReassgin(tmpLog)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0} {1}'.format(errtype,errvalue))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED

Пример #17

Показать файл

 def doAction(self):
     try:
         # get logger
         origTmpLog = MsgWrapper(logger)
         origTmpLog.debug('start')
         # clean up data locality
         self.doCleanDataLocality()
         # update data locality
         self.doUpdateDataLocality()
     except Exception:
         errtype, errvalue = sys.exc_info()[:2]
         origTmpLog.error('failed with {0} {1}'.format(errtype, errvalue))
     # return
     origTmpLog.debug('done')
     return self.SC_SUCCEEDED

Пример #18

Показать файл

 def doAction(self):
     try:
         # get logger
         origTmpLog = MsgWrapper(logger)
         origTmpLog.debug('start')
         # make tasks pending under certain conditions
         self.do_for_data_locality()
     except Exception:
         errtype, errvalue = sys.exc_info()[:2]
         err_str = traceback.format_exc()
         origTmpLog.error('failed with {0} {1} ; {2}'.format(
             errtype, errvalue, err_str))
     # return
     origTmpLog.debug('done')
     return self.SC_SUCCEEDED

Пример #19

Показать файл

 def checkDatasetConsistency(self,location,datasetName):
     # make logger
     methodName = 'checkDatasetConsistency'
     methodName = '{0} datasetName={1} location={2}'.format(methodName,datasetName,location)
     tmpLog = MsgWrapper(logger,methodName)
     try:
         # get DQ2 API
         dq2=DQ2()
         # check
         tmpRet = dq2.checkDatasetConsistency(location,datasetName)
         tmpLog.debug(str(tmpRet))
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errMsg = 'failed with {0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         errCode = self.checkError(errtype)
         return errCode,'{0}.{1} {2}'.format(self.__class__.__name__,methodName,errMsg)

Пример #20

Показать файл

 def reassign_jobs(self, to_reassign_map):
     tmp_log = MsgWrapper(logger, 'reassign_jobs')
     for jedi_taskid, value_map in to_reassign_map.items():
         site = value_map['site']
         n_jobs_to_fill = value_map['n_jobs_to_fill']
         # compute n_jobs_to_close from n_jobs_to_fill
         n_jobs_to_close = int(n_jobs_to_fill / 3)
         # reassign
         n_jobs_closed = self.taskBufferIF.reassignJobsInPreassignedTask_JEDI(
             jedi_taskid, site, n_jobs_to_close)
         if n_jobs_closed is None:
             tmp_log.debug(
                 'jediTaskID={0} no longer ready/running or not assigned to {1} , skipped'
                 .format(jedi_taskid, site))
         else:
             tmp_log.debug('jediTaskID={0} to {1} , closed {2} jobs'.format(
                 jedi_taskid, site, n_jobs_closed))

Пример #21

Показать файл

 def doAction(self):
     try:
         # get logger
         origTmpLog = MsgWrapper(logger)
         origTmpLog.debug('start')
         # handle waiting jobs
         self.doForWaitingJobs()
         # throttle tasks if so many prestaging requests
         self.doForPreStaging()
         # priority massage
         self.doForPriorityMassage()
         # redo stalled analysis jobs
         self.doForRedoStalledJobs()
         # throttle WAN data access
         #self.doForThrottleWAN()
     except Exception:
         errtype,errvalue = sys.exc_info()[:2]
         origTmpLog.error('failed with {0} {1}'.format(errtype,errvalue))
     # return
     origTmpLog.debug('done')
     return self.SC_SUCCEEDED

Пример #22

Показать файл

Файл: AtlasProdWatchDog.py Проект: fbarreir/panda-jedi

 def doAction(self):
     try:
         # get logger
         tmpLog = MsgWrapper(logger)
         tmpLog.debug('start')
         # action for priority boost
         self.doActionForPriorityBoost(tmpLog)
         # action for reassign
         self.doActionForReassgin(tmpLog)
         # action for throttled
         self.doActionForThrottled(tmpLog)
         # action for high prio pending
         for minPriority,timeoutVal in [(950,10),
                                        (900,30),
                                        ]:
             self.doActionForHighPrioPending(tmpLog,minPriority,timeoutVal)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0} {1}'.format(errtype,errvalue))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED

Пример #23

Показать файл

Файл: AtlasProdWatchDog.py Проект: PanDAWMS/panda-jedi

    def doAction(self):
        try:
            # get logger
            tmpLog = MsgWrapper(logger)
            tmpLog.debug('start')

            # action for priority boost
            self.doActionForPriorityBoost(tmpLog)

            # action for reassign
            self.doActionForReassign(tmpLog)

            # action for throttled
            self.doActionForThrottled(tmpLog)

            # action for high prio pending
            for minPriority, timeoutVal in [(950, 10),
                                            (900, 30),
                                            ]:
                self.doActionForHighPrioPending(tmpLog, minPriority, timeoutVal)

            # action to set scout job data w/o scouts
            self.doActionToSetScoutJobData(tmpLog)

            # action to throttle jobs in paused tasks
            self.doActionToThrottleJobInPausedTasks(tmpLog)

            # action for jumbo
            jumbo = JumboWatchDog(self.taskBufferIF, self.ddmIF, tmpLog, 'atlas', 'managed')
            jumbo.run()
        except Exception:
            errtype, errvalue = sys.exc_info()[:2]
            tmpLog.error('failed with {0}:{1} {2}'.format(errtype.__name__, errvalue, traceback.format_exc()))
        # return
        tmpLog.debug('done')
        return self.SC_SUCCEEDED

Пример #24

Показать файл

Файл: GenJobThrottler.py Проект: PanDAWMS/panda-jedi

 def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue, resourceType):
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start vo={0} label={1} cloud={2} workQueue={3}'.format(vo,prodSourceLabel,cloudName,
                                                                          workQueue.queue_name))
     # check if unthrottled
     if workQueue.queue_share == None:
         tmpLog.debug("  done : unthrottled since share=None")
         return self.retUnThrottled
     tmpLog.debug("  done : SKIP")
     return self.retThrottled

Пример #25

Показать файл

 def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue,
                   resourceType):
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start vo={0} label={1} cloud={2} workQueue={3}'.format(
         vo, prodSourceLabel, cloudName, workQueue.queue_name))
     # check if unthrottled
     if not workQueue.throttled:
         tmpLog.debug("  done : unthrottled since throttled is False")
         return self.retUnThrottled
     tmpLog.debug("  done : SKIP")
     return self.retThrottled

Пример #26

Показать файл

Файл: AtlasAnalJobThrottler.py Проект: tertychnyy/panda-jedi

 def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue,
                   jobStat):
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start vo={0} label={1} cloud={2} workQueue={3}'.format(
         vo, prodSourceLabel, cloudName, workQueue.queue_name))
     # check if unthrottled
     if workQueue.queue_share == None:
         tmpLog.debug("  done : unthrottled since share=None")
         return self.retUnThrottled
     tmpLog.debug("  done : SKIP")
     return self.retThrottled

Пример #27

Показать файл

Файл: TaskCommando.py Проект: wguanicedew/panda-jedi

 def start(self):
     # start base classes
     JediKnight.start(self)
     # go into main loop
     while True:
         startTime = datetime.datetime.utcnow()
         try:
             # get logger
             tmpLog = MsgWrapper(logger)
             tmpLog.debug('start')
             # loop over all vos
             for vo in self.vos:
                 # loop over all sourceLabels
                 for prodSourceLabel in self.prodSourceLabels:
                     # get the list of tasks to exec command
                     tmpList = self.taskBufferIF.getTasksToExecCommand_JEDI(
                         vo, prodSourceLabel)
                     if tmpList == None:
                         # failed
                         tmpLog.error(
                             'failed to get the task list for vo={0} label={1}'
                             .format(vo, prodSourceLabel))
                     else:
                         tmpLog.debug('got {0} tasks'.format(len(tmpList)))
                         # put to a locked list
                         taskList = ListWithLock(tmpList)
                         # make thread pool
                         threadPool = ThreadPool()
                         # make workers
                         nWorker = jedi_config.taskrefine.nWorkers
                         for iWorker in range(nWorker):
                             thr = TaskCommandoThread(
                                 taskList, threadPool, self.taskBufferIF,
                                 self.ddmIF, self.pid)
                             thr.start()
                         # join
                         threadPool.join()
             tmpLog.debug('done')
         except:
             errtype, errvalue = sys.exc_info()[:2]
             tmpLog.error('failed in {0}.start() with {1} {2}'.format(
                 self.__class__.__name__, errtype.__name__, errvalue))
         # sleep if needed
         loopCycle = jedi_config.tcommando.loopCycle
         timeDelta = datetime.datetime.utcnow() - startTime
         sleepPeriod = loopCycle - timeDelta.seconds
         if sleepPeriod > 0:
             time.sleep(sleepPeriod)
         # randomize cycle
         self.randomSleep()

Пример #28

Показать файл

Файл: AtlasProdJobBroker.py Проект: tertychnyy/panda-jedi

 def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                         monToken='<jediTaskID={0} {1}>'.format(taskSpec.jediTaskID,
                                                                datetime.datetime.utcnow().isoformat('/')))
     tmpLog.debug('start')
     # return for failure
     retFatal    = self.SC_FATAL,inputChunk
     retTmpError = self.SC_FAILED,inputChunk
     # get sites in the cloud
     sitePreAssigned = False
     siteListPreAssigned = False
     if not taskSpec.site in ['',None]:
         if ',' in taskSpec.site:
             # site list
             siteListPreAssigned = True
             scanSiteList = taskSpec.site.split(',')
         else:
             # site
             sitePreAssigned = True
             scanSiteList = [taskSpec.site]
         tmpLog.debug('site={0} is pre-assigned criteria=+preassign'.format(taskSpec.site))
     elif inputChunk.getPreassignedSite() != None:
         siteListPreAssigned = True
         scanSiteList = DataServiceUtils.getSitesShareDDM(self.siteMapper,inputChunk.getPreassignedSite())
         scanSiteList.append(inputChunk.getPreassignedSite())
         tmpMsg = 'use site={0} since they share DDM endpoints with orinal_site={1} which is pre-assigned in masterDS '.format(str(scanSiteList),
                                                                                                                               inputChunk.getPreassignedSite())
         tmpMsg += 'criteria=+premerge'
         tmpLog.debug(tmpMsg)
     else:
         scanSiteList = self.siteMapper.getCloud(cloudName)['sites']
         tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList)))
     # get job statistics
     tmpSt,jobStatMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     # T1 
     if not taskSpec.useWorldCloud():
         t1Sites = [self.siteMapper.getCloud(cloudName)['source']]
         # hospital sites
         if self.hospitalQueueMap.has_key(cloudName):
             t1Sites += self.hospitalQueueMap[cloudName]
     else:
         # get destination for WORLD cloud
         t1Sites = []
         tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,datasetTypes=['log'])
         for datasetSpec in datasetSpecList:
             if not datasetSpec.destination in t1Sites:
                 t1Sites.append(datasetSpec.destination)
     # sites sharing SE with T1
     sitesShareSeT1 = DataServiceUtils.getSitesShareDDM(self.siteMapper,t1Sites[0])
     # all T1
     allT1Sites = self.getAllT1Sites()
     # core count
     if inputChunk.isMerging and taskSpec.mergeCoreCount != None:
         taskCoreCount = taskSpec.mergeCoreCount
     else:
         taskCoreCount = taskSpec.coreCount
     # MP
     if taskCoreCount != None and taskCoreCount > 1:
         # use MCORE only
         useMP = 'only'
     elif taskCoreCount == 0:
         # use MCORE and normal 
         useMP = 'any'
     else:
         # not use MCORE
         useMP = 'unuse'
     # get workQueue
     workQueue = self.taskBufferIF.getWorkQueueMap().getQueueWithID(taskSpec.workQueue_ID)
     ######################################
     # selection for status
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check site status
             skipFlag = False
             if tmpSiteSpec.status != 'online':
                 skipFlag = True
             if not skipFlag:    
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to status=%s criteria=-status' % (tmpSiteName,tmpSiteSpec.status))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for reprocessing
     if taskSpec.processingType == 'reprocessing':
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check schedconfig.validatedreleases
             if tmpSiteSpec.validatedreleases == ['True']:
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to validatedreleases <> True criteria=-validated' % tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for reprocessing'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for high priorities
     t1WeightForHighPrio = 1
     if (taskSpec.currentPriority >= 900 or inputChunk.useScout()) \
             and not sitePreAssigned and not siteListPreAssigned:
         t1WeightForHighPrio = 100
         newScanSiteList = []
         for tmpSiteName in scanSiteList:            
             if tmpSiteName in t1Sites+sitesShareSeT1+allT1Sites:
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpMsg = '  skip site={0} due to highPrio/scouts which needs to run at T1 or sites associated with {1} T1 SE '.format(tmpSiteName,
                                                                                                                                       cloudName)
                 tmpMsg += 'criteria=-scoutprio'
                 tmpLog.debug(tmpMsg)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed for highPrio/scouts'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection to avoid slow or inactive sites
     if (taskSpec.currentPriority >= 800 or inputChunk.useScout() or \
             inputChunk.isMerging or taskSpec.mergeOutput()) \
             and not sitePreAssigned:
         # get inactive sites
         inactiveTimeLimit = 2
         inactiveSites = self.taskBufferIF.getInactiveSites_JEDI('production',inactiveTimeLimit)
         newScanSiteList = []
         tmpMsgList = []
         for tmpSiteName in scanSiteList:
             nToGetAll = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \
                 AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'starting')
             if tmpSiteName in ['BNL_CLOUD','BNL_CLOUD_MCORE','ATLAS_OPP_OSG']:
                 tmpMsg = '  skip site={0} since high prio/scouts/merge needs to avoid slow sites '.format(tmpSiteName)
                 tmpMsg += 'criteria=-slow'
                 tmpMsgList.append(tmpMsg)
             elif tmpSiteName in inactiveSites and nToGetAll > 0:
                 tmpMsg = '  skip site={0} since high prio/scouts/merge needs to avoid inactive sites (laststart is older than {1}h) '.format(tmpSiteName,
                                                                                                                                              inactiveTimeLimit)
                 tmpMsg += 'criteria=-inactive'
                 tmpMsgList.append(tmpMsg)
             else:
                 newScanSiteList.append(tmpSiteName)
         if newScanSiteList != []:
             scanSiteList = newScanSiteList
             for tmpMsg in tmpMsgList:
                 tmpLog.debug(tmpMsg)
         tmpLog.debug('{0} candidates passed for slowness/inactive check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for data availability
     if not sitePreAssigned and not siteListPreAssigned:
         for datasetSpec in inputChunk.getDatasets():
             datasetName = datasetSpec.datasetName
             # ignore DBR
             if DataServiceUtils.isDBR(datasetName):
                 continue
             if not self.dataSiteMap.has_key(datasetName):
                 # get the list of sites where data is available
                 tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName))
                 tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper,
                                                                  self.ddmIF,datasetName,
                                                                  datasetSpec.storageToken)
                 if tmpSt == self.SC_FAILED:
                     tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     self.sendLogMessage(tmpLog)
                     return retTmpError
                 if tmpSt == self.SC_FATAL:
                     tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     self.sendLogMessage(tmpLog)
                     return retFatal
                 # append
                 self.dataSiteMap[datasetName] = tmpRet
                 tmpLog.debug('map of data availability : {0}'.format(str(tmpRet)))
             """
             # check if T1 has the data
             if self.dataSiteMap[datasetName].has_key(cloudName):
                 cloudHasData = True
             else:
                 cloudHasData = False
             t1hasData = False
             if cloudHasData:
                 for tmpSE,tmpSeVal in self.dataSiteMap[datasetName][cloudName]['t1'].iteritems():
                     if tmpSeVal['state'] == 'complete':
                         t1hasData = True
                         break
                 # T1 has incomplete data while no data at T2
                 if not t1hasData and self.dataSiteMap[datasetName][cloudName]['t2'] == []:
                     # use incomplete data at T1 anyway
                     t1hasData = True
             # data is missing at T1         
             if not t1hasData:
                 tmpLog.debug('{0} is unavailable at T1. scanning T2 sites in homeCloud={1}'.format(datasetName,cloudName))
                 # make subscription to T1
                 # FIXME
                 pass
                 # use T2 until data is complete at T1
                 newScanSiteList = []
                 for tmpSiteName in scanSiteList:                    
                     if cloudHasData and tmpSiteName in self.dataSiteMap[datasetName][cloudName]['t2']:
                         newScanSiteList.append(tmpSiteName)
                     else:
                         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                         if tmpSiteSpec.cloud != cloudName:
                             tmpLog.debug('  skip %s due to foreign T2' % tmpSiteName)
                         else:
                             tmpLog.debug('  skip %s due to missing data at T2' % tmpSiteName)
                 scanSiteList = newScanSiteList
                 tmpLog.debug('{0} candidates passed T2 scan in the home cloud with input:{1}'.format(len(scanSiteList),datasetName))
                 if scanSiteList == []:
                     tmpLog.error('no candidates')
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     return retTmpError
             """        
     ######################################
     # selection for fairshare
     if not (workQueue.queue_type in ['managed'] and workQueue.queue_name in ['test','validation']):
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if AtlasBrokerUtils.hasZeroShare(tmpSiteSpec,taskSpec,inputChunk.isMerging,tmpLog):
                 tmpLog.debug('  skip site={0} due to zero share criteria=-zeroshare'.format(tmpSiteName))
                 continue
             newScanSiteList.append(tmpSiteName)                
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed zero share check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for I/O intensive tasks
     # FIXME
     pass
     ######################################
     # selection for MP
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \
                     (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]):
                     newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to core mismatch site:%s <> task:%s criteria=-cpucore' % \
                              (tmpSiteName,tmpSiteSpec.coreCount,taskCoreCount))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for release
     if taskSpec.transHome != None:
         if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None:
             # only cache is checked for normal tasks
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      caches=taskSpec.transHome,
                                                                      cmtConfig=taskSpec.architecture)
         else:
             # nightlies
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      releases='CVMFS')
             #                                                         releases='nightlies',
             #                                                         cmtConfig=taskSpec.architecture)
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY'] or \
                tmpSiteName in ['CERN-RELEASE']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip site=%s due to missing cache=%s:%s criteria=-cache' % \
                              (tmpSiteName,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for ATLAS release {1}:{2}'.format(len(scanSiteList),
                                                                               taskSpec.transHome,
                                                                               taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for memory
     minRamCount  = max(taskSpec.ramCount, inputChunk.ramCount)
     if not minRamCount in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory:
                 tmpMsg = '  skip site={0} due to site RAM shortage {1}(site upper limit) less than {2} '.format(tmpSiteName,
                                                                                                                 tmpSiteSpec.maxmemory,
                                                                                                                 minRamCount)
                 tmpMsg += 'criteria=-lowmemory'
                 tmpLog.debug(tmpMsg)
                 continue
             if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory:
                 tmpMsg = '  skip site={0} due to job RAM shortage {1}(site lower limit) greater than {2} '.format(tmpSiteName,
                                                                                                                   tmpSiteSpec.minmemory,
                                                                                                                   minRamCount)
                 tmpMsg += 'criteria=-highmemory'
                 tmpLog.debug(tmpMsg)
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed memory check {1}({2})'.format(len(scanSiteList),
                                                                           minRamCount,taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for scratch disk
     if taskSpec.outputScaleWithEvents():
         minDiskCount = taskSpec.getOutDiskSize()*inputChunk.getMaxAtomSize(getNumEvents=True)
     else:
         minDiskCount = taskSpec.getOutDiskSize()*inputChunk.getMaxAtomSize(effectiveSize=True)
     minDiskCount = minDiskCount + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize()
     minDiskCount = minDiskCount / 1024 / 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0 and minDiskCount > tmpSiteSpec.maxwdir:
             tmpMsg = '  skip site={0} due to small scratch disk {1} less than {2} '.format(tmpSiteName,
                                                                                            tmpSiteSpec.maxwdir,
                                                                                            minDiskCount)
             tmpMsg += 'criteria=-disk'
             tmpLog.debug(tmpMsg)
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check minDiskCount>{1}MB'.format(len(scanSiteList),
                                                                                       minDiskCount))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # don't check for T1
         if tmpSiteName in t1Sites:
             pass
         else:
             # check at the site
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # the number of jobs which will produce outputs
             nRemJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'assigned') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'throttled') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running')
             # the size of input files which will be copied to the site
             movingInputSize = self.taskBufferIF.getMovingInputSize_JEDI(tmpSiteName)
             if movingInputSize == None:
                 tmpLog.error('failed to get the size of input file moving to {0}'.format(tmpSiteName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 self.sendLogMessage(tmpLog)
                 return retTmpError
             # free space - inputs - outputs(250MB*nJobs) must be >= 200GB
             outSizePerJob = 0.250
             diskThreshold = 200
             tmpSiteSpaceMap = self.ddmIF.getRseUsage(tmpSiteSpec.ddm)
             if tmpSiteSpaceMap != {}:
                 tmpSiteFreeSpace = tmpSiteSpaceMap['free']
                 tmpSpaceSize = tmpSiteFreeSpace - movingInputSize - nRemJobs * outSizePerJob
                 if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold:
                     tmpLog.debug('  skip {0} due to disk shortage in SE = {1}-{2}-{3}x{4} < {5}'.format(tmpSiteName,tmpSiteFreeSpace,
                                                                                                         movingInputSize,outSizePerJob,
                                                                                                         nRemJobs,diskThreshold))
                     continue
             # check if blacklisted
             if self.ddmIF.isBlackListedEP(tmpSiteSpec.ddm):
                 tmpLog.debug('  skip site={0} since endpoint={1} is blacklisted in DDM criteria=-blacklist'.format(tmpSiteName,tmpSiteSpec.ddm))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for walltime
     if not taskSpec.useHS06():
         tmpMaxAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True)
         minWalltime = taskSpec.walltime * tmpMaxAtomSize
         strMinWalltime = 'walltime*inputSize={0}*{1}'.format(taskSpec.walltime,tmpMaxAtomSize)
     else:
         tmpMaxAtomSize = inputChunk.getMaxAtomSize(getNumEvents=True)
         minWalltime = taskSpec.cpuTime * tmpMaxAtomSize
         strMinWalltime = 'cpuTime*nEventsPerJob={0}*{1}'.format(taskSpec.cpuTime,tmpMaxAtomSize)
     if minWalltime != None or inputChunk.useScout():
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             siteMaxTime = tmpSiteSpec.maxtime
             origSiteMaxTime = siteMaxTime
             # sending scouts merge or wallime-undefined jobs to only sites where walltime is more than 1 day
             if inputChunk.useScout() or inputChunk.isMerging or \
                     (taskSpec.walltime in [0,None] and taskSpec.walltimeUnit in ['',None] and taskSpec.cpuTimeUnit in ['',None]):
                 minTimeForZeroWalltime = 24*60*60
                 if siteMaxTime != 0 and siteMaxTime < minTimeForZeroWalltime:
                     tmpMsg = '  skip site={0} due to site walltime {1} (site upper limit) insufficient '.format(tmpSiteName,
                                                                                                                 siteMaxTime)
                     if inputChunk.useScout():
                         tmpMsg += 'for scouts ({0} at least) '.format(minTimeForZeroWalltime)
                         tmpMsg += 'criteria=-scoutwalltime'
                     else:
                         tmpMsg += 'for zero walltime ({0} at least) '.format(minTimeForZeroWalltime)
                         tmpMsg += 'criteria=-zerowalltime'
                     tmpLog.debug(tmpMsg)
                     continue
             # check max walltime at the site
             tmpSiteStr = '{0}'.format(siteMaxTime)
             if taskSpec.useHS06():
                 oldSiteMaxTime = siteMaxTime
                 siteMaxTime -= taskSpec.baseWalltime
                 tmpSiteStr = '({0}-{1})'.format(oldSiteMaxTime,taskSpec.baseWalltime)
             if not siteMaxTime in [None,0] and not tmpSiteSpec.coreCount in [None,0]:
                 siteMaxTime *= tmpSiteSpec.coreCount
                 tmpSiteStr += '*{0}'.format(tmpSiteSpec.coreCount)
             if taskSpec.useHS06():
                 if not siteMaxTime in [None,0] and not tmpSiteSpec.corepower in [None,0]:
                     siteMaxTime *= tmpSiteSpec.corepower
                     tmpSiteStr += '*{0}'.format(tmpSiteSpec.corepower)
                 siteMaxTime *= float(taskSpec.cpuEfficiency) / 100.0
                 siteMaxTime = long(siteMaxTime)
                 tmpSiteStr += '*{0}%'.format(taskSpec.cpuEfficiency)
             if origSiteMaxTime != 0 and minWalltime > siteMaxTime:
                 tmpMsg = '  skip site={0} due to short site walltime {1} (site upper limit) less than {2} '.format(tmpSiteName,
                                                                                                                    tmpSiteStr,
                                                                                                                    strMinWalltime)
                 tmpMsg += 'criteria=-shortwalltime'
                 tmpLog.debug(tmpMsg)
                 continue
             # check min walltime at the site
             siteMinTime = tmpSiteSpec.mintime
             origSiteMinTime = siteMinTime
             tmpSiteStr = '{0}'.format(siteMinTime)
             if taskSpec.useHS06():
                 oldSiteMinTime = siteMinTime
                 siteMinTime -= taskSpec.baseWalltime
                 tmpSiteStr = '({0}-{1})'.format(oldSiteMinTime,taskSpec.baseWalltime)
             if not siteMinTime in [None,0] and not tmpSiteSpec.coreCount in [None,0]:
                 siteMinTime *= tmpSiteSpec.coreCount
                 tmpSiteStr += '*{0}'.format(tmpSiteSpec.coreCount)
             if taskSpec.useHS06():
                 if not siteMinTime in [None,0] and not tmpSiteSpec.corepower in [None,0]:
                     siteMinTime *= tmpSiteSpec.corepower
                     tmpSiteStr += '*{0}'.format(tmpSiteSpec.corepower)
                 siteMinTime *= float(taskSpec.cpuEfficiency) / 100.0
                 siteMinTime = long(siteMinTime)
                 tmpSiteStr += '*{0}%'.format(taskSpec.cpuEfficiency)
             if origSiteMinTime != 0 and minWalltime < siteMinTime:
                 tmpMsg = '  skip site {0} due to short job walltime {1} (site lower limit) greater than {2} '.format(tmpSiteName,
                                                                                                                      tmpSiteStr,
                                                                                                                      strMinWalltime)
                 tmpMsg += 'criteria=-longwalltime'
                 tmpLog.debug(tmpMsg)
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         if not taskSpec.useHS06():
             tmpLog.debug('{0} candidates passed walltime check {1}({2})'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit))
         else:
             tmpLog.debug('{0} candidates passed walltime check {1}({2}*nEventsPerJob)'.format(len(scanSiteList),strMinWalltime,taskSpec.cpuTimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for network connectivity
     if not sitePreAssigned:
         ipConnectivity = taskSpec.getIpConnectivity()
         if ipConnectivity != None:
             newScanSiteList = []
             for tmpSiteName in scanSiteList:
                 tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                 # check at the site
                 if tmpSiteSpec.wnconnectivity == 'full':
                     pass
                 elif tmpSiteSpec.wnconnectivity == 'http' and ipConnectivity == 'http':
                     pass
                 else:
                     tmpMsg = '  skip site={0} due to insufficient connectivity (site={1}) for task={2} '.format(tmpSiteName,
                                                                                                                 tmpSiteSpec.wnconnectivity,
                                                                                                                 ipConnectivity)
                     tmpMsg += 'criteria=-network'
                     tmpLog.debug(tmpMsg)
                     continue
                 newScanSiteList.append(tmpSiteName)
             scanSiteList = newScanSiteList
             tmpLog.debug('{0} candidates passed network check ({1})'.format(len(scanSiteList),
                                                                             ipConnectivity))
             if scanSiteList == []:
                 tmpLog.error('no candidates')
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 self.sendLogMessage(tmpLog)
                 return retTmpError
     ######################################
     # selection for event service
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # event service
             if taskSpec.useEventService():
                 if tmpSiteSpec.getJobSeed() == 'std':
                     tmpMsg = '  skip site={0} since EventService is not allowed '.format(tmpSiteName)
                     tmpMsg += 'criteria=-es'
                     tmpLog.debug(tmpMsg)
                     continue
             else:
                 if tmpSiteSpec.getJobSeed() == 'es':
                     tmpMsg = '  skip site={0} since only EventService is allowed '.format(tmpSiteName)
                     tmpMsg += 'criteria=-nones'
                     tmpLog.debug(tmpMsg)
                     continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed EventService check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for transferring
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limit
         def_maxTransferring = 2000 
         if tmpSiteSpec.transferringlimit == 0:
             # use default value
             maxTransferring   = def_maxTransferring
         else:
             maxTransferring = tmpSiteSpec.transferringlimit
         # check at the site
         nTraJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'transferring',cloud=cloudName)
         nRunJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running',cloud=cloudName)
         if max(maxTransferring,2*nRunJobs) < nTraJobs and not tmpSiteSpec.cloud in ['ND']:
             tmpLog.debug('  skip site=%s due to too many transferring=%s greater than max(%s,2x%s) criteria=-transferring' % \
                              (tmpSiteName,nTraJobs,def_maxTransferring,nRunJobs))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed transferring check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for T1 weight
     t1Weight = taskSpec.getT1Weight()
     if t1Weight == 0:
         # use T1 weight in cloudconfig
         t1Weight = self.siteMapper.getCloud(cloudName)['weight']
     if t1Weight < 0:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             if not tmpSiteName in t1Sites:
                 tmpLog.debug('  skip site={0} due to negative T1 weight criteria=-t1weight'.format(tmpSiteName))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         t1Weight = 1
     t1Weight = max(t1Weight,t1WeightForHighPrio)
     tmpLog.debug('T1 weight {0}'.format(t1Weight))
     tmpLog.debug('{0} candidates passed T1 weight check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for nPilot
     nPilotMap = {}
     if not sitePreAssigned:
         nWNmap = self.taskBufferIF.getCurrentSiteData()
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             # check at the site
             nPilot = 0
             if nWNmap.has_key(tmpSiteName):
                 nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob']
             if nPilot == 0 and not 'test' in taskSpec.prodSourceLabel:
                 tmpLog.debug('  skip site=%s due to no pilot criteria=-nopilot' % tmpSiteName)
                 continue
             newScanSiteList.append(tmpSiteName)
             nPilotMap[tmpSiteName] = nPilot
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # get available files
     normalizeFactors = {}        
     availableFileMap = {}
     for datasetSpec in inputChunk.getDatasets():
         try:
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(scanSiteList,self.siteMapper,
                                                                        ignoreCC=True)
             # disable file lookup for merge jobs or secondary datasets
             checkCompleteness = True
             useCompleteOnly = False
             if inputChunk.isMerging:
                 checkCompleteness = False
             if not datasetSpec.isMaster():
                 useCompleteOnly = True
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec,
                                                         siteStorageEP,
                                                         self.siteMapper,
                                                         ngGroup=[1],
                                                         checkCompleteness=checkCompleteness,
                                                         storageToken=datasetSpec.storageToken,
                                                         useCompleteOnly=useCompleteOnly)
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             self.sendLogMessage(tmpLog)
             return retTmpError
         # loop over all sites to get the size of available files
         for tmpSiteName in scanSiteList:
             if not normalizeFactors.has_key(tmpSiteName):
                 normalizeFactors[tmpSiteName] = 0
             # get the total size of available files
             if availableFileMap[datasetSpec.datasetName].has_key(tmpSiteName):
                 availableFiles = availableFileMap[datasetSpec.datasetName][tmpSiteName]
                 for tmpFileSpec in \
                         availableFiles['localdisk']+availableFiles['localtape']+availableFiles['cache']:
                     normalizeFactors[tmpSiteName] += tmpFileSpec.fsize
     # get max total size
     tmpTotalSizes = normalizeFactors.values()
     tmpTotalSizes.sort()
     if tmpTotalSizes != []:
         totalSize = tmpTotalSizes.pop()
     else:
         totalSize = 0
     ######################################
     # calculate weight
     tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,
                                                                                 taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     tmpLog.debug('calculate weight and check cap for {0} candidates'.format(len(scanSiteList)))
     weightMapPrimary = {}
     weightMapSecondary = {}
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         nRunning   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',None,taskSpec.workQueue_ID)
         nDefined   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'definied',None,taskSpec.workQueue_ID) + self.getLiveCount(tmpSiteName)
         nAssigned  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'assigned',None,taskSpec.workQueue_ID)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,taskSpec.workQueue_ID) + \
                      AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,taskSpec.workQueue_ID)
         nStarting  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'starting',None,taskSpec.workQueue_ID)
         if tmpSiteName in nPilotMap:
             nPilot = nPilotMap[tmpSiteName]
         else:
             nPilot = 0
         manyAssigned = float(nAssigned + 1) / float(nActivated + 1)
         manyAssigned = min(2.0,manyAssigned)
         manyAssigned = max(1.0,manyAssigned)
         weight = float(nRunning + 1) / float(nActivated + nAssigned + nStarting + nDefined + 1) / manyAssigned
         weightStr = 'nRun={0} nAct={1} nAss={2} nStart={3} nDef={4} totalSize={5} manyAss={6} nPilot={7} '.format(nRunning,nActivated,nAssigned,
                                                                                                                   nStarting,nDefined,
                                                                                                                   totalSize,manyAssigned,
                                                                                                                   nPilot)
         # normalize weights by taking data availability into account
         if totalSize != 0:
             weight = weight * float(normalizeFactors[tmpSiteName]+totalSize) / float(totalSize)
             weightStr += 'availableSize={0} '.format(normalizeFactors[tmpSiteName])
         # T1 weight
         if tmpSiteName in t1Sites+sitesShareSeT1:
             weight *= t1Weight
             weightStr += 't1W={0} '.format(t1Weight)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # set weight and params
         siteCandidateSpec.weight = weight
         siteCandidateSpec.nRunningJobs = nRunning
         siteCandidateSpec.nQueuedJobs = nActivated + nAssigned + nStarting
         siteCandidateSpec.nAssignedJobs = nAssigned
         # set available files
         for tmpDatasetName,availableFiles in availableFileMap.iteritems():
             if availableFiles.has_key(tmpSiteName):
                 siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['localdisk']
                 siteCandidateSpec.localTapeFiles  += availableFiles[tmpSiteName]['localtape']
                 siteCandidateSpec.cacheFiles  += availableFiles[tmpSiteName]['cache']
                 siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote']
         # check if site is locked for WORLD
         lockedByBrokerage = False
         if taskSpec.useWorldCloud():
             lockedByBrokerage = self.checkSiteLock(taskSpec.vo,taskSpec.prodSourceLabel,
                                                    tmpSiteName,taskSpec.workQueue_ID)
         # check cap with nRunning
         cutOffValue = 20
         cutOffFactor = 2 
         nRunningCap = max(cutOffValue,cutOffFactor*nRunning)
         nRunningCap = max(nRunningCap,nPilot)
         okMsg = '  use site={0} with weight={1} {2} criteria=+use'.format(tmpSiteName,weight,weightStr)
         okAsPrimay = False
         if lockedByBrokerage:
             ngMsg = '  skip site={0} due to locked by another brokerage '.format(tmpSiteName)
             ngMsg += 'criteria=-lock'
         elif (nDefined+nActivated+nAssigned+nStarting) > nRunningCap:
             ngMsg = '  skip site={0} due to nDefined+nActivated+nAssigned+nStarting={1} '.format(tmpSiteName,
                                                                                                  nDefined+nActivated+nAssigned+nStarting)
             ngMsg += 'greater than max({0},{1}*nRunning={1}*{2},nPilot={3}) '.format(cutOffValue,
                                                                                      cutOffFactor,                                  
                                                                                      nRunning,                                      
                                                                                      nPilot)
             ngMsg += 'criteria=-cap'
         else:
             ngMsg = '  skip site={0} due to low weight '.format(tmpSiteName)
             ngMsg += 'criteria=-loweigh'
             okAsPrimay = True
         # use primay if cap/lock check is passed
         if okAsPrimay:
             weightMap = weightMapPrimary
         else:
             weightMap = weightMapSecondary
         # add weight
         if not weight in weightMap:
             weightMap[weight] = []
         weightMap[weight].append((siteCandidateSpec,okMsg,ngMsg))
     # use second candidates if no primary candidates passed cap/lock check
     if weightMapPrimary == {}:
         tmpLog.debug('use second candidates since no sites pass cap/lock check')
         weightMap = weightMapSecondary
         # use hightest 3 weights                                                                                                                                                  
         weightRank = 3
     else:
         weightMap = weightMapPrimary
         # use all weights
         weightRank = None
         # dump NG message
         for tmpWeight in weightMapSecondary.keys():
             for siteCandidateSpec,tmpOkMsg,tmpNgMsg in weightMapSecondary[tmpWeight]:
                 tmpLog.debug(tmpNgMsg)
     # max candidates for WORLD
     if taskSpec.useWorldCloud():
         maxSiteCandidates = 10
     else:
         maxSiteCandidates = None
     newScanSiteList = []
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightIdx,tmpWeight in enumerate(weightList):
         for siteCandidateSpec,tmpOkMsg,tmpNgMsg in weightMap[tmpWeight]:
             if (weightRank == None or weightIdx < weightRank) and \
                     (maxSiteCandidates == None or len(newScanSiteList) < maxSiteCandidates):
                 # use site
                 tmpLog.debug(tmpOkMsg)
                 newScanSiteList.append(siteCandidateSpec.siteName)
                 inputChunk.addSiteCandidate(siteCandidateSpec)
             else:
                 # dump NG message
                 tmpLog.debug(tmpNgMsg)
     scanSiteList = newScanSiteList
     # final check
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         self.sendLogMessage(tmpLog)
         return retTmpError
     # lock sites for WORLD
     if taskSpec.useWorldCloud():
         for tmpSiteName in scanSiteList:
             self.lockSite(taskSpec.vo,taskSpec.prodSourceLabel,tmpSiteName,taskSpec.workQueue_ID)
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     # return
     self.sendLogMessage(tmpLog)
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,inputChunk

Пример #29

Показать файл

Файл: AtlasProdTaskBroker.py Проект: ruslan33/panda-jedi

 def runImpl(self):
     # cutoff for disk in TB
     diskThreshold = 5 * 1024
     # dataset type to ignore file availability check
     datasetTypeToSkipCheck = ['log']
     thrInputSize = 1024*1024*1024
     thrInputNum = 100
     thrInputSizeFrac = 0.1
     thrInputNumFrac = 0.1
     cutOffRW = 50
     negWeightTape = 0.001
     # main
     lastJediTaskID = None
     siteMapper = self.taskBufferIF.getSiteMapper()
     while True:
         try:
             taskInputList = self.inputList.get(1)
             # no more datasets
             if len(taskInputList) == 0:
                 self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__,
                                                                                                             self.numTasks))
                 return
             # loop over all tasks
             for taskSpec,inputChunk in taskInputList:
                 lastJediTaskID = taskSpec.jediTaskID
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='{0}'.format(taskSpec.jediTaskID))
                 tmpLog.debug('start')
                 # get nuclei
                 nucleusList = siteMapper.nuclei
                 if taskSpec.nucleus in nucleusList:
                     candidateNucleus = taskSpec.nucleus
                 else:
                     tmpLog.debug('got {0} candidates'.format(len(nucleusList)))
                     ######################################
                     # check status
                     newNucleusList = {}
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleusSpec.state in ['ACTIVE']:
                             tmpLog.debug('  skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus,
                                                                                                         tmpNucleusSpec.state))
                         else:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.debug('{0} candidates passed status check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # check endpoint
                     newNucleusList = {}
                     tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                                   ['output','log'])
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         toSkip = False
                         for tmpDatasetSpec in tmpDatasetSpecList:
                             # ignore distributed datasets
                             if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None:
                                 continue
                             # get endpoint with the pattern
                             tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken)
                             if tmpEP == None:
                                 tmpLog.debug('  skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus,
                                                                                                                     tmpDatasetSpec.storageToken))
                                 toSkip = True
                                 break
                             # check state
                             """
                             if not tmpEP['state'] in ['ACTIVE']:
                                 tmpLog.debug('  skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus,
                                                                                                                          tmpEP['ddm_endpoint_name'],
                                                                                                                          tmpEP['state']))
                                 toSkip = True
                                 break
                             """    
                             # check space
                             tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired']
                             if tmpSpaceSize < diskThreshold:
                                 tmpLog.debug('  skip nucleus={0} since disk shortage ({1}<{2}) at endpoint {3} criteria=-space'.format(tmpNucleus,
                                                                                                                                        tmpSpaceSize,
                                                                                                                                        diskThreshold,
                                                                                                                                        tmpEP['state']))
                                 toSkip = True
                                 break
                         if not toSkip:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.debug('{0} candidates passed endpoint check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ###################################### 
                     # data locality
                     toSkip = False
                     availableData = {}
                     for datasetSpec in inputChunk.getDatasets():
                         # only for real datasets
                         if datasetSpec.isPseudo():
                             continue
                         # ignore DBR
                         if DataServiceUtils.isDBR(datasetSpec.datasetName):
                             continue
                         # skip locality check
                         if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck:
                             continue
                         # get nuclei where data is available
                         tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF,
                                                                           datasetSpec.datasetName,
                                                                           nucleusList.keys())
                         if tmpSt != Interaction.SC_SUCCEEDED:
                             tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet))
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             toSkip = True
                             break
                         # sum
                         for tmpNucleus,tmpVals in tmpRet.iteritems():
                             if not tmpNucleus in availableData:
                                 availableData[tmpNucleus] = tmpVals
                             else:
                                 availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems())
                     if toSkip:
                         continue
                     if availableData != {}:
                         newNucleusList = {}
                         # skip if no data
                         for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                             if availableData[tmpNucleus]['tot_size'] > thrInputSize and \
                                     availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac:
                                 tmpLog.debug('  skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus,
                                                                                                                                         availableData[tmpNucleus]['ava_size_any'],
                                                                                                                                         availableData[tmpNucleus]['tot_size'],
                                                                                                                                         thrInputSizeFrac))
                             elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \
                                     availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac:
                                 tmpLog.debug('  skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus,
                                                                                                                                           availableData[tmpNucleus]['ava_num_any'],
                                                                                                                                           availableData[tmpNucleus]['tot_num'],
                                                                                                                                           thrInputNumFrac))
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         nucleusList = newNucleusList
                         tmpLog.debug('{0} candidates passed data check'.format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # ability to execute jobs
                     newNucleusList = {}
                     # get all panda sites
                     tmpSiteList = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         tmpSiteList += tmpNucleusSpec.allPandaSites
                     tmpSiteList = list(set(tmpSiteList))
                     tmpLog.debug('===== start for job check')
                     jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF)
                     tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True,
                                                          tmpSiteList,tmpLog)
                     tmpLog.debug('===== done for job check')
                     if tmpSt != Interaction.SC_SUCCEEDED:
                         tmpLog.debug('failed to get sites where jobs can run. Use any nuclei where input is available')
                         # use any nuclei where input is available if no sites can run jobs
                         tmpRet = tmpSiteList
                     okNuclei = set()
                     for tmpSite in tmpRet:
                         siteSpec = siteMapper.getSite(tmpSite)
                         okNuclei.add(siteSpec.pandasite)
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if tmpNucleus in okNuclei:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                         else:
                             tmpLog.debug('  skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus))
                     nucleusList = newNucleusList
                     tmpLog.debug('{0} candidates passed job check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ###################################### 
                     # RW
                     taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID)
                     ###################################### 
                     # weight
                     self.prioRW.acquire()
                     nucleusRW = self.prioRW[taskSpec.currentPriority]
                     self.prioRW.release()
                     totalWeight = 0
                     nucleusweights = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleus in nucleusRW:
                             nucleusRW[tmpNucleus] = 0
                         wStr = '1'
                         # with RW
                         if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW:
                             weight = 1 / float(nucleusRW[tmpNucleus])
                             wStr += '/({0}=RW)'.format(nucleusRW[tmpNucleus])
                         else:
                             weight = 1
                             wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW)
                         # with data
                         if availableData != {}:
                             weight *= float(availableData[tmpNucleus]['ava_size_any'])
                             weight /= float(availableData[tmpNucleus]['tot_size'])
                             wStr += '*({0}=available input size on DISK/TAPE)'.format(availableData[tmpNucleus]['ava_size_any'])
                             wStr += '/({0}=total input size)'.format(availableData[tmpNucleus]['tot_size'])
                             # negative weight for tape
                             if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']:
                                 weight *= negWeightTape
                                 wStr += '*({0}=weight for TAPE)'.format(negWeightTape)
                         tmpLog.debug('  use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr))
                         totalWeight += weight
                         nucleusweights.append((tmpNucleus,weight))
                     tmpLog.debug('final {0} candidates'.format(len(nucleusList)))
                     ###################################### 
                     # final selection
                     tgtWeight = random.uniform(0,totalWeight)
                     candidateNucleus = None
                     for tmpNucleus,weight in nucleusweights:
                         tgtWeight -= weight
                         if tgtWeight <= 0:
                             candidateNucleus = tmpNucleus
                             break
                     if candidateNucleus == None:
                         candidateNucleus = nucleusweights[-1][0]
                 ###################################### 
                 # update
                 nucleusSpec = nucleusList[candidateNucleus]
                 # get output/log datasets
                 tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                            ['output','log'])
                 # get destinations
                 retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)}
                 tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
                 tmpLog.info('  set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet))
                 # update RW table
                 self.prioRW.acquire()
                 for prio,rwMap in self.prioRW.iteritems():
                     if prio > taskSpec.currentPriority:
                         continue
                     if candidateNucleus in rwMap:
                         rwMap[candidateNucleus] += taskRW
                     else:
                         rwMap[candidateNucleus] = taskRW
                 self.prioRW.release()
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errMsg  = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID)
             errMsg += traceback.format_exc()
             logger.error(errMsg)

Пример #30

Показать файл

Файл: AtlasProdTaskBroker.py Проект: ruslan33/panda-jedi

 def findMissingFiles(self,jediTaskID,cloudName):
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(jediTaskID))
     tmpLog.debug('start findMissingFiles')
     # return for failure
     retError = self.SC_FAILED
     # get datasets
     tmpSt,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(jediTaskID,['input'],True)
     if not tmpSt:
         tmpLog.error('failed to get the list of datasets')
         return retError
     # loop over all datasets
     for datasetSpec in datasetSpecList: 
         # check only master dataset
         if not datasetSpec.isMaster():
             continue
         tmpLog.debug('checking {0}'.format(datasetSpec.datasetName))
         # get ddmIF
         ddmIF = self.ddmIF.getInterface(datasetSpec.vo)
         if ddmIF == None:
             tmpLog.error('failed to get DDM I/F for vo={0}'.format(datasetSpec.vo))
             return retError
         # get the list of sites where data is available
         tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper,ddmIF,
                                                          datasetSpec.datasetName)
         if tmpSt != self.SC_SUCCEEDED:
             tmpLog.error('failed to get the list of sites where {0} is available, since {1}'.format(datasetSpec.datasetName,
                                                                                                     tmpRet))
             return retError
         dataSiteMap = tmpRet
         # data is unavailable in cloud
         if not dataSiteMap.has_key(cloudName):
             tmpLog.error('{0} is unavailable in cloud={1} map={2}'.format(datasetSpec.datasetName,cloudName,str(dataSiteMap)))
             return retError
         # mapping between sites and storage endpoints
         checkedSites = [self.siteMapper.getCloud(cloudName)['source']]+dataSiteMap[cloudName]['t2']
         siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(checkedSites,self.siteMapper)
         # get available files per site/endpoint                                                                                     
         tmpAvFileMap = ddmIF.getAvailableFiles(datasetSpec,
                                                siteStorageEP,
                                                self.siteMapper,
                                                ngGroup=[1],
                                                checkLFC=True)
         if tmpAvFileMap == None:
             tmpLog.error('failed to get available file list for {0}'.format(datasetSpec.datasetName))
             return retError
         # check availability
         missingFiles = []
         for fileSpec in datasetSpec.Files:
             fileFound = False
             for tmpSiteName,availableFilesMap in tmpAvFileMap.iteritems():
                 for tmpStorageType,availableFiles in availableFilesMap.iteritems():
                     for availableFile in availableFiles:
                         if fileSpec.lfn == availableFile.lfn:
                             fileFound = True
                             break
                     if fileFound:
                         break
                 if fileFound:
                     break
             # missing
             if not fileFound:
                 missingFiles.append(fileSpec.fileID)
                 tmpLog.debug('{0} missing'.format(fileSpec.lfn))
         # update contents
         if missingFiles != []:        
             tmpSt = self.taskBufferIF.setMissingFiles_JEDI(jediTaskID,datasetSpec.datasetID,missingFiles)
             if not tmpSt:
                 tmpLog.error('failed to set missing files in {0}'.format(datasetSpec.datasetName))
                 return retError
     tmpLog.debug('done findMissingFiles')
     return self.SC_SUCCEEDED

Пример #31

Показать файл

Файл: AtlasProdTaskBroker.py Проект: tertychnyy/panda-jedi

 def doBrokerage(self, inputList, vo, prodSourceLabel, workQueue):
     # variables for submission
     maxBunchTask = 100
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug("start doBrokerage")
     # return for failure
     retFatal = self.SC_FATAL
     retTmpError = self.SC_FAILED
     tmpLog.debug("vo={0} label={1} queue={2}".format(vo, prodSourceLabel, workQueue.queue_name))
     # loop over all tasks
     allRwMap = {}
     prioMap = {}
     tt2Map = {}
     expRWs = {}
     jobSpecList = []
     for tmpJediTaskID, tmpInputList in inputList:
         for taskSpec, cloudName, inputChunk in tmpInputList:
             # make JobSpec to be submitted for TaskAssigner
             jobSpec = JobSpec()
             jobSpec.taskID = taskSpec.jediTaskID
             jobSpec.jediTaskID = taskSpec.jediTaskID
             # set managed to trigger TA
             jobSpec.prodSourceLabel = "managed"
             jobSpec.processingType = taskSpec.processingType
             jobSpec.workingGroup = taskSpec.workingGroup
             jobSpec.metadata = taskSpec.processingType
             jobSpec.assignedPriority = taskSpec.taskPriority
             jobSpec.currentPriority = taskSpec.currentPriority
             jobSpec.maxDiskCount = (taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()) / 1024 / 1024
             if taskSpec.useWorldCloud():
                 # use destinationSE to trigger task brokerage in WORLD cloud
                 jobSpec.destinationSE = taskSpec.cloud
             prodDBlock = None
             setProdDBlock = False
             for datasetSpec in inputChunk.getDatasets():
                 prodDBlock = datasetSpec.datasetName
                 if datasetSpec.isMaster():
                     jobSpec.prodDBlock = datasetSpec.datasetName
                     setProdDBlock = True
                 for fileSpec in datasetSpec.Files:
                     tmpInFileSpec = fileSpec.convertToJobFileSpec(datasetSpec)
                     jobSpec.addFile(tmpInFileSpec)
             # use secondary dataset name as prodDBlock
             if setProdDBlock == False and prodDBlock != None:
                 jobSpec.prodDBlock = prodDBlock
             # append
             jobSpecList.append(jobSpec)
             prioMap[jobSpec.taskID] = jobSpec.currentPriority
             tt2Map[jobSpec.taskID] = jobSpec.processingType
             # get RW for a priority
             if not allRwMap.has_key(jobSpec.currentPriority):
                 tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI(
                     vo, prodSourceLabel, workQueue, jobSpec.currentPriority
                 )
                 if tmpRW == None:
                     tmpLog.error("failed to calculate RW with prio={0}".format(jobSpec.currentPriority))
                     return retTmpError
                 allRwMap[jobSpec.currentPriority] = tmpRW
             # get expected RW
             expRW = self.taskBufferIF.calculateTaskRW_JEDI(jobSpec.jediTaskID)
             if expRW == None:
                 tmpLog.error("failed to calculate RW for jediTaskID={0}".format(jobSpec.jediTaskID))
                 return retTmpError
             expRWs[jobSpec.taskID] = expRW
     # get fullRWs
     fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI(vo, prodSourceLabel, None, None)
     if fullRWs == None:
         tmpLog.error("failed to calculate full RW")
         return retTmpError
     # set metadata
     for jobSpec in jobSpecList:
         rwValues = allRwMap[jobSpec.currentPriority]
         jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % (
             jobSpec.metadata,
             str(rwValues),
             str(expRWs),
             str(prioMap),
             str(fullRWs),
             str(tt2Map),
         )
     tmpLog.debug("run task assigner for {0} tasks".format(len(jobSpecList)))
     nBunchTask = 0
     while nBunchTask < len(jobSpecList):
         # get a bunch
         jobsBunch = jobSpecList[nBunchTask : nBunchTask + maxBunchTask]
         strIDs = "jediTaskID="
         for tmpJobSpec in jobsBunch:
             strIDs += "{0},".format(tmpJobSpec.taskID)
         strIDs = strIDs[:-1]
         tmpLog.debug(strIDs)
         # increment index
         nBunchTask += maxBunchTask
         # run task brokerge
         stS, outSs = PandaClient.runTaskAssignment(jobsBunch)
         tmpLog.debug("{0}:{1}".format(stS, str(outSs)))
     # return
     tmpLog.debug("done")
     return self.SC_SUCCEEDED

Пример #32

Показать файл

Файл: JobSplitter.py Проект: PanDAWMS/panda-jedi

 def doSplit(self,taskSpec,inputChunk,siteMapper):
     # return for failure
     retFatal    = self.SC_FATAL,[]
     retTmpError = self.SC_FAILED,[]
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0} datasetID={1}>'.format(taskSpec.jediTaskID,inputChunk.masterIndexName))
     tmpLog.debug('start')
     if not inputChunk.isMerging:
         # set maxNumFiles using taskSpec if specified
         maxNumFiles = taskSpec.getMaxNumFilesPerJob()
         # set fsize gradients using taskSpec
         sizeGradients  = taskSpec.getOutDiskSize()
         # set fsize intercepts using taskSpec                
         sizeIntercepts = taskSpec.getWorkDiskSize()
         # walltime
         if not taskSpec.useHS06():
             walltimeGradient = taskSpec.walltime
         else:
             walltimeGradient = taskSpec.cpuTime
         # number of events per job if defined
         nEventsPerJob = taskSpec.getNumEventsPerJob()
         # number of files per job if defined
         if not taskSpec.dynamicNumEvents():
             nFilesPerJob = taskSpec.getNumFilesPerJob()
         else:
             nFilesPerJob = None
         if nFilesPerJob == None and nEventsPerJob == None and inputChunk.useScout() \
                 and not taskSpec.useLoadXML() and not taskSpec.respectSplitRule():
             nFilesPerJob = 1
         # grouping with boundaryID
         useBoundary = taskSpec.useGroupWithBoundaryID()
         # fsize intercepts per input size
         sizeGradientsPerInSize = None
         # max primay output size
         maxOutSize = None
         # max size per job
         maxSizePerJob = taskSpec.getMaxSizePerJob()
         if maxSizePerJob is not None:
             maxSizePerJob += InputChunk.defaultOutputSize
         # dynamic number of events
         dynNumEvents = taskSpec.dynamicNumEvents()
         # max number of event ranges
         maxNumEventRanges = None
         # multiplicity of jobs
         if taskSpec.useJobCloning():
             multiplicity = 1
         else:
             multiplicity = taskSpec.getNumEventServiceConsumer()
         # split with fields
         if taskSpec.getFieldNumToLFN() != None and taskSpec.useFileAsSourceLFN():
             splitByFields = taskSpec.getFieldNumToLFN()
         else:
             splitByFields = None
     else:
         # set parameters for merging
         maxNumFiles = taskSpec.getMaxNumFilesPerMergeJob()
         sizeGradients = 0
         walltimeGradient = 0
         nFilesPerJob = taskSpec.getNumFilesPerMergeJob()
         nEventsPerJob = taskSpec.getNumEventsPerMergeJob()
         maxSizePerJob = None
         useBoundary = {'inSplit':3}
         dynNumEvents = False
         maxNumEventRanges = None
         multiplicity = None
         # gradients per input size is 1 + margin
         sizeGradientsPerInSize = self.sizeGradientsPerInSizeForMerge
         # intercepts for libDS
         sizeIntercepts = taskSpec.getWorkDiskSize()
         # mergein of 500MB
         interceptsMergin = self.interceptsMerginForMerge
         if sizeIntercepts < interceptsMergin:
             sizeIntercepts = interceptsMergin
         maxOutSize = taskSpec.getMaxSizePerMergeJob()
         if maxOutSize == None:
             # max output size is 5GB for merging by default
             maxOutSize = 5 * 1024 * 1024 * 1024
         # split with fields
         if taskSpec.getFieldNumToLFN() != None and taskSpec.useFileAsSourceLFN():
             splitByFields = range(4+1,4+1+len(taskSpec.getFieldNumToLFN()))
         else:
             splitByFields = None
     # LB
     respectLB = taskSpec.respectLumiblock()
     # dump
     tmpLog.debug('maxNumFiles={0} sizeGradients={1} sizeIntercepts={2} useBoundary={3}'.format(maxNumFiles,
                                                                                                sizeGradients,
                                                                                                sizeIntercepts,
                                                                                                useBoundary))
     tmpLog.debug('walltimeGradient={0} nFilesPerJob={1} nEventsPerJob={2}'.format(walltimeGradient,
                                                                                     nFilesPerJob,
                                                                                     nEventsPerJob))
     tmpLog.debug('sizeGradientsPerInSize={0} maxOutSize={1} respectLB={2} dynNumEvents={3}'.format(sizeGradientsPerInSize,
                                                                                                    maxOutSize,
                                                                                                    respectLB,
                                                                                                    dynNumEvents))
     tmpLog.debug('multiplicity={0} splitByFields={1} nFiles={2}'.format(multiplicity,str(splitByFields),
                                                                         inputChunk.getNumFilesInMaster()))
     # split
     returnList = []
     subChunks  = []
     iSubChunks = 0
     nSubChunks = 25
     subChunk   = None
     while True:
         # change site
         if iSubChunks % nSubChunks == 0 or subChunk == []:
             # append to return map
             if subChunks != []:
                 # get site names for parallel execution
                 if taskSpec.getNumSitesPerJob() > 1 and not inputChunk.isMerging and inputChunk.useJumbo != 'fake':
                     siteName = inputChunk.getParallelSites(taskSpec.getNumSitesPerJob(),
                                                            nSubChunks,[siteName])
                 returnList.append({'siteName':siteName,
                                    'subChunks':subChunks,
                                    'siteCandidate':siteCandidate,
                                    })
                 tmpLog.debug('split to %s subchunks' % len(subChunks))
                 # reset
                 subChunks = []
             # skip unavailable files in distributed datasets
             nSkip = inputChunk.skipUnavailableFiles()
             tmpLog.debug('skipped {0} files'.format(nSkip))
             # new candidate
             siteCandidate = inputChunk.getOneSiteCandidate(nSubChunks)
             if siteCandidate == None:
                 break
             siteName = siteCandidate.siteName
             siteSpec = siteMapper.getSite(siteName)
             # directIO
             if taskSpec.useLocalIO() or not siteSpec.isDirectIO() or taskSpec.allowInputLAN() is None \
                     or inputChunk.isMerging:
                 useDirectIO = False
             else:
                 useDirectIO = True
             # get maxSize if it is set in taskSpec
             maxSize = maxSizePerJob
             if maxSize == None:
                 # use maxwdir as the default maxSize
                 if not useDirectIO:
                     maxSize = siteSpec.maxwdir * 1024 * 1024
                 elif nEventsPerJob is not None or nFilesPerJob is not None:
                     maxSize = None
                 else:
                     maxSize = max(50000, siteSpec.maxwdir) * 1024 * 1024
             else:
                 # add offset
                 maxSize += sizeIntercepts
             # max disk size
             maxDiskSize = siteSpec.maxwdir * 1024 * 1024
             # max walltime
             maxWalltime = None
             if not inputChunk.isMerging:
                 maxWalltime = taskSpec.getMaxWalltime()
             if maxWalltime is None:
                 maxWalltime = siteSpec.maxtime
             # core count
             if siteSpec.coreCount > 0:
                 coreCount = siteSpec.coreCount
             else:
                 coreCount = 1
             # core power
             corePower = siteSpec.corepower
             # max num of event ranges for dynNumEvents
             if dynNumEvents:
                 maxNumEventRanges = int(siteSpec.get_n_sim_events() / taskSpec.get_min_granularity())
                 if maxNumEventRanges == 0:
                     maxNumEventRanges = 1
             tmpLog.debug('chosen {0}'.format(siteName))
             tmpLog.debug('new weight {0}'.format(siteCandidate.weight))
             tmpLog.debug('maxSize={0} maxWalltime={1} coreCount={2} corePower={3} maxNumEventRanges={4} maxDisk={5}'.format(maxSize,maxWalltime,
                                                                                                                             coreCount,corePower,
                                                                                                                             maxNumEventRanges,
                                                                                                                             maxDiskSize))
             tmpLog.debug('useDirectIO={0} label={1}'.format(useDirectIO, taskSpec.prodSourceLabel))
         # get sub chunk
         subChunk = inputChunk.getSubChunk(siteName,maxSize=maxSize,
                                           maxNumFiles=maxNumFiles,
                                           sizeGradients=sizeGradients,
                                           sizeIntercepts=sizeIntercepts,
                                           nFilesPerJob=nFilesPerJob,
                                           walltimeGradient=walltimeGradient,
                                           maxWalltime=maxWalltime,
                                           nEventsPerJob=nEventsPerJob,
                                           useBoundary=useBoundary,
                                           sizeGradientsPerInSize=sizeGradientsPerInSize,
                                           maxOutSize=maxOutSize,
                                           coreCount=coreCount,
                                           respectLB=respectLB,
                                           corePower=corePower,
                                           dynNumEvents=dynNumEvents,
                                           maxNumEventRanges=maxNumEventRanges,
                                           multiplicity=multiplicity,
                                           splitByFields=splitByFields,
                                           tmpLog=tmpLog,
                                           useDirectIO=useDirectIO,
                                           maxDiskSize=maxDiskSize)
         if subChunk == None:
             break
         if subChunk != []:
             # append
             subChunks.append(subChunk)
         iSubChunks += 1
     # append to return map if remain
     if subChunks != []:
         # get site names for parallel execution
         if taskSpec.getNumSitesPerJob() > 1 and not inputChunk.isMerging:
             siteName = inputChunk.getParallelSites(taskSpec.getNumSitesPerJob(),
                                                    nSubChunks,[siteName])
         returnList.append({'siteName':siteName,
                            'subChunks':subChunks,
                            'siteCandidate':siteCandidate,
                            })
         tmpLog.debug('split to %s subchunks' % len(subChunks))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED,returnList

Пример #33

Показать файл

Файл: AtlasProdJobThrottler.py Проект: lukewayne123/panda-jedi

 def toBeThrottled(self,vo,prodSourceLabel,cloudName,workQueue,jobStat):
     # params
     nBunch = 4
     threshold = 2.0
     thresholdForSite = threshold - 1.0
     nJobsInBunchMax = 500
     nJobsInBunchMin = 300
     nJobsInBunchMaxES = 1000
     nWaitingLimit = 4
     nWaitingBunchLimit = 2
     nParallel = 8
     # make logger
     tmpLog = MsgWrapper(logger)
     workQueueIDs = workQueue.getIDs()
     msgHeader = '{0}:{1} cloud={2} queue={3}:'.format(vo,prodSourceLabel,cloudName,workQueue.queue_name)
     tmpLog.debug(msgHeader+' start workQueueID={0}'.format(str(workQueueIDs)))
     # check cloud status
     if not self.siteMapper.checkCloud(cloudName):
         msgBody = "SKIP cloud={0} undefined".format(cloudName)
         tmpLog.debug(msgHeader+" "+msgBody)
         tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
         return self.retThrottled
     cloudSpec = self.siteMapper.getCloud(cloudName)
     if cloudSpec['status'] in ['offline']:
         msgBody = "SKIP cloud.status={0}".format(cloudSpec['status'])
         tmpLog.debug(msgHeader+" "+msgBody)
         tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
         return self.retThrottled
     if cloudSpec['status'] in ['test']:
         if workQueue.queue_name != 'test':
             msgBody = "SKIP cloud.status={0} for non test queue ({1})".format(cloudSpec['status'],
                                                                               workQueue.queue_name)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
             tmpLog.debug(msgHeader+" "+msgBody)
             return self.retThrottled
     # check if unthrottled
     if workQueue.queue_share == None:
         msgBody = "PASS unthrottled since share=None"
         tmpLog.debug(msgHeader+" "+msgBody)
         return self.retUnThrottled
     # count number of jobs in each status
     nRunning = 0
     nNotRun  = 0
     nDefine  = 0
     nWaiting = 0
     for workQueueID in workQueueIDs:
         if jobStat.has_key(cloudName) and \
                jobStat[cloudName].has_key(workQueueID):
             tmpLog.debug(msgHeader+" "+str(jobStat[cloudName][workQueueID]))
             for pState,pNumber in jobStat[cloudName][workQueueID].iteritems():
                 if pState in ['running']:
                     nRunning += pNumber
                 elif pState in ['assigned','activated','starting']:
                     nNotRun  += pNumber
                 elif pState in ['defined']:
                     nDefine  += pNumber
                 elif pState in ['waiting']:
                     nWaiting += pNumber
     # check if higher prio tasks are waiting
     tmpStat,highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed',cloudName,workQueue)
     highestPrioInPandaDB = highestPrioJobStat['highestPrio']
     nNotRunHighestPrio   = highestPrioJobStat['nNotRun']
     # the highest priority of waiting tasks 
     highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo,workQueue,
                                                                      'managed',cloudName)
     if highestPrioWaiting == None:
         msgBody = 'failed to get the highest priority of waiting tasks'
         tmpLog.error(msgHeader+" "+msgBody)
         return self.retTmpError
     # high priority tasks are waiting
     highPrioQueued = False
     if highestPrioWaiting > highestPrioInPandaDB or (highestPrioWaiting == highestPrioInPandaDB and \
                                                      nNotRunHighestPrio < nJobsInBunchMin):
         highPrioQueued = True
     tmpLog.debug(msgHeader+" highestPrio waiting:{0} inPanda:{1} numNotRun:{2} -> highPrioQueued={3}".format(highestPrioWaiting,
                                                                                                              highestPrioInPandaDB,
                                                                                                              nNotRunHighestPrio,
                                                                                                              highPrioQueued))
     # set maximum number of jobs to be submitted
     tmpRemainingSlot = int(nRunning*threshold-nNotRun)
     if tmpRemainingSlot < nJobsInBunchMin:
         # use the lower limit to avoid creating too many _sub/_dis datasets
         nJobsInBunch = nJobsInBunchMin
     else:
         if workQueue.queue_name in ['evgensimul']:
             # use higher limit for evgensimul
             if tmpRemainingSlot < nJobsInBunchMaxES:
                 nJobsInBunch = tmpRemainingSlot
             else:
                 nJobsInBunch = nJobsInBunchMaxES
         else:
             if tmpRemainingSlot < nJobsInBunchMax:
                 nJobsInBunch = tmpRemainingSlot
             else:
                 nJobsInBunch = nJobsInBunchMax
     nQueueLimit = nJobsInBunch*nBunch
     # use special limit for CERN
     if cloudName == 'CERN':
         nQueueLimit = 2000
     # use nPrestage for reprocessing   
     if workQueue.queue_name in ['reprocessing']:
         if cloudSpec.has_key('nprestage') and cloudSpec['nprestage'] > 0:
             nQueueLimit = cloudSpec['nprestage']
             # reset nJobsInBunch
             if nQueueLimit > (nNotRun+nDefine):
                 tmpRemainingSlot = nQueueLimit - (nNotRun+nDefine)
                 if tmpRemainingSlot < nJobsInBunch:
                     pass
                 elif tmpRemainingSlot < nJobsInBunchMax:
                     nJobsInBunch = tmpRemainingSlot
                 else:
                     nJobsInBunch = nJobsInBunchMax
     # set number of jobs to be submitted
     self.setMaxNumJobs(nJobsInBunch/nParallel)
     # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling
     limitPriority = False
     tmpLog.debug(msgHeader+" nQueueLimit:{0} nQueued:{1} nDefine:{2} nRunning:{3}".format(nQueueLimit,
                                                                                           nNotRun+nDefine,
                                                                                           nDefine,
                                                                                           nRunning))
     # check when high prio tasks are not waiting
     if not highPrioQueued:
         if nRunning == 0 and (nNotRun+nDefine) > nQueueLimit:
             limitPriority = True
             # pilot is not running or DDM has a problem
             msgBody = "SKIP no running and enough nQueued({0})>{1}".format(nNotRun+nDefine,nQueueLimit)
             tmpLog.debug(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
             return self.retMergeUnThr
         elif nRunning != 0 and float(nNotRun)/float(nRunning) > threshold and (nNotRun+nDefine) > nQueueLimit:
             limitPriority = True
             # enough jobs in Panda
             msgBody = "SKIP nQueued({0})/nRunning({1})>{2} & nQueued+Defined({3})>{4}".format(nNotRun,nRunning,
                                                                                               threshold,nNotRun+nDefine,
                                                                                               nQueueLimit)
             tmpLog.debug(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
             return self.retMergeUnThr
         elif nDefine > nQueueLimit:
             limitPriority = True
             # brokerage is stuck
             msgBody = "SKIP too many nDefined({0})>{1}".format(nDefine,nQueueLimit)
             tmpLog.debug(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
             return self.retMergeUnThr
         elif nWaiting > nRunning*nWaitingLimit and nWaiting > nJobsInBunch*nWaitingBunchLimit:
             limitPriority = True
             # too many waiting
             msgBody = "SKIP too many nWaiting({0})>max(nRunning({1})x{2},{3}x{4})".format(nWaiting,nRunning,nWaitingLimit,
                                                                                           nJobsInBunch,nWaitingBunchLimit)
             tmpLog.debug(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
             return self.retMergeUnThr
     # get jobs from prodDB
     limitPriorityValue = None
     if limitPriority:
         limitPriorityValue = highestPrioInPandaDB
         self.setMinPriority(limitPriorityValue)
     msgBody = "PASS - priority limit={0}".format(limitPriorityValue)
     tmpLog.debug(msgHeader+" "+msgBody)
     return self.retUnThrottled

Пример #34

Показать файл

Файл: AtlasProdWatchDog.py Проект: PanDAWMS/panda-jedi

    def doActionForReassign(self,gTmpLog):
        # get DDM I/F
        ddmIF = self.ddmIF.getInterface(self.vo)
        # get site mapper
        siteMapper = self.taskBufferIF.getSiteMapper()
        # get tasks to get reassigned
        taskList = self.taskBufferIF.getTasksToReassign_JEDI(self.vo,self.prodSourceLabel)

        gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList)))
        for taskSpec in taskList:
            tmpLog = MsgWrapper(logger, '< jediTaskID={0} >'.format(taskSpec.jediTaskID))
            tmpLog.debug('start to reassign')
            # DDM backend
            ddmBackEnd = taskSpec.getDdmBackEnd()
            # get datasets
            tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,['output','log'])
            if tmpStat is not True:
                tmpLog.error('failed to get datasets')
                continue
            # update DB
            if not taskSpec.useWorldCloud():
                # update cloudtasks
                tmpStat = self.taskBufferIF.setCloudTaskByUser('jedi',taskSpec.jediTaskID,taskSpec.cloud,'assigned',True)
                if tmpStat != 'SUCCEEDED':
                    tmpLog.error('failed to update CloudTasks')
                    continue
                # check cloud
                if not siteMapper.checkCloud(taskSpec.cloud):
                    tmpLog.error("cloud={0} doesn't exist".format(taskSpec.cloud))
                    continue
            else:
                # re-run task brokerage
                if taskSpec.nucleus in [None,'']:
                    taskSpec.status = 'assigning'
                    taskSpec.oldStatus = None
                    taskSpec.setToRegisterDatasets()
                    self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID': taskSpec.jediTaskID},
                                                      setOldModTime=True)
                    tmpLog.debug('#ATM #KV label=managed action=trigger_new_brokerage by setting task_status={0}'.
                                 format(taskSpec.status))
                    continue

                # get nucleus
                nucleusSpec = siteMapper.getNucleus(taskSpec.nucleus)
                if nucleusSpec is None:
                    tmpLog.error("nucleus={0} doesn't exist".format(taskSpec.nucleus))
                    continue

                # set nucleus
                retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,datasetSpecList)}
                tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)

            # get T1/nucleus
            if not taskSpec.useWorldCloud():
                t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest']
            else:
                t1SiteName = nucleusSpec.getOnePandaSite()
            t1Site = siteMapper.getSite(t1SiteName)

            # loop over all datasets
            isOK = True
            for datasetSpec in datasetSpecList:
                tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName))
                if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                    tmpLog.debug('skip {0} is distributed'.format(datasetSpec.datasetName))
                    continue
                # get location
                location = siteMapper.getDdmEndpoint(t1Site.sitename, datasetSpec.storageToken, taskSpec.prodSourceLabel,
                                                     JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType))
                # make subscription
                try:
                    tmpLog.debug('registering subscription to {0} with backend={1}'.format(location,
                                                                                           ddmBackEnd))
                    tmpStat = ddmIF.registerDatasetSubscription(datasetSpec.datasetName,location,
                                                                'Production Output',asynchronous=True)
                    if tmpStat is not True:
                        tmpLog.error("failed to make subscription")
                        isOK = False
                        break
                except Exception:
                    errtype,errvalue = sys.exc_info()[:2]
                    tmpLog.warning('failed to make subscription with {0}:{1}'.format(errtype.__name__,errvalue))
                    isOK = False
                    break
            # succeeded
            if isOK:
                # activate task
                if taskSpec.oldStatus in ['assigning','exhausted',None]:
                    taskSpec.status = 'ready'
                else:
                    taskSpec.status = taskSpec.oldStatus
                taskSpec.oldStatus = None
                self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID},
                                                  setOldModTime=True)
                tmpLog.debug('finished to reassign')

Пример #35

Показать файл

    def runImpl(self):
        while True:
            try:
                # get a part of list
                nTasks = 10
                taskDsList = self.taskDsList.get(nTasks)
                # no more datasets
                if len(taskDsList) == 0:
                    self.logger.debug('%s terminating since no more items' %
                                      self.__class__.__name__)
                    return
                # loop over all tasks
                for jediTaskID, dsList in taskDsList:
                    allUpdated = True
                    taskBroken = False
                    taskOnHold = False
                    runningTask = False
                    missingMap = {}
                    # make logger
                    tmpLog = MsgWrapper(
                        self.logger, '< jediTaskID={0} >'.format(jediTaskID))
                    # get task
                    tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(
                        jediTaskID, False, True, self.pid, 10)
                    if not tmpStat or taskSpec == None:
                        tmpLog.error(
                            'failed to get taskSpec for jediTaskID={0}'.format(
                                jediTaskID))
                        continue
                    try:
                        # get task parameters
                        taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                            jediTaskID)
                        taskParamMap = RefinerUtils.decodeJSON(taskParam)
                    except:
                        errtype, errvalue = sys.exc_info()[:2]
                        tmpLog.error(
                            'task param conversion from json failed with {0}:{1}'
                            .format(errtype.__name__, errvalue))
                        taskBroken = True
                    # renaming of parameters
                    if taskParamMap.has_key('nEventsPerInputFile'):
                        taskParamMap['nEventsPerFile'] = taskParamMap[
                            'nEventsPerInputFile']
                    # the number of files per job
                    nFilesPerJob = None
                    if taskParamMap.has_key('nFilesPerJob'):
                        nFilesPerJob = taskParamMap['nFilesPerJob']
                    # the number of chunks used by scout
                    nChunksForScout = 10
                    # load XML
                    if taskSpec.useLoadXML():
                        xmlConfig = taskParamMap['loadXML']
                    else:
                        xmlConfig = None
                    # skip files used by another task
                    if 'skipFilesUsedBy' in taskParamMap:
                        skipFilesUsedBy = taskParamMap['skipFilesUsedBy']
                    else:
                        skipFilesUsedBy = None
                    # check no wait
                    noWaitParent = False
                    parentOutDatasets = set()
                    if taskSpec.noWaitParent() and not taskSpec.parent_tid in [
                            None, taskSpec.jediTaskID
                    ]:
                        tmpStat = self.taskBufferIF.checkParentTask_JEDI(
                            taskSpec.parent_tid)
                        if tmpStat == 'running':
                            noWaitParent = True
                            # get output datasets from parent task
                            tmpParentStat, tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                                taskSpec.parent_tid, ['output', 'log'])
                            # collect dataset names
                            for tmpParentOutDataset in tmpParentOutDatasets:
                                parentOutDatasets.add(
                                    tmpParentOutDataset.datasetName)
                    # loop over all datasets
                    nFilesMaster = 0
                    checkedMaster = False
                    setFrozenTime = True
                    if not taskBroken:
                        ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                        origNumFiles = None
                        if taskParamMap.has_key('nFiles'):
                            origNumFiles = taskParamMap['nFiles']
                        for datasetSpec in dsList:
                            tmpLog.debug('start loop for {0}(id={1})'.format(
                                datasetSpec.datasetName,
                                datasetSpec.datasetID))
                            # get dataset metadata
                            tmpLog.debug('get metadata')
                            gotMetadata = False
                            stateUpdateTime = datetime.datetime.utcnow()
                            try:
                                if not datasetSpec.isPseudo():
                                    tmpMetadata = ddmIF.getDatasetMetaData(
                                        datasetSpec.datasetName)
                                else:
                                    # dummy metadata for pseudo dataset
                                    tmpMetadata = {'state': 'closed'}
                                # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed
                                if (noWaitParent or taskSpec.runUntilClosed()) and \
                                        (tmpMetadata['state'] == 'open' \
                                             or datasetSpec.datasetName in parentOutDatasets \
                                             or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets):
                                    # dummy metadata when parent is running
                                    tmpMetadata = {'state': 'mutable'}
                                gotMetadata = True
                            except:
                                errtype, errvalue = sys.exc_info()[:2]
                                tmpLog.error(
                                    '{0} failed to get metadata to {1}:{2}'.
                                    format(self.__class__.__name__,
                                           errtype.__name__, errvalue))
                                if errtype == Interaction.JEDIFatalError:
                                    # fatal error
                                    datasetStatus = 'broken'
                                    taskBroken = True
                                    # update dataset status
                                    self.updateDatasetStatus(
                                        datasetSpec, datasetStatus, tmpLog)
                                else:
                                    if not taskSpec.ignoreMissingInDS():
                                        # temporary error
                                        taskOnHold = True
                                    else:
                                        # ignore missing
                                        datasetStatus = 'failed'
                                        # update dataset status
                                        self.updateDatasetStatus(
                                            datasetSpec, datasetStatus, tmpLog)
                                taskSpec.setErrDiag(
                                    'failed to get metadata for {0}'.format(
                                        datasetSpec.datasetName))
                                if not taskSpec.ignoreMissingInDS():
                                    allUpdated = False
                            else:
                                # get file list specified in task parameters
                                fileList, includePatt, excludePatt = RefinerUtils.extractFileList(
                                    taskParamMap, datasetSpec.datasetName)
                                # get the number of events in metadata
                                if taskParamMap.has_key(
                                        'getNumEventsInMetadata'):
                                    getNumEvents = True
                                else:
                                    getNumEvents = False
                                # get file list from DDM
                                tmpLog.debug('get files')
                                try:
                                    useInFilesWithNewAttemptNr = False
                                    skipDuplicate = not datasetSpec.useDuplicatedFiles(
                                    )
                                    if not datasetSpec.isPseudo():
                                        if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \
                                                not datasetSpec.containerName in ['',None]:
                                            # read files from container if file list is specified in task parameters
                                            tmpDatasetName = datasetSpec.containerName
                                        else:
                                            tmpDatasetName = datasetSpec.datasetName
                                        # use long format for LB
                                        longFormat = False
                                        if taskSpec.respectLumiblock():
                                            longFormat = True
                                        tmpRet = ddmIF.getFilesInDataset(
                                            tmpDatasetName,
                                            getNumEvents=getNumEvents,
                                            skipDuplicate=skipDuplicate,
                                            longFormat=longFormat)
                                        tmpLog.debug(
                                            'got {0} files in {1}'.format(
                                                len(tmpRet), tmpDatasetName))
                                        # remove lost files
                                        tmpLostFiles = ddmIF.findLostFiles(
                                            tmpDatasetName, tmpRet)
                                        if tmpLostFiles != {}:
                                            tmpLog.debug(
                                                'found {0} lost files in {1}'.
                                                format(len(tmpLostFiles),
                                                       tmpDatasetName))
                                            for tmpListGUID, tmpLostLFN in tmpLostFiles.iteritems(
                                            ):
                                                tmpLog.debug(
                                                    'removed {0}'.format(
                                                        tmpLostLFN))
                                                del tmpRet[tmpListGUID]
                                    else:
                                        if datasetSpec.isSeqNumber():
                                            # make dummy files for seq_number
                                            if datasetSpec.getNumRecords(
                                            ) != None:
                                                nPFN = datasetSpec.getNumRecords(
                                                )
                                            elif origNumFiles != None:
                                                nPFN = origNumFiles
                                                if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \
                                                        and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                    nPFN = nPFN * taskParamMap[
                                                        'nEventsPerFile'] / taskParamMap[
                                                            'nEventsPerJob']
                                                elif taskParamMap.has_key(
                                                        'nEventsPerFile'
                                                ) and taskParamMap.has_key(
                                                        'nEventsPerRange'):
                                                    nPFN = nPFN * taskParamMap[
                                                        'nEventsPerFile'] / taskParamMap[
                                                            'nEventsPerRange']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap:
                                                nPFN = taskParamMap[
                                                    'nEvents'] / taskParamMap[
                                                        'nEventsPerJob']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \
                                                    and 'nFilesPerJob' in taskParamMap:
                                                nPFN = taskParamMap[
                                                    'nEvents'] / taskParamMap[
                                                        'nEventsPerFile'] / taskParamMap[
                                                            'nFilesPerJob']
                                            else:
                                                # the default number of records for seq_number
                                                seqDefNumRecords = 10000
                                                # get nFiles of the master
                                                tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI(
                                                    datasetSpec.jediTaskID,
                                                    datasetSpec.masterID,
                                                    ['nFiles'])
                                                # use nFiles of the master as the number of records if it is larger than the default
                                                if 'nFiles' in tmpMasterAtt and tmpMasterAtt[
                                                        'nFiles'] > seqDefNumRecords:
                                                    nPFN = tmpMasterAtt[
                                                        'nFiles']
                                                else:
                                                    nPFN = seqDefNumRecords
                                                # check usedBy
                                                if skipFilesUsedBy != None:
                                                    for tmpJediTaskID in str(
                                                            skipFilesUsedBy
                                                    ).split(','):
                                                        tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI(
                                                            tmpJediTaskID, {
                                                                'datasetName':
                                                                datasetSpec.
                                                                datasetName
                                                            }, ['nFiles'])
                                                        if 'nFiles' in tmpParentAtt and tmpParentAtt[
                                                                'nFiles']:
                                                            nPFN += tmpParentAtt[
                                                                'nFiles']
                                            tmpRet = {}
                                            # get offset
                                            tmpOffset = datasetSpec.getOffset()
                                            tmpOffset += 1
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {
                                                    'lfn': iPFN + tmpOffset,
                                                    'scope': None,
                                                    'filesize': 0,
                                                    'checksum': None,
                                                }
                                        elif not taskSpec.useListPFN():
                                            # dummy file list for pseudo dataset
                                            tmpRet = {
                                                str(uuid.uuid4()): {
                                                    'lfn': 'pseudo_lfn',
                                                    'scope': None,
                                                    'filesize': 0,
                                                    'checksum': None,
                                                }
                                            }
                                        else:
                                            # make dummy file list for PFN list
                                            if taskParamMap.has_key('nFiles'):
                                                nPFN = taskParamMap['nFiles']
                                            else:
                                                nPFN = 1
                                            tmpRet = {}
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {
                                                    'lfn':
                                                    '{0:06d}:{1}'.format(
                                                        iPFN,
                                                        taskParamMap['pfnList']
                                                        [iPFN].split('/')[-1]),
                                                    'scope':
                                                    None,
                                                    'filesize':
                                                    0,
                                                    'checksum':
                                                    None,
                                                }
                                except:
                                    errtype, errvalue = sys.exc_info()[:2]
                                    tmpLog.error(
                                        'failed to get files due to {0}:{1} {2}'
                                        .format(self.__class__.__name__,
                                                errtype.__name__, errvalue))
                                    if errtype == Interaction.JEDIFatalError:
                                        # fatal error
                                        datasetStatus = 'broken'
                                        taskBroken = True
                                        # update dataset status
                                        self.updateDatasetStatus(
                                            datasetSpec, datasetStatus, tmpLog)
                                    else:
                                        # temporary error
                                        taskOnHold = True
                                    taskSpec.setErrDiag(
                                        'failed to get files for {0}'.format(
                                            datasetSpec.datasetName))
                                    allUpdated = False
                                else:
                                    # parameters for master input
                                    respectLB = False
                                    useRealNumEvents = False
                                    if datasetSpec.isMaster():
                                        # respect LB boundaries
                                        respectLB = taskSpec.respectLumiblock()
                                        # use real number of events
                                        useRealNumEvents = taskSpec.useRealNumEvents(
                                        )
                                    # the number of events per file
                                    nEventsPerFile = None
                                    nEventsPerJob = None
                                    nEventsPerRange = None
                                    tgtNumEventsPerJob = None
                                    if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \
                                            (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()):
                                        if taskParamMap.has_key(
                                                'nEventsPerFile'):
                                            nEventsPerFile = taskParamMap[
                                                'nEventsPerFile']
                                        elif datasetSpec.isMaster(
                                        ) and datasetSpec.isPseudo(
                                        ) and taskParamMap.has_key('nEvents'):
                                            # use nEvents as nEventsPerFile for pseudo input
                                            nEventsPerFile = taskParamMap[
                                                'nEvents']
                                        if taskParamMap.has_key(
                                                'nEventsPerJob'):
                                            nEventsPerJob = taskParamMap[
                                                'nEventsPerJob']
                                        elif taskParamMap.has_key(
                                                'nEventsPerRange'):
                                            nEventsPerRange = taskParamMap[
                                                'nEventsPerRange']
                                        if 'tgtNumEventsPerJob' in taskParamMap:
                                            tgtNumEventsPerJob = taskParamMap[
                                                'tgtNumEventsPerJob']
                                            # reset nEventsPerJob
                                            nEventsPerJob = None
                                    # max attempts
                                    maxAttempt = None
                                    maxFailure = None
                                    if datasetSpec.isMaster(
                                    ) or datasetSpec.toKeepTrack():
                                        # max attempts
                                        if taskSpec.disableAutoRetry():
                                            # disable auto retry
                                            maxAttempt = 1
                                        elif taskParamMap.has_key(
                                                'maxAttempt'):
                                            maxAttempt = taskParamMap[
                                                'maxAttempt']
                                        else:
                                            # use default value
                                            maxAttempt = 3
                                        # max failure
                                        if 'maxFailure' in taskParamMap:
                                            maxFailure = taskParamMap[
                                                'maxFailure']
                                    # first event number
                                    firstEventNumber = None
                                    if datasetSpec.isMaster():
                                        # first event number
                                        firstEventNumber = 1 + taskSpec.getFirstEventOffset(
                                        )
                                    # nMaxEvents
                                    nMaxEvents = None
                                    if datasetSpec.isMaster(
                                    ) and taskParamMap.has_key('nEvents'):
                                        nMaxEvents = taskParamMap['nEvents']
                                    # nMaxFiles
                                    nMaxFiles = None
                                    if taskParamMap.has_key('nFiles'):
                                        if datasetSpec.isMaster():
                                            nMaxFiles = taskParamMap['nFiles']
                                        else:
                                            # calculate for secondary
                                            nMaxFiles = datasetSpec.getNumMultByRatio(
                                                origNumFiles)
                                            # multipled by the number of jobs per file for event-level splitting
                                            if nMaxFiles != None and taskParamMap.has_key(
                                                    'nEventsPerFile'):
                                                if taskParamMap.has_key(
                                                        'nEventsPerJob'):
                                                    if taskParamMap[
                                                            'nEventsPerFile'] > taskParamMap[
                                                                'nEventsPerJob']:
                                                        nMaxFiles *= float(
                                                            taskParamMap[
                                                                'nEventsPerFile']
                                                        ) / float(taskParamMap[
                                                            'nEventsPerJob'])
                                                        nMaxFiles = int(
                                                            math.ceil(
                                                                nMaxFiles))
                                                elif taskParamMap.has_key(
                                                        'nEventsPerRange'):
                                                    if taskParamMap[
                                                            'nEventsPerFile'] > taskParamMap[
                                                                'nEventsPerRange']:
                                                        nMaxFiles *= float(
                                                            taskParamMap[
                                                                'nEventsPerFile']
                                                        ) / float(taskParamMap[
                                                            'nEventsPerRange'])
                                                        nMaxFiles = int(
                                                            math.ceil(
                                                                nMaxFiles))
                                    # use scout
                                    useScout = False
                                    if datasetSpec.isMaster(
                                    ) and taskSpec.useScout() and (
                                            datasetSpec.status != 'toupdate'
                                            or not taskSpec.isPostScout()):
                                        useScout = True
                                    # use files with new attempt numbers
                                    useFilesWithNewAttemptNr = False
                                    if not datasetSpec.isPseudo(
                                    ) and fileList != [] and taskParamMap.has_key(
                                            'useInFilesWithNewAttemptNr'):
                                        useFilesWithNewAttemptNr = True
                                    #ramCount
                                    ramCount = 0

                                    # feed files to the contents table
                                    tmpLog.debug('update contents')
                                    retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(
                                        datasetSpec, tmpRet,
                                        tmpMetadata['state'], stateUpdateTime,
                                        nEventsPerFile, nEventsPerJob,
                                        maxAttempt, firstEventNumber,
                                        nMaxFiles, nMaxEvents, useScout,
                                        fileList, useFilesWithNewAttemptNr,
                                        nFilesPerJob, nEventsPerRange,
                                        nChunksForScout, includePatt,
                                        excludePatt, xmlConfig, noWaitParent,
                                        taskSpec.parent_tid, self.pid,
                                        maxFailure, useRealNumEvents,
                                        respectLB, tgtNumEventsPerJob,
                                        skipFilesUsedBy, ramCount)
                                    if retDB == False:
                                        taskSpec.setErrDiag(
                                            'failed to insert files for {0}. {1}'
                                            .format(datasetSpec.datasetName,
                                                    diagMap['errMsg']))
                                        allUpdated = False
                                        taskBroken = True
                                        break
                                    elif retDB == None:
                                        # the dataset is locked by another or status is not applicable
                                        allUpdated = False
                                        tmpLog.debug(
                                            'escape since task or dataset is locked'
                                        )
                                        break
                                    elif missingFileList != []:
                                        # files are missing
                                        tmpErrStr = '{0} files missing in {1}'.format(
                                            len(missingFileList),
                                            datasetSpec.datasetName)
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        allUpdated = False
                                        taskOnHold = True
                                        missingMap[datasetSpec.datasetName] = {
                                            'datasetSpec': datasetSpec,
                                            'missingFiles': missingFileList
                                        }
                                    else:
                                        # reduce the number of files to be read
                                        if taskParamMap.has_key('nFiles'):
                                            if datasetSpec.isMaster():
                                                taskParamMap[
                                                    'nFiles'] -= nFilesUnique
                                        # reduce the number of files for scout
                                        if useScout:
                                            nChunksForScout = diagMap[
                                                'nChunksForScout']
                                        # number of master input files
                                        if datasetSpec.isMaster():
                                            checkedMaster = True
                                            nFilesMaster += nFilesUnique
                                    # running task
                                    if diagMap['isRunningTask']:
                                        runningTask = True
                                    # no activated pending input for noWait
                                    if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout == 0) \
                                            and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster():
                                        tmpErrStr = 'insufficient inputs are ready. '
                                        tmpErrStr += diagMap['errMsg']
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        taskOnHold = True
                                        setFrozenTime = False
                                        break
                            tmpLog.debug('end loop')
                    # no mater input
                    if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster:
                        tmpErrStr = 'no master input files. input dataset is empty'
                        tmpLog.error(tmpErrStr)
                        taskSpec.setErrDiag(tmpErrStr, None)
                        if taskSpec.allowEmptyInput() or noWaitParent:
                            taskOnHold = True
                        else:
                            taskBroken = True
                    # update task status
                    if taskBroken:
                        # task is broken
                        taskSpec.status = 'tobroken'
                        tmpMsg = 'set task.status={0}'.format(taskSpec.status)
                        tmpLog.info(tmpMsg)
                        tmpLog.sendMsg(tmpMsg, self.msgType)
                        allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                            jediTaskID, taskSpec, pid=self.pid)
                    # change task status unless the task is running
                    if not runningTask:
                        if taskOnHold:
                            # go to pending state
                            if not taskSpec.status in ['broken', 'tobroken']:
                                taskSpec.setOnHold()
                            tmpMsg = 'set task.status={0}'.format(
                                taskSpec.status)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg, self.msgType)
                            allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                                jediTaskID,
                                taskSpec,
                                pid=self.pid,
                                setFrozenTime=setFrozenTime)
                        elif allUpdated:
                            # all OK
                            allRet, newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                                jediTaskID,
                                getTaskStatus=True,
                                pid=self.pid,
                                useWorldCloud=taskSpec.useWorldCloud())
                            tmpMsg = 'set task.status={0}'.format(
                                newTaskStatus)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg, self.msgType)
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(
                            jediTaskID, self.pid)
                        tmpLog.debug('unlock not-running task with {0}'.format(
                            retUnlock))
                    else:
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(
                            jediTaskID, self.pid)
                        tmpLog.debug('unlock task with {0}'.format(retUnlock))
                    tmpLog.debug('done')
            except:
                errtype, errvalue = sys.exc_info()[:2]
                logger.error('{0} failed in runImpl() with {1}:{2}'.format(
                    self.__class__.__name__, errtype.__name__, errvalue))

Пример #36

Показать файл

 def runImpl(self):
     # cutoff for disk in TB
     diskThreshold = self.taskBufferIF.getConfigValue(
         self.msgType,
         'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name), 'jedi',
         'atlas')
     if diskThreshold is None:
         diskThreshold = 100 * 1024
     # dataset type to ignore file availability check
     datasetTypeToSkipCheck = ['log']
     # thresholds for data availability check
     thrInputSize = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas')
     if thrInputSize is None:
         thrInputSize = 1
     thrInputSize *= 1024 * 1024 * 1024
     thrInputNum = self.taskBufferIF.getConfigValue(self.msgType,
                                                    'INPUT_NUM_THRESHOLD',
                                                    'jedi', 'atlas')
     if thrInputNum is None:
         thrInputNum = 100
     thrInputSizeFrac = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas')
     if thrInputSizeFrac is None:
         thrInputSizeFrac = 10
     thrInputSizeFrac = float(thrInputSizeFrac) / 100
     thrInputNumFrac = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas')
     if thrInputNumFrac is None:
         thrInputNumFrac = 10
     thrInputNumFrac = float(thrInputNumFrac) / 100
     cutOffRW = 50
     negWeightTape = 0.001
     minIoIntensityWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MIN_IO_INTENSITY_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if minIoIntensityWithLD is None:
         minIoIntensityWithLD = 200
     minInputSizeWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MIN_INPUT_SIZE_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if minInputSizeWithLD is None:
         minInputSizeWithLD = 10000
     maxTaskPrioWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MAX_TASK_PRIO_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if maxTaskPrioWithLD is None:
         maxTaskPrioWithLD = 800
     # main
     lastJediTaskID = None
     siteMapper = self.taskBufferIF.getSiteMapper()
     while True:
         try:
             taskInputList = self.inputList.get(1)
             # no more datasets
             if len(taskInputList) == 0:
                 self.logger.debug(
                     '{0} terminating after processing {1} tasks since no more inputs '
                     .format(self.__class__.__name__, self.numTasks))
                 return
             # loop over all tasks
             for taskSpec, inputChunk in taskInputList:
                 lastJediTaskID = taskSpec.jediTaskID
                 # make logger
                 tmpLog = MsgWrapper(
                     self.logger,
                     '<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                     monToken='jediTaskID={0}'.format(taskSpec.jediTaskID))
                 tmpLog.debug('start')
                 tmpLog.info(
                     'thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}'
                     .format(thrInputSize, thrInputNum, thrInputSizeFrac,
                             thrInputNumFrac))
                 # read task parameters
                 try:
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                         taskSpec.jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except Exception:
                     tmpLog.error('failed to read task params')
                     taskSpec.setErrDiag(
                         tmpLog.uploadLog(taskSpec.jediTaskID))
                     self.sendLogMessage(tmpLog)
                     continue
                 # RW
                 taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(
                     taskSpec.jediTaskID)
                 # get nuclei
                 nucleusList = siteMapper.nuclei
                 if taskSpec.nucleus in siteMapper.nuclei:
                     candidateNucleus = taskSpec.nucleus
                 elif taskSpec.nucleus in siteMapper.satellites:
                     nucleusList = siteMapper.satellites
                     candidateNucleus = taskSpec.nucleus
                 else:
                     tmpLog.info('got {0} candidates'.format(
                         len(nucleusList)))
                     ######################################
                     # check status
                     newNucleusList = {}
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleusSpec.state not in ['ACTIVE']:
                             tmpLog.info(
                                 '  skip nucleus={0} due to status={1} criteria=-status'
                                 .format(tmpNucleus, tmpNucleusSpec.state))
                         else:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info(
                         '{0} candidates passed status check'.format(
                             len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # check status of transfer backlog
                     t1Weight = taskSpec.getT1Weight()
                     if t1Weight < 0:
                         tmpLog.info(
                             'skip transfer backlog check due to negative T1Weight'
                         )
                     else:
                         newNucleusList = {}
                         backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei(
                         )
                         for tmpNucleus, tmpNucleusSpec in iteritems(
                                 nucleusList):
                             if tmpNucleus in backlogged_nuclei:
                                 tmpLog.info(
                                     '  skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog'
                                     .format(tmpNucleus))
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         nucleusList = newNucleusList
                         tmpLog.info(
                             '{0} candidates passed transfer backlog check'.
                             format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # check endpoint
                     fractionFreeSpace = {}
                     newNucleusList = {}
                     tmpStat, tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                         taskSpec.jediTaskID, ['output', 'log'])
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         toSkip = False
                         for tmpDatasetSpec in tmpDatasetSpecList:
                             # ignore distributed datasets
                             if DataServiceUtils.getDistributedDestination(
                                     tmpDatasetSpec.storageToken
                             ) is not None:
                                 continue
                             # get endpoint with the pattern
                             tmpEP = tmpNucleusSpec.getAssociatedEndpoint(
                                 tmpDatasetSpec.storageToken)
                             if tmpEP is None:
                                 tmpLog.info(
                                     '  skip nucleus={0} since no endpoint with {1} criteria=-match'
                                     .format(tmpNucleus,
                                             tmpDatasetSpec.storageToken))
                                 toSkip = True
                                 break
                             # check state
                             """
                             if tmpEP['state'] not in ['ACTIVE']:
                                 tmpLog.info('  skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus,
                                                                                                                          tmpEP['ddm_endpoint_name'],
                                                                                                                          tmpEP['state']))
                                 toSkip = True
                                 break
                             """
                             # check space
                             tmpSpaceSize = tmpEP['space_free'] + tmpEP[
                                 'space_expired']
                             tmpSpaceToUse = 0
                             if tmpNucleus in self.fullRW:
                                 # 0.25GB per cpuTime/corePower/day
                                 tmpSpaceToUse = long(
                                     self.fullRW[tmpNucleus] / 10 / 24 /
                                     3600 * 0.25)
                             if tmpSpaceSize - tmpSpaceToUse < diskThreshold:
                                 tmpLog.info(
                                     '  skip nucleus={0} since disk shortage (free {1} GB - reserved {2} GB < thr {3} GB) at endpoint {4} criteria=-space'
                                     .format(tmpNucleus, tmpSpaceSize,
                                             tmpSpaceToUse, diskThreshold,
                                             tmpEP['ddm_endpoint_name']))
                                 toSkip = True
                                 break
                             # keep fraction of free space
                             if tmpNucleus not in fractionFreeSpace:
                                 fractionFreeSpace[tmpNucleus] = {
                                     'total': 0,
                                     'free': 0
                                 }
                             try:
                                 tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                     float(fractionFreeSpace[tmpNucleus]['total'])
                             except Exception:
                                 tmpOld = None
                             try:
                                 tmpNew = float(tmpSpaceSize -
                                                tmpSpaceToUse) / float(
                                                    tmpEP['space_total'])
                             except Exception:
                                 tmpNew = None
                             if tmpNew is not None and (tmpOld is None
                                                        or tmpNew < tmpOld):
                                 fractionFreeSpace[tmpNucleus] = {
                                     'total': tmpEP['space_total'],
                                     'free': tmpSpaceSize - tmpSpaceToUse
                                 }
                         if not toSkip:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info(
                         '{0} candidates passed endpoint check {1} TB'.
                         format(len(nucleusList), diskThreshold / 1024))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # ability to execute jobs
                     newNucleusList = {}
                     # get all panda sites
                     tmpSiteList = []
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         tmpSiteList += tmpNucleusSpec.allPandaSites
                     tmpSiteList = list(set(tmpSiteList))
                     tmpLog.debug('===== start for job check')
                     jobBroker = AtlasProdJobBroker(self.ddmIF,
                                                    self.taskBufferIF)
                     tmpSt, tmpRet = jobBroker.doBrokerage(
                         taskSpec, taskSpec.cloud, inputChunk, None, True,
                         tmpSiteList, tmpLog)
                     tmpLog.debug('===== done for job check')
                     if tmpSt != Interaction.SC_SUCCEEDED:
                         tmpLog.error('no sites can run jobs')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     okNuclei = set()
                     for tmpSite in tmpRet:
                         siteSpec = siteMapper.getSite(tmpSite)
                         okNuclei.add(siteSpec.pandasite)
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleus in okNuclei:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                         else:
                             tmpLog.info(
                                 '  skip nucleus={0} due to missing ability to run jobs criteria=-job'
                                 .format(tmpNucleus))
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed job check'.format(
                         len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # data locality
                     toSkip = False
                     availableData = {}
                     for datasetSpec in inputChunk.getDatasets():
                         # only for real datasets
                         if datasetSpec.isPseudo():
                             continue
                         # ignore DBR
                         if DataServiceUtils.isDBR(datasetSpec.datasetName):
                             continue
                         # skip locality check
                         if DataServiceUtils.getDatasetType(
                                 datasetSpec.datasetName
                         ) in datasetTypeToSkipCheck:
                             continue
                         # primary only
                         if taskParamMap.get(
                                 'taskBrokerOnMaster'
                         ) is True and not datasetSpec.isMaster():
                             continue
                         # use deep scan for primary dataset unless data carousel
                         if datasetSpec.isMaster(
                         ) and not taskSpec.inputPreStaging():
                             deepScan = True
                         else:
                             deepScan = False
                         # get nuclei where data is available
                         tmpSt, tmpRet = AtlasBrokerUtils.getNucleiWithData(
                             siteMapper, self.ddmIF,
                             datasetSpec.datasetName,
                             list(nucleusList.keys()), deepScan)
                         if tmpSt != Interaction.SC_SUCCEEDED:
                             tmpLog.error(
                                 'failed to get nuclei where data is available, since {0}'
                                 .format(tmpRet))
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             toSkip = True
                             break
                         # sum
                         for tmpNucleus, tmpVals in iteritems(tmpRet):
                             if tmpNucleus not in availableData:
                                 availableData[tmpNucleus] = tmpVals
                             else:
                                 availableData[tmpNucleus] = dict(
                                     (k, v + tmpVals[k])
                                     for (k, v) in iteritems(
                                         availableData[tmpNucleus]))
                     if toSkip:
                         continue
                     if availableData != {}:
                         newNucleusList = {}
                         # skip if no data
                         skipMsgList = []
                         for tmpNucleus, tmpNucleusSpec in iteritems(
                                 nucleusList):
                             if taskSpec.inputPreStaging(
                             ) and availableData[tmpNucleus][
                                     'ava_num_any'] > 0:
                                 # use incomplete replicas for data carousel since the completeness is guaranteed
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                             elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \
                                     availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac:
                                 tmpMsg = '  skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(
                                     tmpNucleus, availableData[tmpNucleus]
                                     ['ava_size_any'],
                                     availableData[tmpNucleus]['tot_size'],
                                     thrInputSizeFrac)
                                 skipMsgList.append(tmpMsg)
                             elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \
                                     availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac:
                                 tmpMsg = '  skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(
                                     tmpNucleus, availableData[tmpNucleus]
                                     ['ava_num_any'],
                                     availableData[tmpNucleus]['tot_num'],
                                     thrInputNumFrac)
                                 skipMsgList.append(tmpMsg)
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         totInputSize = list(availableData.values(
                         ))[0]['tot_size'] / 1024 / 1024 / 1024
                         data_locality_check_str = (
                             '(ioIntensity ({0}) is None or less than {1} kBPerS '
                             'and input size ({2} GB) is less than {3}) '
                             'or task.currentPriority ({4}) is higher than or equal to {5}'
                         ).format(taskSpec.ioIntensity,
                                  minIoIntensityWithLD, int(totInputSize),
                                  minInputSizeWithLD,
                                  taskSpec.currentPriority,
                                  maxTaskPrioWithLD)
                         if len(newNucleusList) > 0:
                             nucleusList = newNucleusList
                             for tmpMsg in skipMsgList:
                                 tmpLog.info(tmpMsg)
                         elif ((taskSpec.ioIntensity is None
                               or taskSpec.ioIntensity <= minIoIntensityWithLD)
                               and totInputSize <= minInputSizeWithLD) \
                               or taskSpec.currentPriority >= maxTaskPrioWithLD:
                             availableData = {}
                             tmpLog.info(
                                 '  disable data locality check since no nucleus has input data, {}'
                                 .format(data_locality_check_str))
                         else:
                             # no candidate + unavoidable data locality check
                             nucleusList = newNucleusList
                             for tmpMsg in skipMsgList:
                                 tmpLog.info(tmpMsg)
                             tmpLog.info(
                                 '  the following conditions required to disable data locality check: {}'
                                 .format(data_locality_check_str))
                         tmpLog.info(
                             '{0} candidates passed data check'.format(
                                 len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # weight
                     self.prioRW.acquire()
                     nucleusRW = self.prioRW[taskSpec.currentPriority]
                     self.prioRW.release()
                     totalWeight = 0
                     nucleusweights = []
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleus not in nucleusRW:
                             nucleusRW[tmpNucleus] = 0
                         wStr = '1'
                         # with RW
                         if tmpNucleus in nucleusRW and nucleusRW[
                                 tmpNucleus] >= cutOffRW:
                             weight = 1 / float(nucleusRW[tmpNucleus])
                             wStr += '/( RW={0} )'.format(
                                 nucleusRW[tmpNucleus])
                         else:
                             weight = 1
                             wStr += '/(1 : RW={0}<{1})'.format(
                                 nucleusRW[tmpNucleus], cutOffRW)
                         # with data
                         if availableData != {}:
                             if availableData[tmpNucleus]['tot_size'] > 0:
                                 weight *= float(availableData[tmpNucleus]
                                                 ['ava_size_any'])
                                 weight /= float(
                                     availableData[tmpNucleus]['tot_size'])
                                 wStr += '* ( available_input_size_DISKTAPE={0} )'.format(
                                     availableData[tmpNucleus]
                                     ['ava_size_any'])
                                 wStr += '/ ( total_input_size={0} )'.format(
                                     availableData[tmpNucleus]['tot_size'])
                                 # negative weight for tape
                                 if availableData[tmpNucleus][
                                         'ava_size_any'] > availableData[
                                             tmpNucleus]['ava_size_disk']:
                                     weight *= negWeightTape
                                     wStr += '*( weight_TAPE={0} )'.format(
                                         negWeightTape)
                         # fraction of free space
                         if tmpNucleus in fractionFreeSpace:
                             try:
                                 tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                     float(fractionFreeSpace[tmpNucleus]['total'])
                                 weight *= tmpFrac
                                 wStr += '*( free_space={0} )/( total_space={1} )'.format(
                                     fractionFreeSpace[tmpNucleus]['free'],
                                     fractionFreeSpace[tmpNucleus]['total'])
                             except Exception:
                                 pass
                         tmpLog.info(
                             '  use nucleus={0} weight={1} {2} criteria=+use'
                             .format(tmpNucleus, weight, wStr))
                         totalWeight += weight
                         nucleusweights.append((tmpNucleus, weight))
                     tmpLog.info('final {0} candidates'.format(
                         len(nucleusList)))
                     ######################################
                     # final selection
                     tgtWeight = random.uniform(0, totalWeight)
                     candidateNucleus = None
                     for tmpNucleus, weight in nucleusweights:
                         tgtWeight -= weight
                         if tgtWeight <= 0:
                             candidateNucleus = tmpNucleus
                             break
                     if candidateNucleus is None:
                         candidateNucleus = nucleusweights[-1][0]
                 ######################################
                 # update
                 nucleusSpec = nucleusList[candidateNucleus]
                 # get output/log datasets
                 tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                     taskSpec.jediTaskID, ['output', 'log'])
                 # get destinations
                 retMap = {
                     taskSpec.jediTaskID:
                     AtlasBrokerUtils.getDictToSetNucleus(
                         nucleusSpec, tmpDatasetSpecs)
                 }
                 tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
                 tmpLog.info(
                     '  set nucleus={0} with {1} criteria=+set'.format(
                         candidateNucleus, tmpRet))
                 self.sendLogMessage(tmpLog)
                 if tmpRet:
                     tmpMsg = 'set task_status=ready'
                     tmpLog.sendMsg(tmpMsg, self.msgType)
                 # update RW table
                 self.prioRW.acquire()
                 for prio, rwMap in iteritems(self.prioRW):
                     if prio > taskSpec.currentPriority:
                         continue
                     if candidateNucleus in rwMap:
                         rwMap[candidateNucleus] += taskRW
                     else:
                         rwMap[candidateNucleus] = taskRW
                 self.prioRW.release()
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             errMsg = '{0}.runImpl() failed with {1} {2} '.format(
                 self.__class__.__name__, errtype.__name__, errvalue)
             errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID)
             errMsg += traceback.format_exc()
             logger.error(errMsg)

Пример #37

Показать файл

 def doBrokerage(self, inputList, vo, prodSourceLabel, workQueue,
                 resource_name):
     # list with a lock
     inputListWorld = ListWithLock([])
     # variables for submission
     maxBunchTask = 100
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start doBrokerage')
     # return for failure
     retFatal = self.SC_FATAL
     retTmpError = self.SC_FAILED
     tmpLog.debug(
         'vo={0} label={1} queue={2} resource_name={3} nTasks={4}'.format(
             vo, prodSourceLabel, workQueue.queue_name, resource_name,
             len(inputList)))
     # loop over all tasks
     allRwMap = {}
     prioMap = {}
     tt2Map = {}
     expRWs = {}
     jobSpecList = []
     for tmpJediTaskID, tmpInputList in inputList:
         for taskSpec, cloudName, inputChunk in tmpInputList:
             # collect tasks for WORLD
             if taskSpec.useWorldCloud():
                 inputListWorld.append((taskSpec, inputChunk))
                 continue
             # make JobSpec to be submitted for TaskAssigner
             jobSpec = JobSpec()
             jobSpec.taskID = taskSpec.jediTaskID
             jobSpec.jediTaskID = taskSpec.jediTaskID
             # set managed to trigger TA
             jobSpec.prodSourceLabel = 'managed'
             jobSpec.processingType = taskSpec.processingType
             jobSpec.workingGroup = taskSpec.workingGroup
             jobSpec.metadata = taskSpec.processingType
             jobSpec.assignedPriority = taskSpec.taskPriority
             jobSpec.currentPriority = taskSpec.currentPriority
             jobSpec.maxDiskCount = (
                 taskSpec.getOutDiskSize() +
                 taskSpec.getWorkDiskSize()) // 1024 // 1024
             if taskSpec.useWorldCloud():
                 # use destinationSE to trigger task brokerage in WORLD cloud
                 jobSpec.destinationSE = taskSpec.cloud
             prodDBlock = None
             setProdDBlock = False
             for datasetSpec in inputChunk.getDatasets():
                 prodDBlock = datasetSpec.datasetName
                 if datasetSpec.isMaster():
                     jobSpec.prodDBlock = datasetSpec.datasetName
                     setProdDBlock = True
                 for fileSpec in datasetSpec.Files:
                     tmpInFileSpec = fileSpec.convertToJobFileSpec(
                         datasetSpec)
                     jobSpec.addFile(tmpInFileSpec)
             # use secondary dataset name as prodDBlock
             if setProdDBlock is False and prodDBlock is not None:
                 jobSpec.prodDBlock = prodDBlock
             # append
             jobSpecList.append(jobSpec)
             prioMap[jobSpec.taskID] = jobSpec.currentPriority
             tt2Map[jobSpec.taskID] = jobSpec.processingType
             # get RW for a priority
             if jobSpec.currentPriority not in allRwMap:
                 tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI(
                     vo, prodSourceLabel, workQueue,
                     jobSpec.currentPriority)
                 if tmpRW is None:
                     tmpLog.error(
                         'failed to calculate RW with prio={0}'.format(
                             jobSpec.currentPriority))
                     return retTmpError
                 allRwMap[jobSpec.currentPriority] = tmpRW
             # get expected RW
             expRW = self.taskBufferIF.calculateTaskRW_JEDI(
                 jobSpec.jediTaskID)
             if expRW is None:
                 tmpLog.error(
                     'failed to calculate RW for jediTaskID={0}'.format(
                         jobSpec.jediTaskID))
                 return retTmpError
             expRWs[jobSpec.taskID] = expRW
     # for old clouds
     if jobSpecList != []:
         # get fullRWs
         fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI(
             vo, prodSourceLabel, None, None)
         if fullRWs is None:
             tmpLog.error('failed to calculate full RW')
             return retTmpError
         # set metadata
         for jobSpec in jobSpecList:
             rwValues = allRwMap[jobSpec.currentPriority]
             jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % (
                 jobSpec.metadata, str(rwValues), str(expRWs), str(prioMap),
                 str(fullRWs), str(tt2Map))
         tmpLog.debug('run task assigner for {0} tasks'.format(
             len(jobSpecList)))
         nBunchTask = 0
         while nBunchTask < len(jobSpecList):
             # get a bunch
             jobsBunch = jobSpecList[nBunchTask:nBunchTask + maxBunchTask]
             strIDs = 'jediTaskID='
             for tmpJobSpec in jobsBunch:
                 strIDs += '{0},'.format(tmpJobSpec.taskID)
             strIDs = strIDs[:-1]
             tmpLog.debug(strIDs)
             # increment index
             nBunchTask += maxBunchTask
             # run task brokerge
             stS, outSs = PandaClient.runTaskAssignment(jobsBunch)
             tmpLog.debug('{0}:{1}'.format(stS, str(outSs)))
     # for WORLD
     if len(inputListWorld) > 0:
         # thread pool
         threadPool = ThreadPool()
         # get full RW for WORLD
         fullRWs = self.taskBufferIF.calculateWorldRWwithPrio_JEDI(
             vo, prodSourceLabel, None, None)
         if fullRWs is None:
             tmpLog.error('failed to calculate full WORLD RW')
             return retTmpError
         # get RW per priority
         for taskSpec, inputChunk in inputListWorld:
             if taskSpec.currentPriority not in allRwMap:
                 tmpRW = self.taskBufferIF.calculateWorldRWwithPrio_JEDI(
                     vo, prodSourceLabel, workQueue,
                     taskSpec.currentPriority)
                 if tmpRW is None:
                     tmpLog.error(
                         'failed to calculate RW with prio={0}'.format(
                             taskSpec.currentPriority))
                     return retTmpError
                 allRwMap[taskSpec.currentPriority] = tmpRW
         # live counter for RWs
         liveCounter = MapWithLock(allRwMap)
         # make workers
         ddmIF = self.ddmIF.getInterface(vo)
         for iWorker in range(4):
             thr = AtlasProdTaskBrokerThread(inputListWorld, threadPool,
                                             self.taskBufferIF, ddmIF,
                                             fullRWs, liveCounter,
                                             workQueue)
             thr.start()
         threadPool.join(60 * 10)
     # return
     tmpLog.debug('doBrokerage done')
     return self.SC_SUCCEEDED

Пример #38

Показать файл

 def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID))
     tmpLog.debug('start')
     # return for failure
     retFatal    = self.SC_FATAL,inputChunk
     retTmpError = self.SC_FAILED,inputChunk
     # get sites in the cloud
     if not taskSpec.site in ['',None]:
         scanSiteList = [taskSpec.site]
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
     elif inputChunk.getPreassignedSite() != None:
         scanSiteList = [inputChunk.getPreassignedSite()]
         tmpLog.debug('site={0} is pre-assigned in masterDS'.format(inputChunk.getPreassignedSite()))
     else:
         scanSiteList = self.siteMapper.getCloud(cloudName)['sites']
         tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList)))
     # get job statistics
     tmpSt,jobStatMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # T1 
     t1Sites = [self.siteMapper.getCloud(cloudName)['source']]
     # hospital sites
     if self.hospitalQueueMap.has_key(cloudName):
         t1Sites += self.hospitalQueueMap[cloudName]
     # MP    
     if taskSpec.coreCount != None and taskSpec.coreCount > 1:
         useMP = True
     else:
         useMP = False
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status != 'online':
             skipFlag = True
         if not skipFlag:    
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug('  skip %s due to status=%s' % (tmpSiteName,tmpSiteSpec.status))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for reprocessing
     if taskSpec.processingType == 'reprocessing':
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check schedconfig.validatedreleases
             if tmpSiteSpec.validatedreleases == ['True']:
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip %s due to validatedreleases != True' % tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for reprocessing'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for high priorities
     if taskSpec.currentPriority >= 950 and not useMP:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:            
             if tmpSiteName in t1Sites:
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip %s due to high prio which needs to run at T1' % tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed for high prio'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for data availability
     for datasetSpec in inputChunk.getDatasets():
         datasetName = datasetSpec.datasetName
         if not self.dataSiteMap.has_key(datasetName):
             # get the list of sites where data is available
             tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName))
             tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper,
                                                              self.ddmIF,datasetName)
             if tmpSt == self.SC_FAILED:
                 tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet)
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retTmpError
             if tmpSt == self.SC_FATAL:
                 tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet)
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retFatal
             # append
             self.dataSiteMap[datasetName] = tmpRet
             tmpLog.debug('map of data availability : {0}'.format(str(tmpRet)))
         # check if T1 has the data
         if self.dataSiteMap[datasetName].has_key(cloudName):
             cloudHasData = True
         else:
             cloudHasData = False
         t1hasData = False
         if cloudHasData:
             for tmpSE,tmpSeVal in self.dataSiteMap[datasetName][cloudName]['t1'].iteritems():
                 if tmpSeVal['state'] == 'complete':
                     t1hasData = True
                     break
             # T1 has incomplete data while no data at T2
             if not t1hasData and self.dataSiteMap[datasetName][cloudName]['t2'] == []:
                 # use incomplete data at T1 anyway
                 t1hasData = True
         # data is missing at T1         
         if not t1hasData:
             tmpLog.debug('{0} is unavailable at T1. scanning T2 sites in homeCloud={1}'.format(datasetName,cloudName))
             # make subscription to T1
             # FIXME
             pass
             # use T2 until data is complete at T1
             newScanSiteList = []
             for tmpSiteName in scanSiteList:                    
                 if cloudHasData and tmpSiteName in self.dataSiteMap[datasetName][cloudName]['t2']:
                     newScanSiteList.append(tmpSiteName)
                 else:
                     tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                     if tmpSiteSpec.cloud != cloudName:
                         tmpLog.debug('  skip %s due to foreign T2' % tmpSiteName)
                     else:
                         tmpLog.debug('  skip %s due to missing data at T2' % tmpSiteName)
             scanSiteList = newScanSiteList
             tmpLog.debug('{0} candidates passed T2 scan in the home cloud with input:{1}'.format(len(scanSiteList),datasetName))
             if scanSiteList == []:
                 tmpLog.error('no candidates')
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retTmpError
     ######################################
     # selection for fairshare
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if AtlasBrokerUtils.hasZeroShare(tmpSiteSpec,taskSpec,tmpLog):
             tmpLog.debug('  skip {0} due to zero share'.format(tmpSiteName))
             continue
         newScanSiteList.append(tmpSiteName)                
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed zero share check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for I/O intensive tasks
     # FIXME
     pass
     ######################################
     # selection for MP
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if (useMP and tmpSiteSpec.coreCount > 1) or \
            (not useMP and tmpSiteSpec.coreCount in [0,1,None]):
                 newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug('  skip %s due to core mismatch site:%s != task:%s' % \
                          (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for release
     if taskSpec.transHome != None:
         if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None:
             # only cache is checked for normal tasks
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      caches=taskSpec.transHome,
                                                                      cmtConfig=taskSpec.architecture)
         else:
             # nightlies
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      releases='nightlies',
                                                                      cmtConfig=taskSpec.architecture)
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY'] or \
                tmpSiteSpec.cloud in ['ND'] or \
                tmpSiteName in ['CERN-RELEASE']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip %s due to missing rel/cache %s:%s' % \
                              (tmpSiteName,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for ATLAS release {1}:{2}'.format(len(scanSiteList),
                                                                               taskSpec.transHome,
                                                                               taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for memory
     minRamCount  = taskSpec.ramCount
     if not minRamCount in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory:
                 tmpLog.debug('  skip {0} due to site RAM shortage={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                       tmpSiteSpec.maxmemory,
                                                                                                       minRamCount))
                 continue
             if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory:
                 tmpLog.debug('  skip {0} due to job RAM shortage={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                      tmpSiteSpec.minmemory,
                                                                                                      minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList),
                                                                          minRamCount,taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for scratch disk
     minDiskCount = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize()
     minDiskCount = minDiskCount / 1024 / 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0 and minDiskCount > tmpSiteSpec.maxwdir:
             tmpLog.debug('  skip {0} due to small scratch disk={1} < {2}'.format(tmpSiteName,
                                                                                  tmpSiteSpec.maxwdir,
                                                                                  minDiskCount))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # don't check for T1
         if tmpSiteName in t1Sites:
             pass
         else:
             # check at the site
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # the number of jobs which will produce outputs
             nRemJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'assigned') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'activated') + \
                        AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running')
             # the size of input files which will be copied to the site
             movingInputSize = self.taskBufferIF.getMovingInputSize_JEDI(tmpSiteName)
             if movingInputSize == None:
                 tmpLog.error('failed to get the size of input file moving to {0}'.format(tmpSiteName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retTmpError
             # free space - inputs - outputs(250MB*nJobs) must be >= 200GB
             outSizePerJob = 0.250
             diskThreshold = 200
             tmpSpaceSize = tmpSiteSpec.space - movingInputSize - nRemJobs * outSizePerJob
             if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold:
                 tmpLog.debug('  skip {0} due to disk shortage in SE = {1}-{2}-{3}x{4} < {5}'.format(tmpSiteName,tmpSiteSpec.space,
                                                                                                     movingInputSize,outSizePerJob,
                                                                                                     nRemJobs,diskThreshold))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug('  skip {0} due to short site walltime={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                         tmpSiteSpec.maxtime,
                                                                                                         minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug('  skip {0} due to short job walltime={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                        tmpSiteSpec.mintime,
                                                                                                        minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed walltime check ={1}({2})'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for transferring
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limit
         def_maxTransferring = 2000 
         if tmpSiteSpec.transferringlimit == 0:
             # use default value
             maxTransferring   = def_maxTransferring
         else:
             maxTransferring = tmpSiteSpec.transferringlimit
         # check at the site
         nTraJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'transferring',cloud=cloudName)
         nRunJobs = AtlasBrokerUtils.getNumJobs(jobStatMap,tmpSiteName,'running',cloud=cloudName)
         if max(maxTransferring,2*nRunJobs) < nTraJobs and not tmpSiteSpec.cloud in ['ND']:
             tmpLog.debug('  skip %s due to too many transferring %s > max(%s,2x%s)' % \
                          (tmpSiteName,nTraJobs,def_maxTransferring,nRunJobs))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed transferring check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug('  skip %s due to no pilot' % tmpSiteName)
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # get available files
     totalSize = 0
     normalizeFactors = {}        
     availableFileMap = {}
     for datasetSpec in inputChunk.getDatasets():
         try:
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(scanSiteList,self.siteMapper)
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec,
                                                         siteStorageEP,
                                                         self.siteMapper,
                                                         ngGroup=[1])
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
         # get total size
         totalSize += datasetSpec.getSize()
         # loop over all sites to get the size of available files
         for tmpSiteName in scanSiteList:
             if not normalizeFactors.has_key(tmpSiteName):
                 normalizeFactors[tmpSiteName] = 0
             # get the total size of available files
             if availableFileMap[datasetSpec.datasetName].has_key(tmpSiteName):
                 availableFiles = availableFileMap[datasetSpec.datasetName][tmpSiteName]
                 for tmpFileSpec in \
                         availableFiles['localdisk']+availableFiles['localtape']+availableFiles['cache']:
                     normalizeFactors[tmpSiteName] += tmpFileSpec.fsize
     ######################################
     # calculate weight
     tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,
                                                                                 taskSpec.prodSourceLabel,
                                                                                 taskSpec.currentPriority)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     for tmpSiteName in scanSiteList:
         nRunning   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',cloudName,taskSpec.workQueue_ID)
         nAssigned  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'assigned',cloudName,taskSpec.workQueue_ID)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',cloudName,taskSpec.workQueue_ID)
         weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1)
         # normalize weights by taking data availability into account
         if totalSize != 0:
             weight = weight * float(normalizeFactors[tmpSiteName]+totalSize) / float(totalSize)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # set weight
         siteCandidateSpec.weight = weight
         # set available files
         for tmpDatasetName,availableFiles in availableFileMap.iteritems():
             if availableFiles.has_key(tmpSiteName):
                 siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['localdisk']
                 siteCandidateSpec.localTapeFiles  += availableFiles[tmpSiteName]['localtape']
                 siteCandidateSpec.cacheFiles  += availableFiles[tmpSiteName]['cache']
                 siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote']
         # append        
         inputChunk.addSiteCandidate(siteCandidateSpec)
         tmpLog.debug('  use {0} with weight={1}'.format(tmpSiteName,weight))
     # return
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,inputChunk

Пример #39

Показать файл

Файл: AtlasAnalWatchDog.py Проект: RRCKI/panda-jedi

 def doForPreStaging(self):
     try:
         tmpLog = MsgWrapper(logger,'doForPreStaging')
         # lock
         flagLocked = self.taskBufferIF.lockProcess_JEDI(self.vo,self.prodSourceLabel,
                                                         self.cronActions['forPrestage'],
                                                         0,self.pid,timeLimit=5)
         if not flagLocked:
             return
         tmpLog.debug('start')
         # get throttled users
         thrUserTasks = self.taskBufferIF.getThrottledUsersTasks_JEDI(self.vo,self.prodSourceLabel)
         # get dispatch datasets
         dispUserTasks = self.taskBufferIF.getDispatchDatasetsPerUser(self.vo,self.prodSourceLabel,True,True)
         # max size of prestaging requests in MB
         maxPrestaging = self.taskBufferIF.getConfigValue('anal_watchdog', 'PRESTAGE_LIMIT', 'jedi', 'atlas')
         if maxPrestaging == None:
             maxPrestaging = 1
         maxPrestaging *= 1024*1024
         # throttle interval
         thrInterval = 120
         # loop over all users
         for userName,userDict in dispUserTasks.iteritems():
             tmpLog.debug('{0} {1} GB'.format(userName, userDict['size']/1024))
             # too large
             if userDict['size'] > maxPrestaging:
                 tmpLog.debug('{0} has too large prestaging {1}>{2} GB'.format(userName,
                                                                               userDict['size']/1024,
                                                                               maxPrestaging/1024))
                 # throttle tasks
                 for taskID in userDict['tasks']:
                     if not userName in thrUserTasks or not taskID in thrUserTasks[userName]:
                         tmpLog.debug('thottle jediTaskID={0}'.format(taskID))
                         errDiag = 'throttled for {0} min due to too large prestaging from TAPE'.format(thrInterval)
                         self.taskBufferIF.throttleTask_JEDI(taskID,thrInterval,errDiag)
                 # remove the user from the list
                 if userName in thrUserTasks:
                     del thrUserTasks[userName]
         # release users
         for userName,taskIDs in thrUserTasks.items():
             tmpLog.debug('{0} release throttled tasks'.format(userName))
             # unthrottle tasks
             for taskID in taskIDs:
                 tmpLog.debug('unthottle jediTaskID={0}'.format(taskID))
                 self.taskBufferIF.releaseThrottledTask_JEDI(taskID)
         tmpLog.debug('done')
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0} {1} {2}'.format(errtype,errvalue,traceback.format_exc()))

Пример #40

Показать файл

Файл: AtlasAnalJobBroker.py Проект: RRCKI/panda-jedi

 def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                         monToken='<jediTaskID={0} {1}>'.format(taskSpec.jediTaskID,
                                                                datetime.datetime.utcnow().isoformat('/')))
     tmpLog.debug('start')
     # return for failure
     retFatal    = self.SC_FATAL,inputChunk
     retTmpError = self.SC_FAILED,inputChunk
     # get primary site candidates 
     sitePreAssigned = False
     excludeList = []
     includeList = None
     scanSiteList = []
     # get list of site access
     siteAccessList = self.taskBufferIF.listSiteAccess(None,taskSpec.userName)
     siteAccessMap = {}
     for tmpSiteName,tmpAccess in siteAccessList:
         siteAccessMap[tmpSiteName] = tmpAccess
     # site limitation
     if taskSpec.useLimitedSites():
         if 'excludedSite' in taskParamMap:
             excludeList = taskParamMap['excludedSite']
             # str to list for task retry
             try:
                 if type(excludeList) != types.ListType:
                     excludeList = excludeList.split(',')
             except:
                 pass
         if 'includedSite' in taskParamMap:
             includeList = taskParamMap['includedSite']
             # str to list for task retry
             if includeList == '':
                 includeList = None
             try:
                 if type(includeList) != types.ListType:
                     includeList = includeList.split(',')
             except:
                 pass
     # loop over all sites        
     for siteName,tmpSiteSpec in self.siteMapper.siteSpecList.iteritems():
         if tmpSiteSpec.type == 'analysis':
             scanSiteList.append(siteName)
     # preassigned
     if not taskSpec.site in ['',None]:
         # site is pre-assigned
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
         sitePreAssigned = True
         if not taskSpec.site in scanSiteList:
             scanSiteList.append(taskSpec.site)
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     # allowed remote access protocol
     allowedRemoteProtocol = 'fax'
     # MP    
     if taskSpec.coreCount != None and taskSpec.coreCount > 1:
         # use MCORE only
         useMP = 'only'
     elif taskSpec.coreCount == 0:
         # use MCORE and normal 
         useMP = 'any'
     else:
         # not use MCORE
         useMP = 'unuse'
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status in ['offline']:
             skipFlag = True
         elif tmpSiteSpec.status in ['brokeroff','test']:
             if not sitePreAssigned:
                 skipFlag = True
             elif tmpSiteName != taskSpec.site:
                 skipFlag = True
         if not skipFlag:    
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug('  skip site=%s due to status=%s criteria=-status' % (tmpSiteName,tmpSiteSpec.status))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for MP
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \
                     (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]):
                     newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to core mismatch cores_site=%s <> cores_task=%s criteria=-cpucore' % \
                              (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for useMP={1}'.format(len(scanSiteList),useMP))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for release
     if taskSpec.transHome != None:
         if taskSpec.transHome.startswith('ROOT'):
             # hack until x86_64-slc6-gcc47-opt is published in installedsw
             if taskSpec.architecture == 'x86_64-slc6-gcc47-opt':
                 tmpCmtConfig = 'x86_64-slc6-gcc46-opt'
             else:
                 tmpCmtConfig = taskSpec.architecture
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      cmtConfig=tmpCmtConfig,
                                                                      onlyCmtConfig=True)
         elif 'AthAnalysis' in taskSpec.transHome or re.search('Ath[a-zA-Z]+Base',taskSpec.transHome) != None:
             # AthAnalysis
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                      cmtConfig=taskSpec.architecture,
                                                                      onlyCmtConfig=True)
         else:    
             # remove AnalysisTransforms-
             transHome = re.sub('^[^-]+-*','',taskSpec.transHome)
             transHome = re.sub('_','-',transHome)
             if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None and taskSpec.transHome != 'AnalysisTransforms' and \
                     re.search('\d{4}-\d{2}-\d{2}T\d{4}$',taskSpec.transHome) == None :
                 # cache is checked 
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                          caches=transHome,
                                                                          cmtConfig=taskSpec.architecture)
             elif transHome == '' and taskSpec.transUses != None:
                 # remove Atlas-
                 transUses = taskSpec.transUses.split('-')[-1]
                 # release is checked 
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                          releases=transUses,
                                                                          cmtConfig=taskSpec.architecture)
             else:
                 # nightlies
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                          releases='CVMFS')
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip site=%s due to missing rel/cache %s:%s:%s criteria=-cache' % \
                              (tmpSiteName,taskSpec.transUses,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for SW {1}:{2}:{3}'.format(len(scanSiteList),
                                                                        taskSpec.transUses,
                                                                        taskSpec.transHome,
                                                                        taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for memory
     minRamCount = inputChunk.getMaxRamCount()
     minRamCount = JediCoreUtils.compensateRamCount(minRamCount)
     if not minRamCount in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # site max memory requirement
             if not tmpSiteSpec.maxrss in [0,None]:
                 site_maxmemory = tmpSiteSpec.maxrss
             else:
                 site_maxmemory = tmpSiteSpec.maxmemory
             if not site_maxmemory in [0,None] and minRamCount != 0 and minRamCount > site_maxmemory:
                 tmpLog.debug('  skip site={0} due to site RAM shortage. site_maxmemory={1} < job_minramcount={2} criteria=-lowmemory'.format(tmpSiteName,
                                                                                                       site_maxmemory,
                                                                                                       minRamCount))
                 continue
             # site min memory requirement
             if not tmpSiteSpec.minrss in [0,None]:
                 site_minmemory = tmpSiteSpec.minrss
             else:
                 site_minmemory = tmpSiteSpec.minmemory
             if not site_minmemory in [0,None] and minRamCount != 0 and minRamCount < site_minmemory:
                 tmpLog.debug('  skip site={0} due to job RAM shortage. site_minmemory={1} > job_minramcount={2} criteria=-highmemory'.format(tmpSiteName,
                                                                                                      site_minmemory,
                                                                                                      minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList),
                                                                          minRamCount,taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for scratch disk
     tmpMaxAtomSize  = inputChunk.getMaxAtomSize()
     tmpEffAtomSize  = inputChunk.getMaxAtomSize(effectiveSize=True)
     tmpOutDiskSize  = taskSpec.getOutDiskSize()
     tmpWorkDiskSize = taskSpec.getWorkDiskSize()
     minDiskCountS = tmpOutDiskSize*tmpEffAtomSize + tmpWorkDiskSize + tmpMaxAtomSize
     minDiskCountS = minDiskCountS / 1024 / 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = tmpOutDiskSize*tmpEffAtomSize + tmpWorkDiskSize
         minDiskCountR = minDiskCountR / 1024 / 1024
     tmpLog.debug('maxAtomSize={0} effectiveAtomSize={1} outDiskCount={2} workDiskSize={3}'.format(tmpMaxAtomSize,
                                                                                                   tmpEffAtomSize,
                                                                                                   tmpOutDiskSize,
                                                                                                   tmpWorkDiskSize))
     tmpLog.debug('minDiskCountScratch={0} minDiskCountRemote={1}'.format(minDiskCountS,
                                                                          minDiskCountR))
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0:
             if tmpSiteSpec.isDirectIO():
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug('  skip site={0} due to small scratch disk={1} < {2} criteria=-disk'.format(tmpSiteName,
                                                                                      tmpSiteSpec.maxwdir,
                                                                                      minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check endpoint
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         tmpEndPoint = tmpSiteSpec.ddm_endpoints.getEndPoint(tmpSiteSpec.ddm)
         if tmpEndPoint is not None:
             # free space must be >= 200GB
             diskThreshold = 200
             tmpSpaceSize = 0
             if tmpEndPoint['space_expired'] is not None:
                 tmpSpaceSize += tmpEndPoint['space_expired']
             if tmpEndPoint['space_free'] is not None:
                 tmpSpaceSize += tmpEndPoint['space_free']
             if tmpSpaceSize < diskThreshold:
                 tmpLog.debug('  skip site={0} due to disk shortage in SE {1} < {2}GB criteria=-disk'.format(tmpSiteName,tmpSpaceSize,
                                                                                         diskThreshold))
                 continue
             # check if blacklisted
             if tmpEndPoint['blacklisted'] == 'Y':
                 tmpLog.debug('  skip site={0} since {1} is blacklisted in DDM criteria=-blacklist'.format(tmpSiteName,tmpSiteSpec.ddm))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0,None] and minWalltime > 0:
         minWalltime *= tmpEffAtomSize
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug('  skip site={0} due to short site walltime={1}(site upper limit) < {2} criteria=-shortwalltime'.format(tmpSiteName,
                                                                                                         tmpSiteSpec.maxtime,
                                                                                                         minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug('  skip site={0} due to short job walltime={1}(site lower limit) > {2} criteria=-longwalltime'.format(tmpSiteName,
                                                                                                        tmpSiteSpec.mintime,
                                                                                                        minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug('  skip site=%s due to no pilot criteria=-nopilot' % tmpSiteName)
             if not self.testMode:
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # check inclusion and exclusion
     newScanSiteList = []
     sitesForANY = []
     for tmpSiteName in scanSiteList:
         autoSite = False
         # check exclusion
         if AtlasBrokerUtils.isMatched(tmpSiteName,excludeList):
             tmpLog.debug('  skip site={0} excluded criteria=-excluded'.format(tmpSiteName))
             continue
         # check inclusion
         if includeList != None and not AtlasBrokerUtils.isMatched(tmpSiteName,includeList):
             if 'AUTO' in includeList:
                 autoSite = True
             else:
                 tmpLog.debug('  skip site={0} not included criteria=-notincluded'.format(tmpSiteName))
                 continue
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limited access
         if tmpSiteSpec.accesscontrol == 'grouplist':
             if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \
                     siteAccessMap[tmpSiteSpec.sitename] != 'approved':
                 tmpLog.debug('  skip site={0} limited access criteria=-limitedaccess'.format(tmpSiteName))
                 continue
         # check cloud
         if not taskSpec.cloud in [None,'','any',tmpSiteSpec.cloud]: 
             tmpLog.debug('  skip site={0} cloud mismatch criteria=-cloudmismatch'.format(tmpSiteName))
             continue
         if autoSite:
             sitesForANY.append(tmpSiteName)
         else:
             newScanSiteList.append(tmpSiteName)
     # use AUTO sites if no sites are included
     if newScanSiteList == []:
         newScanSiteList = sitesForANY
     else:
         for tmpSiteName in sitesForANY:
             tmpLog.debug('  skip site={0} not included criteria=-notincluded'.format(tmpSiteName))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed inclusion/exclusion/cloud'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for data availability
     hasDDS = False
     dataWeight = {}
     remoteSourceList = {}
     if inputChunk.getDatasets() != []:
         oldScanSiteList = copy.copy(scanSiteList)
         for datasetSpec in inputChunk.getDatasets():
             datasetName = datasetSpec.datasetName
             if not self.dataSiteMap.has_key(datasetName):
                 # get the list of sites where data is available
                 tmpLog.debug('getting the list of sites where {0} is available'.format(datasetName))
                 tmpSt,tmpRet = AtlasBrokerUtils.getAnalSitesWithData(scanSiteList,
                                                                      self.siteMapper,
                                                                      self.ddmIF,datasetName)
                 if tmpSt in [Interaction.JEDITemporaryError,Interaction.JEDITimeoutError]: 
                     tmpLog.error('temporary failed to get the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     # send info to logger
                     self.sendLogMessage(tmpLog)
                     return retTmpError
                 if tmpSt == Interaction.JEDIFatalError:
                     tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     # send info to logger
                     self.sendLogMessage(tmpLog)
                     return retFatal
                 # append
                 self.dataSiteMap[datasetName] = tmpRet
                 if datasetName.startswith('ddo'):
                     tmpLog.debug(' {0} sites'.format(len(tmpRet)))
                 else:
                     tmpLog.debug(' {0} sites : {1}'.format(len(tmpRet),str(tmpRet)))
                     # check if distributed
                     if tmpRet != {}:
                         isDistributed = True
                         for tmpMap in tmpRet.values():
                             for tmpVal in tmpMap.values():
                                 if tmpVal['state'] == 'complete':
                                     isDistributed = False
                                     break
                             if not isDistributed:
                                 break
                         if isDistributed:
                             # check if really distributed
                             isDistributed = self.ddmIF.isDistributedDataset(datasetName)
                             if isDistributed:
                                 hasDDS = True
                                 datasetSpec.setDistributed()
                                 tmpLog.debug(' {0} is distributed'.format(datasetName))
             # check if the data is available at somewhere
             if self.dataSiteMap[datasetName] == {}:
                 tmpLog.error('{0} is unavailable at any site'.format(datasetName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 # send info to logger
                 self.sendLogMessage(tmpLog)
                 return retFatal
         # get the list of sites where data is available    
         scanSiteList = None
         scanSiteListOnDisk = None
         normFactor = 0
         for datasetName,tmpDataSite in self.dataSiteMap.iteritems():
             normFactor += 1
             # get sites where replica is available
             tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(tmpDataSite,includeTape=True)
             tmpDiskSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(tmpDataSite,includeTape=False)
             # get sites which can remotely access source sites
             if inputChunk.isMerging:
                 # disable remote access for merging
                 tmpSatelliteSites = {}
             elif (not sitePreAssigned) or (sitePreAssigned and not taskSpec.site in tmpSiteList):
                 tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites(tmpDiskSiteList,self.taskBufferIF,
                                                                        self.siteMapper,nSites=50,
                                                                        protocol=allowedRemoteProtocol)
             else:
                 tmpSatelliteSites = {}
             # make weight map for local
             for tmpSiteName in tmpSiteList:
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = 0
                 # give more weight to disk
                 if tmpSiteName in tmpDiskSiteList:
                     dataWeight[tmpSiteName] += 1
                 else:
                     dataWeight[tmpSiteName] += 0.001
             # make weight map for remote
             for tmpSiteName,tmpWeightSrcMap in tmpSatelliteSites.iteritems():
                 # skip since local data is available
                 if tmpSiteName in tmpSiteList:
                     continue
                 tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                 # negative weight for remote access
                 wRemote = 50.0
                 if not tmpSiteSpec.wansinklimit in [0,None]:
                     wRemote /= float(tmpSiteSpec.wansinklimit)
                 # sum weight
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = float(tmpWeightSrcMap['weight'])/wRemote
                 else:
                     dataWeight[tmpSiteName] += float(tmpWeightSrcMap['weight'])/wRemote
                 # make remote source list
                 if not remoteSourceList.has_key(tmpSiteName):
                     remoteSourceList[tmpSiteName] = {}
                 remoteSourceList[tmpSiteName][datasetName] = tmpWeightSrcMap['source']
             # first list
             if scanSiteList == None:
                 scanSiteList = []
                 for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                     if not tmpSiteName in oldScanSiteList:
                         continue
                     if not tmpSiteName in scanSiteList:
                         scanSiteList.append(tmpSiteName)
                 scanSiteListOnDisk = set()
                 for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys():
                     if not tmpSiteName in oldScanSiteList:
                         continue
                     scanSiteListOnDisk.add(tmpSiteName)
                 continue
             # pickup sites which have all data
             newScanList = []
             for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                 if tmpSiteName in scanSiteList and not tmpSiteName in newScanList:
                     newScanList.append(tmpSiteName)
             scanSiteList = newScanList
             tmpLog.debug('{0} is available at {1} sites'.format(datasetName,len(scanSiteList)))
             # pickup sites which have all data on DISK
             newScanListOnDisk = set()
             for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys():
                 if tmpSiteName in scanSiteListOnDisk:
                     newScanListOnDisk.add(tmpSiteName)
             scanSiteListOnDisk = newScanListOnDisk
             tmpLog.debug('{0} is available at {1} sites on DISK'.format(datasetName,len(scanSiteListOnDisk)))
         # check for preassigned
         if sitePreAssigned and not taskSpec.site in scanSiteList:
             scanSiteList = []
             tmpLog.debug('data is unavailable locally or remotely at preassigned site {0}'.format(taskSpec.site))
         elif len(scanSiteListOnDisk) > 0:
             # use only disk sites
             scanSiteList = list(scanSiteListOnDisk)
         tmpLog.debug('{0} candidates have input data'.format(len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retFatal
     ######################################
     # sites already used by task
     tmpSt,sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # calculate weight
     fqans = taskSpec.makeFQANs()
     """
     tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans,
                                                                                               taskSpec.workingGroup,True)
     currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight)
     currentPriority -= 500
     tmpLog.debug('currentPriority={0}'.format(currentPriority))
     """
     tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,
                                                                                 taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     # check for preassigned
     if sitePreAssigned and not taskSpec.site in scanSiteList:
         tmpLog.debug("preassigned site {0} did not pass all tests".format(taskSpec.site))
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retFatal
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     timeWindowForFC = 6
     preSiteCandidateSpec = None
     failureCounts = self.taskBufferIF.getFailureCountsForTask_JEDI(taskSpec.jediTaskID,timeWindowForFC)
     problematicSites = set()
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',  None,None)
         nAssigned  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'defined',  None,None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) + \
                      AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None)
         nStarting  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'starting', None,None)
         nFailed    = 0
         nClosed    = 0
         nFinished  = 0
         if tmpSiteName in failureCounts:
             if 'failed' in failureCounts[tmpSiteName]:
                 nFailed = failureCounts[tmpSiteName]['failed']
             if 'closed' in failureCounts[tmpSiteName]:
                 nClosed = failureCounts[tmpSiteName]['closed']
             if 'finished' in failureCounts[tmpSiteName]:
                 nFinished = failureCounts[tmpSiteName]['finished']
         # problematic sites
         if nFailed+nClosed > 2*nFinished:
             problematicSites.add(tmpSiteName)
         # calculate weight
         weight = float(nRunning + 1) / float(nActivated + nAssigned + nStarting + 1)
         nThrottled = 0
         if remoteSourceList.has_key(tmpSiteName):
             nThrottled = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None)
             weight /= float(nThrottled + 1)
         # noramize weights by taking data availability into account
         tmpDataWeight = 1
         if dataWeight.has_key(tmpSiteName):
             weight = weight * dataWeight[tmpSiteName]
             tmpDataWeight = dataWeight[tmpSiteName]
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # preassigned
         if sitePreAssigned and tmpSiteName == taskSpec.site:
             preSiteCandidateSpec = siteCandidateSpec
         # set weight
         siteCandidateSpec.weight = weight
         tmpStr  = '  site={0} nRun={1} nDef={2} nAct={3} nStart={4} '.format(tmpSiteName,
                                                                              nRunning,        
                                                                              nAssigned,       
                                                                              nActivated,      
                                                                              nStarting)
         tmpStr += 'nFailed={0} nClosed={1} nFinished={2} nTr={3} dataW={4} W={5}'.format(nFailed,
                                                                                          nClosed,
                                                                                          nFinished,
                                                                                          nThrottled,
                                                                                          tmpDataWeight,
                                                                                          weight)
         tmpLog.debug(tmpStr)
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if not weightMap.has_key(weight):
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)    
     # sort candidates by weights
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight
     # limit the number of sites. use all sites for distributed datasets
     if not hasDDS:
         maxNumSites = 10
         # remove problematic sites
         candidateSpecList = AtlasBrokerUtils.skipProblematicSites(candidateSpecList,
                                                                   problematicSites,
                                                                   sitesUsedByTask,
                                                                   preSiteCandidateSpec,
                                                                   maxNumSites,
                                                                   timeWindowForFC,
                                                                   tmpLog)
     # append preassigned
     if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList: 
         candidateSpecList.append(preSiteCandidateSpec)
     # collect site names
     scanSiteList = []    
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # get list of available files
     availableFileMap = {}     
     for datasetSpec in inputChunk.getDatasets():
         try:
             # get list of site to be scanned
             fileScanSiteList = []
             for tmpSiteName in scanSiteList:
                 fileScanSiteList.append(tmpSiteName)
                 if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(datasetSpec.datasetName):
                     for tmpRemoteSite in remoteSourceList[tmpSiteName][datasetSpec.datasetName]:
                         if not tmpRemoteSite in fileScanSiteList:
                             fileScanSiteList.append(tmpRemoteSite)
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(fileScanSiteList,self.siteMapper)
             # disable file lookup for merge jobs
             if inputChunk.isMerging:
                 checkCompleteness = False
             else:
                 checkCompleteness = True
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec,
                                                         siteStorageEP,
                                                         self.siteMapper,
                                                         ngGroup=[2],
                                                         checkCompleteness=checkCompleteness)
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         tmpSiteName = siteCandidateSpec.siteName
         # preassigned
         if sitePreAssigned and tmpSiteName != taskSpec.site:
             tmpLog.debug('  skip site={0} non pre-assigned site criteria=-nonpreassigned'.format(tmpSiteName))
             continue
         # set available files
         if inputChunk.getDatasets() == []: 
             isAvailable = True
         else:
             isAvailable = False
         for tmpDatasetName,availableFiles in availableFileMap.iteritems():
             tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName)
             # check remote files
             if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(tmpDatasetName):
                 for tmpRemoteSite in remoteSourceList[tmpSiteName][tmpDatasetName]:
                     if availableFiles.has_key(tmpRemoteSite) and \
                             len(tmpDatasetSpec.Files) <= len(availableFiles[tmpRemoteSite]['localdisk']):
                         # use only remote disk files
                         siteCandidateSpec.remoteFiles += availableFiles[tmpRemoteSite]['localdisk']
                         # set remote site and access protocol
                         siteCandidateSpec.remoteProtocol = allowedRemoteProtocol
                         siteCandidateSpec.remoteSource   = tmpRemoteSite
                         isAvailable = True
                         break
             # local files
             if availableFiles.has_key(tmpSiteName):
                 if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \
                         len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']) or \
                         len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localtape']) or \
                         (tmpDatasetSpec.isDistributed() and len(availableFiles[tmpSiteName]['all']) > 0):
                     siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['localdisk']
                     # add cached files to local list since cached files go to pending when reassigned
                     siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['cache']
                     siteCandidateSpec.localTapeFiles  += availableFiles[tmpSiteName]['localtape']
                     siteCandidateSpec.cacheFiles  += availableFiles[tmpSiteName]['cache']
                     siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote']
                     siteCandidateSpec.addAvailableFiles(availableFiles[tmpSiteName]['all'])
                     isAvailable = True
                 else:
                     tmpMsg = '{0} is incompete at {1} : nFiles={2} nLocal={3} nCached={4} nTape={5}'
                     tmpLog.debug(tmpMsg.format(tmpDatasetName,
                                                tmpSiteName,
                                                len(tmpDatasetSpec.Files),
                                                len(availableFiles[tmpSiteName]['localdisk']),
                                                len(availableFiles[tmpSiteName]['cache']),
                                                len(availableFiles[tmpSiteName]['localtape']),
                                                ))
             if not isAvailable:
                 break
         # append
         if not isAvailable:
             tmpLog.debug('  skip site={0} file unavailable criteria=-fileunavailable'.format(siteCandidateSpec.siteName))
             continue
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug('  use site={0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5} criteria=+use'.format(siteCandidateSpec.siteName,
                                                                                                              siteCandidateSpec.weight,
                                                                                                              len(siteCandidateSpec.localDiskFiles),
                                                                                                              len(siteCandidateSpec.localTapeFiles),
                                                                                                              len(siteCandidateSpec.cacheFiles),
                                                                                                              len(siteCandidateSpec.remoteFiles),
                                                                                                              ))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     # send info to logger
     self.sendLogMessage(tmpLog)
     # return
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,inputChunk

Пример #41

Показать файл

Файл: AtlasProdTaskBroker.py Проект: tertychnyy/panda-jedi

 def doBrokerage(self, inputList, vo, prodSourceLabel, workQueue):
     # variables for submission
     maxBunchTask = 100
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start doBrokerage')
     # return for failure
     retFatal = self.SC_FATAL
     retTmpError = self.SC_FAILED
     tmpLog.debug('vo={0} label={1} queue={2}'.format(
         vo, prodSourceLabel, workQueue.queue_name))
     # loop over all tasks
     allRwMap = {}
     prioMap = {}
     tt2Map = {}
     expRWs = {}
     jobSpecList = []
     for tmpJediTaskID, tmpInputList in inputList:
         for taskSpec, cloudName, inputChunk in tmpInputList:
             # make JobSpec to be submitted for TaskAssigner
             jobSpec = JobSpec()
             jobSpec.taskID = taskSpec.jediTaskID
             jobSpec.jediTaskID = taskSpec.jediTaskID
             # set managed to trigger TA
             jobSpec.prodSourceLabel = 'managed'
             jobSpec.processingType = taskSpec.processingType
             jobSpec.workingGroup = taskSpec.workingGroup
             jobSpec.metadata = taskSpec.processingType
             jobSpec.assignedPriority = taskSpec.taskPriority
             jobSpec.currentPriority = taskSpec.currentPriority
             jobSpec.maxDiskCount = (
                 taskSpec.getOutDiskSize() +
                 taskSpec.getWorkDiskSize()) / 1024 / 1024
             if taskSpec.useWorldCloud():
                 # use destinationSE to trigger task brokerage in WORLD cloud
                 jobSpec.destinationSE = taskSpec.cloud
             prodDBlock = None
             setProdDBlock = False
             for datasetSpec in inputChunk.getDatasets():
                 prodDBlock = datasetSpec.datasetName
                 if datasetSpec.isMaster():
                     jobSpec.prodDBlock = datasetSpec.datasetName
                     setProdDBlock = True
                 for fileSpec in datasetSpec.Files:
                     tmpInFileSpec = fileSpec.convertToJobFileSpec(
                         datasetSpec)
                     jobSpec.addFile(tmpInFileSpec)
             # use secondary dataset name as prodDBlock
             if setProdDBlock == False and prodDBlock != None:
                 jobSpec.prodDBlock = prodDBlock
             # append
             jobSpecList.append(jobSpec)
             prioMap[jobSpec.taskID] = jobSpec.currentPriority
             tt2Map[jobSpec.taskID] = jobSpec.processingType
             # get RW for a priority
             if not allRwMap.has_key(jobSpec.currentPriority):
                 tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI(
                     vo, prodSourceLabel, workQueue,
                     jobSpec.currentPriority)
                 if tmpRW == None:
                     tmpLog.error(
                         'failed to calculate RW with prio={0}'.format(
                             jobSpec.currentPriority))
                     return retTmpError
                 allRwMap[jobSpec.currentPriority] = tmpRW
             # get expected RW
             expRW = self.taskBufferIF.calculateTaskRW_JEDI(
                 jobSpec.jediTaskID)
             if expRW == None:
                 tmpLog.error(
                     'failed to calculate RW for jediTaskID={0}'.format(
                         jobSpec.jediTaskID))
                 return retTmpError
             expRWs[jobSpec.taskID] = expRW
     # get fullRWs
     fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI(
         vo, prodSourceLabel, None, None)
     if fullRWs == None:
         tmpLog.error('failed to calculate full RW')
         return retTmpError
     # set metadata
     for jobSpec in jobSpecList:
         rwValues = allRwMap[jobSpec.currentPriority]
         jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % (
             jobSpec.metadata, str(rwValues), str(expRWs), str(prioMap),
             str(fullRWs), str(tt2Map))
     tmpLog.debug('run task assigner for {0} tasks'.format(
         len(jobSpecList)))
     nBunchTask = 0
     while nBunchTask < len(jobSpecList):
         # get a bunch
         jobsBunch = jobSpecList[nBunchTask:nBunchTask + maxBunchTask]
         strIDs = 'jediTaskID='
         for tmpJobSpec in jobsBunch:
             strIDs += '{0},'.format(tmpJobSpec.taskID)
         strIDs = strIDs[:-1]
         tmpLog.debug(strIDs)
         # increment index
         nBunchTask += maxBunchTask
         # run task brokerge
         stS, outSs = PandaClient.runTaskAssignment(jobsBunch)
         tmpLog.debug('{0}:{1}'.format(stS, str(outSs)))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED

Пример #42

Показать файл

Файл: ContentsFeeder.py Проект: PanDAWMS/panda-jedi

    def runImpl(self):
        while True:
            try:
                # get a part of list
                nTasks = 10
                taskDsList = self.taskDsList.get(nTasks)
                # no more datasets
                if len(taskDsList) == 0:
                    self.logger.debug('%s terminating since no more items' % self.__class__.__name__)
                    return
                # loop over all tasks
                for jediTaskID,dsList in taskDsList:
                    allUpdated = True
                    taskBroken = False
                    taskOnHold = False
                    runningTask = False
                    missingMap = {}
                    datasetsIdxConsistency = []

                    # get task
                    tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False,True,self.pid,10)
                    if not tmpStat or taskSpec == None:
                        self.logger.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID))
                        continue

                    # make logger
                    try:
                        gshare = '_'.join(taskSpec.gshare.split(' '))
                    except:
                        gshare = 'Undefined'
                    tmpLog = MsgWrapper(self.logger,'<jediTaskID={0} gshare={1}>'.format(jediTaskID, gshare))

                    try:
                        # get task parameters
                        taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                        taskParamMap = RefinerUtils.decodeJSON(taskParam)
                    except:
                        errtype,errvalue = sys.exc_info()[:2]
                        tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue))
                        taskBroken = True
                    # renaming of parameters
                    if taskParamMap.has_key('nEventsPerInputFile'):
                        taskParamMap['nEventsPerFile'] = taskParamMap['nEventsPerInputFile']
                    # the number of files per job
                    nFilesPerJob = taskSpec.getNumFilesPerJob()
                    # the number of chunks used by scout 
                    nChunksForScout = 10
                    # load XML
                    if taskSpec.useLoadXML():
                        xmlConfig = taskParamMap['loadXML']
                    else:
                        xmlConfig = None
                    # skip files used by another task
                    if 'skipFilesUsedBy' in taskParamMap:
                        skipFilesUsedBy = taskParamMap['skipFilesUsedBy']
                    else:
                        skipFilesUsedBy = None
                    # check no wait
                    noWaitParent = False
                    parentOutDatasets = set()
                    if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None,taskSpec.jediTaskID]:
                        tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid)
                        if tmpStat == 'running':
                            noWaitParent = True
                            # get output datasets from parent task
                            tmpParentStat,tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.parent_tid,
                                                                                                                  ['output','log'])
                            # collect dataset names
                            for tmpParentOutDataset in tmpParentOutDatasets:
                                parentOutDatasets.add(tmpParentOutDataset.datasetName)
                    # loop over all datasets
                    nFilesMaster = 0
                    checkedMaster = False
                    setFrozenTime = True
                    if not taskBroken:
                        ddmIF = self.ddmIF.getInterface(taskSpec.vo) 
                        origNumFiles = None
                        if taskParamMap.has_key('nFiles'):
                            origNumFiles = taskParamMap['nFiles']
                        for datasetSpec in dsList:
                            tmpLog.debug('start loop for {0}(id={1})'.format(datasetSpec.datasetName,datasetSpec.datasetID))
                            # index consistency
                            if datasetSpec.indexConsistent():
                                datasetsIdxConsistency.append(datasetSpec.datasetID)
                            # get dataset metadata
                            tmpLog.debug('get metadata')
                            gotMetadata = False
                            stateUpdateTime = datetime.datetime.utcnow()                    
                            try:
                                if not datasetSpec.isPseudo():
                                    tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName)
                                else:
                                    # dummy metadata for pseudo dataset
                                    tmpMetadata = {'state':'closed'}
                                # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed 
                                if (noWaitParent or taskSpec.runUntilClosed()) and \
                                        (tmpMetadata['state'] == 'open' \
                                             or datasetSpec.datasetName in parentOutDatasets \
                                             or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets):
                                    # dummy metadata when parent is running
                                    tmpMetadata = {'state':'mutable'}
                                gotMetadata = True
                            except:
                                errtype,errvalue = sys.exc_info()[:2]
                                tmpLog.error('{0} failed to get metadata to {1}:{2}'.format(self.__class__.__name__,
                                                                                            errtype.__name__,errvalue))
                                if errtype == Interaction.JEDIFatalError:
                                    # fatal error
                                    datasetStatus = 'broken'
                                    taskBroken = True
                                    # update dataset status    
                                    self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                else:
                                    if not taskSpec.ignoreMissingInDS():
                                        # temporary error
                                        taskOnHold = True
                                    else:
                                        # ignore missing 
                                        datasetStatus = 'failed'
                                        # update dataset status
                                        self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                taskSpec.setErrDiag('failed to get metadata for {0}'.format(datasetSpec.datasetName))
                                if not taskSpec.ignoreMissingInDS():
                                    allUpdated = False
                            else:
                                # get file list specified in task parameters
                                fileList,includePatt,excludePatt = RefinerUtils.extractFileList(taskParamMap,datasetSpec.datasetName)   
                                # get the number of events in metadata
                                if taskParamMap.has_key('getNumEventsInMetadata'):
                                    getNumEvents = True
                                else:
                                    getNumEvents = False
                                # get file list from DDM
                                tmpLog.debug('get files')
                                try:
                                    useInFilesWithNewAttemptNr = False
                                    skipDuplicate = not datasetSpec.useDuplicatedFiles()
                                    if not datasetSpec.isPseudo():
                                        if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \
                                                not datasetSpec.containerName in ['',None]:
                                            # read files from container if file list is specified in task parameters
                                            tmpDatasetName = datasetSpec.containerName
                                        else:
                                            tmpDatasetName = datasetSpec.datasetName
                                        # use long format for LB
                                        longFormat = False
                                        if taskSpec.respectLumiblock() or taskSpec.orderByLB():
                                            longFormat = True
                                        tmpRet = ddmIF.getFilesInDataset(tmpDatasetName,
                                                                         getNumEvents=getNumEvents,
                                                                         skipDuplicate=skipDuplicate,
                                                                         longFormat=longFormat
                                                                         )
                                        tmpLog.debug('got {0} files in {1}'.format(len(tmpRet),tmpDatasetName))
                                        # remove lost files
                                        tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName,tmpRet)
                                        if tmpLostFiles != {}:
                                            tmpLog.debug('found {0} lost files in {1}'.format(len(tmpLostFiles),tmpDatasetName))
                                            for tmpListGUID,tmpLostLFN in tmpLostFiles.iteritems():
                                                tmpLog.debug('removed {0}'.format(tmpLostLFN))
                                                del tmpRet[tmpListGUID]
                                    else:
                                        if datasetSpec.isSeqNumber():
                                            # make dummy files for seq_number
                                            if datasetSpec.getNumRecords() != None:
                                                nPFN = datasetSpec.getNumRecords()
                                            elif origNumFiles != None:
                                                nPFN = origNumFiles
                                                if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \
                                                        and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                    nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerJob']
                                                elif taskParamMap.has_key('nEventsPerFile') and taskParamMap.has_key('nEventsPerRange'):
                                                    nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerRange']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap:
                                                nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerJob']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \
                                                    and taskSpec.getNumFilesPerJob() is not None:
                                                nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerFile'] / taskSpec.getNumFilesPerJob()
                                            else:
                                                # the default number of records for seq_number
                                                seqDefNumRecords = 10000
                                                # get nFiles of the master
                                                tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI(datasetSpec.jediTaskID,
                                                                                                           datasetSpec.masterID,
                                                                                                           ['nFiles'])
                                                # use nFiles of the master as the number of records if it is larger than the default
                                                if 'nFiles' in tmpMasterAtt and tmpMasterAtt['nFiles'] > seqDefNumRecords:
                                                    nPFN = tmpMasterAtt['nFiles']
                                                else:
                                                    nPFN = seqDefNumRecords
                                                # check usedBy 
                                                if skipFilesUsedBy != None:
                                                    for tmpJediTaskID in str(skipFilesUsedBy).split(','):
                                                        tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI(tmpJediTaskID,
                                                                                                                          {'datasetName':datasetSpec.datasetName},
                                                                                                                          ['nFiles'])
                                                        if 'nFiles' in tmpParentAtt and tmpParentAtt['nFiles']:
                                                            nPFN += tmpParentAtt['nFiles']
                                            tmpRet = {}
                                            # get offset
                                            tmpOffset = datasetSpec.getOffset()
                                            tmpOffset += 1
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {'lfn':iPFN+tmpOffset,
                                                                             'scope':None,
                                                                             'filesize':0,
                                                                             'checksum':None,
                                                                             }
                                        elif not taskSpec.useListPFN():
                                            # dummy file list for pseudo dataset
                                            tmpRet = {str(uuid.uuid4()):{'lfn':'pseudo_lfn',
                                                                         'scope':None,
                                                                         'filesize':0,
                                                                         'checksum':None,
                                                                         }
                                                      }
                                        else:
                                            # make dummy file list for PFN list
                                            if taskParamMap.has_key('nFiles'):
                                                nPFN = taskParamMap['nFiles']
                                            else:
                                                nPFN = 1
                                            tmpRet = {}
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {'lfn':'{0:06d}:{1}'.format(iPFN,taskParamMap['pfnList'][iPFN].split('/')[-1]),
                                                                             'scope':None,
                                                                             'filesize':0,
                                                                             'checksum':None,
                                                                             }
                                except:
                                    errtype,errvalue = sys.exc_info()[:2]
                                    tmpLog.error('failed to get files due to {0}:{1} {2}'.format(self.__class__.__name__,
                                                                                                 errtype.__name__,errvalue))
                                    if errtype == Interaction.JEDIFatalError:
                                        # fatal error
                                        datasetStatus = 'broken'
                                        taskBroken = True
                                        # update dataset status    
                                        self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                    else:
                                        # temporary error
                                        taskOnHold = True
                                    taskSpec.setErrDiag('failed to get files for {0}'.format(datasetSpec.datasetName))
                                    allUpdated = False
                                else:
                                    # parameters for master input
                                    respectLB = False
                                    useRealNumEvents = False
                                    if datasetSpec.isMaster():
                                        # respect LB boundaries
                                        respectLB = taskSpec.respectLumiblock()
                                        # use real number of events
                                        useRealNumEvents = taskSpec.useRealNumEvents()
                                    # the number of events per file
                                    nEventsPerFile  = None
                                    nEventsPerJob   = None
                                    nEventsPerRange = None
                                    tgtNumEventsPerJob = None
                                    if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \
                                            (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()):
                                        if taskParamMap.has_key('nEventsPerFile'):
                                            nEventsPerFile = taskParamMap['nEventsPerFile']
                                        elif datasetSpec.isMaster() and datasetSpec.isPseudo() and taskParamMap.has_key('nEvents'):
                                            # use nEvents as nEventsPerFile for pseudo input
                                            nEventsPerFile = taskParamMap['nEvents']
                                        if taskParamMap.has_key('nEventsPerJob'):
                                            nEventsPerJob = taskParamMap['nEventsPerJob']
                                        elif taskParamMap.has_key('nEventsPerRange'):
                                            nEventsPerRange = taskParamMap['nEventsPerRange']
                                        if 'tgtNumEventsPerJob' in taskParamMap:
                                            tgtNumEventsPerJob = taskParamMap['tgtNumEventsPerJob']
                                            # reset nEventsPerJob
                                            nEventsPerJob = None
                                    # max attempts
                                    maxAttempt = None
                                    maxFailure = None
                                    if datasetSpec.isMaster() or datasetSpec.toKeepTrack():
                                        # max attempts 
                                        if taskSpec.disableAutoRetry():
                                            # disable auto retry 
                                            maxAttempt = 1
                                        elif taskParamMap.has_key('maxAttempt'):
                                            maxAttempt = taskParamMap['maxAttempt']
                                        else:
                                            # use default value
                                            maxAttempt = 3
                                        # max failure
                                        if 'maxFailure' in taskParamMap:
                                            maxFailure = taskParamMap['maxFailure']
                                    # first event number
                                    firstEventNumber = None
                                    if datasetSpec.isMaster():
                                        # first event number
                                        firstEventNumber = 1 + taskSpec.getFirstEventOffset()
                                    # nMaxEvents
                                    nMaxEvents = None 
                                    if datasetSpec.isMaster() and taskParamMap.has_key('nEvents'):
                                        nMaxEvents = taskParamMap['nEvents']
                                    # nMaxFiles
                                    nMaxFiles = None
                                    if taskParamMap.has_key('nFiles'):
                                        if datasetSpec.isMaster():
                                            nMaxFiles = taskParamMap['nFiles']
                                        else:
                                            # calculate for secondary
                                            nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles)
                                            # multipled by the number of jobs per file for event-level splitting
                                            if nMaxFiles != None and taskParamMap.has_key('nEventsPerFile'):
                                                if taskParamMap.has_key('nEventsPerJob'):
                                                    if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                        nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerJob'])
                                                        nMaxFiles = int(math.ceil(nMaxFiles))
                                                elif taskParamMap.has_key('nEventsPerRange'):
                                                    if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerRange']:
                                                        nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerRange'])
                                                        nMaxFiles = int(math.ceil(nMaxFiles))
                                    # use scout
                                    useScout = False    
                                    if datasetSpec.isMaster() and taskSpec.useScout() and (datasetSpec.status != 'toupdate' or not taskSpec.isPostScout()):
                                        useScout = True
                                    # use files with new attempt numbers    
                                    useFilesWithNewAttemptNr = False
                                    if not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key('useInFilesWithNewAttemptNr'):
                                        useFilesWithNewAttemptNr = True
                                    # ramCount
                                    ramCount = 0
                                    # skip short input
                                    if datasetSpec.isMaster() and not datasetSpec.isPseudo() \
                                            and nEventsPerFile is not None and nEventsPerJob is not None \
                                            and nEventsPerFile >= nEventsPerJob \
                                            and 'skipShortInput' in taskParamMap and taskParamMap['skipShortInput'] == True:
                                        skipShortInput = True
                                    else:
                                        skipShortInput = False
                                    # feed files to the contents table
                                    tmpLog.debug('update contents')
                                    retDB,missingFileList,nFilesUnique,diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(datasetSpec,tmpRet,
                                                                                                                              tmpMetadata['state'],
                                                                                                                              stateUpdateTime,
                                                                                                                              nEventsPerFile,
                                                                                                                              nEventsPerJob,
                                                                                                                              maxAttempt,
                                                                                                                              firstEventNumber,
                                                                                                                              nMaxFiles,
                                                                                                                              nMaxEvents,
                                                                                                                              useScout,
                                                                                                                              fileList,
                                                                                                                              useFilesWithNewAttemptNr,
                                                                                                                              nFilesPerJob,
                                                                                                                              nEventsPerRange,
                                                                                                                              nChunksForScout,
                                                                                                                              includePatt,
                                                                                                                              excludePatt,
                                                                                                                              xmlConfig,
                                                                                                                              noWaitParent,
                                                                                                                              taskSpec.parent_tid,
                                                                                                                              self.pid,
                                                                                                                              maxFailure,
                                                                                                                              useRealNumEvents,
                                                                                                                              respectLB,
                                                                                                                              tgtNumEventsPerJob,
                                                                                                                              skipFilesUsedBy,
                                                                                                                              ramCount,
                                                                                                                              taskSpec,
                                                                                                                              skipShortInput)
                                    if retDB == False:
                                        taskSpec.setErrDiag('failed to insert files for {0}. {1}'.format(datasetSpec.datasetName,
                                                                                                         diagMap['errMsg']))
                                        allUpdated = False
                                        taskBroken = True
                                        break
                                    elif retDB == None:
                                        # the dataset is locked by another or status is not applicable
                                        allUpdated = False
                                        tmpLog.debug('escape since task or dataset is locked')
                                        break
                                    elif missingFileList != []:
                                        # files are missing
                                        tmpErrStr = '{0} files missing in {1}'.format(len(missingFileList),datasetSpec.datasetName)
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        allUpdated = False
                                        taskOnHold = True
                                        missingMap[datasetSpec.datasetName] = {'datasetSpec':datasetSpec,
                                                                               'missingFiles':missingFileList} 
                                    else:
                                        # reduce the number of files to be read
                                        if taskParamMap.has_key('nFiles'):
                                            if datasetSpec.isMaster():
                                                taskParamMap['nFiles'] -= nFilesUnique
                                        # reduce the number of files for scout
                                        if useScout:
                                            nChunksForScout = diagMap['nChunksForScout']
                                        # number of master input files
                                        if datasetSpec.isMaster():
                                            checkedMaster = True
                                            nFilesMaster += nFilesUnique
                                    # running task
                                    if diagMap['isRunningTask']:
                                        runningTask = True
                                    # no activated pending input for noWait
                                    if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout <= 0) \
                                            and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster():
                                        tmpErrStr = 'insufficient inputs are ready. '
                                        tmpErrStr += diagMap['errMsg']
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        taskOnHold = True
                                        setFrozenTime = False
                                        break
                            tmpLog.debug('end loop')
                    # no mater input
                    if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster:
                        tmpErrStr = 'no master input files. input dataset is empty'
                        tmpLog.error(tmpErrStr)
                        taskSpec.setErrDiag(tmpErrStr,None)
                        if taskSpec.allowEmptyInput() or noWaitParent:
                            taskOnHold = True
                        else:
                            taskBroken = True
                    # index consistency
                    if not taskOnHold and not taskBroken and len(datasetsIdxConsistency) > 0:
                        self.taskBufferIF.removeFilesIndexInconsistent_JEDI(jediTaskID,datasetsIdxConsistency)
                    # update task status
                    if taskBroken:
                        # task is broken
                        taskSpec.status = 'tobroken'
                        tmpMsg = 'set task_status={0}'.format(taskSpec.status)
                        tmpLog.info(tmpMsg)
                        tmpLog.sendMsg(tmpMsg,self.msgType)
                        allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid)
                    # change task status unless the task is running
                    if not runningTask:
                        if taskOnHold:
                            # go to pending state
                            if not taskSpec.status in ['broken','tobroken']:
                                taskSpec.setOnHold()
                            tmpMsg = 'set task_status={0}'.format(taskSpec.status)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg,self.msgType)
                            allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid,setFrozenTime=setFrozenTime)
                        elif allUpdated:
                            # all OK
                            allRet,newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,getTaskStatus=True,pid=self.pid,
                                                                                                       useWorldCloud=taskSpec.useWorldCloud())
                            tmpMsg = 'set task_status={0}'.format(newTaskStatus)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg,self.msgType)
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid)
                        tmpLog.debug('unlock not-running task with {0}'.format(retUnlock))
                    else:
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid)
                        tmpLog.debug('unlock task with {0}'.format(retUnlock))
                    tmpLog.debug('done')
            except:
                errtype,errvalue = sys.exc_info()[:2]
                logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))

Пример #43

Показать файл

Файл: GenJobBroker.py Проект: tertychnyy/panda-jedi

 def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,
                         '<jediTaskID={0}>'.format(taskSpec.jediTaskID))
     tmpLog.debug('start')
     # return for failure
     retFatal = self.SC_FATAL, inputChunk
     retTmpError = self.SC_FAILED, inputChunk
     # get sites in the cloud
     if not taskSpec.site in ['', None]:
         scanSiteList = [taskSpec.site]
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
     elif inputChunk.getPreassignedSite() != None:
         scanSiteList = [inputChunk.getPreassignedSite()]
         tmpLog.debug('site={0} is pre-assigned in masterDS'.format(
             inputChunk.getPreassignedSite()))
     else:
         scanSiteList = self.siteMapper.getCloud(cloudName)['sites']
         tmpLog.debug('cloud=%s has %s candidates' %
                      (cloudName, len(scanSiteList)))
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status != 'online':
             skipFlag = True
         if not skipFlag:
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug('  skip %s due to status=%s' %
                          (tmpSiteName, tmpSiteSpec.status))
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed site status check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for memory
     minRamCount = max(taskSpec.ramCount, inputChunk.ramCount)
     if not minRamCount in [0, None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory:
                 tmpLog.debug(
                     '  skip {0} due to site RAM shortage={1}(site upper limit) < {2}'
                     .format(tmpSiteName, tmpSiteSpec.maxmemory,
                             minRamCount))
                 continue
             if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory:
                 tmpLog.debug(
                     '  skip {0} due to job RAM shortage={1}(site lower limit) > {2}'
                     .format(tmpSiteName, tmpSiteSpec.minmemory,
                             minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(
             len(scanSiteList), minRamCount, taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for scratch disk
     minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize(
     ) + inputChunk.getMaxAtomSize()
     minDiskCountS = minDiskCountS / 1024 / 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = taskSpec.getOutDiskSize(
         ) + taskSpec.getWorkDiskSize()
         minDiskCountR = minDiskCountR / 1024 / 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0:
             if tmpSiteSpec.isDirectIO():
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug(
                     '  skip {0} due to small scratch disk={1} < {2}'.
                     format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # free space must be >= 200GB
         diskThreshold = 200
         tmpSpaceSize = tmpSiteSpec.space
         if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold:
             tmpLog.debug(
                 '  skip {0} due to disk shortage in SE = {1} < {2}GB'.
                 format(tmpSiteName, tmpSiteSpec.space, diskThreshold))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0, None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug(
                     '  skip {0} due to short site walltime={1}(site upper limit) < {2}'
                     .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug(
                     '  skip {0} due to short job walltime={1}(site lower limit) > {2}'
                     .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(
             len(scanSiteList), minWalltime, taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][
                 'updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug('  skip %s due to no pilot' % tmpSiteName)
             #continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed pilot activity check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # sites already used by task
     tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(
         taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # calculate weight
     tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(
         taskSpec.vo, taskSpec.prodSourceLabel, taskSpec.currentPriority)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     preSiteCandidateSpec = None
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName,
                                                'running', None, None)
         nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                 tmpSiteName, 'defined',
                                                 None, None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                  tmpSiteName, 'activated',
                                                  None, None)
         weight = float(nRunning + 1) / float(nActivated + nAssigned +
                                              1) / float(nAssigned + 1)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # set weight
         siteCandidateSpec.weight = weight
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if not weightMap.has_key(weight):
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)
     # limit the number of sites
     maxNumSites = 5
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         if len(candidateSpecList) >= maxNumSites:
             break
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight[:(maxNumSites -
                                                len(candidateSpecList))]
     # collect site names
     scanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         tmpSiteName = siteCandidateSpec.siteName
         # append
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug('  use {0} with weight={1}'.format(
             siteCandidateSpec.siteName, siteCandidateSpec.weight))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED, inputChunk

Пример #44

Показать файл

Файл: GenJobBroker.py Проект: RRCKI/panda-jedi

 def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID))
     tmpLog.debug('start')
     # return for failure
     retFatal    = self.SC_FATAL,inputChunk
     retTmpError = self.SC_FAILED,inputChunk
     # get sites in the cloud
     if not taskSpec.site in ['',None]:
         scanSiteList = [taskSpec.site]
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
     elif inputChunk.getPreassignedSite() != None:
         scanSiteList = [inputChunk.getPreassignedSite()]
         tmpLog.debug('site={0} is pre-assigned in masterDS'.format(inputChunk.getPreassignedSite()))
     else:
         scanSiteList = self.siteMapper.getCloud(cloudName)['sites']
         tmpLog.debug('cloud=%s has %s candidates' % (cloudName,len(scanSiteList)))
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status != 'online':
             skipFlag = True
         if not skipFlag:
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug('  skip %s due to status=%s' % (tmpSiteName,tmpSiteSpec.status))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for memory
     minRamCount  = max(taskSpec.ramCount, inputChunk.ramCount)
     if not minRamCount in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory:
                 tmpLog.debug('  skip {0} due to site RAM shortage={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                       tmpSiteSpec.maxmemory,
                                                                                                       minRamCount))
                 continue
             if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory:
                 tmpLog.debug('  skip {0} due to job RAM shortage={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                      tmpSiteSpec.minmemory,
                                                                                                      minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList),
                                                                          minRamCount,taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for scratch disk
     minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize()
     minDiskCountS = minDiskCountS / 1024 / 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()
         minDiskCountR = minDiskCountR / 1024 / 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0:
             if tmpSiteSpec.isDirectIO():
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug('  skip {0} due to small scratch disk={1} < {2}'.format(tmpSiteName,
                                                                                      tmpSiteSpec.maxwdir,
                                                                                      minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # free space must be >= 200GB
         diskThreshold = 200
         tmpSpaceSize = tmpSiteSpec.space
         if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold:
             tmpLog.debug('  skip {0} due to disk shortage in SE = {1} < {2}GB'.format(tmpSiteName,tmpSiteSpec.space,
                                                                                       diskThreshold))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug('  skip {0} due to short site walltime={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                         tmpSiteSpec.maxtime,
                                                                                                         minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug('  skip {0} due to short job walltime={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                        tmpSiteSpec.mintime,
                                                                                                        minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug('  skip %s due to no pilot' % tmpSiteName)
             #continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # sites already used by task
     tmpSt,sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # calculate weight
     tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,
                                                                                 taskSpec.prodSourceLabel,
                                                                                 taskSpec.currentPriority)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     preSiteCandidateSpec = None
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',  None,None)
         nAssigned  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'defined',  None,None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None)
         weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # set weight
         siteCandidateSpec.weight = weight
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if not weightMap.has_key(weight):
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)    
     # limit the number of sites
     maxNumSites = 5
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         if len(candidateSpecList) >= maxNumSites:
             break
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight[:(maxNumSites-len(candidateSpecList))]
     # collect site names
     scanSiteList = []    
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         tmpSiteName = siteCandidateSpec.siteName
         # append
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug('  use {0} with weight={1}'.format(siteCandidateSpec.siteName,
                                                         siteCandidateSpec.weight))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # return
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,inputChunk

Пример #45

Показать файл

 def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,
                         '<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                         monToken='<jediTaskID={0} {1}>'.format(
                             taskSpec.jediTaskID,
                             datetime.datetime.utcnow().isoformat('/')))
     tmpLog.debug('start')
     # return for failure
     retFatal = self.SC_FATAL, inputChunk
     retTmpError = self.SC_FAILED, inputChunk
     # get primary site candidates
     sitePreAssigned = False
     excludeList = []
     includeList = None
     scanSiteList = []
     # get list of site access
     siteAccessList = self.taskBufferIF.listSiteAccess(
         None, taskSpec.userName)
     siteAccessMap = {}
     for tmpSiteName, tmpAccess in siteAccessList:
         siteAccessMap[tmpSiteName] = tmpAccess
     # site limitation
     if taskSpec.useLimitedSites():
         if 'excludedSite' in taskParamMap:
             excludeList = taskParamMap['excludedSite']
             # str to list for task retry
             try:
                 if type(excludeList) != types.ListType:
                     excludeList = excludeList.split(',')
             except:
                 pass
         if 'includedSite' in taskParamMap:
             includeList = taskParamMap['includedSite']
             # str to list for task retry
             if includeList == '':
                 includeList = None
             try:
                 if type(includeList) != types.ListType:
                     includeList = includeList.split(',')
             except:
                 pass
     # loop over all sites
     for siteName, tmpSiteSpec in self.siteMapper.siteSpecList.iteritems():
         if tmpSiteSpec.type == 'analysis':
             scanSiteList.append(siteName)
     # preassigned
     if not taskSpec.site in ['', None]:
         # site is pre-assigned
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
         sitePreAssigned = True
         if not taskSpec.site in scanSiteList:
             scanSiteList.append(taskSpec.site)
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     # allowed remote access protocol
     allowedRemoteProtocol = 'fax'
     # MP
     if taskSpec.coreCount != None and taskSpec.coreCount > 1:
         # use MCORE only
         useMP = 'only'
     elif taskSpec.coreCount == 0:
         # use MCORE and normal
         useMP = 'any'
     else:
         # not use MCORE
         useMP = 'unuse'
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status in ['offline']:
             skipFlag = True
         elif tmpSiteSpec.status in ['brokeroff', 'test']:
             if not sitePreAssigned:
                 skipFlag = True
             elif tmpSiteName != taskSpec.site:
                 skipFlag = True
         if not skipFlag:
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug(
                 '  skip site=%s due to status=%s criteria=-status' %
                 (tmpSiteName, tmpSiteSpec.status))
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed site status check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for MP
     if not sitePreAssigned:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \
                     (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]):
                 newScanSiteList.append(tmpSiteName)
             else:
                 tmpLog.debug('  skip site=%s due to core mismatch cores_site=%s <> cores_task=%s criteria=-cpucore' % \
                              (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount))
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed for useMP={1}'.format(
             len(scanSiteList), useMP))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for release
     if taskSpec.transHome != None:
         if taskSpec.transHome.startswith('ROOT'):
             # hack until x86_64-slc6-gcc47-opt is published in installedsw
             if taskSpec.architecture == 'x86_64-slc6-gcc47-opt':
                 tmpCmtConfig = 'x86_64-slc6-gcc46-opt'
             else:
                 tmpCmtConfig = taskSpec.architecture
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                 scanSiteList, cmtConfig=tmpCmtConfig, onlyCmtConfig=True)
         elif 'AthAnalysis' in taskSpec.transHome or re.search(
                 'Ath[a-zA-Z]+Base', taskSpec.transHome) != None:
             # AthAnalysis
             siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                 scanSiteList,
                 cmtConfig=taskSpec.architecture,
                 onlyCmtConfig=True)
         else:
             # remove AnalysisTransforms-
             transHome = re.sub('^[^-]+-*', '', taskSpec.transHome)
             transHome = re.sub('_', '-', transHome)
             if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None and taskSpec.transHome != 'AnalysisTransforms' and \
                     re.search('\d{4}-\d{2}-\d{2}T\d{4}$',taskSpec.transHome) == None :
                 # cache is checked
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                     scanSiteList,
                     caches=transHome,
                     cmtConfig=taskSpec.architecture)
             elif transHome == '' and taskSpec.transUses != None:
                 # remove Atlas-
                 transUses = taskSpec.transUses.split('-')[-1]
                 # release is checked
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                     scanSiteList,
                     releases=transUses,
                     cmtConfig=taskSpec.architecture)
             else:
                 # nightlies
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                     scanSiteList, releases='CVMFS')
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip site=%s due to missing rel/cache %s:%s:%s criteria=-cache' % \
                              (tmpSiteName,taskSpec.transUses,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed for SW {1}:{2}:{3}'.format(
             len(scanSiteList), taskSpec.transUses, taskSpec.transHome,
             taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for memory
     minRamCount = inputChunk.getMaxRamCount()
     minRamCount = JediCoreUtils.compensateRamCount(minRamCount)
     if not minRamCount in [0, None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # site max memory requirement
             if not tmpSiteSpec.maxrss in [0, None]:
                 site_maxmemory = tmpSiteSpec.maxrss
             else:
                 site_maxmemory = tmpSiteSpec.maxmemory
             if not site_maxmemory in [
                     0, None
             ] and minRamCount != 0 and minRamCount > site_maxmemory:
                 tmpLog.debug(
                     '  skip site={0} due to site RAM shortage. site_maxmemory={1} < job_minramcount={2} criteria=-lowmemory'
                     .format(tmpSiteName, site_maxmemory, minRamCount))
                 continue
             # site min memory requirement
             if not tmpSiteSpec.minrss in [0, None]:
                 site_minmemory = tmpSiteSpec.minrss
             else:
                 site_minmemory = tmpSiteSpec.minmemory
             if not site_minmemory in [
                     0, None
             ] and minRamCount != 0 and minRamCount < site_minmemory:
                 tmpLog.debug(
                     '  skip site={0} due to job RAM shortage. site_minmemory={1} > job_minramcount={2} criteria=-highmemory'
                     .format(tmpSiteName, site_minmemory, minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(
             len(scanSiteList), minRamCount, taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for scratch disk
     tmpMaxAtomSize = inputChunk.getMaxAtomSize()
     tmpEffAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True)
     tmpOutDiskSize = taskSpec.getOutDiskSize()
     tmpWorkDiskSize = taskSpec.getWorkDiskSize()
     minDiskCountS = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize + tmpMaxAtomSize
     minDiskCountS = minDiskCountS / 1024 / 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize
         minDiskCountR = minDiskCountR / 1024 / 1024
     tmpLog.debug(
         'maxAtomSize={0} effectiveAtomSize={1} outDiskCount={2} workDiskSize={3}'
         .format(tmpMaxAtomSize, tmpEffAtomSize, tmpOutDiskSize,
                 tmpWorkDiskSize))
     tmpLog.debug('minDiskCountScratch={0} minDiskCountRemote={1}'.format(
         minDiskCountS, minDiskCountR))
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0:
             if tmpSiteSpec.isDirectIO():
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug(
                     '  skip site={0} due to small scratch disk={1} < {2} criteria=-disk'
                     .format(tmpSiteName, tmpSiteSpec.maxwdir,
                             minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check endpoint
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         tmpEndPoint = tmpSiteSpec.ddm_endpoints.getEndPoint(
             tmpSiteSpec.ddm)
         if tmpEndPoint is not None:
             # free space must be >= 200GB
             diskThreshold = 200
             tmpSpaceSize = 0
             if tmpEndPoint['space_expired'] is not None:
                 tmpSpaceSize += tmpEndPoint['space_expired']
             if tmpEndPoint['space_free'] is not None:
                 tmpSpaceSize += tmpEndPoint['space_free']
             if tmpSpaceSize < diskThreshold:
                 tmpLog.debug(
                     '  skip site={0} due to disk shortage in SE {1} < {2}GB criteria=-disk'
                     .format(tmpSiteName, tmpSpaceSize, diskThreshold))
                 continue
             # check if blacklisted
             if tmpEndPoint['blacklisted'] == 'Y':
                 tmpLog.debug(
                     '  skip site={0} since {1} is blacklisted in DDM criteria=-blacklist'
                     .format(tmpSiteName, tmpSiteSpec.ddm))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0, None] and minWalltime > 0:
         minWalltime *= tmpEffAtomSize
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug(
                     '  skip site={0} due to short site walltime={1}(site upper limit) < {2} criteria=-shortwalltime'
                     .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug(
                     '  skip site={0} due to short job walltime={1}(site lower limit) > {2} criteria=-longwalltime'
                     .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(
             len(scanSiteList), minWalltime, taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][
                 'updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug(
                 '  skip site=%s due to no pilot criteria=-nopilot' %
                 tmpSiteName)
             if not self.testMode:
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed pilot activity check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # check inclusion and exclusion
     newScanSiteList = []
     sitesForANY = []
     for tmpSiteName in scanSiteList:
         autoSite = False
         # check exclusion
         if AtlasBrokerUtils.isMatched(tmpSiteName, excludeList):
             tmpLog.debug(
                 '  skip site={0} excluded criteria=-excluded'.format(
                     tmpSiteName))
             continue
         # check inclusion
         if includeList != None and not AtlasBrokerUtils.isMatched(
                 tmpSiteName, includeList):
             if 'AUTO' in includeList:
                 autoSite = True
             else:
                 tmpLog.debug(
                     '  skip site={0} not included criteria=-notincluded'.
                     format(tmpSiteName))
                 continue
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limited access
         if tmpSiteSpec.accesscontrol == 'grouplist':
             if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \
                     siteAccessMap[tmpSiteSpec.sitename] != 'approved':
                 tmpLog.debug(
                     '  skip site={0} limited access criteria=-limitedaccess'
                     .format(tmpSiteName))
                 continue
         # check cloud
         if not taskSpec.cloud in [None, '', 'any', tmpSiteSpec.cloud]:
             tmpLog.debug(
                 '  skip site={0} cloud mismatch criteria=-cloudmismatch'.
                 format(tmpSiteName))
             continue
         if autoSite:
             sitesForANY.append(tmpSiteName)
         else:
             newScanSiteList.append(tmpSiteName)
     # use AUTO sites if no sites are included
     if newScanSiteList == []:
         newScanSiteList = sitesForANY
     else:
         for tmpSiteName in sitesForANY:
             tmpLog.debug(
                 '  skip site={0} not included criteria=-notincluded'.
                 format(tmpSiteName))
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed inclusion/exclusion/cloud'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # selection for data availability
     hasDDS = False
     dataWeight = {}
     remoteSourceList = {}
     if inputChunk.getDatasets() != []:
         oldScanSiteList = copy.copy(scanSiteList)
         for datasetSpec in inputChunk.getDatasets():
             datasetName = datasetSpec.datasetName
             if not self.dataSiteMap.has_key(datasetName):
                 # get the list of sites where data is available
                 tmpLog.debug(
                     'getting the list of sites where {0} is available'.
                     format(datasetName))
                 tmpSt, tmpRet = AtlasBrokerUtils.getAnalSitesWithData(
                     scanSiteList, self.siteMapper, self.ddmIF, datasetName)
                 if tmpSt in [
                         Interaction.JEDITemporaryError,
                         Interaction.JEDITimeoutError
                 ]:
                     tmpLog.error(
                         'temporary failed to get the list of sites where data is available, since %s'
                         % tmpRet)
                     taskSpec.setErrDiag(
                         tmpLog.uploadLog(taskSpec.jediTaskID))
                     # send info to logger
                     self.sendLogMessage(tmpLog)
                     return retTmpError
                 if tmpSt == Interaction.JEDIFatalError:
                     tmpLog.error(
                         'fatal error when getting the list of sites where data is available, since %s'
                         % tmpRet)
                     taskSpec.setErrDiag(
                         tmpLog.uploadLog(taskSpec.jediTaskID))
                     # send info to logger
                     self.sendLogMessage(tmpLog)
                     return retFatal
                 # append
                 self.dataSiteMap[datasetName] = tmpRet
                 if datasetName.startswith('ddo'):
                     tmpLog.debug(' {0} sites'.format(len(tmpRet)))
                 else:
                     tmpLog.debug(' {0} sites : {1}'.format(
                         len(tmpRet), str(tmpRet)))
                     # check if distributed
                     if tmpRet != {}:
                         isDistributed = True
                         for tmpMap in tmpRet.values():
                             for tmpVal in tmpMap.values():
                                 if tmpVal['state'] == 'complete':
                                     isDistributed = False
                                     break
                             if not isDistributed:
                                 break
                         if isDistributed:
                             # check if really distributed
                             isDistributed = self.ddmIF.isDistributedDataset(
                                 datasetName)
                             if isDistributed:
                                 hasDDS = True
                                 datasetSpec.setDistributed()
                                 tmpLog.debug(' {0} is distributed'.format(
                                     datasetName))
             # check if the data is available at somewhere
             if self.dataSiteMap[datasetName] == {}:
                 tmpLog.error(
                     '{0} is unavailable at any site'.format(datasetName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 # send info to logger
                 self.sendLogMessage(tmpLog)
                 return retFatal
         # get the list of sites where data is available
         scanSiteList = None
         scanSiteListOnDisk = None
         normFactor = 0
         for datasetName, tmpDataSite in self.dataSiteMap.iteritems():
             normFactor += 1
             # get sites where replica is available
             tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(
                 tmpDataSite, includeTape=True)
             tmpDiskSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(
                 tmpDataSite, includeTape=False)
             # get sites which can remotely access source sites
             if inputChunk.isMerging:
                 # disable remote access for merging
                 tmpSatelliteSites = {}
             elif (not sitePreAssigned) or (
                     sitePreAssigned and not taskSpec.site in tmpSiteList):
                 tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites(
                     tmpDiskSiteList,
                     self.taskBufferIF,
                     self.siteMapper,
                     nSites=50,
                     protocol=allowedRemoteProtocol)
             else:
                 tmpSatelliteSites = {}
             # make weight map for local
             for tmpSiteName in tmpSiteList:
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = 0
                 # give more weight to disk
                 if tmpSiteName in tmpDiskSiteList:
                     dataWeight[tmpSiteName] += 1
                 else:
                     dataWeight[tmpSiteName] += 0.001
             # make weight map for remote
             for tmpSiteName, tmpWeightSrcMap in tmpSatelliteSites.iteritems(
             ):
                 # skip since local data is available
                 if tmpSiteName in tmpSiteList:
                     continue
                 tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                 # negative weight for remote access
                 wRemote = 50.0
                 if not tmpSiteSpec.wansinklimit in [0, None]:
                     wRemote /= float(tmpSiteSpec.wansinklimit)
                 # sum weight
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = float(
                         tmpWeightSrcMap['weight']) / wRemote
                 else:
                     dataWeight[tmpSiteName] += float(
                         tmpWeightSrcMap['weight']) / wRemote
                 # make remote source list
                 if not remoteSourceList.has_key(tmpSiteName):
                     remoteSourceList[tmpSiteName] = {}
                 remoteSourceList[tmpSiteName][
                     datasetName] = tmpWeightSrcMap['source']
             # first list
             if scanSiteList == None:
                 scanSiteList = []
                 for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                     if not tmpSiteName in oldScanSiteList:
                         continue
                     if not tmpSiteName in scanSiteList:
                         scanSiteList.append(tmpSiteName)
                 scanSiteListOnDisk = set()
                 for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys(
                 ):
                     if not tmpSiteName in oldScanSiteList:
                         continue
                     scanSiteListOnDisk.add(tmpSiteName)
                 continue
             # pickup sites which have all data
             newScanList = []
             for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                 if tmpSiteName in scanSiteList and not tmpSiteName in newScanList:
                     newScanList.append(tmpSiteName)
             scanSiteList = newScanList
             tmpLog.debug('{0} is available at {1} sites'.format(
                 datasetName, len(scanSiteList)))
             # pickup sites which have all data on DISK
             newScanListOnDisk = set()
             for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys():
                 if tmpSiteName in scanSiteListOnDisk:
                     newScanListOnDisk.add(tmpSiteName)
             scanSiteListOnDisk = newScanListOnDisk
             tmpLog.debug('{0} is available at {1} sites on DISK'.format(
                 datasetName, len(scanSiteListOnDisk)))
         # check for preassigned
         if sitePreAssigned and not taskSpec.site in scanSiteList:
             scanSiteList = []
             tmpLog.debug(
                 'data is unavailable locally or remotely at preassigned site {0}'
                 .format(taskSpec.site))
         elif len(scanSiteListOnDisk) > 0:
             # use only disk sites
             scanSiteList = list(scanSiteListOnDisk)
         tmpLog.debug('{0} candidates have input data'.format(
             len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retFatal
     ######################################
     # sites already used by task
     tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(
         taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     ######################################
     # calculate weight
     fqans = taskSpec.makeFQANs()
     """
     tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans,
                                                                                               taskSpec.workingGroup,True)
     currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight)
     currentPriority -= 500
     tmpLog.debug('currentPriority={0}'.format(currentPriority))
     """
     tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(
         taskSpec.vo, taskSpec.prodSourceLabel)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     # check for preassigned
     if sitePreAssigned and not taskSpec.site in scanSiteList:
         tmpLog.debug("preassigned site {0} did not pass all tests".format(
             taskSpec.site))
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retFatal
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     timeWindowForFC = 6
     preSiteCandidateSpec = None
     failureCounts = self.taskBufferIF.getFailureCountsForTask_JEDI(
         taskSpec.jediTaskID, timeWindowForFC)
     problematicSites = set()
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName,
                                                'running', None, None)
         nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                 tmpSiteName, 'defined',
                                                 None, None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) + \
                      AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None)
         nStarting = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                 tmpSiteName, 'starting',
                                                 None, None)
         nFailed = 0
         nClosed = 0
         nFinished = 0
         if tmpSiteName in failureCounts:
             if 'failed' in failureCounts[tmpSiteName]:
                 nFailed = failureCounts[tmpSiteName]['failed']
             if 'closed' in failureCounts[tmpSiteName]:
                 nClosed = failureCounts[tmpSiteName]['closed']
             if 'finished' in failureCounts[tmpSiteName]:
                 nFinished = failureCounts[tmpSiteName]['finished']
         # problematic sites
         if nFailed + nClosed > 2 * nFinished:
             problematicSites.add(tmpSiteName)
         # calculate weight
         weight = float(nRunning + 1) / float(nActivated + nAssigned +
                                              nStarting + 1)
         nThrottled = 0
         if remoteSourceList.has_key(tmpSiteName):
             nThrottled = AtlasBrokerUtils.getNumJobs(
                 jobStatPrioMap, tmpSiteName, 'throttled', None, None)
             weight /= float(nThrottled + 1)
         # noramize weights by taking data availability into account
         tmpDataWeight = 1
         if dataWeight.has_key(tmpSiteName):
             weight = weight * dataWeight[tmpSiteName]
             tmpDataWeight = dataWeight[tmpSiteName]
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # preassigned
         if sitePreAssigned and tmpSiteName == taskSpec.site:
             preSiteCandidateSpec = siteCandidateSpec
         # set weight
         siteCandidateSpec.weight = weight
         tmpStr = '  site={0} nRun={1} nDef={2} nAct={3} nStart={4} '.format(
             tmpSiteName, nRunning, nAssigned, nActivated, nStarting)
         tmpStr += 'nFailed={0} nClosed={1} nFinished={2} nTr={3} dataW={4} W={5}'.format(
             nFailed, nClosed, nFinished, nThrottled, tmpDataWeight, weight)
         tmpLog.debug(tmpStr)
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if not weightMap.has_key(weight):
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)
     # sort candidates by weights
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight
     # limit the number of sites. use all sites for distributed datasets
     if not hasDDS:
         maxNumSites = 10
         # remove problematic sites
         candidateSpecList = AtlasBrokerUtils.skipProblematicSites(
             candidateSpecList, problematicSites, sitesUsedByTask,
             preSiteCandidateSpec, maxNumSites, timeWindowForFC, tmpLog)
     # append preassigned
     if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList:
         candidateSpecList.append(preSiteCandidateSpec)
     # collect site names
     scanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # get list of available files
     availableFileMap = {}
     for datasetSpec in inputChunk.getDatasets():
         try:
             # get list of site to be scanned
             fileScanSiteList = []
             for tmpSiteName in scanSiteList:
                 fileScanSiteList.append(tmpSiteName)
                 if remoteSourceList.has_key(
                         tmpSiteName
                 ) and remoteSourceList[tmpSiteName].has_key(
                         datasetSpec.datasetName):
                     for tmpRemoteSite in remoteSourceList[tmpSiteName][
                             datasetSpec.datasetName]:
                         if not tmpRemoteSite in fileScanSiteList:
                             fileScanSiteList.append(tmpRemoteSite)
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(
                 fileScanSiteList, self.siteMapper)
             # disable file lookup for merge jobs
             if inputChunk.isMerging:
                 checkCompleteness = False
             else:
                 checkCompleteness = True
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(
                 datasetSpec,
                 siteStorageEP,
                 self.siteMapper,
                 ngGroup=[2],
                 checkCompleteness=checkCompleteness)
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError, 'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype, errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' %
                          (errtype.__name__, errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             # send info to logger
             self.sendLogMessage(tmpLog)
             return retTmpError
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         tmpSiteName = siteCandidateSpec.siteName
         # preassigned
         if sitePreAssigned and tmpSiteName != taskSpec.site:
             tmpLog.debug(
                 '  skip site={0} non pre-assigned site criteria=-nonpreassigned'
                 .format(tmpSiteName))
             continue
         # set available files
         if inputChunk.getDatasets() == []:
             isAvailable = True
         else:
             isAvailable = False
         for tmpDatasetName, availableFiles in availableFileMap.iteritems():
             tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName)
             # check remote files
             if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[
                     tmpSiteName].has_key(tmpDatasetName):
                 for tmpRemoteSite in remoteSourceList[tmpSiteName][
                         tmpDatasetName]:
                     if availableFiles.has_key(tmpRemoteSite) and \
                             len(tmpDatasetSpec.Files) <= len(availableFiles[tmpRemoteSite]['localdisk']):
                         # use only remote disk files
                         siteCandidateSpec.remoteFiles += availableFiles[
                             tmpRemoteSite]['localdisk']
                         # set remote site and access protocol
                         siteCandidateSpec.remoteProtocol = allowedRemoteProtocol
                         siteCandidateSpec.remoteSource = tmpRemoteSite
                         isAvailable = True
                         break
             # local files
             if availableFiles.has_key(tmpSiteName):
                 if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \
                         len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']) or \
                         len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localtape']) or \
                         (tmpDatasetSpec.isDistributed() and len(availableFiles[tmpSiteName]['all']) > 0):
                     siteCandidateSpec.localDiskFiles += availableFiles[
                         tmpSiteName]['localdisk']
                     # add cached files to local list since cached files go to pending when reassigned
                     siteCandidateSpec.localDiskFiles += availableFiles[
                         tmpSiteName]['cache']
                     siteCandidateSpec.localTapeFiles += availableFiles[
                         tmpSiteName]['localtape']
                     siteCandidateSpec.cacheFiles += availableFiles[
                         tmpSiteName]['cache']
                     siteCandidateSpec.remoteFiles += availableFiles[
                         tmpSiteName]['remote']
                     siteCandidateSpec.addAvailableFiles(
                         availableFiles[tmpSiteName]['all'])
                     isAvailable = True
                 else:
                     tmpMsg = '{0} is incompete at {1} : nFiles={2} nLocal={3} nCached={4} nTape={5}'
                     tmpLog.debug(
                         tmpMsg.format(
                             tmpDatasetName,
                             tmpSiteName,
                             len(tmpDatasetSpec.Files),
                             len(availableFiles[tmpSiteName]['localdisk']),
                             len(availableFiles[tmpSiteName]['cache']),
                             len(availableFiles[tmpSiteName]['localtape']),
                         ))
             if not isAvailable:
                 break
         # append
         if not isAvailable:
             tmpLog.debug(
                 '  skip site={0} file unavailable criteria=-fileunavailable'
                 .format(siteCandidateSpec.siteName))
             continue
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug(
             '  use site={0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5} criteria=+use'
             .format(
                 siteCandidateSpec.siteName,
                 siteCandidateSpec.weight,
                 len(siteCandidateSpec.localDiskFiles),
                 len(siteCandidateSpec.localTapeFiles),
                 len(siteCandidateSpec.cacheFiles),
                 len(siteCandidateSpec.remoteFiles),
             ))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         # send info to logger
         self.sendLogMessage(tmpLog)
         return retTmpError
     # send info to logger
     self.sendLogMessage(tmpLog)
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED, inputChunk

Пример #46

Показать файл

 def toBeThrottled(self,vo,prodSourceLabel,cloudName,workQueue,jobStat):
     # component name
     compName = 'prod_job_throttler'
     # params
     nBunch = 4
     threshold = 2.0
     thresholdForSite = threshold - 1.0
     nJobsInBunchMax = 600
     nJobsInBunchMin = 500
     nJobsInBunchMaxES = 1000
     if workQueue.criteria != None and 'site' in workQueue.criteria:
         minTotalWalltime = 10*1000*1000
     else:
         minTotalWalltime = 50*1000*1000
     nWaitingLimit = 4
     nWaitingBunchLimit = 2
     nParallel = 2
     # make logger
     tmpLog = MsgWrapper(logger)
     workQueueIDs = workQueue.getIDs()
     msgHeader = '{0}:{1} cloud={2} queue={3}:'.format(vo,prodSourceLabel,cloudName,workQueue.queue_name)
     tmpLog.debug(msgHeader+' start workQueueID={0}'.format(str(workQueueIDs)))
     # change threashold
     if workQueue.queue_name in ['mcore']:
         threshold = 5.0
     # check cloud status
     if not self.siteMapper.checkCloud(cloudName):
         msgBody = "SKIP cloud={0} undefined".format(cloudName)
         tmpLog.warning(msgHeader+" "+msgBody)
         tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
         return self.retThrottled
     cloudSpec = self.siteMapper.getCloud(cloudName)
     if cloudSpec['status'] in ['offline']:
         msgBody = "SKIP cloud.status={0}".format(cloudSpec['status'])
         tmpLog.warning(msgHeader+" "+msgBody)
         tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
         return self.retThrottled
     if cloudSpec['status'] in ['test']:
         if workQueue.queue_name != 'test':
             msgBody = "SKIP cloud.status={0} for non test queue ({1})".format(cloudSpec['status'],
                                                                               workQueue.queue_name)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
             tmpLog.warning(msgHeader+" "+msgBody)
             return self.retThrottled
     # check if unthrottled
     if workQueue.queue_share == None:
         msgBody = "PASS unthrottled since share=None"
         tmpLog.debug(msgHeader+" "+msgBody)
         return self.retUnThrottled
     # count number of jobs in each status
     nRunning = 0
     nNotRun  = 0
     nDefine  = 0
     nWaiting = 0
     for workQueueID in workQueueIDs:
         if jobStat.has_key(cloudName) and \
                jobStat[cloudName].has_key(workQueueID):
             tmpLog.debug(msgHeader+" "+str(jobStat[cloudName][workQueueID]))
             for pState,pNumber in jobStat[cloudName][workQueueID].iteritems():
                 if pState in ['running']:
                     nRunning += pNumber
                 elif pState in ['assigned','activated','starting']:
                     nNotRun  += pNumber
                 elif pState in ['defined']:
                     nDefine  += pNumber
                 elif pState in ['waiting']:
                     nWaiting += pNumber
     # check if higher prio tasks are waiting
     tmpStat,highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed',cloudName,workQueue)
     highestPrioInPandaDB = highestPrioJobStat['highestPrio']
     nNotRunHighestPrio   = highestPrioJobStat['nNotRun']
     # the highest priority of waiting tasks 
     highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo,workQueue,
                                                                      'managed',cloudName)
     if highestPrioWaiting == None:
         msgBody = 'failed to get the highest priority of waiting tasks'
         tmpLog.error(msgHeader+" "+msgBody)
         return self.retTmpError
     # high priority tasks are waiting
     highPrioQueued = False
     if highestPrioWaiting > highestPrioInPandaDB or (highestPrioWaiting == highestPrioInPandaDB and \
                                                      nNotRunHighestPrio < nJobsInBunchMin):
         highPrioQueued = True
     tmpLog.debug(msgHeader+" highestPrio waiting:{0} inPanda:{1} numNotRun:{2} -> highPrioQueued={3}".format(highestPrioWaiting,
                                                                                                              highestPrioInPandaDB,
                                                                                                              nNotRunHighestPrio,
                                                                                                              highPrioQueued))
     # set maximum number of jobs to be submitted
     tmpRemainingSlot = int(nRunning*threshold-nNotRun)
     if tmpRemainingSlot < nJobsInBunchMin:
         # use the lower limit to avoid creating too many _sub/_dis datasets
         nJobsInBunch = nJobsInBunchMin
     else:
         if workQueue.queue_name in ['evgensimul']:
             # use higher limit for evgensimul
             if tmpRemainingSlot < nJobsInBunchMaxES:
                 nJobsInBunch = tmpRemainingSlot
             else:
                 nJobsInBunch = nJobsInBunchMaxES
         else:
             if tmpRemainingSlot < nJobsInBunchMax:
                 nJobsInBunch = tmpRemainingSlot
             else:
                 nJobsInBunch = nJobsInBunchMax
     nQueueLimit = nJobsInBunch*nBunch
     # use special nQueueLimit
     tmpVal = self.taskBufferIF.getConfigValue(compName, 'NQUEUELIMIT_{0}'.format(workQueue.queue_name), 'jedi', 'atlas')
     if tmpVal is not None:
         nQueueLimit = tmpVal
     # use nPrestage for reprocessing   
     if workQueue.queue_name in ['reprocessing','mcore_repro']:
         # reset nJobsInBunch
         if nQueueLimit > (nNotRun+nDefine):
             tmpRemainingSlot = nQueueLimit - (nNotRun+nDefine)
             if tmpRemainingSlot < nJobsInBunch:
                 pass
             elif tmpRemainingSlot < nJobsInBunchMax:
                 nJobsInBunch = tmpRemainingSlot
             else:
                 nJobsInBunch = nJobsInBunchMax
     # get cap
     nRunningCap = self.taskBufferIF.getConfigValue(compName, 'NRUNNINGCAP_{0}'.format(workQueue.queue_name), 'jedi', 'atlas')
     nQueueCap = self.taskBufferIF.getConfigValue(compName, 'NQUEUECAP_{0}'.format(workQueue.queue_name), 'jedi', 'atlas')
     # set number of jobs to be submitted
     self.setMaxNumJobs(nJobsInBunch/nParallel)
     # get total walltime
     totWalltime = self.taskBufferIF.getTotalWallTime_JEDI(vo,prodSourceLabel,workQueue,cloudName)
     # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling
     limitPriority = False
     tmpStr = msgHeader+" nQueueLimit:{0} nQueued:{1} nDefine:{2} nRunning:{3} totWalltime:{4} nRunCap:{5} nQueueCap:{6}"
     tmpLog.debug(tmpStr.format(nQueueLimit,
                                nNotRun+nDefine,
                                nDefine,
                                nRunning,
                                totWalltime,
                                nRunningCap,
                                nQueueCap))
     # check
     if nRunning == 0 and (nNotRun+nDefine) > nQueueLimit and (totWalltime == None or totWalltime > minTotalWalltime):
         limitPriority = True
         if not highPrioQueued:
             # pilot is not running or DDM has a problem
             msgBody = "SKIP no running and enough nQueued({0})>{1} totWalltime({2})>{3} ".format(nNotRun+nDefine,nQueueLimit,
                                                                                                  totWalltime,minTotalWalltime)
             tmpLog.warning(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True)
             return self.retMergeUnThr
     elif nRunning != 0 and float(nNotRun+nDefine)/float(nRunning) > threshold and \
             (nNotRun+nDefine) > nQueueLimit and (totWalltime == None or totWalltime > minTotalWalltime):
         limitPriority = True
         if not highPrioQueued:
             # enough jobs in Panda
             msgBody = "SKIP nQueued({0})/nRunning({1})>{2} & nQueued({3})>{4} totWalltime({5})>{6}".format(nNotRun+nDefine,nRunning,
                                                                                                            threshold,nNotRun+nDefine,
                                                                                                            nQueueLimit,
                                                                                                            totWalltime,minTotalWalltime)
             tmpLog.warning(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True)
             return self.retMergeUnThr
     elif nDefine > nQueueLimit:
         limitPriority = True
         if not highPrioQueued:
             # brokerage is stuck
             msgBody = "SKIP too many nDefined({0})>{1}".format(nDefine,nQueueLimit)
             tmpLog.warning(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True)
             return self.retMergeUnThr
     elif nWaiting > nRunning*nWaitingLimit and nWaiting > nJobsInBunch*nWaitingBunchLimit:
         limitPriority = True
         if not highPrioQueued:
             # too many waiting
             msgBody = "SKIP too many nWaiting({0})>max(nRunning({1})x{2},{3}x{4})".format(nWaiting,nRunning,nWaitingLimit,
                                                                                           nJobsInBunch,nWaitingBunchLimit)
             tmpLog.warning(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True)
             return self.retMergeUnThr
     elif nRunningCap is not None and nRunning > nRunningCap:
         limitPriority = True
         if not highPrioQueued:
             # cap on running
             msgBody = "SKIP nRunning({0})>nRunningCap({1})".format(nRunning,nRunningCap)
             tmpLog.warning(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True)
             return self.retMergeUnThr
     elif nQueueCap is not None and nNotRun+nDefine > nQueueCap:
         limitPriority = True
         if not highPrioQueued:
             # cap on queued
             msgBody = "SKIP nQueue({0})>nQueueCap({1})".format(nNotRun+nDefine,nQueueCap)
             tmpLog.warning(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True)
             return self.retMergeUnThr
     # get jobs from prodDB
     limitPriorityValue = None
     if limitPriority:
         limitPriorityValue = highestPrioWaiting
         self.setMinPriority(limitPriorityValue)
     else:
         # not enough jobs are queued
         if nNotRun+nDefine < max(nQueueLimit,nRunning) or (totWalltime != None and totWalltime < minTotalWalltime):
             tmpLog.debug(msgHeader+" not enough jobs queued")
             self.notEnoughJobsQueued()
             self.setMaxNumJobs(max(self.maxNumJobs,nQueueLimit/20))
     msgBody = "PASS - priority limit={0}".format(limitPriorityValue)
     tmpLog.debug(msgHeader+" "+msgBody)
     return self.retUnThrottled

Пример #47

Показать файл

    def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue,
                      resource_name):
        # params
        nBunch = 4
        threshold = 2.0
        nJobsInBunchMax = 600
        nJobsInBunchMin = 500
        minTotalWalltime = 50 * 1000 * 1000
        nWaitingLimit = 4
        nWaitingBunchLimit = 2
        nParallel = 2
        nParallelCap = 5
        # make logger
        tmpLog = MsgWrapper(logger)

        workQueueID = workQueue.getID()
        workQueueName = workQueue.queue_name

        workQueueName = '_'.join(workQueue.queue_name.split(' '))
        msgHeader = '{0}:{1} cloud={2} queue={3} resource_type={4}:'.format(
            vo, prodSourceLabel, cloudName, workQueueName, resource_name)
        tmpLog.debug('{0} start workQueueID={1}'.format(
            msgHeader, workQueueID))

        # get central configuration values
        config_map = self.__getConfiguration(vo, workQueue.queue_name,
                                             resource_name)
        configQueueLimit = config_map[NQUEUELIMIT]['value']
        configQueueCap = config_map[NQUEUECAP]['value']
        configRunningCap = config_map[NRUNNINGCAP]['value']

        tmpLog.debug(
            msgHeader +
            ' got configuration configQueueLimit={0}, configQueueCap={1}, configRunningCap={2}'
            .format(configQueueLimit, configQueueCap, configRunningCap))

        # check if unthrottled
        if not workQueue.throttled:
            msgBody = "PASS unthrottled since GS_throttled is False"
            tmpLog.info(msgHeader + " " + msgBody)
            return self.retUnThrottled

        # get the jobs statistics for our wq/gs and expand the stats map
        jobstats_map = self.__prepareJobStats(workQueue, resource_name,
                                              config_map)
        nRunning_rt = jobstats_map['nRunning_rt']
        nRunning_gs = jobstats_map['nRunning_gs']
        nRunning_runningcap = jobstats_map['nRunning_runningcap']
        nNotRun_rt = jobstats_map['nNotRun_rt']
        nNotRun_gs = jobstats_map['nNotRun_gs']
        nNotRun_queuelimit = jobstats_map['nNotRun_queuelimit']
        nNotRun_queuecap = jobstats_map['nNotRun_queuecap']
        nDefine_rt = jobstats_map['nDefine_rt']
        nDefine_gs = jobstats_map['nDefine_gs']
        nDefine_queuelimit = jobstats_map['nDefine_queuelimit']
        nDefine_queuecap = jobstats_map['nDefine_queuecap']
        nWaiting_rt = jobstats_map['nWaiting_rt']
        nWaiting_gs = jobstats_map['nWaiting_gs']

        # check if higher prio tasks are waiting
        if workQueue.queue_name in non_rt_wqs:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI(
                'managed', cloudName, workQueue)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(
                vo, workQueue, 'managed', cloudName)
        else:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI(
                'managed', cloudName, workQueue, resource_name)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(
                vo, workQueue, 'managed', cloudName, resource_name)

        highestPrioInPandaDB = highestPrioJobStat['highestPrio']
        nNotRunHighestPrio = highestPrioJobStat['nNotRun']
        if highestPrioWaiting is None:
            msgBody = 'failed to get the highest priority of waiting tasks'
            tmpLog.error("{0} {1}".format(msgHeader, msgBody))
            return self.retTmpError

        # high priority tasks are waiting
        highPrioQueued = False
        if highestPrioWaiting > highestPrioInPandaDB \
                or (highestPrioWaiting == highestPrioInPandaDB and nNotRunHighestPrio < nJobsInBunchMin):
            highPrioQueued = True
        tmpLog.debug(
            "{0} highestPrio waiting:{1} inPanda:{2} numNotRun:{3} -> highPrioQueued={4}"
            .format(msgHeader, highestPrioWaiting, highestPrioInPandaDB,
                    nNotRunHighestPrio, highPrioQueued))
        # set maximum number of jobs to be submitted
        if workQueue.queue_name in non_rt_wqs:
            tmpRemainingSlot = int(nRunning_gs * threshold - nNotRun_gs)
        else:
            tmpRemainingSlot = int(nRunning_rt * threshold - nNotRun_rt)
        # use the lower limit to avoid creating too many _sub/_dis datasets
        nJobsInBunch = min(max(nJobsInBunchMin, tmpRemainingSlot),
                           nJobsInBunchMax)

        if configQueueLimit is not None:
            nQueueLimit = configQueueLimit
        else:
            nQueueLimit = nJobsInBunch * nBunch

        # use nPrestage for reprocessing
        if workQueue.queue_name in ['Heavy Ion', 'Reprocessing default']:
            # reset nJobsInBunch
            if nQueueLimit > (nNotRun_queuelimit + nDefine_queuelimit):
                tmpRemainingSlot = nQueueLimit - (nNotRun_queuelimit +
                                                  nDefine_queuelimit)
                if tmpRemainingSlot > nJobsInBunch:
                    nJobsInBunch = min(tmpRemainingSlot, nJobsInBunchMax)

        # get cap
        # set number of jobs to be submitted
        if configQueueCap is None:
            self.setMaxNumJobs(nJobsInBunch / nParallel)
        else:
            self.setMaxNumJobs(configQueueCap / nParallelCap)

        # get total walltime
        totWalltime = self.taskBufferIF.getTotalWallTime_JEDI(
            vo, prodSourceLabel, workQueue, resource_name, cloudName)

        # log the current situation and limits
        tmpLog.info("{0} nQueueLimit={1} nRunCap={2} nQueueCap={3}".format(
            msgHeader, nQueueLimit, configRunningCap, configQueueCap))
        tmpLog.info(
            "{0} at global share level: nQueued={1} nDefine={2} nRunning={3}".
            format(msgHeader, nNotRun_gs + nDefine_gs, nDefine_gs,
                   nRunning_gs))
        tmpLog.info(
            "{0} at resource type level: nQueued_rt={1} nDefine_rt={2} nRunning_rt={3} totWalltime={4}"
            .format(msgHeader, nNotRun_rt + nDefine_rt, nDefine_rt,
                    nRunning_rt, totWalltime))

        # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling
        limitPriority = False
        if workQueue.queue_name not in non_rt_wqs \
                and nRunning_rt == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit \
                and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs \
                and nRunning_gs == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name not in non_rt_wqs and  nRunning_rt != 0 \
                and float(nNotRun_rt + nDefine_rt) / float(nRunning_rt) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_rt({0})/nRunning_rt({1})>{2} & nQueued_queuelimit({3})>{4} totWalltime({5})>{6}".format(
                    nNotRun_rt + nDefine_rt, nRunning_rt, threshold,
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs and nRunning_gs != 0 \
                and float(nNotRun_gs + nDefine_gs) / float(nRunning_gs) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_gs({0})/nRunning_gs({1})>{2} & nQueued_queuelimit({3})>{4}".format(
                    nNotRun_gs + nDefine_gs, nRunning_gs, threshold,
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif nDefine_queuelimit > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # brokerage is stuck
                msgBody = "SKIP too many nDefined_queuelimit({0})>{1}".format(
                    nDefine_queuelimit, nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif nWaiting_rt > max(nRunning_rt * nWaitingLimit,
                               nJobsInBunch * nWaitingBunchLimit):
            limitPriority = True
            if not highPrioQueued:
                # too many waiting
                msgBody = "SKIP too many nWaiting_rt({0})>max(nRunning_rt({1})x{2},{3}x{4})".format(
                    nWaiting_rt, nRunning_rt, nWaitingLimit, nJobsInBunch,
                    nWaitingBunchLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif configRunningCap and nRunning_runningcap > configRunningCap:
            # cap on running
            msgBody = "SKIP nRunning_runningcap({0})>nRunningCap({1})".format(
                nRunning_runningcap, configRunningCap)
            tmpLog.warning('{0} {1}'.format(msgHeader, msgBody))
            tmpLog.sendMsg('{0} {1}'.format(msgHeader, msgBody),
                           self.msgType,
                           msgLevel='warning',
                           escapeChar=True)
            return self.retMergeUnThr

        elif configQueueCap and nNotRun_queuecap + nDefine_queuecap > configQueueCap:
            limitPriority = True
            if not highPrioQueued:
                # cap on queued
                msgBody = "SKIP nQueued_queuecap({0})>nQueueCap({1})".format(
                    nNotRun_queuecap + nDefine_queuecap, configQueueCap)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        # get jobs from prodDB
        limitPriorityValue = None
        if limitPriority:
            limitPriorityValue = highestPrioWaiting
            self.setMinPriority(limitPriorityValue)
        else:
            # not enough jobs are queued
            if (nNotRun_queuelimit + nDefine_queuelimit < nQueueLimit * 0.9) \
                    or (workQueue.queue_name in non_rt_wqs and nNotRun_gs + nDefine_gs < nRunning_gs) \
                    or (workQueue.queue_name not in non_rt_wqs and nNotRun_rt + nDefine_rt < nRunning_rt):
                tmpLog.debug(msgHeader + " not enough jobs queued")
                if not workQueue.queue_name in non_rt_wqs:
                    self.notEnoughJobsQueued()
                self.setMaxNumJobs(max(self.maxNumJobs, nQueueLimit / 20))

        msgBody = "PASS - priority limit={0} maxNumJobs={1}".format(
            limitPriorityValue, self.maxNumJobs)
        tmpLog.info(msgHeader + " " + msgBody)
        return self.retUnThrottled

Пример #48

Показать файл

 def toBeThrottled(self,vo,prodSourceLabel,cloudName,workQueue,jobStat):
     # params
     nBunch = 4
     threshold = 2.0
     thresholdForSite = threshold - 1.0
     nJobsInBunchMax = 500
     nJobsInBunchMin = 300
     nJobsInBunchMaxES = 1000
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start vo={0} label={1} cloud={2} workQueue={3}'.format(vo,prodSourceLabel,cloudName,
                                                                          workQueue.queue_name))
     workQueueID = workQueue.queue_id
     # check cloud status
     if not self.siteMapper.checkCloud(cloudName):
         tmpLog.debug("  done : SKIP cloud undefined")
         return self.retThrottled
     cloudSpec = self.siteMapper.getCloud(cloudName)
     if cloudSpec['status'] in ['offline']:
         tmpLog.debug("  done : SKIP cloud.status={0}".format(cloudSpec['status']))
         return self.retThrottled
     if cloudSpec['status'] in ['test']:
         if workQueue.queue_name != 'test':
             tmpLog.debug("  done : SKIP cloud.status={0} for non test queue ({1})".format(cloudSpec['status'],
                                                                                           workQueue.queue_name))
             return self.retThrottled
     # check if unthrottled
     if workQueue.queue_share == None:
         tmpLog.debug("  done : unthrottled since share=None")
         return self.retUnThrottled
     # count number of jobs in each status
     nRunning = 0
     nNotRun  = 0
     nDefine  = 0
     nWaiting = 0
     if jobStat.has_key(cloudName) and \
            jobStat[cloudName].has_key(workQueueID):
         for pState,pNumber in jobStat[cloudName][workQueueID].iteritems():
             if pState in ['running']:
                 nRunning += pNumber
             elif pState in ['assigned','activated','starting']:
                 nNotRun  += pNumber
             elif pState in ['defined']:
                 nDefine  += pNumber
             elif pState in ['waiting']:
                 nWaiting += pNumber
     # check if higher prio tasks are waiting
     tmpStat,highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed',cloudName,workQueue)
     highestPrioInPandaDB = highestPrioJobStat['highestPrio']
     nNotRunHighestPrio   = highestPrioJobStat['nNotRun']
     # the highest priority of waiting tasks 
     highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo,workQueue,
                                                                      'managed',cloudName)
     if highestPrioWaiting == None:
         tmpLog.error('failed to get the highest priority of waiting tasks')
         return self.retTmpError
     # high priority tasks are waiting
     highPrioQueued = False
     if highestPrioWaiting > highestPrioInPandaDB or (highestPrioWaiting == highestPrioInPandaDB and \
                                                      nNotRunHighestPrio < nJobsInBunchMin):
         highPrioQueued = True
     tmpLog.debug(" highestPrio waiting:{0} inPanda:{1} numNotRun:{2} -> highPrioQueued={3}".format(highestPrioWaiting,
                                                                                                    highestPrioInPandaDB,
                                                                                                    nNotRunHighestPrio,
                                                                                                    highPrioQueued))
     # set maximum number of jobs to be submitted
     tmpRemainingSlot = int(nRunning*threshold-nNotRun)
     if tmpRemainingSlot < nJobsInBunchMin:
         # use the lower limit to avoid creating too many _sub/_dis datasets
         nJobsInBunch = nJobsInBunchMin
     else:
         if workQueue.queue_name in ['evgensimul']:
             # use higher limit for evgensimul
             if tmpRemainingSlot < nJobsInBunchMaxES:
                 nJobsInBunch = tmpRemainingSlot
             else:
                 nJobsInBunch = nJobsInBunchMaxES
         else:
             if tmpRemainingSlot < nJobsInBunchMax:
                 nJobsInBunch = tmpRemainingSlot
             else:
                 nJobsInBunch = nJobsInBunchMax
     nQueueLimit = nJobsInBunch*nBunch
     # use special limit for CERN
     if cloudName == 'CERN':
         nQueueLimit = 2000
     # use nPrestage for reprocessing   
     if workQueue.queue_name in ['reprocessing']:
         if cloudSpec.has_key('nprestage') and cloudSpec['nprestage'] > 0:
             nQueueLimit = cloudSpec['nprestage']
             # reset nJobsInBunch
             if nQueueLimit > (nNotRun+nDefine):
                 tmpRemainingSlot = nQueueLimit - (nNotRun+nDefine)
                 if tmpRemainingSlot < nJobsInBunch:
                     pass
                 elif tmpRemainingSlot < nJobsInBunchMax:
                     nJobsInBunch = tmpRemainingSlot
                 else:
                     nJobsInBunch = nJobsInBunchMax
     # set number of jobs to be submitted
     self.setMaxNumJobs(nJobsInBunch)
     # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling
     limitPriority = False
     # check when high prio tasks are not waiting
     if not highPrioQueued:
         if nRunning == 0 and (nNotRun+nDefine) > nQueueLimit:
             limitPriority = True
             # pilot is not running or DDM has a problem
             tmpLog.debug("  done : SKIP no running and enough nQueued={0}>{1}".format(nNotRun+nDefine,nQueueLimit))
             return self.retThrottled
         elif nRunning != 0 and float(nNotRun)/float(nRunning) > threshold and (nNotRun+nDefine) > nQueueLimit:
             limitPriority = True
             # enough jobs in Panda
             tmpLog.debug("  done : SKIP nQueued/nRunning={0}>{1} & nQueued={2}>{3}".format(float(nNotRun)/float(nRunning),
                                                                                            threshold,nNotRun+nDefine,
                                                                                            nQueueLimit))
             return self.retThrottled
         elif nDefine > nQueueLimit:
             limitPriority = True
             # brokerage is stuck
             tmpLog.debug("  done : SKIP too many nDefin={0}>{1}".format(nDefine,nQueueLimit))
             return self.retThrottled
         elif nWaiting > nRunning*4 and nWaiting > nJobsInBunch*2:
             limitPriority = True
             # too many waiting
             tmpLog.debug("  done : SKIP too many nWaiting={0}>{1}".format(nWaiting,nRunning*4))
             return self.retThrottled
     # get jobs from prodDB
     limitPriorityValue = None
     if limitPriority:
         limitPriorityValue = highestPrioInPandaDB
         self.setMinPriority(limitPriorityValue)
     tmpLog.debug("   done : PASS - priority limit={0}".format(limitPriorityValue))
     return self.retUnThrottled

Пример #49

Показать файл

 def doBrokerage(self,taskSpec,cloudName,inputChunk,taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID))
     tmpLog.debug('start')
     # return for failure
     retFatal    = self.SC_FATAL,inputChunk
     retTmpError = self.SC_FAILED,inputChunk
     # get primary site candidates 
     sitePreAssigned = False
     excludeList = []
     includeList = None
     scanSiteList = []
     # get list of site access
     siteAccessList = self.taskBufferIF.listSiteAccess(None,taskSpec.userName)
     siteAccessMap = {}
     for tmpSiteName,tmpAccess in siteAccessList:
         siteAccessMap[tmpSiteName] = tmpAccess
     # site limitation
     if taskSpec.useLimitedSites():
         if 'excludedSite' in taskParamMap:
             excludeList = taskParamMap['excludedSite']
         if 'includedSite' in taskParamMap:
             includeList = taskParamMap['includedSite']
     # loop over all sites        
     for siteName,tmpSiteSpec in self.siteMapper.siteSpecList.iteritems():
         if tmpSiteSpec.type == 'analysis':
             scanSiteList.append(siteName)
     # preassigned
     if not taskSpec.site in ['',None]:
         # site is pre-assigned
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
         sitePreAssigned = True
         if not taskSpec.site in scanSiteList:
             scanSiteList.append(taskSpec.site)
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     # allowed remote access protocol
     allowedRemoteProtocol = 'fax'
     ######################################
     # selection for data availability
     dataWeight = {}
     remoteSourceList = {}
     if inputChunk.getDatasets() != []:    
         for datasetSpec in inputChunk.getDatasets():
             datasetName = datasetSpec.datasetName
             if not self.dataSiteMap.has_key(datasetName):
                 # get the list of sites where data is available
                 tmpLog.debug('getting the list of sites where {0} is avalable'.format(datasetName))
                 tmpSt,tmpRet = AtlasBrokerUtils.getAnalSitesWithData(scanSiteList,
                                                                      self.siteMapper,
                                                                      self.ddmIF,datasetName)
                 if tmpSt == self.SC_FAILED:
                     tmpLog.error('failed to get the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     return retTmpError
                 if tmpSt == self.SC_FATAL:
                     tmpLog.error('fatal error when getting the list of sites where data is available, since %s' % tmpRet)
                     taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                     return retFatal
                 # append
                 self.dataSiteMap[datasetName] = tmpRet
                 if datasetName.startswith('ddo'):
                     tmpLog.debug(' {0} sites'.format(len(tmpRet)))
                 else:
                     tmpLog.debug(' {0} sites : {1}'.format(len(tmpRet),str(tmpRet)))
             # check if the data is available at somewhere
             if self.dataSiteMap[datasetName] == {}:
                 tmpLog.error('{0} is unavaiable at any site'.format(datasetName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retFatal
             # check if the data is available on disk
             if AtlasBrokerUtils.getAnalSitesWithDataDisk(self.dataSiteMap[datasetName]) == []:
                 tmpLog.error('{0} is avaiable only on tape'.format(datasetName))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retFatal
         # get the list of sites where data is available    
         scanSiteList = None     
         normFactor = 0
         for datasetName,tmpDataSite in self.dataSiteMap.iteritems():
             normFactor += 1
             # get sites where disk replica is available
             tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(tmpDataSite)
             # get sites which can remotely access source sites
             if (not sitePreAssigned) or (sitePreAssigned and not taskSpec.site in tmpSiteList):
                 tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites(tmpSiteList,self.taskBufferIF,
                                                                        self.siteMapper,nSites=50,
                                                                        protocol=allowedRemoteProtocol)
             else:
                 tmpSatelliteSites = {}
             # make weight map for local
             for tmpSiteName in tmpSiteList:
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = 1
                 else:
                     dataWeight[tmpSiteName] += 1
             # make weight map for remote
             for tmpSiteName,tmpWeightSrcMap in tmpSatelliteSites.iteritems():
                 # skip since local data is available
                 if tmpSiteName in tmpSiteList:
                     continue
                 # sum weight
                 if not dataWeight.has_key(tmpSiteName):
                     dataWeight[tmpSiteName] = tmpWeightSrcMap['weight']
                 else:
                     dataWeight[tmpSiteName] += tmpWeightSrcMap['weight']
                 # make remote source list
                 if not remoteSourceList.has_key(tmpSiteName):
                     remoteSourceList[tmpSiteName] = {}
                 remoteSourceList[tmpSiteName][datasetName] = tmpWeightSrcMap['source']
             # first list
             if scanSiteList == None:
                 scanSiteList = []
                 for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                     if not tmpSiteName in scanSiteList:
                         scanSiteList.append(tmpSiteName)
                 continue
             # pickup sites which have all data
             newScanList = []
             for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                 if tmpSiteName in scanSiteList and not tmpSiteName in newScanList:
                     newScanList.append(tmpSiteName)
             scanSiteList = newScanList
             tmpLog.debug('{0} is available at {1} sites'.format(datasetName,len(scanSiteList)))
         tmpLog.debug('{0} candidates have input data'.format(len(scanSiteList)))
         # check for preassigned
         if sitePreAssigned and not taskSpec.site in scanSiteList:
             scanSiteList = []
             tmpLog.debug('data is unavailable locally or remotely at preassigned site {0}'.format(taskSpec.site))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retFatal
     ######################################
     # selection for status
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         skipFlag = False
         if tmpSiteSpec.status in ['offline']:
             skipFlag = True
         elif tmpSiteSpec.status in ['brokeroff','test']:
             if not sitePreAssigned:
                 skipFlag = True
             elif tmpSiteName != taskSpec.site:
                 skipFlag = True
         if not skipFlag:    
             newScanSiteList.append(tmpSiteName)
         else:
             tmpLog.debug('  skip %s due to status=%s' % (tmpSiteName,tmpSiteSpec.status))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed site status check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for release
     if not taskSpec.transHome in [None,'AnalysisTransforms']:
         if taskSpec.transHome.startswith('ROOT'):
             # hack until x86_64-slc6-gcc47-opt is published in installedsw
             if taskSpec.architecture == 'x86_64-slc6-gcc47-opt':
                 tmpCmtConfig = 'x86_64-slc6-gcc46-opt'
             else:
                 tmpCmtConfig = taskSpec.architecture
             siteListWithSW = taskBuffer.checkSitesWithRelease(scanSiteList,
                                                               cmtConfig=tmpCmtConfig,
                                                               onlyCmtConfig=True)
         else:    
             # remove AnalysisTransforms-
             transHome = re.sub('^[^-]+-*','',taskSpec.transHome)
             transHome = re.sub('_','-',transHome)
             if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None:
                 # cache is checked 
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                          caches=transHome,
                                                                          cmtConfig=taskSpec.architecture)
             elif transHome == '':
                 # release is checked 
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                          releases=taskSpec.transUses,
                                                                          cmtConfig=taskSpec.architecture)
             else:
                 # nightlies
                 siteListWithSW = self.taskBufferIF.checkSitesWithRelease(scanSiteList,
                                                                          releases='CVMFS')
                 #                                                         releases='nightlies',
                 #                                                         cmtConfig=taskSpec.architecture)
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # release check is disabled or release is available
             if tmpSiteSpec.releases == ['ANY'] or \
                tmpSiteSpec.cloud in ['ND']:
                 newScanSiteList.append(tmpSiteName)
             elif tmpSiteName in siteListWithSW:
                 newScanSiteList.append(tmpSiteName)
             else:
                 # release is unavailable
                 tmpLog.debug('  skip %s due to missing rel/cache %s:%s' % \
                              (tmpSiteName,taskSpec.transHome,taskSpec.architecture))
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed for SW {1}:{2}'.format(len(scanSiteList),
                                                                    taskSpec.transHome,
                                                                    taskSpec.architecture))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for memory
     minRamCount  = taskSpec.ramCount
     if not minRamCount in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory:
                 tmpLog.debug('  skip {0} due to site RAM shortage={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                       tmpSiteSpec.maxmemory,
                                                                                                       minRamCount))
                 continue
             if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory:
                 tmpLog.debug('  skip {0} due to job RAM shortage={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                      tmpSiteSpec.minmemory,
                                                                                                      minRamCount))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format(len(scanSiteList),
                                                                          minRamCount,taskSpec.ramUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for scratch disk
     minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize() + inputChunk.getMaxAtomSize()
     minDiskCountS = minDiskCountS / 1024 / 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()
         minDiskCountR = minDiskCountR / 1024 / 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir != 0:
             if tmpSiteSpec.isDirectIO():
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug('  skip {0} due to small scratch disk={1} < {2}'.format(tmpSiteName,
                                                                                      tmpSiteSpec.maxwdir,
                                                                                      minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # free space must be >= 200GB
         diskThreshold = 200
         tmpSpaceSize = tmpSiteSpec.space
         if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold:
             tmpLog.debug('  skip {0} due to disk shortage in SE = {1} < {2}GB'.format(tmpSiteName,tmpSiteSpec.space,
                                                                                       diskThreshold))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if not minWalltime in [0,None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug('  skip {0} due to short site walltime={1}(site upper limit) < {2}'.format(tmpSiteName,
                                                                                                         tmpSiteSpec.maxtime,
                                                                                                         minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug('  skip {0} due to short job walltime={1}(site lower limit) > {2}'.format(tmpSiteName,
                                                                                                        tmpSiteSpec.mintime,
                                                                                                        minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList        
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(len(scanSiteList),minWalltime,taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if nWNmap.has_key(tmpSiteName):
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName]['updateJob']
         if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
             tmpLog.debug('  skip %s due to no pilot' % tmpSiteName)
             #continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed pilot activity check'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # check inclusion and exclusion
     newScanSiteList = []
     sitesForANY = []
     for tmpSiteName in scanSiteList:
         autoSite = False
         # check exclusion
         if AtlasBrokerUtils.isMatched(tmpSiteName,excludeList):
             tmpLog.debug('  skip {0} excluded'.format(tmpSiteName))
             continue
         # check inclusion
         if includeList != None and not AtlasBrokerUtils.isMatched(tmpSiteName,includeList):
             if 'AUTO' in includeList:
                 autoSite = True
             else:
                 tmpLog.debug('  skip {0} not included'.format(tmpSiteName))
                 continue
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # limited access
         if tmpSiteSpec.accesscontrol == 'grouplist':
             if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \
                     siteAccessMap[tmpSiteSpec.sitename] != 'approved':
                 tmpLog.debug('  skip {0} limited access'.format(tmpSiteName))
                 continue
         # check cloud
         if not taskSpec.cloud in [None,'','any',tmpSiteSpec.cloud]: 
             tmpLog.debug('  skip {0} cloud missmatch'.format(tmpSiteName))
             continue
         if autoSite:
             sitesForANY.append(tmpSiteName)
         else:
             newScanSiteList.append(tmpSiteName)
     # use AUTO sites if no sites are included
     if newScanSiteList == []:
         newScanSiteList = sitesForANY
     else:
         for tmpSiteName in sitesForANY:
             tmpLog.debug('  skip {0} not included'.format(tmpSiteName))
     scanSiteList = newScanSiteList        
     tmpLog.debug('{0} candidates passed inclusion/exclusion/cloud'.format(len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # sites already used by task
     tmpSt,sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # calculate weight
     fqans = taskSpec.makeFQANs()
     tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans,
                                                                                               taskSpec.workingGroup,True)
     currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight)
     tmpLog.debug('currentPriority={0}'.format(currentPriority))
     tmpSt,jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI(taskSpec.vo,
                                                                                 taskSpec.prodSourceLabel,
                                                                                 currentPriority)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # check for preassigned
     if sitePreAssigned and not taskSpec.site in scanSiteList:
         tmpLog.debug("preassigned site {0} didn't pass all tests".format(taskSpec.site))
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retFatal
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     preSiteCandidateSpec = None
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning   = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'running',  None,None)
         nAssigned  = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'defined',  None,None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None)
         weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1)
         if remoteSourceList.has_key(tmpSiteName):
             nThrottled = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None)
             weight = float(nThrottled + 1)
         # noramize weights by taking data availability into account
         if dataWeight.has_key(tmpSiteName):
             weight = weight * dataWeight[tmpSiteName]
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # preassigned
         if sitePreAssigned and tmpSiteName == taskSpec.site:
             preSiteCandidateSpec = siteCandidateSpec
         # set weight
         siteCandidateSpec.weight = weight
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if not weightMap.has_key(weight):
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)    
     # limit the number of sites
     maxNumSites = 5
     weightList = weightMap.keys()
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         if len(candidateSpecList) >= maxNumSites:
             break
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight[:(maxNumSites-len(candidateSpecList))]
     # append preassigned
     if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList: 
         candidateSpecList.append(preSiteCandidateSpec)
     # collect site names
     scanSiteList = []    
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # get list of available files
     availableFileMap = {}     
     for datasetSpec in inputChunk.getDatasets():
         try:
             # get list of site to be scanned
             fileScanSiteList = []
             for tmpSiteName in scanSiteList:
                 fileScanSiteList.append(tmpSiteName)
                 if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(datasetSpec.datasetName):
                     for tmpRemoteSite in remoteSourceList[tmpSiteName][datasetSpec.datasetName]:
                         if not tmpRemoteSite in fileScanSiteList:
                             fileScanSiteList.append(tmpRemoteSite)
             # mapping between sites and storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(fileScanSiteList,self.siteMapper)
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(datasetSpec,
                                                         siteStorageEP,
                                                         self.siteMapper,
                                                         ngGroup=[2])
             if tmpAvFileMap == None:
                 raise Interaction.JEDITemporaryError,'ddmIF.getAvailableFiles failed'
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed to get available files with %s %s' % (errtype.__name__,errvalue))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         tmpSiteName = siteCandidateSpec.siteName
         # preassigned
         if sitePreAssigned and tmpSiteName != taskSpec.site:
             tmpLog.debug('  skip {0} non pre-assigned site'.format(tmpSiteName))
             continue
         # set available files
         if inputChunk.getDatasets() == []: 
             isAvailable = True
         else:
             isAvailable = False
         for tmpDatasetName,availableFiles in availableFileMap.iteritems():
             tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName)
             # check remote files
             if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[tmpSiteName].has_key(tmpDatasetName):
                 for tmpRemoteSite in remoteSourceList[tmpSiteName][tmpDatasetName]:
                     if availableFiles.has_key(tmpRemoteSite) and \
                             len(tmpDatasetSpec.Files) <= availableFiles[tmpRemoteSite]['localdisk']:
                         # use only remote disk files
                         siteCandidateSpec.remoteFiles += availableFiles[tmpRemoteSite]['localdisk']
                         # set remote site and access protocol
                         siteCandidateSpec.remoteProtocol = allowedRemoteProtocol
                         siteCandidateSpec.remoteSource   = tmpRemoteSite
                         isAvailable = True
                         break
             # local files
             if availableFiles.has_key(tmpSiteName):
                 if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \
                         len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']):
                     siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['localdisk']
                     # add cached files to local list since cached files go to pending when reassigned
                     siteCandidateSpec.localDiskFiles  += availableFiles[tmpSiteName]['cache']
                     siteCandidateSpec.localTapeFiles  += availableFiles[tmpSiteName]['localtape']
                     siteCandidateSpec.cacheFiles  += availableFiles[tmpSiteName]['cache']
                     siteCandidateSpec.remoteFiles += availableFiles[tmpSiteName]['remote']
                     isAvailable = True
                 else:
                     tmpLog.debug('{0} is incompete at {1} : nFiles={2} nLocal={3} nCached={4}'.format(tmpDatasetName,
                                                                                                       tmpSiteName,
                                                                                                       len(tmpDatasetSpec.Files),
                                                                                                       len(availableFiles[tmpSiteName]['localdisk']),
                                                                                                       len(availableFiles[tmpSiteName]['cache'])))
             if not isAvailable:
                 break
         # append
         if not isAvailable:
             tmpLog.debug('  skip {0} file unavailable'.format(siteCandidateSpec.siteName))
             continue
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug('  use {0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5}'.format(siteCandidateSpec.siteName,
                                                                                                              siteCandidateSpec.weight,
                                                                                                              len(siteCandidateSpec.localDiskFiles),
                                                                                                              len(siteCandidateSpec.localTapeFiles),
                                                                                                              len(siteCandidateSpec.cacheFiles),
                                                                                                              len(siteCandidateSpec.remoteFiles),
                                                                                                              ))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # return
     tmpLog.debug('done')        
     return self.SC_SUCCEEDED,inputChunk

Пример #50

Показать файл

Файл: AtlasAnalWatchDog.py Проект: fbarreir/panda-jedi

 def doAction(self):
     try:
         # get logger
         tmpLog = MsgWrapper(logger)
         tmpLog.debug('start')
         origTmpLog = tmpLog
         # check every 60 min
         checkInterval = 60
         # get lib.tgz for waiting jobs
         libList = self.taskBufferIF.getLibForWaitingRunJob_JEDI(self.vo,self.prodSourceLabel,checkInterval)
         tmpLog.debug('got {0} lib.tgz files'.format(len(libList)))
         # activate or kill orphan jobs which were submitted to use lib.tgz when the lib.tgz was being produced
         for prodUserName,datasetName,tmpFileSpec in libList:
             tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(tmpFileSpec.jediTaskID))
             tmpLog.debug('start')
             # check status of lib.tgz
             if tmpFileSpec.status == 'failed':
                 # get buildJob 
                 pandaJobSpecs = self.taskBufferIF.peekJobs([tmpFileSpec.PandaID],
                                                            fromDefined=False,
                                                            fromActive=False,
                                                            fromWaiting=False)
                 pandaJobSpec = pandaJobSpecs[0]
                 if pandaJobSpec != None:
                     # kill
                     self.taskBufferIF.updateJobs([pandaJobSpec],False)
                     tmpLog.debug('  killed downstream jobs for user="******" with libDS={1}'.format(prodUserName,datasetName))
                 else:
                     # PandaJobSpec not found
                     tmpLog.error('  cannot find PandaJobSpec for user="******" with PandaID={1}'.format(prodUserName,
                                                                                                      tmpFileSpec.PandaID))
             elif tmpFileSpec.status == 'finished':
                 # set metadata
                 self.taskBufferIF.setGUIDs([{'guid':tmpFileSpec.GUID,
                                              'lfn':tmpFileSpec.lfn,
                                              'checksum':tmpFileSpec.checksum,
                                              'fsize':tmpFileSpec.fsize,
                                              'scope':tmpFileSpec.scope,
                                              }])
                 # get lib dataset
                 dataset = self.taskBufferIF.queryDatasetWithMap({'name':datasetName})
                 if dataset != None:
                     # activate jobs
                     aThr = Activator(self.taskBufferIF,dataset)
                     aThr.start()
                     aThr.join()
                     tmpLog.debug('  activated downstream jobs for user="******" with libDS={1}'.format(prodUserName,datasetName))
                 else:
                     # datasetSpec not found
                     tmpLog.error('  cannot find datasetSpec for user="******" with libDS={1}'.format(prodUserName,datasetName))
             else:
                 # lib.tgz is not ready
                 tmpLog.debug('  keep waiting for user="******" libDS={1}'.format(prodUserName,datasetName))
     except:
         tmpLog = origTmpLog
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0} {1}'.format(errtype,errvalue))
     # return
     tmpLog = origTmpLog
     tmpLog.debug('done')
     return self.SC_SUCCEEDED

Пример #51

Показать файл

Файл: JobSplitter.py Проект: lukewayne123/panda-jedi

 def doSplit(self,taskSpec,inputChunk,siteMapper):
     # return for failure
     retFatal    = self.SC_FATAL,[]
     retTmpError = self.SC_FAILED,[]
     # make logger
     tmpLog = MsgWrapper(logger,'<jediTaskID={0} datasetID={1}>'.format(taskSpec.jediTaskID,inputChunk.masterIndexName))
     tmpLog.debug('start')
     if not inputChunk.isMerging:
         # set maxNumFiles using taskSpec if specified
         maxNumFiles = taskSpec.getMaxNumFilesPerJob()
         # set fsize gradients using taskSpec
         sizeGradients  = taskSpec.getOutDiskSize()
         # set fsize intercepts using taskSpec                
         sizeIntercepts = taskSpec.getWorkDiskSize()
         # walltime
         walltimeGradient = taskSpec.walltime
         # number of events per job if defined
         nEventsPerJob = taskSpec.getNumEventsPerJob()
         # number of files per job if defined
         nFilesPerJob = taskSpec.getNumFilesPerJob()
         if nFilesPerJob == None and nEventsPerJob == None and inputChunk.useScout() and not taskSpec.useLoadXML():
             nFilesPerJob = 1
         # grouping with boundaryID
         useBoundary = taskSpec.useGroupWithBoundaryID()
         # fsize intercepts per input size
         sizeGradientsPerInSize = None
         # max primay output size
         maxOutSize = None
         # max size per job
         maxSizePerJob = taskSpec.getMaxSizePerJob()
     else:
         # set parameters for merging
         maxNumFiles = taskSpec.getMaxNumFilesPerMergeJob()
         if maxNumFiles == None:
             maxNumFiles = 50
         sizeGradients = 0
         walltimeGradient = 0
         nFilesPerJob = taskSpec.getNumFilesPerMergeJob()
         nEventsPerJob = taskSpec.getNumEventsPerMergeJob()
         maxSizePerJob = None
         useBoundary = {'inSplit':3}
         # gradients per input size is 1
         sizeGradientsPerInSize = 1
         # intercepts for libDS
         sizeIntercepts = taskSpec.getWorkDiskSize()
         # mergein of 500MB
         interceptsMergin = 500 * 1024 * 1024
         if sizeIntercepts < interceptsMergin:
             sizeIntercepts = interceptsMergin
         maxOutSize = taskSpec.getMaxSizePerMergeJob()
         if maxOutSize == None:
             # max output size is 5GB for merging by default
             maxOutSize = 5 * 1024 * 1024 * 1024
     # LB
     respectLB = taskSpec.respectLumiblock()
     # dump
     tmpLog.debug('maxNumFiles={0} sizeGradients={1} sizeIntercepts={2} useBoundary={3}'.format(maxNumFiles,
                                                                                                sizeGradients,
                                                                                                sizeIntercepts,
                                                                                                useBoundary))
     tmpLog.debug('walltimeGradient={0} nFilesPerJob={1} nEventsPerJob={2}'.format(walltimeGradient,
                                                                                     nFilesPerJob,
                                                                                     nEventsPerJob))
     tmpLog.debug('sizeGradientsPerInSize={0} maxOutSize={1} respectLB={2}'.format(sizeGradientsPerInSize,
                                                                                   maxOutSize,
                                                                                   respectLB))
     # split
     returnList = []
     subChunks  = []
     iSubChunks = 0
     nSubChunks = 50
     while True:
         # change site
         if iSubChunks % nSubChunks == 0:
             # append to return map
             if subChunks != []:
                 returnList.append({'siteName':siteName,
                                    'subChunks':subChunks,
                                    'siteCandidate':siteCandidate,
                                    })
                 # reset
                 subChunks = []
             # new candidate
             siteCandidate = inputChunk.getOneSiteCandidate()
             siteName = siteCandidate.siteName
             siteSpec = siteMapper.getSite(siteName)
             # get maxSize if it is set in taskSpec
             maxSize = maxSizePerJob
             if maxSize == None or maxSize > (siteSpec.maxwdir * 1024 * 1024):
                 # use maxwdir as the default maxSize
                 maxSize = siteSpec.maxwdir * 1024 * 1024
             # max walltime      
             maxWalltime = siteSpec.maxtime
             # core count
             if siteSpec.coreCount > 0:
                 coreCount = siteSpec.coreCount
             else:
                 coreCount = 1
             tmpLog.debug('chosen {0}'.format(siteName))
             tmpLog.debug('maxSize={0} maxWalltime={1} coreCount={2}'.format(maxSize,maxWalltime,coreCount))
         # get sub chunk
         subChunk = inputChunk.getSubChunk(siteName,maxSize=maxSize,
                                           maxNumFiles=maxNumFiles,
                                           sizeGradients=sizeGradients,
                                           sizeIntercepts=sizeIntercepts,
                                           nFilesPerJob=nFilesPerJob,
                                           walltimeGradient=walltimeGradient,
                                           maxWalltime=maxWalltime,
                                           nEventsPerJob=nEventsPerJob,
                                           useBoundary=useBoundary,
                                           sizeGradientsPerInSize=sizeGradientsPerInSize,
                                           maxOutSize=maxOutSize,
                                           coreCount=coreCount,
                                           respectLB=respectLB,
                                           tmpLog=tmpLog)
         if subChunk == None:
             break
         # append
         subChunks.append(subChunk)
         iSubChunks += 1
     # append to return map if remain
     if subChunks != []:
         returnList.append({'siteName':siteName,
                            'subChunks':subChunks,
                            'siteCandidate':siteCandidate,
                            })
     tmpLog.debug('split to %s subchunks' % iSubChunks)            
     # return
     return self.SC_SUCCEEDED,returnList

Пример #52

Показать файл

Файл: AtlasProdWatchDog.py Проект: PanDAWMS/panda-jedi

 def doActionForReassgin(self,gTmpLog):
     # get DDM I/F
     ddmIF = self.ddmIF.getInterface(self.vo)
     # get site mapper
     siteMapper = self.taskBufferIF.getSiteMapper()
     # get tasks to get reassigned
     taskList = self.taskBufferIF.getTasksToReassign_JEDI(self.vo,self.prodSourceLabel)
     gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList)))
     for taskSpec in taskList:
         tmpLog = MsgWrapper(logger,'<jediTaskID={0}'.format(taskSpec.jediTaskID))
         tmpLog.debug('start to reassign')
         # DDM backend
         ddmBackEnd = taskSpec.getDdmBackEnd()
         # get datasets
         tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,['output','log'])
         if tmpStat != True:
             tmpLog.error('failed to get datasets')
             continue
         # update DB
         if not taskSpec.useWorldCloud():
             # update cloudtasks
             tmpStat = self.taskBufferIF.setCloudTaskByUser('jedi',taskSpec.jediTaskID,taskSpec.cloud,'assigned',True)
             if tmpStat != 'SUCCEEDED':
                 tmpLog.error('failed to update CloudTasks')
                 continue
             # check cloud
             if not siteMapper.checkCloud(taskSpec.cloud):
                 tmpLog.error("cloud={0} doesn't exist".format(taskSpec.cloud))
                 continue
         else:
             # re-run task brokerage
             if taskSpec.nucleus in [None,'']:
                 taskSpec.status = 'assigning'
                 taskSpec.oldStatus = None
                 taskSpec.setToRegisterDatasets()
                 self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID},
                                                   setOldModTime=True)
                 tmpLog.debug('set task_status={0} to trigger task brokerage again'.format(taskSpec.status))
                 continue
             # get nucleus
             nucleusSpec = siteMapper.getNucleus(taskSpec.nucleus)
             if nucleusSpec == None:
                 tmpLog.error("nucleus={0} doesn't exist".format(taskSpec.nucleus))
                 continue
             # set nucleus
             retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,datasetSpecList)}
             tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
         # get T1/nucleus
         if not taskSpec.useWorldCloud():
             t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest']
         else:
             t1SiteName = nucleusSpec.getOnePandaSite()
         t1Site = siteMapper.getSite(t1SiteName)
         # loop over all datasets
         isOK = True
         for datasetSpec in datasetSpecList:
             tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName))
             if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                 tmpLog.debug('skip {0} is distributed'.format(datasetSpec.datasetName))
                 continue
             # get location
             location = siteMapper.getDdmEndpoint(t1Site.sitename,datasetSpec.storageToken)
             # make subscription
             try:
                 tmpLog.debug('registering subscription to {0} with backend={1}'.format(location,
                                                                                        ddmBackEnd))
                 tmpStat = ddmIF.registerDatasetSubscription(datasetSpec.datasetName,location,
                                                             'Production Output',asynchronous=True)
                 if tmpStat != True:
                     tmpLog.error("failed to make subscription")
                     isOK = False
                     break
             except:
                 errtype,errvalue = sys.exc_info()[:2]
                 tmpLog.warning('failed to make subscription with {0}:{1}'.format(errtype.__name__,errvalue))
                 isOK = False
                 break
         # succeeded
         if isOK:    
             # activate task
             if taskSpec.oldStatus in ['assigning','exhausted',None]:
                 taskSpec.status = 'ready'
             else:
                 taskSpec.status = taskSpec.oldStatus
             taskSpec.oldStatus = None
             self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID},
                                               setOldModTime=True)
             tmpLog.debug('finished to reassign')

Пример #53

Показать файл

 def do_preassign(self):
     tmp_log = MsgWrapper(logger, 'do_preassign')
     # refresh
     self.refresh()
     # list of resource type
     resource_type_list = [
         rt.resource_name for rt in self.taskBufferIF.load_resource_types()
     ]
     # threshold of time duration in second that the queue keeps empty to trigger preassignment
     empty_duration_threshold = 1800
     # return map
     ret_map = {
         'to_reassign': {},
     }
     # loop
     for prod_source_label in self.prodSourceLabelList:
         # site-rse map
         site_rse_map = self.get_site_rse_map(prod_source_label)
         # parameters from GDP config
         max_preassigned_tasks = self.taskBufferIF.getConfigValue(
             'queue_filler',
             'MAX_PREASSIGNED_TASKS_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if max_preassigned_tasks is None:
             max_preassigned_tasks = 3
         min_files_ready = self.taskBufferIF.getConfigValue(
             'queue_filler',
             'MIN_FILES_READY_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if min_files_ready is None:
             min_files_ready = 50
         min_files_remaining = self.taskBufferIF.getConfigValue(
             'queue_filler',
             'MIN_FILES_REMAINING_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if min_files_remaining is None:
             min_files_remaining = 100
         # load site empty-since map from cache
         site_empty_since_map_orig = self._get_from_ses_cache()
         # available sites
         available_sites_list = self.get_available_sites_list()
         # now timestamp
         now_time = datetime.datetime.utcnow()
         now_time_ts = int(now_time.timestamp())
         # update site empty-since map
         site_empty_since_map = copy.deepcopy(site_empty_since_map_orig)
         available_site_name_list = [x[0] for x in available_sites_list]
         for site in site_empty_since_map_orig:
             # remove sites that are no longer empty
             if site not in available_site_name_list:
                 del site_empty_since_map[site]
         for site in available_site_name_list:
             # add newly found empty sites
             if site not in site_empty_since_map_orig:
                 site_empty_since_map[site] = now_time_ts
         self._update_to_ses_cache(site_empty_since_map)
         # evaluate sites to preaassign according to cache
         # get blacklisted_tasks_map from cache
         blacklisted_tasks_map = self._get_from_bt_cache()
         blacklisted_tasks_set = set()
         for bt_list in blacklisted_tasks_map.values():
             blacklisted_tasks_set |= set(bt_list)
         # loop over available sites to preassign
         for (site, tmpSiteSpec, n_jobs_to_fill) in available_sites_list:
             # rses of the available site
             available_rses = set()
             try:
                 available_rses.update(set(site_rse_map[site]))
             except KeyError:
                 tmp_log.debug(
                     'skipped {site} since no good RSE'.format(site=site))
                 continue
             # do not consider TAPE rses
             for rse in set(available_rses):
                 if 'TAPE' in str(rse):
                     available_rses.remove(rse)
             # skip if no rse for available site
             if not available_rses:
                 tmp_log.debug(
                     'skipped {site} since no available RSE'.format(
                         site=site))
                 continue
             # skip if no coreCount set
             if not tmpSiteSpec.coreCount or not tmpSiteSpec.coreCount > 0:
                 tmp_log.debug(
                     'skipped {site} since coreCount is not set'.format(
                         site=site))
                 continue
             # now timestamp
             now_time = datetime.datetime.utcnow()
             now_time_ts = int(now_time.timestamp())
             # skip if not empty for long enough
             if site not in site_empty_since_map:
                 tmp_log.error(
                     'skipped {site} since not in empty-since map (should not happen)'
                     .format(site=site))
                 continue
             empty_duration = now_time_ts - site_empty_since_map[site]
             tmp_num_slots = tmpSiteSpec.getNumStandby(None, None)
             if empty_duration < empty_duration_threshold and not tmp_num_slots:
                 tmp_log.debug(
                     'skipped {site} since not empty for enough time ({ed}s < {edt}s)'
                     .format(site=site,
                             ed=empty_duration,
                             edt=empty_duration_threshold))
                 continue
             # only simul tasks if site has fairsharePolicy setup
             processing_type_constraint = ''
             if tmpSiteSpec.fairsharePolicy not in ('NULL', None):
                 if 'type=simul:0%' in tmpSiteSpec.fairsharePolicy:
                     # skip if zero share of simul
                     tmp_log.debug(
                         'skipped {site} since with fairshare but zero for simul'
                         .format(site=site))
                     continue
                 else:
                     processing_type_constraint = "AND t.processingType='simul' "
             # site attributes
             site_maxrss = tmpSiteSpec.maxrss if tmpSiteSpec.maxrss not in (
                 0, None) else 999999
             site_corecount = tmpSiteSpec.coreCount
             site_capability = str(tmpSiteSpec.capability).lower()
             # make sql parameters of rses
             available_rses = list(available_rses)
             rse_params_list = []
             rse_params_map = {}
             for j, rse in enumerate(available_rses):
                 rse_param = ':rse_{0}'.format(j + 1)
                 rse_params_list.append(rse_param)
                 rse_params_map[rse_param] = rse
             rse_params_str = ','.join(rse_params_list)
             # sql
             sql_query = (
                 "SELECT t.jediTaskID, t.workQueue_ID "
                 "FROM {jedi_schema}.JEDI_Tasks t "
                 "WHERE t.status IN ('ready','running') AND t.lockedBy IS NULL "
                 "AND t.prodSourceLabel=:prodSourceLabel "
                 "AND t.resource_type=:resource_type "
                 "AND site IS NULL "
                 "AND (COALESCE(t.baseRamCount, 0) + (CASE WHEN t.ramUnit IN ('MBPerCore','MBPerCoreFixed') THEN t.ramCount*:site_corecount ELSE t.ramCount END))*0.95 < :site_maxrss "
                 "AND t.eventService=0 "
                 "AND EXISTS ( "
                 "SELECT * FROM {jedi_schema}.JEDI_Dataset_Locality dl "
                 "WHERE dl.jediTaskID=t.jediTaskID "
                 "AND dl.rse IN ({rse_params_str}) "
                 ") "
                 "{processing_type_constraint} "
                 "AND EXISTS ( "
                 "SELECT d.datasetID FROM {jedi_schema}.JEDI_Datasets d "
                 "WHERE t.jediTaskID=d.jediTaskID AND d.type='input' "
                 "AND d.nFilesToBeUsed-d.nFilesUsed>=:min_files_ready "
                 "AND d.nFiles-d.nFilesUsed>=:min_files_remaining "
                 ") "
                 "ORDER BY t.currentPriority DESC "
                 "FOR UPDATE ").format(
                     jedi_schema=jedi_config.db.schemaJEDI,
                     rse_params_str=rse_params_str,
                     processing_type_constraint=processing_type_constraint)
             # loop over resource type
             for resource_type in resource_type_list:
                 # key name for preassigned_tasks_map = site + resource_type
                 key_name = '{0}|{1}'.format(site, resource_type)
                 # skip if not match with site capability
                 if site_capability == 'score' and not resource_type.startswith(
                         'SCORE'):
                     continue
                 elif site_capability == 'mcore' and not resource_type.startswith(
                         'MCORE'):
                     continue
                 # params map
                 params_map = {
                     ':prodSourceLabel': prod_source_label,
                     ':resource_type': resource_type,
                     ':site_maxrss': site_maxrss,
                     ':site_corecount': site_corecount,
                     ':min_files_ready': min_files_ready,
                     ':min_files_remaining': min_files_remaining,
                 }
                 params_map.update(rse_params_map)
                 # get preassigned_tasks_map from cache
                 preassigned_tasks_map = self._get_from_pt_cache()
                 preassigned_tasks_cached = preassigned_tasks_map.get(
                     key_name, [])
                 # get task_orig_attr_map from cache
                 task_orig_attr_map = self._get_from_attr_cache()
                 # number of tasks already preassigned
                 n_preassigned_tasks = len(preassigned_tasks_cached)
                 # nuber of tasks to preassign
                 n_tasks_to_preassign = max(
                     max_preassigned_tasks - n_preassigned_tasks, 0)
                 # preassign
                 if n_tasks_to_preassign <= 0:
                     tmp_log.debug(
                         '{key_name:<64} already has enough preassigned tasks ({n_tasks:>3}) ; skipped '
                         .format(key_name=key_name,
                                 n_tasks=n_preassigned_tasks))
                 elif DRY_RUN:
                     dry_sql_query = (
                         "SELECT t.jediTaskID, t.workQueue_ID "
                         "FROM {jedi_schema}.JEDI_Tasks t "
                         "WHERE t.status IN ('ready','running') AND t.lockedBy IS NULL "
                         "AND t.prodSourceLabel=:prodSourceLabel "
                         "AND t.resource_type=:resource_type "
                         "AND site IS NULL "
                         "AND (COALESCE(t.baseRamCount, 0) + (CASE WHEN t.ramUnit IN ('MBPerCore','MBPerCoreFixed') THEN t.ramCount*:site_corecount ELSE t.ramCount END))*0.95 < :site_maxrss "
                         "AND t.eventService=0 "
                         "AND EXISTS ( "
                         "SELECT * FROM {jedi_schema}.JEDI_Dataset_Locality dl "
                         "WHERE dl.jediTaskID=t.jediTaskID "
                         "AND dl.rse IN ({rse_params_str}) "
                         ") "
                         "{processing_type_constraint} "
                         "AND EXISTS ( "
                         "SELECT d.datasetID FROM {jedi_schema}.JEDI_Datasets d "
                         "WHERE t.jediTaskID=d.jediTaskID AND d.type='input' "
                         "AND d.nFilesToBeUsed-d.nFilesUsed>=:min_files_ready "
                         "AND d.nFiles-d.nFilesUsed>=:min_files_remaining "
                         ") "
                         "ORDER BY t.currentPriority DESC ").format(
                             jedi_schema=jedi_config.db.schemaJEDI,
                             rse_params_str=rse_params_str,
                             processing_type_constraint=
                             processing_type_constraint)
                     # tmp_log.debug('[dry run] {} {}'.format(dry_sql_query, params_map))
                     res = self.taskBufferIF.querySQL(
                         dry_sql_query, params_map)
                     n_tasks = 0 if res is None else len(res)
                     if n_tasks > 0:
                         result = [
                             x[0] for x in res
                             if x[0] not in preassigned_tasks_cached
                         ]
                         updated_tasks = result[:n_tasks_to_preassign]
                         tmp_log.debug(
                             '[dry run] {key_name:<64} {n_tasks:>3} tasks would be preassigned '
                             .format(key_name=key_name,
                                     n_tasks=n_tasks_to_preassign))
                         # update preassigned_tasks_map into cache
                         preassigned_tasks_map[key_name] = list(
                             set(updated_tasks)
                             | set(preassigned_tasks_cached))
                         tmp_log.debug('{} ; {}'.format(
                             str(updated_tasks),
                             str(preassigned_tasks_map[key_name])))
                         self._update_to_pt_cache(preassigned_tasks_map)
                 else:
                     updated_tasks_orig_attr = self.taskBufferIF.queryTasksToPreassign_JEDI(
                         sql_query,
                         params_map,
                         site,
                         blacklist=blacklisted_tasks_set,
                         limit=n_tasks_to_preassign)
                     if updated_tasks_orig_attr is None:
                         # dbproxy method failed
                         tmp_log.error(
                             '{key_name:<64} failed to preassign tasks '.
                             format(key_name=key_name))
                     else:
                         n_tasks = len(updated_tasks_orig_attr)
                         if n_tasks > 0:
                             updated_tasks = [
                                 x[0] for x in updated_tasks_orig_attr
                             ]
                             tmp_log.info(
                                 '{key_name:<64} {n_tasks:>3} tasks preassigned : {updated_tasks}'
                                 .format(key_name=key_name,
                                         n_tasks=str(n_tasks),
                                         updated_tasks=updated_tasks))
                             # update preassigned_tasks_map into cache
                             preassigned_tasks_map[key_name] = list(
                                 set(updated_tasks)
                                 | set(preassigned_tasks_cached))
                             self._update_to_pt_cache(preassigned_tasks_map)
                             # update task_orig_attr_map into cache and return map
                             for taskid, orig_attr in updated_tasks_orig_attr:
                                 taskid_str = str(taskid)
                                 task_orig_attr_map[taskid_str] = orig_attr
                                 ret_map['to_reassign'][taskid] = {
                                     'site': site,
                                     'n_jobs_to_fill': n_jobs_to_fill,
                                 }
                             self._update_to_attr_cache(task_orig_attr_map)
                             # Kibana log
                             for taskid in updated_tasks:
                                 tmp_log.debug(
                                     '#ATM #KV jediTaskID={taskid} action=do_preassign site={site} rtype={rtype} preassigned '
                                     .format(taskid=taskid,
                                             site=site,
                                             rtype=resource_type))
                         else:
                             tmp_log.debug(
                                 '{key_name:<64} found no proper task to preassign'
                                 .format(key_name=key_name))
     # total preassigned tasks
     preassigned_tasks_map = self._get_from_pt_cache()
     n_pt_tot = sum(
         [len(pt_list) for pt_list in preassigned_tasks_map.values()])
     tmp_log.debug('now {n_pt_tot} tasks preassigned in total'.format(
         n_pt_tot=n_pt_tot))
     # return
     return ret_map

Пример #54

Показать файл

Файл: AtlasProdTaskBroker.py Проект: ruslan33/panda-jedi

 def doBrokerage(self,inputList,vo,prodSourceLabel,workQueue):
     # list with a lock
     inputListWorld = ListWithLock([])
     # variables for submission
     maxBunchTask = 100
     # make logger
     tmpLog = MsgWrapper(logger)
     tmpLog.debug('start doBrokerage')
     # return for failure
     retFatal    = self.SC_FATAL
     retTmpError = self.SC_FAILED
     tmpLog.debug('vo={0} label={1} queue={2} nTasks={3}'.format(vo,prodSourceLabel,
                                                                 workQueue.queue_name,
                                                                 len(inputList)))
     # loop over all tasks
     allRwMap    = {}
     prioMap     = {}
     tt2Map      = {}
     expRWs      = {}
     jobSpecList = []
     for tmpJediTaskID,tmpInputList in inputList:
         for taskSpec,cloudName,inputChunk in tmpInputList:
             # collect tasks for WORLD
             if taskSpec.useWorldCloud():
                 inputListWorld.append((taskSpec,inputChunk))
                 continue
             # make JobSpec to be submitted for TaskAssigner
             jobSpec = JobSpec()
             jobSpec.taskID     = taskSpec.jediTaskID
             jobSpec.jediTaskID = taskSpec.jediTaskID
             # set managed to trigger TA
             jobSpec.prodSourceLabel  = 'managed'
             jobSpec.processingType   = taskSpec.processingType
             jobSpec.workingGroup     = taskSpec.workingGroup
             jobSpec.metadata         = taskSpec.processingType
             jobSpec.assignedPriority = taskSpec.taskPriority
             jobSpec.currentPriority  = taskSpec.currentPriority
             jobSpec.maxDiskCount     = (taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()) / 1024 / 1024
             if taskSpec.useWorldCloud():
                 # use destinationSE to trigger task brokerage in WORLD cloud
                 jobSpec.destinationSE = taskSpec.cloud
             prodDBlock = None
             setProdDBlock = False
             for datasetSpec in inputChunk.getDatasets():
                 prodDBlock = datasetSpec.datasetName
                 if datasetSpec.isMaster():
                     jobSpec.prodDBlock = datasetSpec.datasetName
                     setProdDBlock = True
                 for fileSpec in datasetSpec.Files:
                     tmpInFileSpec = fileSpec.convertToJobFileSpec(datasetSpec)
                     jobSpec.addFile(tmpInFileSpec)
             # use secondary dataset name as prodDBlock
             if setProdDBlock == False and prodDBlock != None:
                 jobSpec.prodDBlock = prodDBlock
             # append
             jobSpecList.append(jobSpec)
             prioMap[jobSpec.taskID] = jobSpec.currentPriority
             tt2Map[jobSpec.taskID]  = jobSpec.processingType
             # get RW for a priority
             if not allRwMap.has_key(jobSpec.currentPriority):
                 tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI(vo,prodSourceLabel,workQueue,
                                                                    jobSpec.currentPriority) 
                 if tmpRW == None:
                     tmpLog.error('failed to calculate RW with prio={0}'.format(jobSpec.currentPriority))
                     return retTmpError
                 allRwMap[jobSpec.currentPriority] = tmpRW
             # get expected RW
             expRW = self.taskBufferIF.calculateTaskRW_JEDI(jobSpec.jediTaskID)
             if expRW == None:
                 tmpLog.error('failed to calculate RW for jediTaskID={0}'.format(jobSpec.jediTaskID))
                 return retTmpError
             expRWs[jobSpec.taskID] = expRW
     # for old clouds
     if jobSpecList != []:
         # get fullRWs
         fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI(vo,prodSourceLabel,None,None)
         if fullRWs == None:
             tmpLog.error('failed to calculate full RW')
             return retTmpError
         # set metadata
         for jobSpec in jobSpecList:
             rwValues = allRwMap[jobSpec.currentPriority]
             jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % (jobSpec.metadata,
                                                       str(rwValues),str(expRWs),
                                                       str(prioMap),str(fullRWs),
                                                       str(tt2Map))
         tmpLog.debug('run task assigner for {0} tasks'.format(len(jobSpecList)))
         nBunchTask = 0
         while nBunchTask < len(jobSpecList):
             # get a bunch
             jobsBunch = jobSpecList[nBunchTask:nBunchTask+maxBunchTask]
             strIDs = 'jediTaskID='
             for tmpJobSpec in jobsBunch:
                 strIDs += '{0},'.format(tmpJobSpec.taskID)
             strIDs = strIDs[:-1]
             tmpLog.debug(strIDs)
             # increment index
             nBunchTask += maxBunchTask
             # run task brokerge
             stS,outSs = PandaClient.runTaskAssignment(jobsBunch)
             tmpLog.debug('{0}:{1}'.format(stS,str(outSs)))
     # for WORLD
     if len(inputListWorld) > 0:
         # thread pool
         threadPool = ThreadPool()
         # get full RW for WORLD
         fullRWs = self.taskBufferIF.calculateWorldRWwithPrio_JEDI(vo,prodSourceLabel,None,None)
         if fullRWs == None:
             tmpLog.error('failed to calculate full WORLD RW')
             return retTmpError
         # get RW per priority
         for taskSpec,inputChunk in inputListWorld:
             if not taskSpec.currentPriority in allRwMap:
                 tmpRW = self.taskBufferIF.calculateWorldRWwithPrio_JEDI(vo,prodSourceLabel,workQueue,
                                                                         taskSpec.currentPriority)
                 if tmpRW == None:
                     tmpLog.error('failed to calculate RW with prio={0}'.format(taskSpec.currentPriority))
                     return retTmpError
                 allRwMap[taskSpec.currentPriority] = tmpRW
         # live counter for RWs
         liveCounter = MapWithLock(allRwMap)
         # make workers
         ddmIF = self.ddmIF.getInterface(vo)
         for iWorker in range(4):
             thr = AtlasProdTaskBrokerThread(inputListWorld,threadPool,
                                             self.taskBufferIF,ddmIF,
                                             fullRWs,liveCounter)
             thr.start()
         threadPool.join(60*10)
     # return
     tmpLog.debug('doBrokerage done')
     return self.SC_SUCCEEDED

Пример #55

Показать файл

 def doUpdateDataLocality(self):
     tmpLog = MsgWrapper(logger, ' #ATM #KV doUpdateDataLocality')
     tmpLog.debug('start')
     try:
         # lock
         got_lock = self.taskBufferIF.lockProcess_JEDI(  vo=self.vo, prodSourceLabel='default',
                                                         cloud=None, workqueue_id=None, resource_name=None,
                                                         component='AtlasDataLocalityUpdaterWatchDog.doUpdateDataLocality',
                                                         pid=self.pid, timeLimit=240)
         if not got_lock:
             tmpLog.debug('locked by another process. Skipped')
             return
         tmpLog.debug('got lock')
         # get list of datasets
         datasets_list = self.get_datasets_list()
         tmpLog.debug('got {0} datasets to update'.format(len(datasets_list)))
         # make thread pool
         thread_pool = ThreadPool()
         # make workers
         n_workers = 4
         for _ in range(n_workers):
             thr = DataLocalityUpdaterThread(taskDsList=datasets_list,
                                             threadPool=thread_pool,
                                             taskbufferIF=self.taskBufferIF,
                                             ddmIF=self.ddmIF,
                                             pid=self.pid,
                                             loggerObj=tmpLog)
             thr.start()
         tmpLog.debug('started {0} updater workers'.format(n_workers))
         # join
         thread_pool.join()
         # done
         tmpLog.debug('done')
     except Exception:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0} {1} {2}'.format(errtype, errvalue, traceback.format_exc()))

Пример #56

Показать файл

Файл: TaskBroker.py Проект: PanDAWMS/panda-jedi

    def start(self):
        # start base classes
        JediKnight.start(self)
        FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF)
        # go into main loop
        while True:
            startTime = datetime.datetime.utcnow()
            try:
                # get logger
                tmpLog = MsgWrapper(logger)
                tmpLog.debug('start TaskBroker')
                # get work queue mapper
                workQueueMapper = self.taskBufferIF.getWorkQueueMap()
                resource_types = self.taskBufferIF.load_resource_types()

                # loop over all vos
                for vo in self.vos:
                    # loop over all sourceLabels
                    for prodSourceLabel in self.prodSourceLabels:
                        # loop over all work queues
                        for workQueue in workQueueMapper.getAlignedQueueList(
                                vo, prodSourceLabel):
                            for resource_type in resource_types:
                                wq_name = '_'.join(
                                    workQueue.queue_name.split(' '))
                                msgLabel = 'vo={0} label={1} queue={2} resource_type={3}: '.\
                                    format(vo, prodSourceLabel, wq_name, resource_type.resource_name)
                                tmpLog.debug(msgLabel + 'start')
                                # get the list of tasks to check
                                tmpList = self.taskBufferIF.getTasksToCheckAssignment_JEDI(
                                    vo, prodSourceLabel, workQueue,
                                    resource_type.resource_name)
                                if tmpList is None:
                                    # failed
                                    tmpLog.error(
                                        msgLabel +
                                        'failed to get the list of tasks to check'
                                    )
                                else:
                                    tmpLog.debug(msgLabel +
                                                 'got tasks_to_check={0}'.
                                                 format(len(tmpList)))
                                    # put to a locked list
                                    taskList = ListWithLock(tmpList)
                                    # make thread pool
                                    threadPool = ThreadPool()
                                    # make workers
                                    nWorker = jedi_config.taskbroker.nWorkers
                                    for iWorker in range(nWorker):
                                        thr = TaskCheckerThread(
                                            taskList, threadPool,
                                            self.taskBufferIF, self.ddmIF,
                                            self, vo, prodSourceLabel)
                                        thr.start()
                                    # join
                                    threadPool.join()
                                # get the list of tasks to assign
                                tmpList = self.taskBufferIF.getTasksToAssign_JEDI(
                                    vo, prodSourceLabel, workQueue,
                                    resource_type.resource_name)
                                if tmpList is None:
                                    # failed
                                    tmpLog.error(
                                        msgLabel +
                                        'failed to get the list of tasks to assign'
                                    )
                                else:
                                    tmpLog.debug(msgLabel +
                                                 'got tasks_to_assign={0}'.
                                                 format(len(tmpList)))
                                    # put to a locked list
                                    taskList = ListWithLock(tmpList)
                                    # make thread pool
                                    threadPool = ThreadPool()
                                    # make workers
                                    nWorker = jedi_config.taskbroker.nWorkers
                                    for iWorker in range(nWorker):
                                        thr = TaskBrokerThread(
                                            taskList, threadPool,
                                            self.taskBufferIF, self.ddmIF,
                                            self, vo, prodSourceLabel,
                                            workQueue,
                                            resource_type.resource_name)
                                        thr.start()
                                    # join
                                    threadPool.join()
                                tmpLog.debug(msgLabel + 'done')
            except Exception:
                errtype, errvalue = sys.exc_info()[:2]
                tmpLog.error('failed in {0}.start() with {1} {2}'.format(
                    self.__class__.__name__, errtype.__name__, errvalue))
            tmpLog.debug('done')
            # sleep if needed
            loopCycle = jedi_config.taskbroker.loopCycle
            timeDelta = datetime.datetime.utcnow() - startTime
            sleepPeriod = loopCycle - timeDelta.seconds
            if sleepPeriod > 0:
                time.sleep(sleepPeriod)
            # randomize cycle
            self.randomSleep(max_val=loopCycle)

Пример #57

Показать файл

Файл: AtlasProdTaskBroker.py Проект: tertychnyy/panda-jedi

 def findMissingFiles(self, jediTaskID, cloudName):
     tmpLog = MsgWrapper(logger, '<jediTaskID={0}>'.format(jediTaskID))
     tmpLog.debug('start findMissingFiles')
     # return for failure
     retError = self.SC_FAILED
     # get datasets
     tmpSt, datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
         jediTaskID, ['input'], True)
     if not tmpSt:
         tmpLog.error('failed to get the list of datasets')
         return retError
     # loop over all datasets
     for datasetSpec in datasetSpecList:
         # check only master dataset
         if not datasetSpec.isMaster():
             continue
         tmpLog.debug('checking {0}'.format(datasetSpec.datasetName))
         # get ddmIF
         ddmIF = self.ddmIF.getInterface(datasetSpec.vo)
         if ddmIF == None:
             tmpLog.error('failed to get DDM I/F for vo={0}'.format(
                 datasetSpec.vo))
             return retError
         # get the list of sites where data is available
         tmpSt, tmpRet = AtlasBrokerUtils.getSitesWithData(
             self.siteMapper, ddmIF, datasetSpec.datasetName)
         if tmpSt != self.SC_SUCCEEDED:
             tmpLog.error(
                 'failed to get the list of sites where {0} is available, since {1}'
                 .format(datasetSpec.datasetName, tmpRet))
             return retError
         dataSiteMap = tmpRet
         # data is unavailable in cloud
         if not dataSiteMap.has_key(cloudName):
             tmpLog.error('{0} is unavailable in cloud={1} map={2}'.format(
                 datasetSpec.datasetName, cloudName, str(dataSiteMap)))
             return retError
         # mapping between sites and storage endpoints
         checkedSites = [self.siteMapper.getCloud(cloudName)['source']
                         ] + dataSiteMap[cloudName]['t2']
         siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(
             checkedSites, self.siteMapper)
         # get available files per site/endpoint
         tmpAvFileMap = ddmIF.getAvailableFiles(datasetSpec,
                                                siteStorageEP,
                                                self.siteMapper,
                                                ngGroup=[1],
                                                checkLFC=True)
         if tmpAvFileMap == None:
             tmpLog.error(
                 'failed to get available file list for {0}'.format(
                     datasetSpec.datasetName))
             return retError
         # check availability
         missingFiles = []
         for fileSpec in datasetSpec.Files:
             fileFound = False
             for tmpSiteName, availableFilesMap in tmpAvFileMap.iteritems():
                 for tmpStorageType, availableFiles in availableFilesMap.iteritems(
                 ):
                     for availableFile in availableFiles:
                         if fileSpec.lfn == availableFile.lfn:
                             fileFound = True
                             break
                     if fileFound:
                         break
                 if fileFound:
                     break
             # missing
             if not fileFound:
                 missingFiles.append(fileSpec.fileID)
                 tmpLog.debug('{0} missing'.format(fileSpec.lfn))
         # update contents
         if missingFiles != []:
             tmpSt = self.taskBufferIF.setMissingFiles_JEDI(
                 jediTaskID, datasetSpec.datasetID, missingFiles)
             if not tmpSt:
                 tmpLog.error('failed to set missing files in {0}'.format(
                     datasetSpec.datasetName))
                 return retError
     tmpLog.debug('done findMissingFiles')
     return self.SC_SUCCEEDED

Пример #58

Показать файл

 def do_for_data_locality(self):
     tmp_log = MsgWrapper(logger)
     # refresh
     self.refresh()
     # list of resource type
     # resource_type_list = [ rt.resource_name for rt in self.taskBufferIF.load_resource_types() ]
     # loop
     for prod_source_label in self.prodSourceLabelList:
         # site-rse map and blacklisted rses
         site_rse_map, blacklisted_rse_set = self.get_site_rse_map_and_blacklisted_rse_set(
             prod_source_label)
         tmp_log.debug('Found {0} blacklisted RSEs : {1}'.format(
             len(blacklisted_rse_set), ','.join(list(blacklisted_rse_set))))
         # parameter from GDP config
         upplimit_ioIntensity = self.taskBufferIF.getConfigValue(
             'task_withholder',
             'LIMIT_IOINTENSITY_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         lowlimit_currentPriority = self.taskBufferIF.getConfigValue(
             'task_withholder',
             'LIMIT_PRIORITY_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if upplimit_ioIntensity is None:
             upplimit_ioIntensity = 999999
         if lowlimit_currentPriority is None:
             lowlimit_currentPriority = -999999
         upplimit_ioIntensity = max(upplimit_ioIntensity, 100)
         # get work queue for gshare
         work_queue_list = self.workQueueMapper.getAlignedQueueList(
             self.vo, prod_source_label)
         # loop over work queue
         for work_queue in work_queue_list:
             gshare = work_queue.queue_name
             # get cutoff
             cutoff = self.taskBufferIF.getConfigValue(
                 'jobbroker', 'NQUEUELIMITSITE_{}'.format(gshare), 'jedi',
                 self.vo)
             if not cutoff:
                 cutoff = 20
             # busy sites
             busy_sites_list = self.get_busy_sites(gshare, cutoff)
             # rses of busy sites
             busy_rses = set()
             for site in busy_sites_list:
                 try:
                     busy_rses.update(set(site_rse_map[site]))
                 except KeyError:
                     continue
             # make sql parameters of rses
             to_exclude_rses = list(busy_rses | blacklisted_rse_set)
             rse_params_list = []
             rse_params_map = {}
             for j, rse in enumerate(to_exclude_rses):
                 rse_param = ':rse_{0}'.format(j + 1)
                 rse_params_list.append(rse_param)
                 rse_params_map[rse_param] = rse
             rse_params_str = ','.join(rse_params_list)
             # sql
             sql_query = (
                 "SELECT t.jediTaskID "
                 "FROM {jedi_schema}.JEDI_Tasks t "
                 "WHERE t.status IN ('ready','running','scouting') AND t.lockedBy IS NULL "
                 "AND t.gshare=:gshare "
                 "AND t.ioIntensity>=:ioIntensity AND t.currentPriority<:currentPriority "
                 "AND EXISTS ( "
                 "SELECT * FROM {jedi_schema}.JEDI_Datasets d "
                 "WHERE d.jediTaskID=t.jediTaskID "
                 "AND d.type='input' "
                 ") "
                 "AND NOT EXISTS ( "
                 "SELECT * FROM {jedi_schema}.JEDI_Dataset_Locality dl "
                 "WHERE dl.jediTaskID=t.jediTaskID "
                 "AND dl.rse NOT IN ({rse_params_str}) "
                 ") "
                 "FOR UPDATE ").format(
                     jedi_schema=jedi_config.db.schemaJEDI,
                     rse_params_str=rse_params_str)
             # params map
             params_map = {
                 ':gshare': gshare,
                 ':ioIntensity': upplimit_ioIntensity,
                 ':currentPriority': lowlimit_currentPriority,
             }
             params_map.update(rse_params_map)
             # pending reason
             reason = 'no local input data, ioIntensity>={ioIntensity}, currentPriority<{currentPriority},'\
                      'nQueue>max({cutOff},nRunning*2) at all sites where the task can run'.format(
                 ioIntensity=upplimit_ioIntensity,currentPriority=lowlimit_currentPriority,
                 cutOff=cutoff)
             # set pending
             dry_run = False
             if dry_run:
                 dry_sql_query = (
                     "SELECT t.jediTaskID "
                     "FROM {jedi_schema}.JEDI_Tasks t "
                     "WHERE t.status IN ('ready','running','scouting') AND t.lockedBy IS NULL "
                     "AND t.gshare=:gshare "
                     "AND t.ioIntensity>=:ioIntensity AND t.currentPriority<:currentPriority "
                     "AND EXISTS ( "
                     "SELECT * FROM {jedi_schema}.JEDI_Datasets d "
                     "WHERE d.jediTaskID=t.jediTaskID "
                     "AND d.type='input' "
                     ") "
                     "AND NOT EXISTS ( "
                     "SELECT * FROM {jedi_schema}.JEDI_Dataset_Locality dl "
                     "WHERE dl.jediTaskID=t.jediTaskID "
                     "AND dl.rse NOT IN ({rse_params_str}) "
                     ") ").format(jedi_schema=jedi_config.db.schemaJEDI,
                                  rse_params_str=rse_params_str)
                 res = self.taskBufferIF.querySQL(dry_sql_query, params_map)
                 n_tasks = 0 if res is None else len(res)
                 if n_tasks > 0:
                     result = [x[0] for x in res]
                     tmp_log.debug(
                         '[dry run] gshare: {gshare:<16} {n_tasks:>5} tasks would be pending : {result} ; reason="{reason}" '
                         .format(gshare=gshare,
                                 n_tasks=n_tasks,
                                 result=result,
                                 reason=reason))
             else:
                 n_tasks = self.taskBufferIF.queryTasksToBePending_JEDI(
                     sql_query, params_map, reason)
                 if n_tasks is not None and n_tasks > 0:
                     tmp_log.info(
                         'gshare: {gshare:<16} {n_tasks:>5} tasks got pending ; reason="{reason}" '
                         .format(gshare=gshare,
                                 n_tasks=str(n_tasks),
                                 reason=reason))

Пример #59

Показать файл

 def undo_preassign(self):
     tmp_log = MsgWrapper(logger, 'undo_preassign')
     # refresh
     self.refresh()
     # busy sites
     busy_sites_dict = self.get_busy_sites()
     # loop to undo preassignment
     for prod_source_label in self.prodSourceLabelList:
         # parameter from GDP config
         max_preassigned_tasks = self.taskBufferIF.getConfigValue(
             'queue_filler',
             'MAX_PREASSIGNED_TASKS_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if max_preassigned_tasks is None:
             max_preassigned_tasks = 3
         min_files_ready = self.taskBufferIF.getConfigValue(
             'queue_filler',
             'MIN_FILES_READY_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if min_files_ready is None:
             min_files_ready = 50
         min_files_remaining = self.taskBufferIF.getConfigValue(
             'queue_filler',
             'MIN_FILES_REMAINING_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if min_files_remaining is None:
             min_files_remaining = 100
         # clean up outdated blacklist
         blacklist_duration_hours = 12
         blacklisted_tasks_map_orig = self._get_from_bt_cache()
         blacklisted_tasks_map = copy.deepcopy(blacklisted_tasks_map_orig)
         now_time = datetime.datetime.utcnow()
         min_allowed_time = now_time - datetime.timedelta(
             hours=blacklist_duration_hours)
         min_allowed_ts = int(min_allowed_time.timestamp())
         for ts_str in blacklisted_tasks_map_orig:
             ts = int(ts_str)
             if ts < min_allowed_ts:
                 del blacklisted_tasks_map[ts_str]
         self._update_to_bt_cache(blacklisted_tasks_map)
         n_bt_old = sum([
             len(bt_list)
             for bt_list in blacklisted_tasks_map_orig.values()
         ])
         n_bt = sum(
             [len(bt_list) for bt_list in blacklisted_tasks_map.values()])
         tmp_log.debug(
             'done cleanup blacklist; before {n_bt_old} , now {n_bt} tasks in blacklist'
             .format(n_bt_old=n_bt_old, n_bt=n_bt))
         # get a copy of preassigned_tasks_map from cache
         preassigned_tasks_map_orig = self._get_from_pt_cache()
         preassigned_tasks_map = copy.deepcopy(preassigned_tasks_map_orig)
         # clean up task_orig_attr_map in cache
         task_orig_attr_map_orig = self._get_from_attr_cache()
         task_orig_attr_map = copy.deepcopy(task_orig_attr_map_orig)
         all_preassiged_taskids = set()
         for taskid_list in preassigned_tasks_map_orig.values():
             all_preassiged_taskids |= set(taskid_list)
         for taskid_str in task_orig_attr_map_orig:
             taskid = int(taskid_str)
             if taskid not in all_preassiged_taskids:
                 del task_orig_attr_map[taskid_str]
         self._update_to_attr_cache(task_orig_attr_map)
         # loop on preassigned tasks in cache
         for key_name in preassigned_tasks_map_orig:
             # parse key name = site + resource_type
             site, resource_type = key_name.split('|')
             # preassigned tasks in cache
             preassigned_tasks_cached = preassigned_tasks_map.get(
                 key_name, [])
             # force_undo=True for all tasks in busy sites, and force_undo=False for tasks not in status to generate jobs
             force_undo = False
             if site in busy_sites_dict or len(
                     preassigned_tasks_cached) > max_preassigned_tasks:
                 force_undo = True
             reason_str = 'site busy or offline or with too many preassigned tasks' if force_undo \
                             else 'task paused/terminated or without enough files to process'
             # parameters for undo, kinda ugly
             params_map = {
                 ':min_files_ready': min_files_ready,
                 ':min_files_remaining': min_files_remaining,
             }
             # undo preassign
             had_undo = False
             updated_tasks = []
             if DRY_RUN:
                 if force_undo:
                     updated_tasks = list(preassigned_tasks_cached)
                     n_tasks = len(updated_tasks)
                 else:
                     preassigned_tasks_list = []
                     preassigned_tasks_params_map = {}
                     for j, taskid in enumerate(preassigned_tasks_cached):
                         pt_param = ':pt_{0}'.format(j + 1)
                         preassigned_tasks_list.append(pt_param)
                         preassigned_tasks_params_map[pt_param] = taskid
                     if not preassigned_tasks_list:
                         continue
                     preassigned_tasks_params_str = ','.join(
                         preassigned_tasks_list)
                     dry_sql_query = (
                         "SELECT t.jediTaskID "
                         "FROM {jedi_schema}.JEDI_Tasks t "
                         "WHERE t.jediTaskID IN ({preassigned_tasks_params_str}) "
                         "AND t.site IS NOT NULL "
                         "AND NOT ( "
                         "t.status IN ('ready','running') "
                         "AND EXISTS ( "
                         "SELECT d.datasetID FROM {0}.JEDI_Datasets d "
                         "WHERE t.jediTaskID=d.jediTaskID AND d.type='input' "
                         "AND d.nFilesToBeUsed-d.nFilesUsed>=:min_files_ready AND d.nFiles-d.nFilesUsed>=:min_files_remaining "
                         ") "
                         ") ").format(jedi_schema=jedi_config.db.schemaJEDI,
                                      preassigned_tasks_params_str=
                                      preassigned_tasks_params_str)
                     res = self.taskBufferIF.querySQL(
                         dry_sql_query, preassigned_tasks_params_map)
                     n_tasks = 0 if res is None else len(res)
                     if n_tasks > 0:
                         updated_tasks = [x[0] for x in res]
                 # tmp_log.debug('[dry run] {} {} force={}'.format(key_name, str(updated_tasks), force_undo))
                 had_undo = True
                 if n_tasks > 0:
                     tmp_log.debug(
                         '[dry run] {key_name:<64} {n_tasks:>3} preassigned tasks would be undone ({reason_str}) '
                         .format(key_name=key_name,
                                 n_tasks=n_tasks,
                                 reason_str=reason_str))
             else:
                 updated_tasks = self.taskBufferIF.undoPreassignedTasks_JEDI(
                     preassigned_tasks_cached,
                     task_orig_attr_map=task_orig_attr_map,
                     params_map=params_map,
                     force=force_undo)
                 if updated_tasks is None:
                     # dbproxy method failed
                     tmp_log.error(
                         '{key_name:<64} failed to undo preassigned tasks (force={force_undo})'
                         .format(key_name=key_name, force_undo=force_undo))
                 else:
                     had_undo = True
                     n_tasks = len(updated_tasks)
                     if n_tasks > 0:
                         tmp_log.info(
                             '{key_name:<64} {n_tasks:>3} preassigned tasks undone ({reason_str}) : {updated_tasks} '
                             .format(key_name=key_name,
                                     n_tasks=str(n_tasks),
                                     reason_str=reason_str,
                                     updated_tasks=updated_tasks))
                         # Kibana log
                         for taskid in updated_tasks:
                             tmp_log.debug(
                                 '#ATM #KV jediTaskID={taskid} action=undo_preassign site={site} rtype={rtype} un-preassinged since {reason_str}'
                                 .format(taskid=taskid,
                                         site=site,
                                         rtype=resource_type,
                                         reason_str=reason_str))
             # update preassigned_tasks_map into cache
             if had_undo:
                 if force_undo:
                     del preassigned_tasks_map[key_name]
                 else:
                     tmp_tasks_set = set(preassigned_tasks_cached) - set(
                         updated_tasks)
                     if not tmp_tasks_set:
                         del preassigned_tasks_map[key_name]
                     else:
                         preassigned_tasks_map[key_name] = list(
                             tmp_tasks_set)
                 self._update_to_pt_cache(preassigned_tasks_map)
             # update blacklisted_tasks_map into cache
             if had_undo and not force_undo:
                 blacklisted_tasks_map_orig = self._get_from_bt_cache()
                 blacklisted_tasks_map = copy.deepcopy(
                     blacklisted_tasks_map_orig)
                 now_time = datetime.datetime.utcnow()
                 now_rounded_ts = int(
                     now_time.replace(minute=0, second=0,
                                      microsecond=0).timestamp())
                 ts_str = str(now_rounded_ts)
                 if ts_str in blacklisted_tasks_map_orig:
                     tmp_bt_list = blacklisted_tasks_map[ts_str]
                     blacklisted_tasks_map[ts_str] = list(
                         set(tmp_bt_list) | set(updated_tasks))
                 else:
                     blacklisted_tasks_map[ts_str] = list(updated_tasks)
                 self._update_to_bt_cache(blacklisted_tasks_map)