Python MsgWrapper.info примеры использования

Язык программирования: Python

Пространство имен/Пакет: pandajedi.jedicore.MsgWrapper

Класс/Тип: MsgWrapper

Метод/Функция: info

Примеров на hotexamples.com: 51

Python MsgWrapper.info - 51 примеров найдено. Это лучшие примеры Python кода для pandajedi.jedicore.MsgWrapper.MsgWrapper.info, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

MsgWrapper(30)

debug(30)

error(30)

info(23)

sendMsg(8)

uploadLog(7)

warning(3)

Пример #1

Показать файл

 def doCleanDataLocality(self):
     tmpLog = MsgWrapper(logger, ' #ATM #KV doCleanDataLocality')
     tmpLog.debug('start')
     try:
         # lock
         got_lock = self.taskBufferIF.lockProcess_JEDI(  vo=self.vo, prodSourceLabel='default',
                                                         cloud=None, workqueue_id=None, resource_name=None,
                                                         component='AtlasDataLocalityUpdaterWatchDog.doCleanDataLocality',
                                                         pid=self.pid, timeLimit=1440)
         if not got_lock:
             tmpLog.debug('locked by another process. Skipped')
             return
         tmpLog.debug('got lock')
         # lifetime of records
         record_lifetime_hours = 24
         # run
         now_timestamp = datetime.datetime.utcnow()
         before_timestamp = now_timestamp - datetime.timedelta(hours=record_lifetime_hours)
         n_rows = self.taskBufferIF.deleteOutdatedDatasetLocality_JEDI(before_timestamp)
         tmpLog.info('cleaned up {0} records'.format(n_rows))
         # done
         tmpLog.debug('done')
     except Exception:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0} {1} {2}'.format(errtype, errvalue, traceback.format_exc()))

Пример #2

Показать файл

 def registerDatasetSubscription(self,datasetName,location,activity=None,ignoreUnknown=False):
     methodName = 'registerDatasetSubscription'
     methodName = '{0} datasetName={1} location={2}'.format(methodName,datasetName,location)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     isOK = True
     try:
         # get DQ2 API            
         dq2 = DQ2()
         # call
         dq2.registerDatasetSubscription(datasetName,location,activity=activity)
     except DQSubscriptionExistsException:
         pass
     except DQUnknownDatasetException:
         if ignoreUnknown:
             pass
         else:
             isOK = False
     except:
         isOK = False
     if not isOK:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)
     tmpLog.info('done')
     return self.SC_SUCCEEDED,True

Пример #3

Показать файл

 def freezeDataset(self,datasetName,ignoreUnknown=False):
     methodName = 'freezeDataset'
     methodName = '{0} datasetName={1}'.format(methodName,datasetName)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     isOK = True
     try:
         # get DQ2 API            
         dq2=DQ2()
         # freeze
         dq2.freezeDataset(datasetName)
     except DQFrozenDatasetException:
         pass
     except DQUnknownDatasetException:
         if ignoreUnknown:
             pass
         else:
             isOK = False
     except:
         isOK = False
     if isOK:
         tmpLog.info('done')
         return self.SC_SUCCEEDED,True
     else:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)

Пример #4

Показать файл

Файл: PostProcessor.py Проект: wguanicedew/panda-jedi

 def start(self):
     # start base classes
     JediKnight.start(self)
     FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF)
     # go into main loop
     while True:
         startTime = datetime.datetime.utcnow()
         try:
             # get logger
             tmpLog = MsgWrapper(logger)
             tmpLog.info('start')
             # loop over all vos
             for vo in self.vos:
                 # loop over all sourceLabels
                 for prodSourceLabel in self.prodSourceLabels:
                     # prepare tasks to be finished
                     tmpLog.info('preparing tasks to be finished for vo={0} label={1}'.format(vo,prodSourceLabel))
                     tmpRet = self.taskBufferIF.prepareTasksToBeFinished_JEDI(vo,prodSourceLabel,
                                                                              jedi_config.postprocessor.nTasks,
                                                                              pid=self.pid)
                     if tmpRet == None:
                         # failed
                         tmpLog.error('failed to prepare tasks')
                     # get tasks to be finished
                     tmpLog.info('getting tasks to be finished') 
                     tmpList = self.taskBufferIF.getTasksToBeFinished_JEDI(vo,prodSourceLabel,self.pid,
                                                                           jedi_config.postprocessor.nTasks)
                     if tmpList == None: 
                         # failed
                         tmpLog.error('failed to get tasks to be finished')
                     else:
                         tmpLog.info('got {0} tasks'.format(len(tmpList)))
                         # put to a locked list
                         taskList = ListWithLock(tmpList)
                         # make thread pool
                         threadPool = ThreadPool()
                         # make workers
                         nWorker = jedi_config.postprocessor.nWorkers
                         for iWorker in range(nWorker):
                             thr = PostProcessorThread(taskList,threadPool,
                                                       self.taskBufferIF,
                                                       self.ddmIF,
                                                       self)
                             thr.start()
                         # join
                         threadPool.join()
             tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
         # sleep if needed
         loopCycle = 60
         timeDelta = datetime.datetime.utcnow() - startTime
         sleepPeriod = loopCycle - timeDelta.seconds
         if sleepPeriod > 0:
             time.sleep(sleepPeriod)

Пример #5

Показать файл

Файл: WatchDog.py Проект: PanDAWMS/panda-jedi

 def start(self):
     # start base classes
     JediKnight.start(self)
     FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF)
     # go into main loop
     while True:
         startTime = datetime.datetime.utcnow()
         try:
             # get logger
             tmpLog = MsgWrapper(logger)
             tmpLog.info('start')
             # loop over all vos
             for vo in self.vos:
                 # loop over all sourceLabels
                 for prodSourceLabel in self.prodSourceLabels:
                     # vo/prodSourceLabel specific action
                     impl = self.getImpl(vo,
                                         prodSourceLabel,
                                         subType=self.subStr)
                     if impl is not None:
                         plugin_name = impl.__class__.__name__
                         tmpLog.info(
                             'pre-action for vo={} label={} cls={}'.format(
                                 vo, prodSourceLabel, plugin_name))
                         impl.pre_action(tmpLog, vo, prodSourceLabel,
                                         self.pid)
                         tmpLog.info(
                             'do action for vo={} label={} cls={}'.format(
                                 vo, prodSourceLabel, plugin_name))
                         tmpStat = impl.doAction()
                         if tmpStat != Interaction.SC_SUCCEEDED:
                             tmpLog.error(
                                 'failed to run special action for vo={} label={} cls={}'
                                 .format(vo, prodSourceLabel, plugin_name))
                         else:
                             tmpLog.info(
                                 'done for vo={} label={} cls={}'.format(
                                     vo, prodSourceLabel, plugin_name))
             tmpLog.info('done')
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             tmpLog.error('failed in {0}.start() with {1} {2}'.format(
                 self.__class__.__name__, errtype.__name__, errvalue))
         # sleep if needed
         loopCycle = jedi_config.watchdog.loopCycle if self.period is None else self.period
         timeDelta = datetime.datetime.utcnow() - startTime
         sleepPeriod = loopCycle - timeDelta.seconds
         if sleepPeriod > 0:
             time.sleep(sleepPeriod)
         # randomize cycle
         self.randomSleep(max_val=loopCycle)

Пример #6

Показать файл

Файл: PostProcessor.py Проект: wguanicedew/panda-jedi

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for taskSpec in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID))
                 tmpLog.info('start')
                 tmpStat = Interaction.SC_SUCCEEDED
                 # get impl
                 impl = self.implFactory.instantiateImpl(taskSpec.vo,taskSpec.prodSourceLabel,None,
                                                         self.taskBufferIF,self.ddmIF)
                 if impl == None:
                     # post processor is undefined
                     tmpLog.error('post-processor is undefined for vo={0} sourceLabel={1}'.format(taskSpec.vo,taskSpec.prodSourceLabel))
                     tmpStat = Interaction.SC_FATAL
                 # execute    
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('post-process with {0}'.format(impl.__class__.__name__))
                     try:
                         impl.doPostProcess(taskSpec,tmpLog)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         tmpLog.error('doPostProcess failed with {0}:{1}'.format(errtype.__name__,errvalue))
                         tmpStat = Interaction.SC_FATAL
                 # done
                 if tmpStat == Interaction.SC_FATAL:
                     # task is broken
                     tmpErrStr = 'post-process failed'
                     tmpLog.error(tmpErrStr)
                     taskSpec.status = 'broken'
                     taskSpec.setErrDiag(tmpErrStr)
                     taskSpec.lockedBy = None
                     self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID})    
                 elif tmpStat == Interaction.SC_FAILED:
                     tmpErrStr = 'post processing failed'
                     taskSpec.setOnHold()
                     taskSpec.setErrDiag(tmpErrStr,True)
                     taskSpec.lockedBy = None
                     self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID})
                     tmpLog.info('set task_status={0} since {1}'.format(taskSpec.status,taskSpec.errorDialog))
                     continue
                 # final procedure
                 try:
                     impl.doFinalProcedure(taskSpec,tmpLog)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     tmpLog.error('doFinalProcedure failed with {0}:{1}'.format(errtype.__name__,errvalue))
                 # done
                 tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))

Пример #7

Показать файл

Файл: TaskBroker.py Проект: RRCKI/panda-jedi

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 100
             taskList = self.taskList.get(nTasks)
             totalTasks,idxTasks = self.taskList.stat()
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # make logger
             tmpLog = MsgWrapper(self.logger)
             tmpLog.info('start TaskBrokerThread {0}/{1} for jediTaskID={2}'.format(idxTasks,totalTasks,taskList))
             tmpStat = Interaction.SC_SUCCEEDED
             # get TaskSpecs
             tmpListToAssign = []
             for tmpTaskItem in taskList:
                 tmpListItem = self.taskBufferIF.getTasksToBeProcessed_JEDI(None,None,None,None,None,
                                                                            simTasks=[tmpTaskItem],
                                                                            readMinFiles=True)
                 if tmpListItem == None:
                     # failed
                     tmpLog.error('failed to get the input chunks for jediTaskID={0}'.format(tmpTaskItem))
                     tmpStat = Interaction.SC_FAILED
                     break
                 tmpListToAssign += tmpListItem
             # get impl                    
             if tmpStat == Interaction.SC_SUCCEEDED:                    
                 tmpLog.info('getting Impl')
                 try:
                     impl = self.implFactory.getImpl(self.vo,self.prodSourceLabel)
                     if impl == None:
                         # task refiner is undefined
                         tmpLog.error('task broker is undefined for vo={0} sourceLabel={1}'.format(self.vo,self.prodSourceLabel))
                         tmpStat = Interaction.SC_FAILED
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     tmpLog.error('getImpl failed with {0}:{1}'.format(errtype.__name__,errvalue))
                     tmpStat = Interaction.SC_FAILED
             # brokerage
             if tmpStat == Interaction.SC_SUCCEEDED:
                 tmpLog.info('brokerage with {0} for {1} tasks '.format(impl.__class__.__name__,len(tmpListToAssign)))
                 try:
                     tmpStat = impl.doBrokerage(tmpListToAssign,self.vo,
                                                self.prodSourceLabel,self.workQueue)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     tmpLog.error('doBrokerage failed with {0}:{1}'.format(errtype.__name__,errvalue))
                     tmpStat = Interaction.SC_FAILED
             # register
             if tmpStat != Interaction.SC_SUCCEEDED:
                 tmpLog.error('failed')
             else:
                 tmpLog.info('done')                    
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))

Пример #8

Показать файл

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 100
             taskList = self.taskList.get(nTasks)
             totalTasks,idxTasks = self.taskList.stat()
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # make logger
             tmpLog = MsgWrapper(self.logger)
             tmpLog.info('start TaskCheckerThread {0}/{1} for jediTaskID={2}'.format(idxTasks,totalTasks,taskList))
             tmpStat = Interaction.SC_SUCCEEDED
             # get TaskSpecs
             taskSpecList = []
             for jediTaskID in taskList:
                 tmpRet,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False)
                 if tmpRet and taskSpec != None:
                     taskSpecList.append(taskSpec)
                 else:
                     tmpLog.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID))
             if taskSpecList != []:
                 # get impl                    
                 if tmpStat == Interaction.SC_SUCCEEDED:                    
                     tmpLog.info('getting Impl')
                     try:
                         impl = self.implFactory.getImpl(self.vo,self.prodSourceLabel)
                         if impl == None:
                             # task brokerage is undefined
                             tmpLog.error('task broker is undefined for vo={0} sourceLabel={1}'.format(self.vo,self.prodSourceLabel))
                             tmpStat = Interaction.SC_FAILED
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         tmpLog.error('getImpl failed with {0}:{1}'.format(errtype.__name__,errvalue))
                         tmpStat = Interaction.SC_FAILED
                 # check
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('brokerage with {0}'.format(impl.__class__.__name__))
                     try:
                         tmpStat,taskCloudMap = impl.doCheck(taskSpecList)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         tmpLog.error('doCheck failed with {0}:{1}'.format(errtype.__name__,errvalue))
                         tmpStat = Interaction.SC_FAILED
                 # update
                 if tmpStat != Interaction.SC_SUCCEEDED:
                     tmpLog.error('failed to check assignment')
                 else:
                     tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(taskCloudMap)
                     tmpLog.info('done with {0} for {1}'.format(tmpRet,str(taskCloudMap)))
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))

Пример #9

Показать файл

 def finger(self,userName):
     methodName = 'finger'
     methodName = '{0} userName={1}'.format(methodName,userName)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     try:
         # cleanup DN
         userName = parse_dn(userName)
         # exec
         tmpRet = infoClient().finger(userName)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0}:{1}'.format(methodName,errMsg)
     tmpLog.info('done')
     return self.SC_SUCCEEDED,tmpRet

Пример #10

Показать файл

 def setDatasetOwner(self,datasetName,userName):
     methodName = 'setDatasetOwner'
     methodName = '{0} datasetName={1} userName={2}'.format(methodName,datasetName,userName)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     try:
         # cleanup DN
         userName = parse_dn(userName)
         # get DQ2 API            
         dq2=DQ2()
         # set
         dq2.setMetaDataAttribute(datasetName,'owner',userName)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)
     tmpLog.info('done')
     return self.SC_SUCCEEDED,True

Пример #11

Показать файл

 def registerDatasetLocation(self,datasetName,location,lifetime=None,owner=None):
     methodName = 'registerDatasetLocation'
     methodName = '{0} datasetName={1} location={2}'.format(methodName,datasetName,location)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     try:
         # cleanup DN
         owner = parse_dn(owner)
         # get DQ2 API            
         dq2 = DQ2()
         # set
         dq2.registerDatasetLocation(datasetName,location,lifetime=lifetime)
         dq2.setReplicaMetaDataAttribute(datasetName,location,'owner',owner)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)
     tmpLog.info('done')
     return self.SC_SUCCEEDED,True

Пример #12

Показать файл

 def setDatasetMetadata(self,datasetName,metadataName,metadaValue):
     methodName = 'setDatasetMetadata'
     methodName = '{0} datasetName={1} metadataName={2} metadaValue={3}'.format(methodName,datasetName,
                                                                                metadataName,metadaValue)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     try:
         # get DQ2 API            
         dq2 = DQ2()
         # set
         dq2.setMetaDataAttribute(datasetName,metadataName,metadaValue)
     except DQUnknownDatasetException:
         pass
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)
     tmpLog.info('done')
     return self.SC_SUCCEEDED,True

Пример #13

Показать файл

 def expandContainer(self,containerName):
     methodName = 'expandContainer'
     methodName = '{0} contName={1}'.format(methodName,containerName)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     try:
         dsList = []
         # get real names
         tmpS,tmpRealNameList = self.listDatasets(containerName)
         if tmpS != self.SC_SUCCEEDED:
             tmpLog.error('failed to get real names')
             return tmpS,tmpRealNameList
         # loop over all names
         for tmpRealName in tmpRealNameList:
             # container
             if tmpRealName.endswith('/'):
                 # get contents
                 tmpS,tmpO = self.listDatasetsInContainer(tmpRealName)
                 if tmpS != self.SC_SUCCEEDED:
                     tmpLog.error('failed to get datasets in {0}'.format(tmpRealName))
                     return tmpS,tmpO
             else:
                 tmpO = [tmpRealName]
             # collect dataset names
             for tmpStr in tmpO:
                 if not tmpStr in dsList:
                     dsList.append(tmpStr)
         dsList.sort()        
         # return
         tmpLog.info('got {0}'.format(str(dsList)))
         return self.SC_SUCCEEDED,dsList
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error('failed with {0}'.format(errMsg))
         return errCode,'{0} : {1}'.format(methodName,errMsg)

Пример #14

Показать файл

 def deleteDataset(self,datasetName,emptyOnly,ignoreUnknown=False):
     methodName = 'deleteDataset'
     methodName = '{0} datasetName={1}'.format(methodName,datasetName)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     isOK = True
     retStr = ''
     nFiles = -1
     try:
         # get DQ2 API            
         dq2=DQ2()
         # get the number of files
         if emptyOnly:
             nFiles = dq2.getNumberOfFiles(datasetName)
         # erase
         if not emptyOnly or nFiles == 0:
             dq2.eraseDataset(datasetName)
             retStr = 'deleted {0}'.format(datasetName)
         else:
             retStr = 'keep {0} where {1} files are available'.format(datasetName,nFiles)
     except DQUnknownDatasetException:
         if ignoreUnknown:
             pass
         else:
             isOK = False
     except:
         isOK = False
     if isOK:
         tmpLog.info('done')
         return self.SC_SUCCEEDED,retStr
     else:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)

Пример #15

Показать файл

Файл: TaskRefiner.py Проект: pavlo-svirin/panda-jedi

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.info('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,splitRule,taskStatus,parent_tid in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'< jediTaskID={0} >'.format(jediTaskID))
                 tmpLog.debug('start')
                 tmpStat = Interaction.SC_SUCCEEDED
                 errStr = ''
                 # read task parameters
                 try:
                     taskParam = None
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue)
                     tmpLog.debug(taskParam)
                     tmpLog.error(errStr)
                     continue
                     tmpStat = Interaction.SC_FAILED
                 # get impl
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('getting Impl')
                     try:
                         # get VO and sourceLabel
                         vo = taskParamMap['vo']
                         prodSourceLabel = taskParamMap['prodSourceLabel']
                         taskType = taskParamMap['taskType']
                         tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType))
                         # get impl
                         impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType,
                                                                 self.taskBufferIF,self.ddmIF)
                         if impl == None:
                             # task refiner is undefined
                             errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # extract common parameters
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('extracting common')
                     try:
                         # initalize impl
                         impl.initializeRefiner(tmpLog)
                         impl.oldTaskStatus = taskStatus
                         # extract common parameters
                         impl.extractCommon(jediTaskID,taskParamMap,self.workQueueMapper,splitRule)
                         # set parent tid
                         if not parent_tid in [None,jediTaskID]:
                             impl.taskSpec.parent_tid = parent_tid
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to extract common parameters with {0}:{1} {2}'.format(errtype.__name__,errvalue,
                                                                                                traceback.format_exc())
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # check attribute length
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('checking attribute length')
                     if not impl.taskSpec.checkAttrLength():
                         tmpLog.error(impl.taskSpec.errorDialog)
                         tmpStat = Interaction.SC_FAILED
                 # check parent
                 noWaitParent = False
                 parentState = None
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     if not parent_tid in [None,jediTaskID]:
                         tmpLog.info('check parent task')
                         try:
                             tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid)
                             parentState = tmpStat
                             if tmpStat == 'completed':
                                 # parent is done
                                 tmpStat = Interaction.SC_SUCCEEDED
                             elif tmpStat == 'running':
                                 if not impl.taskSpec.noWaitParent():
                                     # parent is running
                                     errStr = 'pending until parent task {0} is done'.format(parent_tid)
                                     impl.taskSpec.status = taskStatus
                                     impl.taskSpec.setOnHold()
                                     impl.taskSpec.setErrDiag(errStr)
                                     tmpLog.info(errStr)
                                     self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                                       oldStatus=[taskStatus],setFrozenTime=False)
                                     continue
                                 else:
                                     # not wait for parent
                                     tmpStat = Interaction.SC_SUCCEEDED
                                     noWaitParent = True
                             else:
                                 # parent is corrupted
                                 tmpStat = Interaction.SC_FAILED
                                 tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid)
                                 impl.taskSpec.setErrDiag(tmpErrStr)
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # refine
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('refining with {0}'.format(impl.__class__.__name__))
                     try:
                         tmpStat = impl.doRefine(jediTaskID,taskParamMap)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         # wait unknown input if noWaitParent or waitInput
                         if ((impl.taskSpec.noWaitParent() or impl.taskSpec.waitInput()) \
                                 and errtype == JediException.UnknownDatasetError) or parentState == 'running' \
                                 or errtype == Interaction.JEDITemporaryError:
                             if impl.taskSpec.noWaitParent() or parentState == 'running':
                                 tmpErrStr = 'pending until parent produces input'
                                 setFrozenTime=False
                             elif errtype == Interaction.JEDITemporaryError:
                                 tmpErrStr = 'pending due to DDM problem. {0}'.format(errvalue)
                                 setFrozenTime=True
                             else:
                                 tmpErrStr = 'pending until input is staged'
                                 setFrozenTime=True
                             impl.taskSpec.status = taskStatus
                             impl.taskSpec.setOnHold()
                             impl.taskSpec.setErrDiag(tmpErrStr)
                             tmpLog.info(tmpErrStr)
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                               oldStatus=[taskStatus],
                                                               insertUnknown=impl.unknownDatasetList,
                                                               setFrozenTime=setFrozenTime)
                             continue
                         else:
                             errStr  = 'failed to refine task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # register
                 if tmpStat != Interaction.SC_SUCCEEDED:
                     tmpLog.error('failed to refine the task')
                     if impl == None or impl.taskSpec == None:
                         tmpTaskSpec = JediTaskSpec()
                         tmpTaskSpec.jediTaskID = jediTaskID
                     else:
                         tmpTaskSpec = impl.taskSpec
                     tmpTaskSpec.status = 'tobroken'
                     if errStr != '':
                         tmpTaskSpec.setErrDiag(errStr,True)
                     self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID},oldStatus=[taskStatus])
                 else:
                     tmpLog.info('registering')                    
                     # fill JEDI tables
                     try:
                         # enable protection against task duplication
                         if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \
                                 not impl.taskSpec.checkPreProcessed():
                             uniqueTaskName = True
                         else:
                             uniqueTaskName = False
                         strTaskParams = None
                         if impl.updatedTaskParams != None:
                             strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams)
                         if taskStatus == 'registered':
                             # unset pre-process flag
                             if impl.taskSpec.checkPreProcessed():
                                 impl.taskSpec.setPostPreProcess()
                             # full registration
                             tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec,
                                                                                                  impl.inMasterDatasetSpec,
                                                                                                  impl.inSecDatasetSpecList,
                                                                                                  impl.outDatasetSpecList,
                                                                                                  impl.outputTemplateMap,
                                                                                                  impl.jobParamsTemplate,
                                                                                                  strTaskParams,
                                                                                                  impl.unmergeMasterDatasetSpec,
                                                                                                  impl.unmergeDatasetSpecMap,
                                                                                                  uniqueTaskName,
                                                                                                  taskStatus) 
                             if not tmpStat:
                                 tmpErrStr = 'failed to register the task to JEDI in a single shot'
                                 tmpLog.error(tmpErrStr)
                                 impl.taskSpec.status = newTaskStatus
                                 impl.taskSpec.setErrDiag(tmpErrStr,True)
                                 self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                                   oldStatus=[taskStatus])
                             tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                             tmpLog.info(tmpMsg)
                             tmpLog.sendMsg(tmpMsg,self.msgType)
                         else:
                             # disable scouts if previous attempt didn't use it
                             if not impl.taskSpec.useScout(splitRule):
                                 impl.taskSpec.setUseScout(False)
                             # update task with new params
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                               oldStatus=[taskStatus])
                             # appending for incremetnal execution
                             tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec,
                                                                             impl.inSecDatasetSpecList)
                             if not tmpStat:
                                 tmpLog.error('failed to append datasets for incexec')
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(tmpErrStr)
                     else:
                         tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))

Пример #16

Показать файл

 def doSetup(self, taskSpec, datasetToRegister, pandaJobs):
     # make logger
     tmpLog = MsgWrapper(logger,
                         "<jediTaskID={0}>".format(taskSpec.jediTaskID))
     tmpLog.info('start label={0} taskType={1}'.format(
         taskSpec.prodSourceLabel, taskSpec.taskType))
     # returns
     retFatal = self.SC_FATAL
     retTmpError = self.SC_FAILED
     retOK = self.SC_SUCCEEDED
     try:
         # get DDM I/F
         ddmIF = self.ddmIF.getInterface(taskSpec.vo)
         # register datasets
         if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']:
             # prod vs anal
             userSetup = False
             if taskSpec.prodSourceLabel in ['user']:
                 userSetup = True
                 # collect datasetID to register datasets/containers just in case
                 for tmpPandaJob in pandaJobs:
                     if not tmpPandaJob.produceUnMerge():
                         for tmpFileSpec in tmpPandaJob.Files:
                             if tmpFileSpec.type in ['output', 'log']:
                                 if not tmpFileSpec.datasetID in datasetToRegister:
                                     datasetToRegister.append(
                                         tmpFileSpec.datasetID)
             tmpLog.info('datasetToRegister={0}'.format(
                 str(datasetToRegister)))
             # get site mapper
             siteMapper = self.taskBufferIF.getSiteMapper()
             # loop over all datasets
             avDatasetList = []
             cnDatasetMap = {}
             for datasetID in datasetToRegister:
                 # get output and log datasets
                 tmpLog.info(
                     'getting datasetSpec with datasetID={0}'.format(
                         datasetID))
                 tmpStat, datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(
                     taskSpec.jediTaskID, datasetID)
                 if not tmpStat:
                     tmpLog.error('failed to get output and log datasets')
                     return retFatal
                 # DDM backend
                 ddmBackEnd = taskSpec.getDdmBackEnd()
                 tmpLog.info('checking {0}'.format(datasetSpec.datasetName))
                 # check if dataset and container are available in DDM
                 for targetName in [
                         datasetSpec.datasetName, datasetSpec.containerName
                 ]:
                     if targetName == None:
                         continue
                     if not targetName in avDatasetList:
                         # set lifetime
                         if targetName.startswith('panda'):
                             if datasetSpec.type == 'trn_log' and taskSpec.prodSourceLabel == 'managed':
                                 lifetime = 365
                             else:
                                 lifetime = 14
                         else:
                             lifetime = None
                         # check dataset/container in DDM
                         tmpList = ddmIF.listDatasets(targetName)
                         if tmpList == []:
                             # get location
                             location = None
                             locForRule = None
                             if targetName == datasetSpec.datasetName:
                                 # dataset
                                 if datasetSpec.site in ['', None]:
                                     if DataServiceUtils.getDistributedDestination(
                                             datasetSpec.storageToken
                                     ) != None:
                                         locForRule = datasetSpec.destination
                                     elif DataServiceUtils.getDestinationSE(
                                             datasetSpec.storageToken
                                     ) != None:
                                         location = DataServiceUtils.getDestinationSE(
                                             datasetSpec.storageToken)
                                     elif taskSpec.cloud != None:
                                         # use T1 SE
                                         tmpT1Name = siteMapper.getCloud(
                                             taskSpec.cloud)['source']
                                         location = siteMapper.getDdmEndpoint(
                                             tmpT1Name,
                                             datasetSpec.storageToken)
                                 else:
                                     tmpLog.info('site={0} token='.format(
                                         datasetSpec.site,
                                         datasetSpec.storageToken))
                                     location = siteMapper.getDdmEndpoint(
                                         datasetSpec.site,
                                         datasetSpec.storageToken)
                             if locForRule == None:
                                 locForRule = location
                             # set metadata
                             if taskSpec.prodSourceLabel in [
                                     'managed', 'test'
                             ] and targetName == datasetSpec.datasetName:
                                 metaData = {}
                                 metaData['task_id'] = taskSpec.jediTaskID
                                 if not taskSpec.campaign in [None, '']:
                                     metaData[
                                         'campaign'] = taskSpec.campaign
                                 if datasetSpec.getTransient() != None:
                                     metaData[
                                         'transient'] = datasetSpec.getTransient(
                                         )
                             else:
                                 metaData = None
                             # register dataset/container
                             tmpLog.info(
                                 'registering {0} with location={1} backend={2} lifetime={3} meta={4}'
                                 .format(targetName, location, ddmBackEnd,
                                         lifetime, str(metaData)))
                             tmpStat = ddmIF.registerNewDataset(
                                 targetName,
                                 backEnd=ddmBackEnd,
                                 location=location,
                                 lifetime=lifetime,
                                 metaData=metaData)
                             if not tmpStat:
                                 tmpLog.error(
                                     'failed to register {0}'.format(
                                         targetName))
                                 return retFatal
                             # procedures for user
                             if userSetup or DataServiceUtils.getDistributedDestination(
                                     datasetSpec.storageToken) != None:
                                 # register location
                                 tmpToRegister = False
                                 if userSetup and targetName == datasetSpec.datasetName and not datasetSpec.site in [
                                         '', None
                                 ]:
                                     userName = taskSpec.userName
                                     grouping = None
                                     tmpToRegister = True
                                 elif DataServiceUtils.getDistributedDestination(
                                         datasetSpec.storageToken) != None:
                                     userName = None
                                     grouping = 'NONE'
                                     tmpToRegister = True
                                 if tmpToRegister:
                                     activity = DataServiceUtils.getActivityForOut(
                                         taskSpec.prodSourceLabel)
                                     tmpLog.info(
                                         'registering location={0} lifetime={1}days activity={2} grouping={3}'
                                         .format(locForRule, lifetime,
                                                 activity, grouping))
                                     tmpStat = ddmIF.registerDatasetLocation(
                                         targetName,
                                         locForRule,
                                         owner=userName,
                                         lifetime=lifetime,
                                         backEnd=ddmBackEnd,
                                         activity=activity,
                                         grouping=grouping)
                                     if not tmpStat:
                                         tmpLog.error(
                                             'failed to register location {0} with {2} for {1}'
                                             .format(
                                                 locForRule, targetName,
                                                 ddmBackEnd))
                                         return retFatal
                             avDatasetList.append(targetName)
                         else:
                             tmpLog.info('{0} already registered'.format(
                                 targetName))
                 # check if dataset is in the container
                 if datasetSpec.containerName != None and datasetSpec.containerName != datasetSpec.datasetName:
                     # get list of constituent datasets in the container
                     if not cnDatasetMap.has_key(datasetSpec.containerName):
                         cnDatasetMap[
                             datasetSpec.
                             containerName] = ddmIF.listDatasetsInContainer(
                                 datasetSpec.containerName)
                     # add dataset
                     if not datasetSpec.datasetName in cnDatasetMap[
                             datasetSpec.containerName]:
                         tmpLog.info('adding {0} to {1}'.format(
                             datasetSpec.datasetName,
                             datasetSpec.containerName))
                         tmpStat = ddmIF.addDatasetsToContainer(
                             datasetSpec.containerName,
                             [datasetSpec.datasetName],
                             backEnd=ddmBackEnd)
                         if not tmpStat:
                             tmpLog.error('failed to add {0} to {1}'.format(
                                 datasetSpec.datasetName,
                                 datasetSpec.containerName))
                             return retFatal
                         cnDatasetMap[datasetSpec.containerName].append(
                             datasetSpec.datasetName)
                     else:
                         tmpLog.info('{0} already in {1}'.format(
                             datasetSpec.datasetName,
                             datasetSpec.containerName))
                 # update dataset
                 datasetSpec.status = 'registered'
                 self.taskBufferIF.updateDataset_JEDI(
                     datasetSpec, {
                         'jediTaskID': taskSpec.jediTaskID,
                         'datasetID': datasetID
                     })
                 # register ES datasets
                 if False:  # FIXME taskSpec.useEventService() and not taskSpec.useJobCloning() and datasetSpec.type == 'output':
                     targetName = datasetSpec.datasetName + EventServiceUtils.esSuffixDDM
                     location = None
                     metaData = {}
                     metaData['task_id'] = taskSpec.jediTaskID
                     metaData['hidden'] = True
                     tmpLog.info(
                         'registering ES dataset {0} with location={1} meta={2}'
                         .format(targetName, location, str(metaData)))
                     tmpStat = ddmIF.registerNewDataset(targetName,
                                                        location=location,
                                                        metaData=metaData)
                     if not tmpStat:
                         tmpLog.error(
                             'failed to register ES dataset {0}'.format(
                                 targetName))
                         return retFatal
                     # register rule
                     location = 'type=ES'
                     activity = DataServiceUtils.getActivityForOut(
                         taskSpec.prodSourceLabel)
                     grouping = 'NONE'
                     tmpLog.info(
                         'registering location={0} activity={1} grouping={2}'
                         .format(location, activity, grouping))
                     tmpStat = ddmIF.registerDatasetLocation(
                         targetName,
                         location,
                         activity=activity,
                         grouping=grouping)
                     if not tmpStat:
                         tmpLog.error(
                             'failed to register location {0} with {2} for {1}'
                             .format(location, targetName, activity))
                         return retFatal
         # open datasets
         if taskSpec.prodSourceLabel in ['managed', 'test']:
             # get the list of output/log datasets
             outDatasetList = []
             for tmpPandaJob in pandaJobs:
                 for tmpFileSpec in tmpPandaJob.Files:
                     if tmpFileSpec.type in ['output', 'log']:
                         if not tmpFileSpec.destinationDBlock in outDatasetList:
                             outDatasetList.append(
                                 tmpFileSpec.destinationDBlock)
             # open datasets
             for outDataset in outDatasetList:
                 tmpLog.info('open {0}'.format(outDataset))
                 ddmIF.openDataset(outDataset)
                 # unset lifetime
                 ddmIF.setDatasetMetadata(outDataset, 'lifetime', None)
         # return
         tmpLog.info('done')
         return retOK
     except:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('doSetup failed with {0}:{1}'.format(
             errtype.__name__, errvalue))
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retFatal

Пример #17

Показать файл

Файл: ContentsFeeder.py Проект: lukewayne123/panda-jedi

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskDsList = self.taskDsList.get(nTasks)
             # no more datasets
             if len(taskDsList) == 0:
                 self.logger.debug('%s terminating since no more items' % self.__class__.__name__)
                 return
             # loop over all tasks
             for jediTaskID,dsList in taskDsList:
                 allUpdated = True
                 taskBroken = False
                 taskOnHold = False
                 runningTask = False
                 missingMap = {}
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(jediTaskID))
                 # get task
                 tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False,True,self.pid,10)
                 if not tmpStat or taskSpec == None:
                     tmpLog.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID))
                     continue
                 try:
                     # get task parameters
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue))
                     taskBroken = True
                 # renaming of parameters
                 if taskParamMap.has_key('nEventsPerInputFile'):
                     taskParamMap['nEventsPerFile'] = taskParamMap['nEventsPerInputFile']
                 # the number of files per job
                 nFilesPerJob = None
                 if taskParamMap.has_key('nFilesPerJob'):
                     nFilesPerJob = taskParamMap['nFilesPerJob']
                 # the number of chunks used by scout 
                 nChunksForScout = 10
                 # load XML
                 if taskSpec.useLoadXML():
                     xmlConfig = taskParamMap['loadXML']
                 else:
                     xmlConfig = None
                 # check no wait
                 noWaitParent = False
                 if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None,taskSpec.jediTaskID]:
                     tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid)
                     if tmpStat == 'running':
                         noWaitParent = True
                 # loop over all datasets
                 nFilesMaster = 0
                 checkedMaster = False
                 setFrozenTime = True
                 if not taskBroken:
                     ddmIF = self.ddmIF.getInterface(taskSpec.vo) 
                     origNumFiles = None
                     if taskParamMap.has_key('nFiles'):
                         origNumFiles = taskParamMap['nFiles']
                     for datasetSpec in dsList:
                         tmpLog.info('start loop for {0}(id={1})'.format(datasetSpec.datasetName,datasetSpec.datasetID))
                         # get dataset metadata
                         tmpLog.info('get metadata')
                         gotMetadata = False
                         stateUpdateTime = datetime.datetime.utcnow()                    
                         try:
                             if not datasetSpec.isPseudo():
                                 tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName)
                             else:
                                 # dummy metadata for pseudo dataset
                                 tmpMetadata = {'state':'closed'}
                             # set mutable when parent is running and the dataset is open
                             if noWaitParent and tmpMetadata['state'] == 'open':
                                 # dummy metadata when parent is running
                                 tmpMetadata = {'state':'mutable'}
                             gotMetadata = True
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             tmpLog.error('{0} failed to get metadata to {1}:{2}'.format(self.__class__.__name__,
                                                                                         errtype.__name__,errvalue))
                             if errtype == Interaction.JEDIFatalError:
                                 # fatal error
                                 datasetStatus = 'broken'
                                 taskBroken = True
                                 # update dataset status    
                                 self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                             else:
                                 # temporary error
                                 taskOnHold = True
                             taskSpec.setErrDiag('failed to get metadata for {0}'.format(datasetSpec.datasetName))
                             allUpdated = False
                         else:
                             # get file list specified in task parameters
                             fileList,includePatt,excludePatt = RefinerUtils.extractFileList(taskParamMap,datasetSpec.datasetName)   
                             # get the number of events in metadata
                             if taskParamMap.has_key('getNumEventsInMetadata'):
                                 getNumEvents = True
                             else:
                                 getNumEvents = False
                             # get file list from DDM
                             tmpLog.info('get files')
                             try:
                                 useInFilesWithNewAttemptNr = False
                                 skipDuplicate = not datasetSpec.useDuplicatedFiles()
                                 if not datasetSpec.isPseudo():
                                     if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \
                                             not datasetSpec.containerName in ['',None]:
                                         # read files from container if file list is specified in task parameters
                                         tmpDatasetName = datasetSpec.containerName
                                     else:
                                         tmpDatasetName = datasetSpec.datasetName
                                     tmpRet = ddmIF.getFilesInDataset(tmpDatasetName,
                                                                      getNumEvents=getNumEvents,
                                                                      skipDuplicate=skipDuplicate
                                                                      )
                                     tmpLog.info('got {0} files in {1}'.format(len(tmpRet),tmpDatasetName))
                                     # remove lost files
                                     tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName,tmpRet)
                                     if tmpLostFiles != {}:
                                         tmpLog.info('found {0} lost files in {1}'.format(len(tmpLostFiles),tmpDatasetName))
                                         for tmpListGUID,tmpLostLFN in tmpLostFiles.iteritems():
                                             tmpLog.info('removed {0}'.format(tmpLostLFN))
                                             del tmpRet[tmpListGUID]
                                 else:
                                     if not taskSpec.useListPFN():
                                         # dummy file list for pseudo dataset
                                         tmpRet = {str(uuid.uuid4()):{'lfn':'pseudo_lfn',
                                                                      'scope':None,
                                                                      'filesize':0,
                                                                      'checksum':None,
                                                                      }
                                                   }
                                     else:
                                         # make dummy file list for PFN list
                                         if taskParamMap.has_key('nFiles'):
                                             nPFN = taskParamMap['nFiles']
                                         else:
                                             nPFN = 1
                                         tmpRet = {}
                                         for iPFN in range(nPFN):
                                             tmpRet[str(uuid.uuid4())] = {'lfn':'{0:06d}:{1}'.format(iPFN,taskParamMap['pfnList'][iPFN].split('/')[-1]),
                                                                          'scope':None,
                                                                          'filesize':0,
                                                                          'checksum':None,
                                                                          }
                             except:
                                 errtype,errvalue = sys.exc_info()[:2]
                                 tmpLog.error('failed to get files due to {0}:{1}'.format(self.__class__.__name__,
                                                                                              errtype.__name__,errvalue))
                                 if errtype == Interaction.JEDIFatalError:
                                     # fatal error
                                     datasetStatus = 'broken'
                                     taskBroken = True
                                     # update dataset status    
                                     self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                 else:
                                     # temporary error
                                     taskOnHold = True
                                 taskSpec.setErrDiag('failed to get files for {0}'.format(datasetSpec.datasetName))
                                 allUpdated = False
                             else:
                                 # the number of events per file
                                 nEventsPerFile  = None
                                 nEventsPerJob   = None
                                 nEventsPerRange = None
                                 if (datasetSpec.isMaster() and taskParamMap.has_key('nEventsPerFile')) or \
                                         (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents')):
                                     if taskParamMap.has_key('nEventsPerFile'):
                                         nEventsPerFile = taskParamMap['nEventsPerFile']
                                     elif datasetSpec.isPseudo() and taskParamMap.has_key('nEvents'):
                                         # use nEvents as nEventsPerFile for pseudo input
                                         nEventsPerFile = taskParamMap['nEvents']
                                     if taskParamMap.has_key('nEventsPerJob'):
                                         nEventsPerJob = taskParamMap['nEventsPerJob']
                                     elif taskParamMap.has_key('nEventsPerRange'):
                                         nEventsPerRange = taskParamMap['nEventsPerRange']
                                 # max attempts
                                 maxAttempt = None
                                 if datasetSpec.isMaster() or datasetSpec.toKeepTrack():
                                     # max attempts 
                                     if taskSpec.disableAutoRetry():
                                         # disable auto retry 
                                         maxAttempt = 1
                                     elif taskParamMap.has_key('maxAttempt'):
                                         maxAttempt = taskParamMap['maxAttempt']
                                     else:
                                         # use default value
                                         maxAttempt = 3
                                 # first event number
                                 firstEventNumber = None
                                 if datasetSpec.isMaster():
                                     # first event number
                                     firstEventNumber = 1 + taskSpec.getFirstEventOffset()
                                 # nMaxEvents
                                 nMaxEvents = None 
                                 if datasetSpec.isMaster() and taskParamMap.has_key('nEvents'):
                                     nMaxEvents = taskParamMap['nEvents']
                                 # nMaxFiles
                                 nMaxFiles = None
                                 if taskParamMap.has_key('nFiles'):
                                     if datasetSpec.isMaster():
                                         nMaxFiles = taskParamMap['nFiles']
                                     else:
                                         # calculate for secondary
                                         nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles)
                                         # multipled by the number of jobs per file for event-level splitting
                                         if nMaxFiles != None and taskParamMap.has_key('nEventsPerFile'):
                                             if taskParamMap.has_key('nEventsPerJob'):
                                                 if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                     nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerJob'])
                                                     nMaxFiles = int(math.ceil(nMaxFiles))
                                             elif taskParamMap.has_key('nEventsPerRange'):
                                                 if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerRange']:
                                                     nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerRange'])
                                                     nMaxFiles = int(math.ceil(nMaxFiles))
                                 # use scout
                                 useScout = False    
                                 if datasetSpec.isMaster() and taskSpec.useScout() and datasetSpec.status != 'toupdate':
                                     useScout = True
                                 # use files with new attempt numbers    
                                 useFilesWithNewAttemptNr = False
                                 if not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key('useInFilesWithNewAttemptNr'):
                                     useFilesWithNewAttemptNr = True
                                 # feed files to the contents table
                                 tmpLog.info('update contents')
                                 retDB,missingFileList,nFilesUnique,diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(datasetSpec,tmpRet,
                                                                                                                           tmpMetadata['state'],
                                                                                                                           stateUpdateTime,
                                                                                                                           nEventsPerFile,
                                                                                                                           nEventsPerJob,
                                                                                                                           maxAttempt,
                                                                                                                           firstEventNumber,
                                                                                                                           nMaxFiles,
                                                                                                                           nMaxEvents,
                                                                                                                           useScout,
                                                                                                                           fileList,
                                                                                                                           useFilesWithNewAttemptNr,
                                                                                                                           nFilesPerJob,
                                                                                                                           nEventsPerRange,
                                                                                                                           nChunksForScout,
                                                                                                                           includePatt,
                                                                                                                           excludePatt,
                                                                                                                           xmlConfig,
                                                                                                                           noWaitParent,
                                                                                                                           taskSpec.parent_tid,
                                                                                                                           self.pid)
                                 if retDB == False:
                                     taskSpec.setErrDiag('failed to insert files for {0}. {1}'.format(datasetSpec.datasetName,
                                                                                                      diagMap['errMsg']))
                                     allUpdated = False
                                     taskBroken = True
                                     break
                                 elif retDB == None:
                                     # the dataset is locked by another or status is not applicable
                                     allUpdated = False
                                     tmpLog.info('escape since task or dataset is locked')
                                     break
                                 elif missingFileList != []:
                                     # files are missing
                                     tmpErrStr = '{0} files missing in {1}'.format(len(missingFileList),datasetSpec.datasetName)
                                     tmpLog.info(tmpErrStr)
                                     taskSpec.setErrDiag(tmpErrStr)
                                     allUpdated = False
                                     taskOnHold = True
                                     missingMap[datasetSpec.datasetName] = {'datasetSpec':datasetSpec,
                                                                            'missingFiles':missingFileList} 
                                 else:
                                     # reduce the number of files to be read
                                     if taskParamMap.has_key('nFiles'):
                                         if datasetSpec.isMaster():
                                             taskParamMap['nFiles'] -= nFilesUnique
                                     # reduce the number of files for scout
                                     if useScout:
                                         nChunksForScout = diagMap['nChunksForScout']
                                     # number of master input files
                                     if datasetSpec.isMaster():
                                         checkedMaster = True
                                         nFilesMaster += nFilesUnique
                                 # running task
                                 if diagMap['isRunningTask']:
                                     runningTask = True
                                 # no activated pending input for noWait
                                 if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout == 0):
                                     tmpErrStr = 'insufficient inputs are ready'
                                     tmpLog.info(tmpErrStr)
                                     taskSpec.setErrDiag(tmpErrStr)
                                     taskOnHold = True
                                     setFrozenTime = False
                                     break
                         tmpLog.info('end loop')
                 # no mater input
                 if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster:
                     tmpErrStr = 'no master input files. input dataset is empty'
                     tmpLog.error(tmpErrStr)
                     taskSpec.setErrDiag(tmpErrStr,None)
                     if taskSpec.allowEmptyInput() or noWaitParent:
                         taskOnHold = True
                     else:
                         taskBroken = True
                 # update task status
                 if taskBroken:
                     # task is broken
                     taskSpec.status = 'tobroken'
                     tmpMsg = 'set task.status={0}'.format(taskSpec.status)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid)
                 # change task status unless the task is running
                 if not runningTask:
                     if taskOnHold:
                         if not noWaitParent:
                             # initialize task generator
                             taskGenerator = TaskGenerator(taskSpec.vo,taskSpec.prodSourceLabel)
                             tmpStat = taskGenerator.initializeMods(self.taskBufferIF,
                                                                    self.ddmIF.getInterface(taskSpec.vo))
                             if not tmpStat:
                                 tmpErrStr = 'failed to initialize TaskGenerator'
                                 tmpLog.error(tmpErrStr)
                                 taskSpec.status = 'tobroken'
                                 taskSpec.setErrDiag(tmpErrStr)
                             else:
                                 # make parent tasks if necessary
                                 tmpLog.info('make parent tasks with {0} (if necessary)'.format(taskGenerator.getClassName(taskSpec.vo,
                                                                                                                           taskSpec.prodSourceLabel)))
                                 tmpStat = taskGenerator.doGenerate(taskSpec,taskParamMap,missingFilesMap=missingMap)
                                 if tmpStat == Interaction.SC_FATAL:
                                     # failed to make parent tasks
                                     taskSpec.status = 'tobroken'
                                     tmpLog.error('failed to make parent tasks')
                         # go to pending state
                         if not taskSpec.status in ['broken','tobroken']:
                             taskSpec.setOnHold()
                         tmpMsg = 'set task.status={0}'.format(taskSpec.status)
                         tmpLog.info(tmpMsg)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                         allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid,setFrozenTime=setFrozenTime)
                     elif allUpdated:
                         # all OK
                         allRet,newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,getTaskStatus=True,
                                                                                                    pid=self.pid)
                         tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                         tmpLog.info(tmpMsg)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                 tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))

Пример #18

Показать файл

    def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue,
                      resource_name):
        # params
        nBunch = 4
        threshold = 2.0
        nJobsInBunchMax = 600
        nJobsInBunchMin = 500
        minTotalWalltime = 50 * 1000 * 1000
        nWaitingLimit = 4
        nWaitingBunchLimit = 2
        nParallel = 2
        nParallelCap = 5
        # make logger
        tmpLog = MsgWrapper(logger)

        workQueueID = workQueue.getID()
        workQueueName = workQueue.queue_name

        workQueueName = '_'.join(workQueue.queue_name.split(' '))
        msgHeader = '{0}:{1} cloud={2} queue={3} resource_type={4}:'.format(
            vo, prodSourceLabel, cloudName, workQueueName, resource_name)
        tmpLog.debug('{0} start workQueueID={1}'.format(
            msgHeader, workQueueID))

        # get central configuration values
        config_map = self.__getConfiguration(vo, workQueue.queue_name,
                                             resource_name)
        configQueueLimit = config_map[NQUEUELIMIT]['value']
        configQueueCap = config_map[NQUEUECAP]['value']
        configRunningCap = config_map[NRUNNINGCAP]['value']

        tmpLog.debug(
            msgHeader +
            ' got configuration configQueueLimit={0}, configQueueCap={1}, configRunningCap={2}'
            .format(configQueueLimit, configQueueCap, configRunningCap))

        # check if unthrottled
        if not workQueue.throttled:
            msgBody = "PASS unthrottled since GS_throttled is False"
            tmpLog.info(msgHeader + " " + msgBody)
            return self.retUnThrottled

        # get the jobs statistics for our wq/gs and expand the stats map
        jobstats_map = self.__prepareJobStats(workQueue, resource_name,
                                              config_map)
        nRunning_rt = jobstats_map['nRunning_rt']
        nRunning_gs = jobstats_map['nRunning_gs']
        nRunning_runningcap = jobstats_map['nRunning_runningcap']
        nNotRun_rt = jobstats_map['nNotRun_rt']
        nNotRun_gs = jobstats_map['nNotRun_gs']
        nNotRun_queuelimit = jobstats_map['nNotRun_queuelimit']
        nNotRun_queuecap = jobstats_map['nNotRun_queuecap']
        nDefine_rt = jobstats_map['nDefine_rt']
        nDefine_gs = jobstats_map['nDefine_gs']
        nDefine_queuelimit = jobstats_map['nDefine_queuelimit']
        nDefine_queuecap = jobstats_map['nDefine_queuecap']
        nWaiting_rt = jobstats_map['nWaiting_rt']
        nWaiting_gs = jobstats_map['nWaiting_gs']

        # check if higher prio tasks are waiting
        if workQueue.queue_name in non_rt_wqs:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI(
                'managed', cloudName, workQueue)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(
                vo, workQueue, 'managed', cloudName)
        else:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI(
                'managed', cloudName, workQueue, resource_name)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(
                vo, workQueue, 'managed', cloudName, resource_name)

        highestPrioInPandaDB = highestPrioJobStat['highestPrio']
        nNotRunHighestPrio = highestPrioJobStat['nNotRun']
        if highestPrioWaiting is None:
            msgBody = 'failed to get the highest priority of waiting tasks'
            tmpLog.error("{0} {1}".format(msgHeader, msgBody))
            return self.retTmpError

        # high priority tasks are waiting
        highPrioQueued = False
        if highestPrioWaiting > highestPrioInPandaDB \
                or (highestPrioWaiting == highestPrioInPandaDB and nNotRunHighestPrio < nJobsInBunchMin):
            highPrioQueued = True
        tmpLog.debug(
            "{0} highestPrio waiting:{1} inPanda:{2} numNotRun:{3} -> highPrioQueued={4}"
            .format(msgHeader, highestPrioWaiting, highestPrioInPandaDB,
                    nNotRunHighestPrio, highPrioQueued))
        # set maximum number of jobs to be submitted
        if workQueue.queue_name in non_rt_wqs:
            tmpRemainingSlot = int(nRunning_gs * threshold - nNotRun_gs)
        else:
            tmpRemainingSlot = int(nRunning_rt * threshold - nNotRun_rt)
        # use the lower limit to avoid creating too many _sub/_dis datasets
        nJobsInBunch = min(max(nJobsInBunchMin, tmpRemainingSlot),
                           nJobsInBunchMax)

        if configQueueLimit is not None:
            nQueueLimit = configQueueLimit
        else:
            nQueueLimit = nJobsInBunch * nBunch

        # use nPrestage for reprocessing
        if workQueue.queue_name in ['Heavy Ion', 'Reprocessing default']:
            # reset nJobsInBunch
            if nQueueLimit > (nNotRun_queuelimit + nDefine_queuelimit):
                tmpRemainingSlot = nQueueLimit - (nNotRun_queuelimit +
                                                  nDefine_queuelimit)
                if tmpRemainingSlot > nJobsInBunch:
                    nJobsInBunch = min(tmpRemainingSlot, nJobsInBunchMax)

        # get cap
        # set number of jobs to be submitted
        if configQueueCap is None:
            self.setMaxNumJobs(nJobsInBunch / nParallel)
        else:
            self.setMaxNumJobs(configQueueCap / nParallelCap)

        # get total walltime
        totWalltime = self.taskBufferIF.getTotalWallTime_JEDI(
            vo, prodSourceLabel, workQueue, resource_name, cloudName)

        # log the current situation and limits
        tmpLog.info("{0} nQueueLimit={1} nRunCap={2} nQueueCap={3}".format(
            msgHeader, nQueueLimit, configRunningCap, configQueueCap))
        tmpLog.info(
            "{0} at global share level: nQueued={1} nDefine={2} nRunning={3}".
            format(msgHeader, nNotRun_gs + nDefine_gs, nDefine_gs,
                   nRunning_gs))
        tmpLog.info(
            "{0} at resource type level: nQueued_rt={1} nDefine_rt={2} nRunning_rt={3} totWalltime={4}"
            .format(msgHeader, nNotRun_rt + nDefine_rt, nDefine_rt,
                    nRunning_rt, totWalltime))

        # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling
        limitPriority = False
        if workQueue.queue_name not in non_rt_wqs \
                and nRunning_rt == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit \
                and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs \
                and nRunning_gs == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name not in non_rt_wqs and  nRunning_rt != 0 \
                and float(nNotRun_rt + nDefine_rt) / float(nRunning_rt) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_rt({0})/nRunning_rt({1})>{2} & nQueued_queuelimit({3})>{4} totWalltime({5})>{6}".format(
                    nNotRun_rt + nDefine_rt, nRunning_rt, threshold,
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs and nRunning_gs != 0 \
                and float(nNotRun_gs + nDefine_gs) / float(nRunning_gs) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_gs({0})/nRunning_gs({1})>{2} & nQueued_queuelimit({3})>{4}".format(
                    nNotRun_gs + nDefine_gs, nRunning_gs, threshold,
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif nDefine_queuelimit > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # brokerage is stuck
                msgBody = "SKIP too many nDefined_queuelimit({0})>{1}".format(
                    nDefine_queuelimit, nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif nWaiting_rt > max(nRunning_rt * nWaitingLimit,
                               nJobsInBunch * nWaitingBunchLimit):
            limitPriority = True
            if not highPrioQueued:
                # too many waiting
                msgBody = "SKIP too many nWaiting_rt({0})>max(nRunning_rt({1})x{2},{3}x{4})".format(
                    nWaiting_rt, nRunning_rt, nWaitingLimit, nJobsInBunch,
                    nWaitingBunchLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif configRunningCap and nRunning_runningcap > configRunningCap:
            # cap on running
            msgBody = "SKIP nRunning_runningcap({0})>nRunningCap({1})".format(
                nRunning_runningcap, configRunningCap)
            tmpLog.warning('{0} {1}'.format(msgHeader, msgBody))
            tmpLog.sendMsg('{0} {1}'.format(msgHeader, msgBody),
                           self.msgType,
                           msgLevel='warning',
                           escapeChar=True)
            return self.retMergeUnThr

        elif configQueueCap and nNotRun_queuecap + nDefine_queuecap > configQueueCap:
            limitPriority = True
            if not highPrioQueued:
                # cap on queued
                msgBody = "SKIP nQueued_queuecap({0})>nQueueCap({1})".format(
                    nNotRun_queuecap + nDefine_queuecap, configQueueCap)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        # get jobs from prodDB
        limitPriorityValue = None
        if limitPriority:
            limitPriorityValue = highestPrioWaiting
            self.setMinPriority(limitPriorityValue)
        else:
            # not enough jobs are queued
            if (nNotRun_queuelimit + nDefine_queuelimit < nQueueLimit * 0.9) \
                    or (workQueue.queue_name in non_rt_wqs and nNotRun_gs + nDefine_gs < nRunning_gs) \
                    or (workQueue.queue_name not in non_rt_wqs and nNotRun_rt + nDefine_rt < nRunning_rt):
                tmpLog.debug(msgHeader + " not enough jobs queued")
                if not workQueue.queue_name in non_rt_wqs:
                    self.notEnoughJobsQueued()
                self.setMaxNumJobs(max(self.maxNumJobs, nQueueLimit / 20))

        msgBody = "PASS - priority limit={0} maxNumJobs={1}".format(
            limitPriorityValue, self.maxNumJobs)
        tmpLog.info(msgHeader + " " + msgBody)
        return self.retUnThrottled

Пример #19

Показать файл

 def runImpl(self):
     # cutoff for disk in TB
     diskThreshold = self.taskBufferIF.getConfigValue(
         self.msgType,
         'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name), 'jedi',
         'atlas')
     if diskThreshold is None:
         diskThreshold = 100 * 1024
     # dataset type to ignore file availability check
     datasetTypeToSkipCheck = ['log']
     # thresholds for data availability check
     thrInputSize = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas')
     if thrInputSize is None:
         thrInputSize = 1
     thrInputSize *= 1024 * 1024 * 1024
     thrInputNum = self.taskBufferIF.getConfigValue(self.msgType,
                                                    'INPUT_NUM_THRESHOLD',
                                                    'jedi', 'atlas')
     if thrInputNum is None:
         thrInputNum = 100
     thrInputSizeFrac = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas')
     if thrInputSizeFrac is None:
         thrInputSizeFrac = 10
     thrInputSizeFrac = float(thrInputSizeFrac) / 100
     thrInputNumFrac = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas')
     if thrInputNumFrac is None:
         thrInputNumFrac = 10
     thrInputNumFrac = float(thrInputNumFrac) / 100
     cutOffRW = 50
     negWeightTape = 0.001
     minIoIntensityWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MIN_IO_INTENSITY_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if minIoIntensityWithLD is None:
         minIoIntensityWithLD = 200
     minInputSizeWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MIN_INPUT_SIZE_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if minInputSizeWithLD is None:
         minInputSizeWithLD = 10000
     maxTaskPrioWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MAX_TASK_PRIO_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if maxTaskPrioWithLD is None:
         maxTaskPrioWithLD = 800
     # main
     lastJediTaskID = None
     siteMapper = self.taskBufferIF.getSiteMapper()
     while True:
         try:
             taskInputList = self.inputList.get(1)
             # no more datasets
             if len(taskInputList) == 0:
                 self.logger.debug(
                     '{0} terminating after processing {1} tasks since no more inputs '
                     .format(self.__class__.__name__, self.numTasks))
                 return
             # loop over all tasks
             for taskSpec, inputChunk in taskInputList:
                 lastJediTaskID = taskSpec.jediTaskID
                 # make logger
                 tmpLog = MsgWrapper(
                     self.logger,
                     '<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                     monToken='jediTaskID={0}'.format(taskSpec.jediTaskID))
                 tmpLog.debug('start')
                 tmpLog.info(
                     'thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}'
                     .format(thrInputSize, thrInputNum, thrInputSizeFrac,
                             thrInputNumFrac))
                 # read task parameters
                 try:
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                         taskSpec.jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except Exception:
                     tmpLog.error('failed to read task params')
                     taskSpec.setErrDiag(
                         tmpLog.uploadLog(taskSpec.jediTaskID))
                     self.sendLogMessage(tmpLog)
                     continue
                 # RW
                 taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(
                     taskSpec.jediTaskID)
                 # get nuclei
                 nucleusList = siteMapper.nuclei
                 if taskSpec.nucleus in siteMapper.nuclei:
                     candidateNucleus = taskSpec.nucleus
                 elif taskSpec.nucleus in siteMapper.satellites:
                     nucleusList = siteMapper.satellites
                     candidateNucleus = taskSpec.nucleus
                 else:
                     tmpLog.info('got {0} candidates'.format(
                         len(nucleusList)))
                     ######################################
                     # check status
                     newNucleusList = {}
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleusSpec.state not in ['ACTIVE']:
                             tmpLog.info(
                                 '  skip nucleus={0} due to status={1} criteria=-status'
                                 .format(tmpNucleus, tmpNucleusSpec.state))
                         else:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info(
                         '{0} candidates passed status check'.format(
                             len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # check status of transfer backlog
                     t1Weight = taskSpec.getT1Weight()
                     if t1Weight < 0:
                         tmpLog.info(
                             'skip transfer backlog check due to negative T1Weight'
                         )
                     else:
                         newNucleusList = {}
                         backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei(
                         )
                         for tmpNucleus, tmpNucleusSpec in iteritems(
                                 nucleusList):
                             if tmpNucleus in backlogged_nuclei:
                                 tmpLog.info(
                                     '  skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog'
                                     .format(tmpNucleus))
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         nucleusList = newNucleusList
                         tmpLog.info(
                             '{0} candidates passed transfer backlog check'.
                             format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # check endpoint
                     fractionFreeSpace = {}
                     newNucleusList = {}
                     tmpStat, tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                         taskSpec.jediTaskID, ['output', 'log'])
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         toSkip = False
                         for tmpDatasetSpec in tmpDatasetSpecList:
                             # ignore distributed datasets
                             if DataServiceUtils.getDistributedDestination(
                                     tmpDatasetSpec.storageToken
                             ) is not None:
                                 continue
                             # get endpoint with the pattern
                             tmpEP = tmpNucleusSpec.getAssociatedEndpoint(
                                 tmpDatasetSpec.storageToken)
                             if tmpEP is None:
                                 tmpLog.info(
                                     '  skip nucleus={0} since no endpoint with {1} criteria=-match'
                                     .format(tmpNucleus,
                                             tmpDatasetSpec.storageToken))
                                 toSkip = True
                                 break
                             # check state
                             """
                             if tmpEP['state'] not in ['ACTIVE']:
                                 tmpLog.info('  skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus,
                                                                                                                          tmpEP['ddm_endpoint_name'],
                                                                                                                          tmpEP['state']))
                                 toSkip = True
                                 break
                             """
                             # check space
                             tmpSpaceSize = tmpEP['space_free'] + tmpEP[
                                 'space_expired']
                             tmpSpaceToUse = 0
                             if tmpNucleus in self.fullRW:
                                 # 0.25GB per cpuTime/corePower/day
                                 tmpSpaceToUse = long(
                                     self.fullRW[tmpNucleus] / 10 / 24 /
                                     3600 * 0.25)
                             if tmpSpaceSize - tmpSpaceToUse < diskThreshold:
                                 tmpLog.info(
                                     '  skip nucleus={0} since disk shortage (free {1} GB - reserved {2} GB < thr {3} GB) at endpoint {4} criteria=-space'
                                     .format(tmpNucleus, tmpSpaceSize,
                                             tmpSpaceToUse, diskThreshold,
                                             tmpEP['ddm_endpoint_name']))
                                 toSkip = True
                                 break
                             # keep fraction of free space
                             if tmpNucleus not in fractionFreeSpace:
                                 fractionFreeSpace[tmpNucleus] = {
                                     'total': 0,
                                     'free': 0
                                 }
                             try:
                                 tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                     float(fractionFreeSpace[tmpNucleus]['total'])
                             except Exception:
                                 tmpOld = None
                             try:
                                 tmpNew = float(tmpSpaceSize -
                                                tmpSpaceToUse) / float(
                                                    tmpEP['space_total'])
                             except Exception:
                                 tmpNew = None
                             if tmpNew is not None and (tmpOld is None
                                                        or tmpNew < tmpOld):
                                 fractionFreeSpace[tmpNucleus] = {
                                     'total': tmpEP['space_total'],
                                     'free': tmpSpaceSize - tmpSpaceToUse
                                 }
                         if not toSkip:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info(
                         '{0} candidates passed endpoint check {1} TB'.
                         format(len(nucleusList), diskThreshold / 1024))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # ability to execute jobs
                     newNucleusList = {}
                     # get all panda sites
                     tmpSiteList = []
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         tmpSiteList += tmpNucleusSpec.allPandaSites
                     tmpSiteList = list(set(tmpSiteList))
                     tmpLog.debug('===== start for job check')
                     jobBroker = AtlasProdJobBroker(self.ddmIF,
                                                    self.taskBufferIF)
                     tmpSt, tmpRet = jobBroker.doBrokerage(
                         taskSpec, taskSpec.cloud, inputChunk, None, True,
                         tmpSiteList, tmpLog)
                     tmpLog.debug('===== done for job check')
                     if tmpSt != Interaction.SC_SUCCEEDED:
                         tmpLog.error('no sites can run jobs')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     okNuclei = set()
                     for tmpSite in tmpRet:
                         siteSpec = siteMapper.getSite(tmpSite)
                         okNuclei.add(siteSpec.pandasite)
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleus in okNuclei:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                         else:
                             tmpLog.info(
                                 '  skip nucleus={0} due to missing ability to run jobs criteria=-job'
                                 .format(tmpNucleus))
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed job check'.format(
                         len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # data locality
                     toSkip = False
                     availableData = {}
                     for datasetSpec in inputChunk.getDatasets():
                         # only for real datasets
                         if datasetSpec.isPseudo():
                             continue
                         # ignore DBR
                         if DataServiceUtils.isDBR(datasetSpec.datasetName):
                             continue
                         # skip locality check
                         if DataServiceUtils.getDatasetType(
                                 datasetSpec.datasetName
                         ) in datasetTypeToSkipCheck:
                             continue
                         # primary only
                         if taskParamMap.get(
                                 'taskBrokerOnMaster'
                         ) is True and not datasetSpec.isMaster():
                             continue
                         # use deep scan for primary dataset unless data carousel
                         if datasetSpec.isMaster(
                         ) and not taskSpec.inputPreStaging():
                             deepScan = True
                         else:
                             deepScan = False
                         # get nuclei where data is available
                         tmpSt, tmpRet = AtlasBrokerUtils.getNucleiWithData(
                             siteMapper, self.ddmIF,
                             datasetSpec.datasetName,
                             list(nucleusList.keys()), deepScan)
                         if tmpSt != Interaction.SC_SUCCEEDED:
                             tmpLog.error(
                                 'failed to get nuclei where data is available, since {0}'
                                 .format(tmpRet))
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             toSkip = True
                             break
                         # sum
                         for tmpNucleus, tmpVals in iteritems(tmpRet):
                             if tmpNucleus not in availableData:
                                 availableData[tmpNucleus] = tmpVals
                             else:
                                 availableData[tmpNucleus] = dict(
                                     (k, v + tmpVals[k])
                                     for (k, v) in iteritems(
                                         availableData[tmpNucleus]))
                     if toSkip:
                         continue
                     if availableData != {}:
                         newNucleusList = {}
                         # skip if no data
                         skipMsgList = []
                         for tmpNucleus, tmpNucleusSpec in iteritems(
                                 nucleusList):
                             if taskSpec.inputPreStaging(
                             ) and availableData[tmpNucleus][
                                     'ava_num_any'] > 0:
                                 # use incomplete replicas for data carousel since the completeness is guaranteed
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                             elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \
                                     availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac:
                                 tmpMsg = '  skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(
                                     tmpNucleus, availableData[tmpNucleus]
                                     ['ava_size_any'],
                                     availableData[tmpNucleus]['tot_size'],
                                     thrInputSizeFrac)
                                 skipMsgList.append(tmpMsg)
                             elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \
                                     availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac:
                                 tmpMsg = '  skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(
                                     tmpNucleus, availableData[tmpNucleus]
                                     ['ava_num_any'],
                                     availableData[tmpNucleus]['tot_num'],
                                     thrInputNumFrac)
                                 skipMsgList.append(tmpMsg)
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         totInputSize = list(availableData.values(
                         ))[0]['tot_size'] / 1024 / 1024 / 1024
                         data_locality_check_str = (
                             '(ioIntensity ({0}) is None or less than {1} kBPerS '
                             'and input size ({2} GB) is less than {3}) '
                             'or task.currentPriority ({4}) is higher than or equal to {5}'
                         ).format(taskSpec.ioIntensity,
                                  minIoIntensityWithLD, int(totInputSize),
                                  minInputSizeWithLD,
                                  taskSpec.currentPriority,
                                  maxTaskPrioWithLD)
                         if len(newNucleusList) > 0:
                             nucleusList = newNucleusList
                             for tmpMsg in skipMsgList:
                                 tmpLog.info(tmpMsg)
                         elif ((taskSpec.ioIntensity is None
                               or taskSpec.ioIntensity <= minIoIntensityWithLD)
                               and totInputSize <= minInputSizeWithLD) \
                               or taskSpec.currentPriority >= maxTaskPrioWithLD:
                             availableData = {}
                             tmpLog.info(
                                 '  disable data locality check since no nucleus has input data, {}'
                                 .format(data_locality_check_str))
                         else:
                             # no candidate + unavoidable data locality check
                             nucleusList = newNucleusList
                             for tmpMsg in skipMsgList:
                                 tmpLog.info(tmpMsg)
                             tmpLog.info(
                                 '  the following conditions required to disable data locality check: {}'
                                 .format(data_locality_check_str))
                         tmpLog.info(
                             '{0} candidates passed data check'.format(
                                 len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # weight
                     self.prioRW.acquire()
                     nucleusRW = self.prioRW[taskSpec.currentPriority]
                     self.prioRW.release()
                     totalWeight = 0
                     nucleusweights = []
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleus not in nucleusRW:
                             nucleusRW[tmpNucleus] = 0
                         wStr = '1'
                         # with RW
                         if tmpNucleus in nucleusRW and nucleusRW[
                                 tmpNucleus] >= cutOffRW:
                             weight = 1 / float(nucleusRW[tmpNucleus])
                             wStr += '/( RW={0} )'.format(
                                 nucleusRW[tmpNucleus])
                         else:
                             weight = 1
                             wStr += '/(1 : RW={0}<{1})'.format(
                                 nucleusRW[tmpNucleus], cutOffRW)
                         # with data
                         if availableData != {}:
                             if availableData[tmpNucleus]['tot_size'] > 0:
                                 weight *= float(availableData[tmpNucleus]
                                                 ['ava_size_any'])
                                 weight /= float(
                                     availableData[tmpNucleus]['tot_size'])
                                 wStr += '* ( available_input_size_DISKTAPE={0} )'.format(
                                     availableData[tmpNucleus]
                                     ['ava_size_any'])
                                 wStr += '/ ( total_input_size={0} )'.format(
                                     availableData[tmpNucleus]['tot_size'])
                                 # negative weight for tape
                                 if availableData[tmpNucleus][
                                         'ava_size_any'] > availableData[
                                             tmpNucleus]['ava_size_disk']:
                                     weight *= negWeightTape
                                     wStr += '*( weight_TAPE={0} )'.format(
                                         negWeightTape)
                         # fraction of free space
                         if tmpNucleus in fractionFreeSpace:
                             try:
                                 tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                     float(fractionFreeSpace[tmpNucleus]['total'])
                                 weight *= tmpFrac
                                 wStr += '*( free_space={0} )/( total_space={1} )'.format(
                                     fractionFreeSpace[tmpNucleus]['free'],
                                     fractionFreeSpace[tmpNucleus]['total'])
                             except Exception:
                                 pass
                         tmpLog.info(
                             '  use nucleus={0} weight={1} {2} criteria=+use'
                             .format(tmpNucleus, weight, wStr))
                         totalWeight += weight
                         nucleusweights.append((tmpNucleus, weight))
                     tmpLog.info('final {0} candidates'.format(
                         len(nucleusList)))
                     ######################################
                     # final selection
                     tgtWeight = random.uniform(0, totalWeight)
                     candidateNucleus = None
                     for tmpNucleus, weight in nucleusweights:
                         tgtWeight -= weight
                         if tgtWeight <= 0:
                             candidateNucleus = tmpNucleus
                             break
                     if candidateNucleus is None:
                         candidateNucleus = nucleusweights[-1][0]
                 ######################################
                 # update
                 nucleusSpec = nucleusList[candidateNucleus]
                 # get output/log datasets
                 tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                     taskSpec.jediTaskID, ['output', 'log'])
                 # get destinations
                 retMap = {
                     taskSpec.jediTaskID:
                     AtlasBrokerUtils.getDictToSetNucleus(
                         nucleusSpec, tmpDatasetSpecs)
                 }
                 tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
                 tmpLog.info(
                     '  set nucleus={0} with {1} criteria=+set'.format(
                         candidateNucleus, tmpRet))
                 self.sendLogMessage(tmpLog)
                 if tmpRet:
                     tmpMsg = 'set task_status=ready'
                     tmpLog.sendMsg(tmpMsg, self.msgType)
                 # update RW table
                 self.prioRW.acquire()
                 for prio, rwMap in iteritems(self.prioRW):
                     if prio > taskSpec.currentPriority:
                         continue
                     if candidateNucleus in rwMap:
                         rwMap[candidateNucleus] += taskRW
                     else:
                         rwMap[candidateNucleus] = taskRW
                 self.prioRW.release()
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             errMsg = '{0}.runImpl() failed with {1} {2} '.format(
                 self.__class__.__name__, errtype.__name__, errvalue)
             errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID)
             errMsg += traceback.format_exc()
             logger.error(errMsg)

Пример #20

Показать файл

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,splitRule,taskStatus,parent_tid in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(jediTaskID))
                 tmpLog.info('start')
                 tmpStat = Interaction.SC_SUCCEEDED
                 errStr = ''
                 # read task parameters
                 try:
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue)
                     tmpLog.error(errStr)
                     tmpStat = Interaction.SC_FAILED
                 # get impl
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('getting Impl')
                     try:
                         # get VO and sourceLabel
                         vo = taskParamMap['vo']
                         prodSourceLabel = taskParamMap['prodSourceLabel']
                         taskType = taskParamMap['taskType']
                         tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType))
                         # get impl
                         impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType,
                                                                 self.taskBufferIF,self.ddmIF)
                         if impl == None:
                             # task refiner is undefined
                             errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # extract common parameters
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('extracting common')                    
                     try:
                         # initalize impl
                         impl.initializeRefiner(tmpLog)
                         # extarct common parameters
                         impl.extractCommon(jediTaskID,taskParamMap,self.workQueueMapper,splitRule)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to extract common parameters with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # check parent
                 noWaitParent = False
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     if not parent_tid in [None,jediTaskID]:
                         tmpLog.info('check parent task')
                         try:
                             tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid)
                             if tmpStat == 'completed':
                                 # parent is done
                                 tmpStat = Interaction.SC_SUCCEEDED
                             elif tmpStat == 'running':
                                 if not impl.taskSpec.noWaitParent():
                                     # parent is running
                                     errStr = 'pending until parent task {0} is done'.format(parent_tid)
                                     impl.taskSpec.status = taskStatus
                                     impl.taskSpec.setOnHold()
                                     impl.taskSpec.setErrDiag(errStr)
                                     tmpLog.info(errStr)
                                     self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID})
                                     continue
                                 else:
                                     # not wait for parent
                                     tmpStat = Interaction.SC_SUCCEEDED
                                     noWaitParent = True
                             else:
                                 # parent is corrupted
                                 tmpStat = Interaction.SC_FAILED
                                 tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid)
                                 impl.taskSpec.setErrDiag(tmpErrStr)
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # refine
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('refining with {0}'.format(impl.__class__.__name__))
                     try:
                         tmpStat = impl.doRefine(jediTaskID,taskParamMap)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         # no wait for parent
                         if impl.taskSpec.noWaitParent() and errtype == JediException.UnknownDatasetError:
                             impl.taskSpec.status = taskStatus
                             impl.taskSpec.setOnHold()
                             errStr = 'pending until parent produces input'
                             tmpLog.info(errStr)
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID})
                             continue
                         else:
                             errStr = 'failed to refine task'
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # register
                 if tmpStat != Interaction.SC_SUCCEEDED:
                     tmpLog.error('failed to refine the task')
                     if impl == None or impl.taskSpec == None:
                         tmpTaskSpec = JediTaskSpec()
                         tmpTaskSpec.jediTaskID = jediTaskID
                     else:
                         tmpTaskSpec = impl.taskSpec
                     tmpTaskSpec.status = 'tobroken'
                     if errStr != '':
                         tmpTaskSpec.setErrDiag(errStr,True)
                     self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID})
                 else:
                     tmpLog.info('registering')                    
                     # fill JEDI tables
                     try:
                         # enable protection against task duplication
                         if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \
                                 not impl.taskSpec.checkPreProcessed():
                             uniqueTaskName = True
                         else:
                             uniqueTaskName = False
                         strTaskParams = None
                         if impl.updatedTaskParams != None:
                             strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams)
                         if taskStatus == 'registered':
                             # unset pre-process flag
                             if impl.taskSpec.checkPreProcessed():
                                 impl.taskSpec.setPostPreProcess()
                             # full registration
                             tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec,
                                                                                                  impl.inMasterDatasetSpec,
                                                                                                  impl.inSecDatasetSpecList,
                                                                                                  impl.outDatasetSpecList,
                                                                                                  impl.outputTemplateMap,
                                                                                                  impl.jobParamsTemplate,
                                                                                                  strTaskParams,
                                                                                                  impl.unmergeMasterDatasetSpec,
                                                                                                  impl.unmergeDatasetSpecMap,
                                                                                                  uniqueTaskName) 
                             if not tmpStat:
                                 tmpErrStr = 'failed to register the task to JEDI in a single shot'
                                 tmpLog.error(tmpErrStr)
                                 impl.taskSpec.status = 'tobroken'
                                 impl.taskSpec.setErrDiag(tmpErrStr,True)
                                 self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID})
                             tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                             tmpLog.info(tmpMsg)
                             tmpLog.sendMsg(tmpMsg,self.msgType)
                         else:        
                             # appending for incremetnal execution
                             tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec,
                                                                             impl.inSecDatasetSpecList)
                             if not tmpStat:
                                 tmpLog.error('failed to append datasets for incexec')
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(tmpErrStr)
                     else:
                         tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))

Пример #21

Показать файл

 def undo_preassign(self):
     tmp_log = MsgWrapper(logger, 'undo_preassign')
     # refresh
     self.refresh()
     # busy sites
     busy_sites_dict = self.get_busy_sites()
     # loop to undo preassignment
     for prod_source_label in self.prodSourceLabelList:
         # parameter from GDP config
         max_preassigned_tasks = self.taskBufferIF.getConfigValue(
             'queue_filler',
             'MAX_PREASSIGNED_TASKS_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if max_preassigned_tasks is None:
             max_preassigned_tasks = 3
         min_files_ready = self.taskBufferIF.getConfigValue(
             'queue_filler',
             'MIN_FILES_READY_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if min_files_ready is None:
             min_files_ready = 50
         min_files_remaining = self.taskBufferIF.getConfigValue(
             'queue_filler',
             'MIN_FILES_REMAINING_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if min_files_remaining is None:
             min_files_remaining = 100
         # clean up outdated blacklist
         blacklist_duration_hours = 12
         blacklisted_tasks_map_orig = self._get_from_bt_cache()
         blacklisted_tasks_map = copy.deepcopy(blacklisted_tasks_map_orig)
         now_time = datetime.datetime.utcnow()
         min_allowed_time = now_time - datetime.timedelta(
             hours=blacklist_duration_hours)
         min_allowed_ts = int(min_allowed_time.timestamp())
         for ts_str in blacklisted_tasks_map_orig:
             ts = int(ts_str)
             if ts < min_allowed_ts:
                 del blacklisted_tasks_map[ts_str]
         self._update_to_bt_cache(blacklisted_tasks_map)
         n_bt_old = sum([
             len(bt_list)
             for bt_list in blacklisted_tasks_map_orig.values()
         ])
         n_bt = sum(
             [len(bt_list) for bt_list in blacklisted_tasks_map.values()])
         tmp_log.debug(
             'done cleanup blacklist; before {n_bt_old} , now {n_bt} tasks in blacklist'
             .format(n_bt_old=n_bt_old, n_bt=n_bt))
         # get a copy of preassigned_tasks_map from cache
         preassigned_tasks_map_orig = self._get_from_pt_cache()
         preassigned_tasks_map = copy.deepcopy(preassigned_tasks_map_orig)
         # clean up task_orig_attr_map in cache
         task_orig_attr_map_orig = self._get_from_attr_cache()
         task_orig_attr_map = copy.deepcopy(task_orig_attr_map_orig)
         all_preassiged_taskids = set()
         for taskid_list in preassigned_tasks_map_orig.values():
             all_preassiged_taskids |= set(taskid_list)
         for taskid_str in task_orig_attr_map_orig:
             taskid = int(taskid_str)
             if taskid not in all_preassiged_taskids:
                 del task_orig_attr_map[taskid_str]
         self._update_to_attr_cache(task_orig_attr_map)
         # loop on preassigned tasks in cache
         for key_name in preassigned_tasks_map_orig:
             # parse key name = site + resource_type
             site, resource_type = key_name.split('|')
             # preassigned tasks in cache
             preassigned_tasks_cached = preassigned_tasks_map.get(
                 key_name, [])
             # force_undo=True for all tasks in busy sites, and force_undo=False for tasks not in status to generate jobs
             force_undo = False
             if site in busy_sites_dict or len(
                     preassigned_tasks_cached) > max_preassigned_tasks:
                 force_undo = True
             reason_str = 'site busy or offline or with too many preassigned tasks' if force_undo \
                             else 'task paused/terminated or without enough files to process'
             # parameters for undo, kinda ugly
             params_map = {
                 ':min_files_ready': min_files_ready,
                 ':min_files_remaining': min_files_remaining,
             }
             # undo preassign
             had_undo = False
             updated_tasks = []
             if DRY_RUN:
                 if force_undo:
                     updated_tasks = list(preassigned_tasks_cached)
                     n_tasks = len(updated_tasks)
                 else:
                     preassigned_tasks_list = []
                     preassigned_tasks_params_map = {}
                     for j, taskid in enumerate(preassigned_tasks_cached):
                         pt_param = ':pt_{0}'.format(j + 1)
                         preassigned_tasks_list.append(pt_param)
                         preassigned_tasks_params_map[pt_param] = taskid
                     if not preassigned_tasks_list:
                         continue
                     preassigned_tasks_params_str = ','.join(
                         preassigned_tasks_list)
                     dry_sql_query = (
                         "SELECT t.jediTaskID "
                         "FROM {jedi_schema}.JEDI_Tasks t "
                         "WHERE t.jediTaskID IN ({preassigned_tasks_params_str}) "
                         "AND t.site IS NOT NULL "
                         "AND NOT ( "
                         "t.status IN ('ready','running') "
                         "AND EXISTS ( "
                         "SELECT d.datasetID FROM {0}.JEDI_Datasets d "
                         "WHERE t.jediTaskID=d.jediTaskID AND d.type='input' "
                         "AND d.nFilesToBeUsed-d.nFilesUsed>=:min_files_ready AND d.nFiles-d.nFilesUsed>=:min_files_remaining "
                         ") "
                         ") ").format(jedi_schema=jedi_config.db.schemaJEDI,
                                      preassigned_tasks_params_str=
                                      preassigned_tasks_params_str)
                     res = self.taskBufferIF.querySQL(
                         dry_sql_query, preassigned_tasks_params_map)
                     n_tasks = 0 if res is None else len(res)
                     if n_tasks > 0:
                         updated_tasks = [x[0] for x in res]
                 # tmp_log.debug('[dry run] {} {} force={}'.format(key_name, str(updated_tasks), force_undo))
                 had_undo = True
                 if n_tasks > 0:
                     tmp_log.debug(
                         '[dry run] {key_name:<64} {n_tasks:>3} preassigned tasks would be undone ({reason_str}) '
                         .format(key_name=key_name,
                                 n_tasks=n_tasks,
                                 reason_str=reason_str))
             else:
                 updated_tasks = self.taskBufferIF.undoPreassignedTasks_JEDI(
                     preassigned_tasks_cached,
                     task_orig_attr_map=task_orig_attr_map,
                     params_map=params_map,
                     force=force_undo)
                 if updated_tasks is None:
                     # dbproxy method failed
                     tmp_log.error(
                         '{key_name:<64} failed to undo preassigned tasks (force={force_undo})'
                         .format(key_name=key_name, force_undo=force_undo))
                 else:
                     had_undo = True
                     n_tasks = len(updated_tasks)
                     if n_tasks > 0:
                         tmp_log.info(
                             '{key_name:<64} {n_tasks:>3} preassigned tasks undone ({reason_str}) : {updated_tasks} '
                             .format(key_name=key_name,
                                     n_tasks=str(n_tasks),
                                     reason_str=reason_str,
                                     updated_tasks=updated_tasks))
                         # Kibana log
                         for taskid in updated_tasks:
                             tmp_log.debug(
                                 '#ATM #KV jediTaskID={taskid} action=undo_preassign site={site} rtype={rtype} un-preassinged since {reason_str}'
                                 .format(taskid=taskid,
                                         site=site,
                                         rtype=resource_type,
                                         reason_str=reason_str))
             # update preassigned_tasks_map into cache
             if had_undo:
                 if force_undo:
                     del preassigned_tasks_map[key_name]
                 else:
                     tmp_tasks_set = set(preassigned_tasks_cached) - set(
                         updated_tasks)
                     if not tmp_tasks_set:
                         del preassigned_tasks_map[key_name]
                     else:
                         preassigned_tasks_map[key_name] = list(
                             tmp_tasks_set)
                 self._update_to_pt_cache(preassigned_tasks_map)
             # update blacklisted_tasks_map into cache
             if had_undo and not force_undo:
                 blacklisted_tasks_map_orig = self._get_from_bt_cache()
                 blacklisted_tasks_map = copy.deepcopy(
                     blacklisted_tasks_map_orig)
                 now_time = datetime.datetime.utcnow()
                 now_rounded_ts = int(
                     now_time.replace(minute=0, second=0,
                                      microsecond=0).timestamp())
                 ts_str = str(now_rounded_ts)
                 if ts_str in blacklisted_tasks_map_orig:
                     tmp_bt_list = blacklisted_tasks_map[ts_str]
                     blacklisted_tasks_map[ts_str] = list(
                         set(tmp_bt_list) | set(updated_tasks))
                 else:
                     blacklisted_tasks_map[ts_str] = list(updated_tasks)
                 self._update_to_bt_cache(blacklisted_tasks_map)

Пример #22

Показать файл

 def doSetup(self,taskSpec,datasetToRegister):
     # make logger
     tmpLog = MsgWrapper(logger,"<jediTaskID={0}>".format(taskSpec.jediTaskID))
     tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType))
     tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister)))
     # returns
     retFatal    = self.SC_FATAL
     retTmpError = self.SC_FAILED
     retOK       = self.SC_SUCCEEDED
     try:
         if datasetToRegister != []:
             # prod vs anal
             userSetup = False
             if taskSpec.prodSourceLabel in ['user']:
                 userSetup = True
             # get DDM I/F
             ddmIF = self.ddmIF.getInterface(taskSpec.vo)
             # get site mapper
             siteMapper = self.taskBufferIF.getSiteMapper()
             # loop over all datasets
             avDatasetList = []
             cnDatasetMap  = {}
             for datasetID in datasetToRegister:
                 # get output and log datasets
                 tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID))
                 tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID,
                                                                               datasetID)
                 if not tmpStat:
                     tmpLog.error('failed to get output and log datasets')
                     return retFatal
                 tmpLog.info('checking {0}'.format(datasetSpec.datasetName)) 
                 # check if dataset and container are available in DDM
                 for targetName in [datasetSpec.datasetName,datasetSpec.containerName]:
                     if targetName == None:
                         continue
                     if not targetName in avDatasetList:
                         # check dataset/container in DDM
                         tmpList = ddmIF.listDatasets(targetName)
                         if tmpList == []:
                             # register dataset/container
                             tmpLog.info('registering {0}'.format(targetName))
                             tmpStat = ddmIF.registerNewDataset(targetName)
                             if not tmpStat:
                                 tmpLog.error('failed to register {0}'.format(targetName))
                                 return retFatal
                             # procedures for user 
                             if userSetup:
                                 # set owner
                                 tmpLog.info('setting owner={0}'.format(taskSpec.userName))
                                 tmpStat = ddmIF.setDatasetOwner(targetName,taskSpec.userName)
                                 if not tmpStat:
                                     tmpLog.error('failed to set ownership {0} with {1}'.format(targetName,
                                                                                                taskSpec.userName))
                                     return retFatal
                                 # register location
                                 if targetName == datasetSpec.datasetName and not datasetSpec.site in ['',None]: 
                                     location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken)
                                     tmpLog.info('registring location={0}'.format(location))
                                     tmpStat = ddmIF.registerDatasetLocation(targetName,location,owner=taskSpec.userName)
                                     if not tmpStat:
                                         tmpLog.error('failed to register location {0} for {1}'.format(location,
                                                                                                       targetName))
                                         return retFatal
                             avDatasetList.append(targetName)
                         else:
                             tmpLog.info('{0} already registered'.format(targetName))
                 # check if dataset is in the container
                 if datasetSpec.containerName != None and datasetSpec.containerName != datasetSpec.datasetName:
                     # get list of constituent datasets in the container
                     if not cnDatasetMap.has_key(datasetSpec.containerName):
                         cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName)
                     # add dataset
                     if not datasetSpec.datasetName in cnDatasetMap[datasetSpec.containerName]:
                         tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) 
                         tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName])
                         if not tmpStat:
                             tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName,
                                                                            datasetSpec.containerName))
                             return retFatal
                         cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName)
                     else:
                         tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) 
                 # update dataset
                 datasetSpec.status = 'registered'
                 self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID,
                                                                   'datasetID':datasetID})
         # return
         tmpLog.info('done')        
         return retOK
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue))
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retFatal

Пример #23

Показать файл

 def getAvailableFiles(self,datasetSpec,siteEndPointMap,siteMapper,ngGroup=[],checkLFC=False):
     # make logger
     methodName = 'getAvailableFiles'
     methodName += ' <datasetID={0}>'.format(datasetSpec.datasetID)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start datasetName={0}'.format(datasetSpec.datasetName))
     try:
         # list of NG endpoints
         ngEndPoints = []
         if 1 in ngGroup:
             ngEndPoints += ['_SCRATCHDISK$','_LOCALGROUPDISK$','_LOCALGROUPTAPE$','_USERDISK$',
                            '_DAQ$','_TMPDISK$','_TZERO$','_GRIDFTP$','MOCKTEST$']
         if 2 in ngGroup:
             ngEndPoints += ['_LOCALGROUPTAPE$',
                            '_DAQ$','_TMPDISK$','_TZERO$','_GRIDFTP$','MOCKTEST$']
         # get all associated endpoints
         siteAllEndPointsMap = {}
         for siteName,endPointPattList in siteEndPointMap.iteritems():
             # get all endpoints matching with patterns 
             allEndPointList = []
             for endPointPatt in endPointPattList:
                 if '*' in endPointPatt:
                     # wildcard
                     endPointPatt = endPointPatt.replace('*','.*')
                     for endPointToA in TiersOfATLAS.getAllDestinationSites():
                         if re.search('^'+endPointPatt+'$',endPointToA) != None:
                             if not endPointToA in allEndPointList:
                                 allEndPointList.append(endPointToA)
                 else:
                     # normal endpoint
                     if endPointPatt in TiersOfATLAS.getAllDestinationSites() and \
                            not endPointPatt in allEndPointList:
                         allEndPointList.append(endPointPatt)
             # get associated endpoints
             siteAllEndPointsMap[siteName] = []
             for endPoint in allEndPointList:
                 # append
                 if not self.checkNGEndPoint(endPoint,ngEndPoints) and \
                         not endPoint in siteAllEndPointsMap[siteName]:
                     siteAllEndPointsMap[siteName].append(endPoint)
                 else:
                     # already checked
                     continue
                 # get alternate name
                 altName = TiersOfATLAS.getSiteProperty(endPoint,'alternateName')
                 if altName != None and altName != ['']:
                     for assEndPoint in TiersOfATLAS.resolveGOC({altName[0]:None})[altName[0]]:
                         if not assEndPoint in siteAllEndPointsMap[siteName] and \
                                not self.checkNGEndPoint(assEndPoint,ngEndPoints):
                             siteAllEndPointsMap[siteName].append(assEndPoint)
         # get replica map
         tmpStat,tmpOut = self.listDatasetReplicas(datasetSpec.datasetName)
         if tmpStat != self.SC_SUCCEEDED:
             tmpLog.error('faild to get dataset replicas with {0}'.format(tmpOut))
             raise tmpStat,tmpOut
         datasetReplicaMap = tmpOut
         # collect SE, LFC hosts, storage path, storage type
         lfcSeMap = {}
         storagePathMap = {}
         completeReplicaMap = {}
         siteHasCompleteReplica = False
         for siteName,allEndPointList in siteAllEndPointsMap.iteritems():
             tmpLfcSeMap = {}
             tmpStoragePathMap = {}
             tmpSiteSpec = siteMapper.getSite(siteName)
             for tmpEndPoint in allEndPointList:
                 # storage type
                 if TiersOfATLAS.isTapeSite(tmpEndPoint):
                     storageType = 'localtape'
                 else:
                     storageType = 'localdisk'
                 # no scan when site has complete replicas
                 if datasetReplicaMap.has_key(tmpEndPoint) and datasetReplicaMap[tmpEndPoint][-1]['found'] != None \
                    and datasetReplicaMap[tmpEndPoint][-1]['total'] == datasetReplicaMap[tmpEndPoint][-1]['found']:
                     completeReplicaMap[tmpEndPoint] = storageType
                     siteHasCompleteReplica = True
                 # no LFC scan for many-time datasets
                 if datasetSpec.isManyTime():
                     continue
                 # get LFC
                 lfc = TiersOfATLAS.getLocalCatalog(tmpEndPoint)
                 # add map
                 if not tmpLfcSeMap.has_key(lfc):
                     tmpLfcSeMap[lfc] = []
                 # get SE
                 seStr = TiersOfATLAS.getSiteProperty(tmpEndPoint, 'srm')
                 tmpMatch = re.search('://([^:/]+):*\d*/',seStr)
                 if tmpMatch != None:
                     se = tmpMatch.group(1)
                     if not se in tmpLfcSeMap[lfc]:
                         tmpLfcSeMap[lfc].append(se)
                 else:
                     tmpLog.error('faild to extract SE from %s for %s:%s' % \
                                  (seStr,siteName,tmpEndPoint))
                 # get SE + path
                 seStr = TiersOfATLAS.getSiteProperty(tmpEndPoint, 'srm')
                 tmpMatch = re.search('(srm://.+)$',seStr)
                 if tmpMatch == None:
                     tmpLog.error('faild to extract SE+PATH from %s for %s:%s' % \
                                  (seStr,siteName,tmpEndPoint))
                     continue
                 # add full path to storage map
                 tmpSePath = tmpMatch.group(1)
                 tmpStoragePathMap[tmpSePath] = {'siteName':siteName,'storageType':storageType}
                 # add compact path
                 tmpSePath = re.sub('(:\d+)*/srm/[^\?]+\?SFN=','',tmpSePath)
                 tmpStoragePathMap[tmpSePath] = {'siteName':siteName,'storageType':storageType}
             # add to map to trigger LFC scan if complete replica is missing at the site
             if DataServiceUtils.isCachedFile(datasetSpec.datasetName,tmpSiteSpec):
                 pass
             elif not siteHasCompleteReplica or checkLFC:
                 for tmpKey,tmpVal in tmpLfcSeMap.iteritems():
                     if not lfcSeMap.has_key(tmpKey):
                         lfcSeMap[tmpKey] = []
                     lfcSeMap[tmpKey] += tmpVal
                 for tmpKey,tmpVal in tmpStoragePathMap.iteritems():
                     storagePathMap[tmpKey] = tmpVal
         # collect GUIDs and LFNs
         fileMap        = {}
         lfnMap         = {}
         lfnFileSepcMap = {}
         scopeMap       = {}
         for tmpFile in datasetSpec.Files:
             fileMap[tmpFile.GUID] = tmpFile.lfn
             lfnMap[tmpFile.lfn] = tmpFile
             lfnFileSepcMap[tmpFile.lfn] = tmpFile
             scopeMap[tmpFile.lfn] = tmpFile.scope
         # get SURLs
         surlMap = {}
         for lfcHost,seList in lfcSeMap.iteritems():
             tmpLog.debug('lookup in LFC:{0} for {1}'.format(lfcHost,str(seList)))               
             tmpStat,tmpRetMap = self.getSURLsFromLFC(fileMap,lfcHost,seList,scopes=scopeMap)
             tmpLog.debug(str(tmpStat))
             if tmpStat != self.SC_SUCCEEDED:
                 raise RuntimeError,tmpRetMap
             for lfn,surls in tmpRetMap.iteritems():
                 if not surlMap.has_key(lfn):
                     surlMap[lfn] = surls
                 else:
                     surlMap[lfn] += surls
         # make return
         returnMap = {}
         for siteName,allEndPointList in siteAllEndPointsMap.iteritems():
             # set default return values
             if not returnMap.has_key(siteName):
                 returnMap[siteName] = {'localdisk':[],'localtape':[],'cache':[],'remote':[]}
             # loop over all files    
             tmpSiteSpec = siteMapper.getSite(siteName)                
             # check if the file is cached
             if DataServiceUtils.isCachedFile(datasetSpec.datasetName,tmpSiteSpec):
                 for tmpFileSpec in datasetSpec.Files:
                     # add to cached file list
                     returnMap[siteName]['cache'].append(tmpFileSpec)
             # complete replicas
             if not checkLFC:        
                 for tmpEndPoint in allEndPointList:
                     if completeReplicaMap.has_key(tmpEndPoint):
                         storageType = completeReplicaMap[tmpEndPoint]
                         returnMap[siteName][storageType] += datasetSpec.Files
         # loop over all available LFNs
         avaLFNs = surlMap.keys()
         avaLFNs.sort()
         for tmpLFN in avaLFNs:
             tmpFileSpec = lfnFileSepcMap[tmpLFN]                
             # loop over all SURLs
             for tmpSURL in surlMap[tmpLFN]:
                 for tmpSePath in storagePathMap.keys():
                     # check SURL
                     if tmpSURL.startswith(tmpSePath):
                         # add
                         siteName = storagePathMap[tmpSePath]['siteName']
                         storageType = storagePathMap[tmpSePath]['storageType']
                         if not tmpFileSpec in returnMap[siteName][storageType]:
                             returnMap[siteName][storageType].append(tmpFileSpec)
                         break
         # dump
         dumpStr = ''
         for siteName,storageTypeFile in returnMap.iteritems():
             dumpStr += '{0}:('.format(siteName)
             for storageType,fileList in storageTypeFile.iteritems():
                 dumpStr += '{0}:{1},'.format(storageType,len(fileList))
             dumpStr = dumpStr[:-1]
             dumpStr += ') '
         dumpStr= dumpStr[:-1]
         tmpLog.debug(dumpStr)
         # return
         tmpLog.info('done')            
         return self.SC_SUCCEEDED,returnMap
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errMsg = 'failed with {0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return self.SC_FAILED,'{0}.{1} {2}'.format(self.__class__.__name__,methodName,errMsg)

Пример #24

Показать файл

Файл: TaskCommando.py Проект: PanDAWMS/panda-jedi

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug(
                     '{0} terminating since no more items'.format(
                         self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID, commandMap in taskList:
                 # make logger
                 tmpLog = MsgWrapper(
                     self.logger, ' < jediTaskID={0} >'.format(jediTaskID))
                 commandStr = commandMap['command']
                 commentStr = commandMap['comment']
                 oldStatus = commandMap['oldStatus']
                 tmpLog.info('start for {0}'.format(commandStr))
                 tmpStat = Interaction.SC_SUCCEEDED
                 if commandStr in ['kill', 'finish', 'reassign']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg, self.msgType)
                     # loop twice to see immediate result
                     for iLoop in range(2):
                         # get active PandaIDs to be killed
                         if commandStr == 'reassign' and commentStr is not None and 'soft reassign' in commentStr:
                             pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(
                                 jediTaskID)
                         elif commandStr == 'reassign' and commentStr is not None and 'nokill reassign' in commentStr:
                             pandaIDs = []
                         else:
                             pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(
                                 jediTaskID, True)
                         if pandaIDs is None:
                             tmpLog.error(
                                 'failed to get PandaIDs for jediTaskID={0}'
                                 .format(jediTaskID))
                             tmpStat = Interaction.SC_FAILED
                         # kill jobs or update task
                         if tmpStat == Interaction.SC_SUCCEEDED:
                             if pandaIDs == []:
                                 # done since no active jobs
                                 tmpMsg = 'completed cleaning jobs'
                                 tmpLog.sendMsg(tmpMsg, self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpTaskSpec = JediTaskSpec()
                                 tmpTaskSpec.jediTaskID = jediTaskID
                                 updateTaskStatus = True
                                 if commandStr != 'reassign':
                                     # reset oldStatus
                                     # keep oldStatus for task reassignment since it is reset when actually reassigned
                                     tmpTaskSpec.forceUpdate('oldStatus')
                                 else:
                                     # extract cloud or site
                                     if commentStr is not None:
                                         tmpItems = commentStr.split(':')
                                         if tmpItems[0] == 'cloud':
                                             tmpTaskSpec.cloud = tmpItems[1]
                                         elif tmpItems[0] == 'nucleus':
                                             tmpTaskSpec.nucleus = tmpItems[
                                                 1]
                                         else:
                                             tmpTaskSpec.site = tmpItems[1]
                                         tmpMsg = 'set {0}={1}'.format(
                                             tmpItems[0], tmpItems[1])
                                         tmpLog.sendMsg(
                                             tmpMsg, self.msgType)
                                         tmpLog.info(tmpMsg)
                                         # back to oldStatus if necessary
                                         if tmpItems[2] == 'y':
                                             tmpTaskSpec.status = oldStatus
                                             tmpTaskSpec.forceUpdate(
                                                 'oldStatus')
                                             updateTaskStatus = False
                                 if commandStr == 'reassign':
                                     tmpTaskSpec.forceUpdate('errorDialog')
                                 if commandStr == 'finish':
                                     # update datasets
                                     tmpLog.info(
                                         'updating datasets to finish')
                                     tmpStat = self.taskBufferIF.updateDatasetsToFinishTask_JEDI(
                                         jediTaskID, self.pid)
                                     if not tmpStat:
                                         tmpLog.info(
                                             'wait until datasets are updated to finish'
                                         )
                                     # ignore failGoalUnreached when manually finished
                                     tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(
                                         jediTaskID)
                                     tmpTaskSpec.splitRule = taskSpec.splitRule
                                     tmpTaskSpec.unsetFailGoalUnreached()
                                 if updateTaskStatus:
                                     tmpTaskSpec.status = JediTaskSpec.commandStatusMap(
                                     )[commandStr]['done']
                                 tmpMsg = 'set task_status={0}'.format(
                                     tmpTaskSpec.status)
                                 tmpLog.sendMsg(tmpMsg, self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpRet = self.taskBufferIF.updateTask_JEDI(
                                     tmpTaskSpec,
                                     {'jediTaskID': jediTaskID},
                                     setOldModTime=True)
                                 tmpLog.info('done with {0}'.format(
                                     str(tmpRet)))
                                 break
                             else:
                                 # kill only in the first loop
                                 if iLoop > 0:
                                     break
                                 # wait or kill jobs
                                 if commentStr and 'soft finish' in commentStr:
                                     queuedPandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(
                                         jediTaskID)
                                     tmpMsg = "trying to kill {0} queued jobs for soft finish".format(
                                         len(queuedPandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = self.taskBufferIF.killJobs(
                                         queuedPandaIDs, commentStr, '52',
                                         True)
                                     tmpMsg = "wating {0} jobs for soft finish".format(
                                         len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = True
                                     tmpLog.info('done with {0}'.format(
                                         str(tmpRet)))
                                     break
                                 else:
                                     tmpMsg = "trying to kill {0} jobs".format(
                                         len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpLog.sendMsg(tmpMsg, self.msgType)
                                     if commandStr in ['finish']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(
                                             pandaIDs, commentStr, '52',
                                             True)
                                     elif commandStr in ['reassign']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(
                                             pandaIDs, commentStr, '51',
                                             True)
                                     else:
                                         # normal kill
                                         tmpRet = self.taskBufferIF.killJobs(
                                             pandaIDs, commentStr, '50',
                                             True)
                                     tmpLog.info('done with {0}'.format(
                                         str(tmpRet)))
                 elif commandStr in ['retry', 'incexec']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg, self.msgType)
                     # change task params for incexec
                     if commandStr == 'incexec':
                         try:
                             # read task params
                             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                                 jediTaskID)
                             taskParamMap = RefinerUtils.decodeJSON(
                                 taskParam)
                             # remove some params
                             for newKey in ['nFiles', 'fixedSandbox']:
                                 try:
                                     del taskParamMap[newKey]
                                 except Exception:
                                     pass
                             # convert new params
                             newParamMap = RefinerUtils.decodeJSON(
                                 commentStr)
                             # change params
                             for newKey, newVal in iteritems(newParamMap):
                                 if newVal is None:
                                     # delete
                                     if newKey in taskParamMap:
                                         del taskParamMap[newKey]
                                 else:
                                     # change
                                     taskParamMap[newKey] = newVal
                             # overwrite sandbox
                             if 'fixedSandbox' in taskParamMap:
                                 # noBuild
                                 for tmpParam in taskParamMap[
                                         'jobParameters']:
                                     if tmpParam[
                                             'type'] == 'constant' and re.search(
                                                 '^-a [^ ]+$',
                                                 tmpParam['value']
                                             ) is not None:
                                         tmpParam['value'] = '-a {0}'.format(
                                             taskParamMap['fixedSandbox'])
                                 # build
                                 if 'buildSpec' in taskParamMap:
                                     taskParamMap['buildSpec'][
                                         'archiveName'] = taskParamMap[
                                             'fixedSandbox']
                                 # merge
                                 if 'mergeSpec' in taskParamMap:
                                     taskParamMap['mergeSpec']['jobParameters'] = \
                                         re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters'])
                             # encode new param
                             strTaskParams = RefinerUtils.encodeJSON(
                                 taskParamMap)
                             tmpRet = self.taskBufferIF.updateTaskParams_JEDI(
                                 jediTaskID, strTaskParams)
                             if tmpRet is not True:
                                 tmpLog.error(
                                     'failed to update task params')
                                 continue
                         except Exception as e:
                             tmpLog.error(
                                 'failed to change task params with {} {}'.
                                 format(str(e), traceback.format_exc()))
                             continue
                     # retry child tasks
                     if 'sole ' in commentStr:
                         retryChildTasks = False
                     else:
                         retryChildTasks = True
                     # discard events
                     if 'discard ' in commentStr:
                         discardEvents = True
                     else:
                         discardEvents = False
                     # release un-staged files
                     if 'staged ' in commentStr:
                         releaseUnstaged = True
                     else:
                         releaseUnstaged = False
                     tmpRet, newTaskStatus = self.taskBufferIF.retryTask_JEDI(
                         jediTaskID,
                         commandStr,
                         retryChildTasks=retryChildTasks,
                         discardEvents=discardEvents,
                         release_unstaged=releaseUnstaged)
                     if tmpRet is True:
                         tmpMsg = 'set task_status={0}'.format(
                             newTaskStatus)
                         tmpLog.sendMsg(tmpMsg, self.msgType)
                         tmpLog.info(tmpMsg)
                     tmpLog.info('done with {0}'.format(tmpRet))
                 else:
                     tmpLog.error('unknown command')
         except Exception as e:
             errStr = '{} failed in runImpl() with {} {} '.format(
                 self.__class__.__name__, str(e), traceback.format_exc())
             logger.error(errStr)

Пример #25

Показать файл

Файл: AtlasProdTaskBroker.py Проект: RRCKI/panda-jedi

 def runImpl(self):
     # cutoff for disk in TB
     diskThreshold = self.taskBufferIF.getConfigValue(self.msgType, 'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name),
                                                      'jedi', 'atlas')
     if diskThreshold is None:
         diskThreshold = 100 * 1024
     # dataset type to ignore file availability check
     datasetTypeToSkipCheck = ['log']
     # thresholds for data availability check
     thrInputSize = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas')
     if thrInputSize is None:
         thrInputSize = 1
     thrInputSize *= 1024*1024*1024
     thrInputNum = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_THRESHOLD', 'jedi', 'atlas')
     if thrInputNum is None:
         thrInputNum = 100
     thrInputSizeFrac = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas')
     if thrInputSizeFrac is None:
         thrInputSizeFrac = 10
     thrInputSizeFrac = float(thrInputSizeFrac) / 100
     thrInputNumFrac = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas')
     if thrInputNumFrac is None:
         thrInputNumFrac = 10
     thrInputNumFrac = float(thrInputNumFrac) / 100
     cutOffRW = 50
     negWeightTape = 0.001
     # main
     lastJediTaskID = None
     siteMapper = self.taskBufferIF.getSiteMapper()
     while True:
         try:
             taskInputList = self.inputList.get(1)
             # no more datasets
             if len(taskInputList) == 0:
                 self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__,
                                                                                                             self.numTasks))
                 return
             # loop over all tasks
             for taskSpec,inputChunk in taskInputList:
                 lastJediTaskID = taskSpec.jediTaskID
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='jediTaskID={0}'.format(taskSpec.jediTaskID))
                 tmpLog.debug('start')
                 tmpLog.info('thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}'.format(thrInputSize,
                                                                                                                 thrInputNum,
                                                                                                                 thrInputSizeFrac,
                                                                                                                 thrInputNumFrac))
                 # RW
                 taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID)
                 # get nuclei
                 nucleusList = siteMapper.nuclei
                 if taskSpec.nucleus in nucleusList:
                     candidateNucleus = taskSpec.nucleus
                 else:
                     tmpLog.info('got {0} candidates'.format(len(nucleusList)))
                     ######################################
                     # check status
                     newNucleusList = {}
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleusSpec.state in ['ACTIVE']:
                             tmpLog.info('  skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus,
                                                                                                         tmpNucleusSpec.state))
                         else:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed status check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # check status of transfer backlog
                     t1Weight = taskSpec.getT1Weight()
                     if t1Weight < 0:
                         tmpLog.info('skip transfer backlog check due to negative T1Weight')
                     else:
                         newNucleusList = {}
                         backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei()
                         for tmpNucleus, tmpNucleusSpec in nucleusList.iteritems():
                             if tmpNucleus in backlogged_nuclei:
                                 tmpLog.info('  skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog'.
                                              format(tmpNucleus))
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         nucleusList = newNucleusList
                         tmpLog.info('{0} candidates passed transfer backlog check'.format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # check endpoint
                     fractionFreeSpace = {}
                     newNucleusList = {}
                     tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                                   ['output','log'])
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         toSkip = False
                         for tmpDatasetSpec in tmpDatasetSpecList:
                             # ignore distributed datasets
                             if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None:
                                 continue
                             # get endpoint with the pattern
                             tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken)
                             if tmpEP == None:
                                 tmpLog.info('  skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus,
                                                                                                                     tmpDatasetSpec.storageToken))
                                 toSkip = True
                                 break
                             # check state
                             """
                             if not tmpEP['state'] in ['ACTIVE']:
                                 tmpLog.info('  skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus,
                                                                                                                          tmpEP['ddm_endpoint_name'],
                                                                                                                          tmpEP['state']))
                                 toSkip = True
                                 break
                             """    
                             # check space
                             tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired']
                             tmpSpaceToUse = 0
                             if tmpNucleus in self.fullRW:
                                 # 0.25GB per cpuTime/corePower/day
                                 tmpSpaceToUse = long(self.fullRW[tmpNucleus]/10/24/3600*0.25)
                             if tmpSpaceSize-tmpSpaceToUse < diskThreshold:
                                 tmpLog.info('  skip nucleus={0} since disk shortage (free {1} - reserved {2} < thr {3}) at endpoint {4} criteria=-space'.format(tmpNucleus,
                                                                                                                                                                  tmpSpaceSize,
                                                                                                                                                                  tmpSpaceToUse,
                                                                                                                                                                  diskThreshold,
                                                                                                                                                                  tmpEP['ddm_endpoint_name']))
                                 toSkip = True
                                 break
                             # keep fraction of free space
                             if not tmpNucleus in fractionFreeSpace:
                                 fractionFreeSpace[tmpNucleus] = {'total':0,'free':0}
                             try:
                                 tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                     float(fractionFreeSpace[tmpNucleus]['total'])
                             except:
                                 tmpOld = None
                             try:
                                 tmpNew = float(tmpSpaceSize-tmpSpaceToUse)/float(tmpEP['space_total'])
                             except:
                                 tmpNew = None
                             if tmpNew != None and (tmpOld == None or tmpNew < tmpOld):
                                 fractionFreeSpace[tmpNucleus] = {'total':tmpEP['space_total'],
                                                                  'free':tmpSpaceSize-tmpSpaceToUse}
                         if not toSkip:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed endpoint check {1} TB'.format(len(nucleusList),diskThreshold/1024))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # ability to execute jobs
                     newNucleusList = {}
                     # get all panda sites
                     tmpSiteList = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         tmpSiteList += tmpNucleusSpec.allPandaSites
                     tmpSiteList = list(set(tmpSiteList))
                     tmpLog.debug('===== start for job check')
                     jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF)
                     tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True,
                                                          tmpSiteList,tmpLog)
                     tmpLog.debug('===== done for job check')
                     if tmpSt != Interaction.SC_SUCCEEDED:
                         tmpLog.error('no sites can run jobs')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     okNuclei = set()
                     for tmpSite in tmpRet:
                         siteSpec = siteMapper.getSite(tmpSite)
                         okNuclei.add(siteSpec.pandasite)
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if tmpNucleus in okNuclei:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                         else:
                             tmpLog.info('  skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus))
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed job check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ###################################### 
                     # data locality
                     toSkip = False
                     availableData = {}
                     for datasetSpec in inputChunk.getDatasets():
                         # only for real datasets
                         if datasetSpec.isPseudo():
                             continue
                         # ignore DBR
                         if DataServiceUtils.isDBR(datasetSpec.datasetName):
                             continue
                         # skip locality check
                         if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck:
                             continue
                         # use deep scan for primary dataset
                         if datasetSpec.isMaster():
                             deepScan = True
                         else:
                             deepScan = False
                         # get nuclei where data is available
                         tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF,
                                                                           datasetSpec.datasetName,
                                                                           nucleusList.keys(),
                                                                           deepScan)
                         if tmpSt != Interaction.SC_SUCCEEDED:
                             tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet))
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             toSkip = True
                             break
                         # sum
                         for tmpNucleus,tmpVals in tmpRet.iteritems():
                             if not tmpNucleus in availableData:
                                 availableData[tmpNucleus] = tmpVals
                             else:
                                 availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems())
                     if toSkip:
                         continue
                     if availableData != {}:
                         newNucleusList = {}
                         # skip if no data
                         skipMsgList = []
                         for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                             if len(nucleusList) == 1:
                                 tmpLog.info('  disable data locality check for nucleus={0} since no other candidate'.format(tmpNucleus))
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                             elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \
                                     availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac:
                                 tmpMsg = '  skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus,
                                                                                                                                     availableData[tmpNucleus]['ava_size_any'],
                                                                                                                                     availableData[tmpNucleus]['tot_size'],
                                                                                                                                     thrInputSizeFrac)
                                 skipMsgList.append(tmpMsg)
                             elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \
                                     availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac:
                                 tmpMsg = '  skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus,
                                                                                                                                       availableData[tmpNucleus]['ava_num_any'],
                                                                                                                                       availableData[tmpNucleus]['tot_num'],
                                                                                                                                       thrInputNumFrac)
                                 skipMsgList.append(tmpMsg)
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         if len(newNucleusList) > 0:
                             nucleusList = newNucleusList
                             for tmpMsg in skipMsgList:
                                 tmpLog.info(tmpMsg)
                         else:
                             tmpLog.info('  disable data locality check since no nucleus has input data')
                         tmpLog.info('{0} candidates passed data check'.format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ###################################### 
                     # weight
                     self.prioRW.acquire()
                     nucleusRW = self.prioRW[taskSpec.currentPriority]
                     self.prioRW.release()
                     totalWeight = 0
                     nucleusweights = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleus in nucleusRW:
                             nucleusRW[tmpNucleus] = 0
                         wStr = '1'
                         # with RW
                         if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW:
                             weight = 1 / float(nucleusRW[tmpNucleus])
                             wStr += '/( RW={0} )'.format(nucleusRW[tmpNucleus])
                         else:
                             weight = 1
                             wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW)
                         # with data
                         if availableData != {}:
                             if availableData[tmpNucleus]['tot_size'] > 0:
                                 weight *= float(availableData[tmpNucleus]['ava_size_any'])
                                 weight /= float(availableData[tmpNucleus]['tot_size'])
                                 wStr += '* ( available_input_size_DISKTAPE={0} )'.format(availableData[tmpNucleus]['ava_size_any'])
                                 wStr += '/ ( total_input_size={0} )'.format(availableData[tmpNucleus]['tot_size'])
                                 # negative weight for tape
                                 if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']:
                                     weight *= negWeightTape
                                     wStr += '*( weight_TAPE={0} )'.format(negWeightTape)
                             # fraction of free space
                             if tmpNucleus in fractionFreeSpace:
                                 try:
                                     tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                         float(fractionFreeSpace[tmpNucleus]['total'])
                                     weight *= tmpFrac
                                     wStr += '*( free_space={0} )/( total_space={1} )'.format(fractionFreeSpace[tmpNucleus]['free'],
                                                                                          fractionFreeSpace[tmpNucleus]['total'])
                                 except:
                                     pass
                         tmpLog.info('  use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr))
                         totalWeight += weight
                         nucleusweights.append((tmpNucleus,weight))
                     tmpLog.info('final {0} candidates'.format(len(nucleusList)))
                     ###################################### 
                     # final selection
                     tgtWeight = random.uniform(0,totalWeight)
                     candidateNucleus = None
                     for tmpNucleus,weight in nucleusweights:
                         tgtWeight -= weight
                         if tgtWeight <= 0:
                             candidateNucleus = tmpNucleus
                             break
                     if candidateNucleus == None:
                         candidateNucleus = nucleusweights[-1][0]
                 ###################################### 
                 # update
                 nucleusSpec = nucleusList[candidateNucleus]
                 # get output/log datasets
                 tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                            ['output','log'])
                 # get destinations
                 retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)}
                 tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
                 tmpLog.info('  set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet))
                 self.sendLogMessage(tmpLog)
                 if tmpRet:
                     tmpMsg = 'set task.status=ready'
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                 # update RW table
                 self.prioRW.acquire()
                 for prio,rwMap in self.prioRW.iteritems():
                     if prio > taskSpec.currentPriority:
                         continue
                     if candidateNucleus in rwMap:
                         rwMap[candidateNucleus] += taskRW
                     else:
                         rwMap[candidateNucleus] = taskRW
                 self.prioRW.release()
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errMsg  = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID)
             errMsg += traceback.format_exc()
             logger.error(errMsg)

Пример #26

Показать файл

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskDsList = self.taskDsList.get(nTasks)
             # no more datasets
             if len(taskDsList) == 0:
                 self.logger.debug("%s terminating since no more items" % self.__class__.__name__)
                 return
             # loop over all tasks
             for jediTaskID, dsList in taskDsList:
                 allUpdated = True
                 taskBroken = False
                 taskOnHold = False
                 runningTask = False
                 missingMap = {}
                 # make logger
                 tmpLog = MsgWrapper(self.logger, "<jediTaskID={0}>".format(jediTaskID))
                 # get task
                 tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID, False, True, None, 10)
                 if not tmpStat or taskSpec == None:
                     tmpLog.error("failed to get taskSpec for jediTaskID={0}".format(jediTaskID))
                     continue
                 try:
                     # get task parameters
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype, errvalue = sys.exc_info()[:2]
                     tmpLog.error(
                         "task param conversion from json failed with {0}:{1}".format(errtype.__name__, errvalue)
                     )
                     taskBroken = True
                 # renaming of parameters
                 if taskParamMap.has_key("nEventsPerInputFile"):
                     taskParamMap["nEventsPerFile"] = taskParamMap["nEventsPerInputFile"]
                 # the number of files per job
                 nFilesPerJob = None
                 if taskParamMap.has_key("nFilesPerJob"):
                     nFilesPerJob = taskParamMap["nFilesPerJob"]
                 # the number of files used by scout
                 nFilesForScout = 0
                 if nFilesPerJob != None:
                     nFilesForScout = 10 * nFilesPerJob
                 else:
                     nFilesForScout = 10
                 # load XML
                 if taskSpec.useLoadXML():
                     try:
                         loadXML = taskParamMap["loadXML"]
                         xmlConfig = ParseJobXML.dom_parser(xmlStr=loadXML)
                     except:
                         errtype, errvalue = sys.exc_info()[:2]
                         tmpLog.error("failed to load XML config with {0}:{1}".format(errtype.__name__, errvalue))
                         taskBroken = True
                 else:
                     xmlConfig = None
                 # check no wait
                 noWaitParent = False
                 if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None, taskSpec.jediTaskID]:
                     tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid)
                     if tmpStat == "running":
                         noWaitParent = True
                 # loop over all datasets
                 nFilesMaster = 0
                 if not taskBroken:
                     ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                     origNumFiles = None
                     if taskParamMap.has_key("nFiles"):
                         origNumFiles = taskParamMap["nFiles"]
                     for datasetSpec in dsList:
                         tmpLog.info(
                             "start loop for {0}(id={1})".format(datasetSpec.datasetName, datasetSpec.datasetID)
                         )
                         # get dataset metadata
                         tmpLog.info("get metadata")
                         gotMetadata = False
                         stateUpdateTime = datetime.datetime.utcnow()
                         try:
                             if not datasetSpec.isPseudo():
                                 tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName)
                             else:
                                 # dummy metadata for pseudo dataset
                                 tmpMetadata = {"state": "closed"}
                             # set mutable when parent is running and the dataset is open
                             if noWaitParent and tmpMetadata["state"] == "open":
                                 # dummy metadata when parent is running
                                 tmpMetadata = {"state": "mutable"}
                             gotMetadata = True
                         except:
                             errtype, errvalue = sys.exc_info()[:2]
                             tmpLog.error(
                                 "{0} failed to get metadata to {1}:{2}".format(
                                     self.__class__.__name__, errtype.__name__, errvalue
                                 )
                             )
                             if errtype == Interaction.JEDIFatalError:
                                 # fatal error
                                 datasetStatus = "broken"
                                 taskBroken = True
                                 # update dataset status
                                 self.updateDatasetStatus(datasetSpec, datasetStatus, tmpLog)
                             else:
                                 # temporary error
                                 taskOnHold = True
                             taskSpec.setErrDiag("failed to get metadata for {0}".format(datasetSpec.datasetName))
                             allUpdated = False
                         else:
                             # get file list specified in task parameters
                             fileList, includePatt, excludePatt = RefinerUtils.extractFileList(
                                 taskParamMap, datasetSpec.datasetName
                             )
                             # get the number of events in metadata
                             if taskParamMap.has_key("getNumEventsInMetadata"):
                                 getNumEvents = True
                             else:
                                 getNumEvents = False
                             # get file list from DDM
                             tmpLog.info("get files")
                             try:
                                 useInFilesWithNewAttemptNr = False
                                 skipDuplicate = not datasetSpec.useDuplicatedFiles()
                                 if not datasetSpec.isPseudo():
                                     if (
                                         fileList != []
                                         and taskParamMap.has_key("useInFilesInContainer")
                                         and not datasetSpec.containerName in ["", None]
                                     ):
                                         # read files from container if file list is specified in task parameters
                                         tmpDatasetName = datasetSpec.containerName
                                     else:
                                         tmpDatasetName = datasetSpec.datasetName
                                     tmpRet = ddmIF.getFilesInDataset(
                                         tmpDatasetName, getNumEvents=getNumEvents, skipDuplicate=skipDuplicate
                                     )
                                     # remove lost files
                                     tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName, tmpRet)
                                     if tmpLostFiles != {}:
                                         tmpLog.info(
                                             "found {0} lost files in {1}".format(len(tmpLostFiles), tmpDatasetName)
                                         )
                                         for tmpListGUID, tmpLostLFN in tmpLostFiles.iteritems():
                                             tmpLog.info("removed {0}".format(tmpLostLFN))
                                             del tmpRet[tmpListGUID]
                                 else:
                                     if not taskSpec.useListPFN():
                                         # dummy file list for pseudo dataset
                                         tmpRet = {
                                             str(uuid.uuid4()): {
                                                 "lfn": "pseudo_lfn",
                                                 "scope": None,
                                                 "filesize": 0,
                                                 "checksum": None,
                                             }
                                         }
                                     else:
                                         # make dummy file list for PFN list
                                         if taskParamMap.has_key("nFiles"):
                                             nPFN = taskParamMap["nFiles"]
                                         else:
                                             nPFN = 1
                                         tmpRet = {}
                                         for iPFN in range(nPFN):
                                             tmpRet[str(uuid.uuid4())] = {
                                                 "lfn": "{0:06d}:{1}".format(
                                                     iPFN, taskParamMap["pfnList"][iPFN].split("/")[-1]
                                                 ),
                                                 "scope": None,
                                                 "filesize": 0,
                                                 "checksum": None,
                                             }
                             except:
                                 errtype, errvalue = sys.exc_info()[:2]
                                 tmpLog.error(
                                     "failed to get files due to {0}:{1}".format(
                                         self.__class__.__name__, errtype.__name__, errvalue
                                     )
                                 )
                                 if errtype == Interaction.JEDIFatalError:
                                     # fatal error
                                     datasetStatus = "broken"
                                     taskBroken = True
                                     # update dataset status
                                     self.updateDatasetStatus(datasetSpec, datasetStatus, tmpLog)
                                 else:
                                     # temporary error
                                     taskOnHold = True
                                 taskSpec.setErrDiag("failed to get files for {0}".format(datasetSpec.datasetName))
                                 allUpdated = False
                             else:
                                 # the number of events per file
                                 nEventsPerFile = None
                                 nEventsPerJob = None
                                 nEventsPerRange = None
                                 if (datasetSpec.isMaster() and taskParamMap.has_key("nEventsPerFile")) or (
                                     datasetSpec.isPseudo() and taskParamMap.has_key("nEvents")
                                 ):
                                     if taskParamMap.has_key("nEventsPerFile"):
                                         nEventsPerFile = taskParamMap["nEventsPerFile"]
                                     elif datasetSpec.isPseudo() and taskParamMap.has_key("nEvents"):
                                         # use nEvents as nEventsPerFile for pseudo input
                                         nEventsPerFile = taskParamMap["nEvents"]
                                     if taskParamMap.has_key("nEventsPerJob"):
                                         nEventsPerJob = taskParamMap["nEventsPerJob"]
                                     elif taskParamMap.has_key("nEventsPerRange"):
                                         nEventsPerRange = taskParamMap["nEventsPerRange"]
                                 # max attempts and first event number
                                 maxAttempt = None
                                 firstEventNumber = None
                                 if datasetSpec.isMaster():
                                     # max attempts
                                     if taskSpec.disableAutoRetry():
                                         # disable auto retry
                                         maxAttempt = 1
                                     elif taskParamMap.has_key("maxAttempt"):
                                         maxAttempt = taskParamMap["maxAttempt"]
                                     else:
                                         # use default value
                                         maxAttempt = 3
                                     # first event number
                                     firstEventNumber = 1 + taskSpec.getFirstEventOffset()
                                 # nMaxEvents
                                 nMaxEvents = None
                                 if datasetSpec.isMaster() and taskParamMap.has_key("nEvents"):
                                     nMaxEvents = taskParamMap["nEvents"]
                                 # nMaxFiles
                                 nMaxFiles = None
                                 if taskParamMap.has_key("nFiles"):
                                     if datasetSpec.isMaster():
                                         nMaxFiles = taskParamMap["nFiles"]
                                     else:
                                         # calculate for secondary
                                         nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles)
                                         # multipled by the number of jobs per file for event-level splitting
                                         if nMaxFiles != None and taskParamMap.has_key("nEventsPerFile"):
                                             if taskParamMap.has_key("nEventsPerJob"):
                                                 if taskParamMap["nEventsPerFile"] > taskParamMap["nEventsPerJob"]:
                                                     nMaxFiles *= float(taskParamMap["nEventsPerFile"]) / float(
                                                         taskParamMap["nEventsPerJob"]
                                                     )
                                                     nMaxFiles = int(math.ceil(nMaxFiles))
                                             elif taskParamMap.has_key("nEventsPerRange"):
                                                 if taskParamMap["nEventsPerFile"] > taskParamMap["nEventsPerRange"]:
                                                     nMaxFiles *= float(taskParamMap["nEventsPerFile"]) / float(
                                                         taskParamMap["nEventsPerRange"]
                                                     )
                                                     nMaxFiles = int(math.ceil(nMaxFiles))
                                 # use scout
                                 useScout = False
                                 if datasetSpec.isMaster() and taskSpec.useScout():
                                     useScout = True
                                 # use files with new attempt numbers
                                 useFilesWithNewAttemptNr = False
                                 if (
                                     not datasetSpec.isPseudo()
                                     and fileList != []
                                     and taskParamMap.has_key("useInFilesWithNewAttemptNr")
                                 ):
                                     useFilesWithNewAttemptNr = True
                                 # feed files to the contents table
                                 tmpLog.info("update contents")
                                 retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(
                                     datasetSpec,
                                     tmpRet,
                                     tmpMetadata["state"],
                                     stateUpdateTime,
                                     nEventsPerFile,
                                     nEventsPerJob,
                                     maxAttempt,
                                     firstEventNumber,
                                     nMaxFiles,
                                     nMaxEvents,
                                     useScout,
                                     fileList,
                                     useFilesWithNewAttemptNr,
                                     nFilesPerJob,
                                     nEventsPerRange,
                                     nFilesForScout,
                                     includePatt,
                                     excludePatt,
                                     xmlConfig,
                                     noWaitParent,
                                     taskSpec.parent_tid,
                                 )
                                 if retDB == False:
                                     taskSpec.setErrDiag(
                                         "failed to insert files for {0}. {1}".format(
                                             datasetSpec.datasetName, diagMap["errMsg"]
                                         )
                                     )
                                     allUpdated = False
                                     taskBroken = True
                                     break
                                 elif retDB == None:
                                     # the dataset is locked by another or status is not applicable
                                     allUpdated = False
                                 elif missingFileList != []:
                                     # files are missing
                                     tmpErrStr = "{0} files missing in {1}".format(
                                         len(missingFileList), datasetSpec.datasetName
                                     )
                                     tmpLog.info(tmpErrStr)
                                     taskSpec.setErrDiag(tmpErrStr)
                                     allUpdated = False
                                     taskOnHold = True
                                     missingMap[datasetSpec.datasetName] = {
                                         "datasetSpec": datasetSpec,
                                         "missingFiles": missingFileList,
                                     }
                                 else:
                                     # reduce the number of files to be read
                                     if taskParamMap.has_key("nFiles"):
                                         if datasetSpec.isMaster():
                                             taskParamMap["nFiles"] -= nFilesUnique
                                     # reduce the number of files for scout
                                     if useScout:
                                         nFilesForScout = diagMap["nFilesForScout"]
                                     # number of master input files
                                     if datasetSpec.isMaster():
                                         nFilesMaster += nFilesUnique
                                 # running task
                                 if diagMap["isRunningTask"]:
                                     runningTask = True
                                 # no activated pending input for noWait
                                 if noWaitParent and diagMap["nActivatedPending"] == 0:
                                     tmpErrStr = "insufficient inputs are ready"
                                     tmpLog.info(tmpErrStr)
                                     taskSpec.setErrDiag(tmpErrStr)
                                     taskOnHold = True
                         tmpLog.info("end loop")
                 # no mater input
                 if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0:
                     tmpErrStr = "no master input files. input dataset is empty"
                     tmpLog.error(tmpErrStr)
                     taskSpec.setErrDiag(tmpErrStr, None)
                     if taskSpec.allowEmptyInput() or noWaitParent:
                         taskOnHold = True
                     else:
                         taskBroken = True
                 # update task status
                 if taskBroken:
                     # task is broken
                     taskSpec.status = "tobroken"
                     tmpMsg = "set task.status={0}".format(taskSpec.status)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg, self.msgType)
                     allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID, taskSpec)
                 # change task status unless the task is running
                 if not runningTask:
                     if taskOnHold:
                         if not noWaitParent:
                             # initialize task generator
                             taskGenerator = TaskGenerator(taskSpec.vo, taskSpec.prodSourceLabel)
                             tmpStat = taskGenerator.initializeMods(
                                 self.taskBufferIF, self.ddmIF.getInterface(taskSpec.vo)
                             )
                             if not tmpStat:
                                 tmpErrStr = "failed to initialize TaskGenerator"
                                 tmpLog.error(tmpErrStr)
                                 taskSpec.status = "tobroken"
                                 taskSpec.setErrDiag(tmpErrStr)
                             else:
                                 # make parent tasks if necessary
                                 tmpLog.info(
                                     "make parent tasks with {0} (if necessary)".format(
                                         taskGenerator.getClassName(taskSpec.vo, taskSpec.prodSourceLabel)
                                     )
                                 )
                                 tmpStat = taskGenerator.doGenerate(
                                     taskSpec, taskParamMap, missingFilesMap=missingMap
                                 )
                                 if tmpStat == Interaction.SC_FATAL:
                                     # failed to make parent tasks
                                     taskSpec.status = "tobroken"
                                     tmpLog.error("failed to make parent tasks")
                         # go to pending state
                         if not taskSpec.status in ["broken", "tobroken"]:
                             taskSpec.setOnHold()
                         tmpMsg = "set task.status={0}".format(taskSpec.status)
                         tmpLog.info(tmpMsg)
                         tmpLog.sendMsg(tmpMsg, self.msgType)
                         allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID, taskSpec)
                     elif allUpdated:
                         # all OK
                         allRet, newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                             jediTaskID, getTaskStatus=True
                         )
                         tmpMsg = "set task.status={0}".format(newTaskStatus)
                         tmpLog.info(tmpMsg)
                         tmpLog.sendMsg(tmpMsg, self.msgType)
                 tmpLog.info("done")
         except:
             errtype, errvalue = sys.exc_info()[:2]
             logger.error(
                 "{0} failed in runImpl() with {1}:{2}".format(self.__class__.__name__, errtype.__name__, errvalue)
             )

Пример #27

Показать файл

Файл: TaskRefiner.py Проект: PanDAWMS/panda-jedi

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.info('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,splitRule,taskStatus,parent_tid in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'< jediTaskID={0} >'.format(jediTaskID))
                 tmpLog.debug('start')
                 tmpStat = Interaction.SC_SUCCEEDED
                 errStr = ''
                 # read task parameters
                 try:
                     taskParam = None
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue)
                     tmpLog.debug(taskParam)
                     tmpLog.error(errStr)
                     continue
                     tmpStat = Interaction.SC_FAILED
                 # get impl
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('getting Impl')
                     try:
                         # get VO and sourceLabel
                         vo = taskParamMap['vo']
                         prodSourceLabel = taskParamMap['prodSourceLabel']
                         taskType = taskParamMap['taskType']
                         tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType))
                         # get impl
                         impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType,
                                                                 self.taskBufferIF,self.ddmIF)
                         if impl == None:
                             # task refiner is undefined
                             errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # extract common parameters
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('extracting common')
                     try:
                         # initalize impl
                         impl.initializeRefiner(tmpLog)
                         impl.oldTaskStatus = taskStatus
                         # extract common parameters
                         impl.extractCommon(jediTaskID, taskParamMap, self.workQueueMapper, splitRule)
                         # set parent tid
                         if not parent_tid in [None,jediTaskID]:
                             impl.taskSpec.parent_tid = parent_tid
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to extract common parameters with {0}:{1} {2}'.format(errtype.__name__,errvalue,
                                                                                                traceback.format_exc())
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # check attribute length
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('checking attribute length')
                     if not impl.taskSpec.checkAttrLength():
                         tmpLog.error(impl.taskSpec.errorDialog)
                         tmpStat = Interaction.SC_FAILED
                 # staging
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     if 'toStaging' in taskParamMap and taskStatus <> 'staged':
                         errStr = 'wait until staging is done'
                         impl.taskSpec.status = 'staging'
                         impl.taskSpec.oldStatus = taskStatus
                         impl.taskSpec.setErrDiag(errStr)
                         # not to update some task attributes
                         impl.taskSpec.resetRefinedAttrs()
                         tmpLog.info(errStr)
                         self.taskBufferIF.updateTask_JEDI(impl.taskSpec, {'jediTaskID':impl.taskSpec.jediTaskID},
                                                           oldStatus=[taskStatus], updateDEFT=False, setFrozenTime=False)
                         continue
                 # check parent
                 noWaitParent = False
                 parentState = None
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     if parent_tid not in [None,jediTaskID]:
                         tmpLog.info('check parent task')
                         try:
                             tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid)
                             parentState = tmpStat
                             if tmpStat == 'completed':
                                 # parent is done
                                 tmpStat = Interaction.SC_SUCCEEDED
                             elif tmpStat == 'running':
                                 if not impl.taskSpec.noWaitParent():
                                     # parent is running
                                     errStr = 'pending until parent task {0} is done'.format(parent_tid)
                                     impl.taskSpec.status = taskStatus
                                     impl.taskSpec.setOnHold()
                                     impl.taskSpec.setErrDiag(errStr)
                                     # not to update some task attributes
                                     impl.taskSpec.resetRefinedAttrs()
                                     tmpLog.info(errStr)
                                     self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                                       oldStatus=[taskStatus],setFrozenTime=False)
                                     continue
                                 else:
                                     # not wait for parent
                                     tmpStat = Interaction.SC_SUCCEEDED
                                     noWaitParent = True
                             else:
                                 # parent is corrupted
                                 tmpStat = Interaction.SC_FAILED
                                 tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid)
                                 impl.taskSpec.setErrDiag(tmpErrStr)
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # refine
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('refining with {0}'.format(impl.__class__.__name__))
                     try:
                         tmpStat = impl.doRefine(jediTaskID,taskParamMap)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         # wait unknown input if noWaitParent or waitInput
                         if ((impl.taskSpec.noWaitParent() or impl.taskSpec.waitInput()) \
                                 and errtype == JediException.UnknownDatasetError) or parentState == 'running' \
                                 or errtype == Interaction.JEDITemporaryError:
                             if impl.taskSpec.noWaitParent() or parentState == 'running':
                                 tmpErrStr = 'pending until parent produces input'
                                 setFrozenTime=False
                             elif errtype == Interaction.JEDITemporaryError:
                                 tmpErrStr = 'pending due to DDM problem. {0}'.format(errvalue)
                                 setFrozenTime=True
                             else:
                                 tmpErrStr = 'pending until input is staged'
                                 setFrozenTime=True
                             impl.taskSpec.status = taskStatus
                             impl.taskSpec.setOnHold()
                             impl.taskSpec.setErrDiag(tmpErrStr)
                             # not to update some task attributes
                             impl.taskSpec.resetRefinedAttrs()
                             tmpLog.info(tmpErrStr)
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                               oldStatus=[taskStatus],
                                                               insertUnknown=impl.unknownDatasetList,
                                                               setFrozenTime=setFrozenTime)
                             continue
                         else:
                             errStr  = 'failed to refine task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # register
                 if tmpStat != Interaction.SC_SUCCEEDED:
                     tmpLog.error('failed to refine the task')
                     if impl == None or impl.taskSpec == None:
                         tmpTaskSpec = JediTaskSpec()
                         tmpTaskSpec.jediTaskID = jediTaskID
                     else:
                         tmpTaskSpec = impl.taskSpec
                     tmpTaskSpec.status = 'tobroken'
                     if errStr != '':
                         tmpTaskSpec.setErrDiag(errStr,True)
                     self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID},oldStatus=[taskStatus])
                 else:
                     tmpLog.info('registering')                    
                     # fill JEDI tables
                     try:
                         # enable protection against task duplication
                         if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \
                                 not impl.taskSpec.checkPreProcessed():
                             uniqueTaskName = True
                         else:
                             uniqueTaskName = False
                         strTaskParams = None
                         if impl.updatedTaskParams != None:
                             strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams)
                         if taskStatus in ['registered', 'staged']:
                             # unset pre-process flag
                             if impl.taskSpec.checkPreProcessed():
                                 impl.taskSpec.setPostPreProcess()
                             # full registration
                             tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec,
                                                                                                  impl.inMasterDatasetSpec,
                                                                                                  impl.inSecDatasetSpecList,
                                                                                                  impl.outDatasetSpecList,
                                                                                                  impl.outputTemplateMap,
                                                                                                  impl.jobParamsTemplate,
                                                                                                  strTaskParams,
                                                                                                  impl.unmergeMasterDatasetSpec,
                                                                                                  impl.unmergeDatasetSpecMap,
                                                                                                  uniqueTaskName,
                                                                                                  taskStatus) 
                             if not tmpStat:
                                 tmpErrStr = 'failed to register the task to JEDI in a single shot'
                                 tmpLog.error(tmpErrStr)
                                 impl.taskSpec.status = newTaskStatus
                                 impl.taskSpec.setErrDiag(tmpErrStr,True)
                                 self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                                   oldStatus=[taskStatus])
                             tmpMsg = 'set task_status={0}'.format(newTaskStatus)
                             tmpLog.info(tmpMsg)
                             tmpLog.sendMsg(tmpMsg,self.msgType)
                         else:
                             # disable scouts if previous attempt didn't use it
                             if not impl.taskSpec.useScout(splitRule):
                                 impl.taskSpec.setUseScout(False)
                             # disallow to reset some attributes
                             for attName in ['ramCount', 'walltime', 'cpuTime', 'startTime']:
                                 impl.taskSpec.resetChangedAttr(attName)
                             # update task with new params
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                               oldStatus=[taskStatus])
                             # appending for incremetnal execution
                             tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec,
                                                                             impl.inSecDatasetSpecList)
                             if not tmpStat:
                                 tmpLog.error('failed to append datasets for incexec')
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(tmpErrStr)
                     else:
                         tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))

Пример #28

Показать файл

 def findLostFiles(self,datasetName,fileMap):
     methodName = 'findLostFiles'
     methodName += ' <datasetName={0}>'.format(datasetName)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     try:
         # get replicas
         tmpStat,tmpOut = self.listDatasetReplicas(datasetName)
         if tmpStat != self.SC_SUCCEEDED:
             tmpLog.error('faild to get dataset replicas with {0}'.format(tmpOut))
             raise tmpStat,tmpOut
         # check if complete replica is available
         hasCompReplica = False
         datasetReplicaMap = tmpOut
         for tmpEndPoint in datasetReplicaMap.keys():
             if datasetReplicaMap[tmpEndPoint][-1]['found'] != None and \
                     datasetReplicaMap[tmpEndPoint][-1]['total'] == datasetReplicaMap[tmpEndPoint][-1]['found']:
                 hasCompReplica = True
                 break
         # no lost files
         if hasCompReplica:
             tmpLog.info('done with no lost files')
             self.SC_SUCCEEDED,{}
         # get LFNs and scopes
         lfnMap = {}
         scopeMap = {}
         for tmpGUID in fileMap.keys():
             tmpLFN = fileMap[tmpGUID]['lfn']
             lfnMap[tmpGUID] = tmpLFN
             scopeMap[tmpLFN] = fileMap[tmpGUID]['scope']
         # get LFC and SE
         lfcSeMap = {}
         for tmpEndPoint in datasetReplicaMap.keys():
             # get LFC
             lfc = TiersOfATLAS.getLocalCatalog(tmpEndPoint)
             # add map
             if not lfcSeMap.has_key(lfc):
                 lfcSeMap[lfc] = []
             # get SE
             seStr = TiersOfATLAS.getSiteProperty(tmpEndPoint, 'srm')
             tmpMatch = re.search('://([^:/]+):*\d*/',seStr)
             if tmpMatch != None:
                 se = tmpMatch.group(1)
                 if not se in lfcSeMap[lfc]:
                     lfcSeMap[lfc].append(se)
         # get SURLs
         for lfcHost,seList in lfcSeMap.iteritems():
             tmpStat,tmpRetMap = self.getSURLsFromLFC(lfnMap,lfcHost,seList,scopes=scopeMap)
             if tmpStat != self.SC_SUCCEEDED:
                 tmpLog.error('faild to get SURLs with {0}'.format(tmpRetMap))
                 raise tmpStat,tmpRetMap
             # look for missing files
             newLfnMap = {}
             for tmpGUID,tmpLFN in lfnMap.iteritems():
                 if not tmpLFN in tmpRetMap:
                     newLfnMap[tmpGUID] = tmpLFN
             lfnMap = newLfnMap
         tmpLog.info('done with lost '+','.join(str(tmpLFN) for tmpLFN in lfnMap.values()))
         return self.SC_SUCCEEDED,lfnMap
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)

Пример #29

Показать файл

 def getLatestDBRelease(self):
     methodName = 'getLatestDBRelease'
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('trying to get the latest version number of DBR')
     # get ddo datasets
     tmpStat,ddoDatasets = self.listDatasets('ddo.*')
     if tmpStat != self.SC_SUCCEEDED or ddoDatasets == {}:
         tmpLog.error('failed to get a list of DBRelease datasets from DQ2')
         return self.SC_FAILED,None
     # reverse sort to avoid redundant lookup   
     ddoDatasets.sort()
     ddoDatasets.reverse()
     # extract version number
     latestVerMajor = 0
     latestVerMinor = 0
     latestVerBuild = 0
     latestVerRev   = 0
     latestDBR = ''
     for tmpName in ddoDatasets:
         # ignore CDRelease
         if ".CDRelease." in tmpName:
             continue
         # ignore user
         if tmpName.startswith('ddo.user'):
             continue
         # use Atlas.Ideal
         if not ".Atlas.Ideal." in tmpName:
             continue
         match = re.search('\.v(\d+)(_*[^\.]*)$',tmpName)
         if match == None:
             tmpLog.warning('cannot extract version number from %s' % tmpName)
             continue
         # ignore special DBRs
         if match.group(2) != '':
             continue
         # get major,minor,build,revision numbers
         tmpVerStr = match.group(1)
         tmpVerMajor = 0
         tmpVerMinor = 0
         tmpVerBuild = 0
         tmpVerRev   = 0
         try:
             tmpVerMajor = int(tmpVerStr[0:2])
         except:
             pass
         try:
             tmpVerMinor = int(tmpVerStr[2:4])
         except:
             pass
         try:
             tmpVerBuild = int(tmpVerStr[4:6])
         except:
             pass
         try:
             tmpVerRev = int(tmpVerStr[6:])
             # use only three digit DBR
             continue
         except:
             pass
         # compare
         if latestVerMajor > tmpVerMajor:
             continue
         elif latestVerMajor == tmpVerMajor:
             if latestVerMinor > tmpVerMinor:
                 continue
             elif latestVerMinor == tmpVerMinor:
                 if latestVerBuild > tmpVerBuild:
                     continue
                 elif latestVerBuild == tmpVerBuild:
                     if latestVerRev > tmpVerRev:
                         continue
         # check if well replicated
         tmpStat,ddoReplicas = self.listDatasetReplicas(tmpName)
         if len(ddoReplicas) < 10:
             continue
         # higher or equal version
         latestVerMajor = tmpVerMajor
         latestVerMinor = tmpVerMinor
         latestVerBuild = tmpVerBuild
         latestVerRev   = tmpVerRev
         latestDBR = tmpName
     # failed
     if latestDBR == '':
         tmpLog.error('failed to get the latest version of DBRelease dataset from DQ2')
         return self.SC_FAILED,None
     tmpLog.info('use {0}'.format(latestDBR))
     return self.SC_SUCCEEDED,latestDBR

Пример #30

Показать файл

Файл: WatchDog.py Проект: tertychnyy/panda-jedi

    def start(self):
        # start base classes
        JediKnight.start(self)
        FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF)
        # go into main loop
        while True:
            startTime = datetime.datetime.utcnow()
            try:
                # get logger
                tmpLog = MsgWrapper(logger)
                tmpLog.info('start')
                # loop over all vos
                for vo in self.vos:
                    # loop over all sourceLabels
                    for prodSourceLabel in self.prodSourceLabels:
                        # rescue picked files
                        tmpLog.info(
                            'rescue tasks with picked files for vo={0} label={1}'
                            .format(vo, prodSourceLabel))
                        tmpRet = self.taskBufferIF.rescuePickedFiles_JEDI(
                            vo, prodSourceLabel,
                            jedi_config.watchdog.waitForPicked)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to rescue')
                        else:
                            tmpLog.info('rescued {0} tasks'.format(tmpRet))

                        # reactivate pending tasks
                        tmpLog.info(
                            'reactivate pending tasks for vo={0} label={1}'.
                            format(vo, prodSourceLabel))
                        tmpRet = self.taskBufferIF.reactivatePendingTasks_JEDI(
                            vo, prodSourceLabel,
                            jedi_config.watchdog.waitForPending,
                            jedi_config.watchdog.timeoutForPending)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to reactivate')
                        else:
                            tmpLog.info('reactivated {0} tasks'.format(tmpRet))
                        # unlock tasks
                        tmpLog.info('unlock tasks for vo={0} label={1}'.format(
                            vo, prodSourceLabel))
                        tmpRet = self.taskBufferIF.unlockTasks_JEDI(
                            vo, prodSourceLabel,
                            jedi_config.watchdog.waitForLocked)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to unlock')
                        else:
                            tmpLog.info('unlock {0} tasks'.format(tmpRet))
                        # restart contents update
                        tmpLog.info(
                            'restart contents update for vo={0} label={1}'.
                            format(vo, prodSourceLabel))
                        tmpRet = self.taskBufferIF.restartTasksForContentsUpdate_JEDI(
                            vo, prodSourceLabel)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to restart')
                        else:
                            tmpLog.info('restarted {0} tasks'.format(tmpRet))
                        # kick exhausted tasks
                        tmpLog.info(
                            'kick exhausted tasks for vo={0} label={1}'.format(
                                vo, prodSourceLabel))
                        tmpRet = self.taskBufferIF.kickExhaustedTasks_JEDI(
                            vo, prodSourceLabel,
                            jedi_config.watchdog.waitForExhausted)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to kick')
                        else:
                            tmpLog.info('kicked {0} tasks'.format(tmpRet))
                        # finish tasks when goal is reached
                        tmpLog.info(
                            'finish achieved tasks for vo={0} label={1}'.
                            format(vo, prodSourceLabel))
                        tmpRet = self.taskBufferIF.getAchievedTasks_JEDI(
                            vo, prodSourceLabel,
                            jedi_config.watchdog.waitForAchieved)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to finish')
                        else:
                            for jediTaskID in tmpRet:
                                self.taskBufferIF.sendCommandTaskPanda(
                                    jediTaskID,
                                    'JEDI. Goal reached',
                                    True,
                                    'finish',
                                    comQualifier='soft')
                            tmpLog.info('finished {0} tasks'.format(tmpRet))
                        # vo/prodSourceLabel specific action
                        impl = self.getImpl(vo, prodSourceLabel)
                        if impl != None:
                            tmpLog.info(
                                'special action for vo={0} label={1} with {2}'.
                                format(vo, prodSourceLabel,
                                       impl.__class__.__name__))
                            tmpStat = impl.doAction()
                            if tmpStat != Interaction.SC_SUCCEEDED:
                                tmpLog.error(
                                    'failed to run special acction for vo={0} label={1}'
                                    .format(vo, prodSourceLabel))
                            else:
                                tmpLog.info('done for vo={0} label={1}'.format(
                                    vo, prodSourceLabel))
                tmpLog.info('done')
            except:
                errtype, errvalue = sys.exc_info()[:2]
                tmpLog.error('failed in {0}.start() with {1} {2}'.format(
                    self.__class__.__name__, errtype.__name__, errvalue))
            # sleep if needed
            loopCycle = jedi_config.watchdog.loopCycle
            timeDelta = datetime.datetime.utcnow() - startTime
            sleepPeriod = loopCycle - timeDelta.seconds
            if sleepPeriod > 0:
                time.sleep(sleepPeriod)
            # randomize cycle
            self.randomSleep()

Пример #31

Показать файл

 def doGenerate(self, taskSpec, taskParamMap, **varMap):
     # make logger
     tmpLog = MsgWrapper(logger,
                         "<jediTaskID={0}>".format(taskSpec.jediTaskID))
     tmpLog.info('start taskType={0}'.format(taskSpec.taskType))
     tmpLog.info(str(varMap))
     # returns
     retFatal = self.SC_FATAL
     retTmpError = self.SC_FAILED
     retOK = self.SC_SUCCEEDED
     try:
         # check prodSourceLabel
         if taskSpec.prodSourceLabel in ['managed', 'test']:
             # check taskType
             if taskSpec.taskType == 'recov':
                 # generate parent tasks for lost file recovery if it is not yet generated
                 if 'parentGenerated' in taskParamMap:
                     tmpLog.info(
                         'skip since already generated parent tasks')
                 else:
                     tmpLog.info(
                         'generating parent tasks for lost file recovery')
                     # missing files are undefined
                     if 'missingFilesMap' not in varMap:
                         tmpLog.error('missing files are undefined')
                         return retFatal
                     missingFilesMap = varMap['missingFilesMap']
                     # check datasets
                     for datasetName, datasetValMap in iteritems(
                             missingFilesMap):
                         # dataset needs specify container
                         datasetSpec = datasetValMap['datasetSpec']
                         if datasetSpec.containerName in ['', None]:
                             errStr = 'cannot make parent tasks due to undefined container for datasetID={0}:{1}'.format(
                                 datasetSpec.datasetID, datasetName)
                             taskSpec.setErrDiag(errStr)
                             tmpLog.error(errStr)
                             return retFatal
                     # make parameters for new task
                     newJsonStrList = []
                     for datasetName, datasetValMap in iteritems(
                             missingFilesMap):
                         datasetSpec = datasetValMap['datasetSpec']
                         newTaskParamMap = {}
                         newTaskParamMap['oldDatasetName'] = datasetName
                         newTaskParamMap['lostFiles'] = datasetValMap[
                             'missingFiles']
                         newTaskParamMap['vo'] = taskSpec.vo
                         newTaskParamMap['cloud'] = taskSpec.cloud
                         newTaskParamMap[
                             'taskPriority'] = taskSpec.taskPriority
                         newTaskParamMap['taskType'] = taskSpec.taskType
                         newTaskParamMap[
                             'prodSourceLabel'] = taskSpec.prodSourceLabel
                         logDatasetName = 'panda.jedi{0}.log.{1}'.format(
                             taskSpec.taskType, uuid.uuid4())
                         newTaskParamMap['log'] = {
                             'dataset':
                             logDatasetName,
                             'type':
                             'template',
                             'param_type':
                             'log',
                             'token':
                             'ATLASDATADISK',
                             'value':
                             '{0}.${{SN}}.log.tgz'.format(logDatasetName)
                         }
                         # make new datasetname
                         outDatasetName = datasetName
                         # remove /
                         outDatasetName = re.sub('/$', '', outDatasetName)
                         # remove extension
                         outDatasetName = re.sub(
                             '\.{0}\d+$'.format(taskSpec.taskType), '',
                             outDatasetName)
                         # add extension
                         outDatasetName = outDatasetName + '.{0}{1}'.format(
                             taskSpec.taskType, taskSpec.jediTaskID)
                         newTaskParamMap['output'] = {
                             'dataset': outDatasetName
                         }
                         if datasetSpec.containerName not in ['', None]:
                             newTaskParamMap['output'][
                                 'container'] = datasetSpec.containerName
                         # make json
                         jsonStr = json.dumps(newTaskParamMap)
                         newJsonStrList.append(jsonStr)
                     # change original task parameters to not repeat the same procedure and to use newly produced files
                     taskParamMap['parentGenerated'] = True
                     taskParamMap['useInFilesInContainer'] = True
                     taskParamMap['useInFilesWithNewAttemptNr'] = True
                     jsonStr = json.dumps(taskParamMap)
                     # insert and update task parameters
                     sTmp, newJediTaskIDs = self.taskBufferIF.insertUpdateTaskParams_JEDI(
                         taskSpec.jediTaskID, taskSpec.vo,
                         taskSpec.prodSourceLabel, jsonStr, newJsonStrList)
                     if sTmp:
                         tmpLog.info(
                             'inserted/updated tasks in DB : new jediTaskIDs={0}'
                             .format(str(newJediTaskIDs)))
                     else:
                         tmpLog.error('failed to insert/update tasks in DB')
                         return retFatal
         # return
         tmpLog.info('done')
         return retOK
     except Exception:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('doGenerate failed with {0}:{1}'.format(
             errtype.__name__, errvalue))
         return retFatal

Пример #32

Показать файл

 def do_preassign(self):
     tmp_log = MsgWrapper(logger, 'do_preassign')
     # refresh
     self.refresh()
     # list of resource type
     resource_type_list = [
         rt.resource_name for rt in self.taskBufferIF.load_resource_types()
     ]
     # threshold of time duration in second that the queue keeps empty to trigger preassignment
     empty_duration_threshold = 1800
     # return map
     ret_map = {
         'to_reassign': {},
     }
     # loop
     for prod_source_label in self.prodSourceLabelList:
         # site-rse map
         site_rse_map = self.get_site_rse_map(prod_source_label)
         # parameters from GDP config
         max_preassigned_tasks = self.taskBufferIF.getConfigValue(
             'queue_filler',
             'MAX_PREASSIGNED_TASKS_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if max_preassigned_tasks is None:
             max_preassigned_tasks = 3
         min_files_ready = self.taskBufferIF.getConfigValue(
             'queue_filler',
             'MIN_FILES_READY_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if min_files_ready is None:
             min_files_ready = 50
         min_files_remaining = self.taskBufferIF.getConfigValue(
             'queue_filler',
             'MIN_FILES_REMAINING_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if min_files_remaining is None:
             min_files_remaining = 100
         # load site empty-since map from cache
         site_empty_since_map_orig = self._get_from_ses_cache()
         # available sites
         available_sites_list = self.get_available_sites_list()
         # now timestamp
         now_time = datetime.datetime.utcnow()
         now_time_ts = int(now_time.timestamp())
         # update site empty-since map
         site_empty_since_map = copy.deepcopy(site_empty_since_map_orig)
         available_site_name_list = [x[0] for x in available_sites_list]
         for site in site_empty_since_map_orig:
             # remove sites that are no longer empty
             if site not in available_site_name_list:
                 del site_empty_since_map[site]
         for site in available_site_name_list:
             # add newly found empty sites
             if site not in site_empty_since_map_orig:
                 site_empty_since_map[site] = now_time_ts
         self._update_to_ses_cache(site_empty_since_map)
         # evaluate sites to preaassign according to cache
         # get blacklisted_tasks_map from cache
         blacklisted_tasks_map = self._get_from_bt_cache()
         blacklisted_tasks_set = set()
         for bt_list in blacklisted_tasks_map.values():
             blacklisted_tasks_set |= set(bt_list)
         # loop over available sites to preassign
         for (site, tmpSiteSpec, n_jobs_to_fill) in available_sites_list:
             # rses of the available site
             available_rses = set()
             try:
                 available_rses.update(set(site_rse_map[site]))
             except KeyError:
                 tmp_log.debug(
                     'skipped {site} since no good RSE'.format(site=site))
                 continue
             # do not consider TAPE rses
             for rse in set(available_rses):
                 if 'TAPE' in str(rse):
                     available_rses.remove(rse)
             # skip if no rse for available site
             if not available_rses:
                 tmp_log.debug(
                     'skipped {site} since no available RSE'.format(
                         site=site))
                 continue
             # skip if no coreCount set
             if not tmpSiteSpec.coreCount or not tmpSiteSpec.coreCount > 0:
                 tmp_log.debug(
                     'skipped {site} since coreCount is not set'.format(
                         site=site))
                 continue
             # now timestamp
             now_time = datetime.datetime.utcnow()
             now_time_ts = int(now_time.timestamp())
             # skip if not empty for long enough
             if site not in site_empty_since_map:
                 tmp_log.error(
                     'skipped {site} since not in empty-since map (should not happen)'
                     .format(site=site))
                 continue
             empty_duration = now_time_ts - site_empty_since_map[site]
             tmp_num_slots = tmpSiteSpec.getNumStandby(None, None)
             if empty_duration < empty_duration_threshold and not tmp_num_slots:
                 tmp_log.debug(
                     'skipped {site} since not empty for enough time ({ed}s < {edt}s)'
                     .format(site=site,
                             ed=empty_duration,
                             edt=empty_duration_threshold))
                 continue
             # only simul tasks if site has fairsharePolicy setup
             processing_type_constraint = ''
             if tmpSiteSpec.fairsharePolicy not in ('NULL', None):
                 if 'type=simul:0%' in tmpSiteSpec.fairsharePolicy:
                     # skip if zero share of simul
                     tmp_log.debug(
                         'skipped {site} since with fairshare but zero for simul'
                         .format(site=site))
                     continue
                 else:
                     processing_type_constraint = "AND t.processingType='simul' "
             # site attributes
             site_maxrss = tmpSiteSpec.maxrss if tmpSiteSpec.maxrss not in (
                 0, None) else 999999
             site_corecount = tmpSiteSpec.coreCount
             site_capability = str(tmpSiteSpec.capability).lower()
             # make sql parameters of rses
             available_rses = list(available_rses)
             rse_params_list = []
             rse_params_map = {}
             for j, rse in enumerate(available_rses):
                 rse_param = ':rse_{0}'.format(j + 1)
                 rse_params_list.append(rse_param)
                 rse_params_map[rse_param] = rse
             rse_params_str = ','.join(rse_params_list)
             # sql
             sql_query = (
                 "SELECT t.jediTaskID, t.workQueue_ID "
                 "FROM {jedi_schema}.JEDI_Tasks t "
                 "WHERE t.status IN ('ready','running') AND t.lockedBy IS NULL "
                 "AND t.prodSourceLabel=:prodSourceLabel "
                 "AND t.resource_type=:resource_type "
                 "AND site IS NULL "
                 "AND (COALESCE(t.baseRamCount, 0) + (CASE WHEN t.ramUnit IN ('MBPerCore','MBPerCoreFixed') THEN t.ramCount*:site_corecount ELSE t.ramCount END))*0.95 < :site_maxrss "
                 "AND t.eventService=0 "
                 "AND EXISTS ( "
                 "SELECT * FROM {jedi_schema}.JEDI_Dataset_Locality dl "
                 "WHERE dl.jediTaskID=t.jediTaskID "
                 "AND dl.rse IN ({rse_params_str}) "
                 ") "
                 "{processing_type_constraint} "
                 "AND EXISTS ( "
                 "SELECT d.datasetID FROM {jedi_schema}.JEDI_Datasets d "
                 "WHERE t.jediTaskID=d.jediTaskID AND d.type='input' "
                 "AND d.nFilesToBeUsed-d.nFilesUsed>=:min_files_ready "
                 "AND d.nFiles-d.nFilesUsed>=:min_files_remaining "
                 ") "
                 "ORDER BY t.currentPriority DESC "
                 "FOR UPDATE ").format(
                     jedi_schema=jedi_config.db.schemaJEDI,
                     rse_params_str=rse_params_str,
                     processing_type_constraint=processing_type_constraint)
             # loop over resource type
             for resource_type in resource_type_list:
                 # key name for preassigned_tasks_map = site + resource_type
                 key_name = '{0}|{1}'.format(site, resource_type)
                 # skip if not match with site capability
                 if site_capability == 'score' and not resource_type.startswith(
                         'SCORE'):
                     continue
                 elif site_capability == 'mcore' and not resource_type.startswith(
                         'MCORE'):
                     continue
                 # params map
                 params_map = {
                     ':prodSourceLabel': prod_source_label,
                     ':resource_type': resource_type,
                     ':site_maxrss': site_maxrss,
                     ':site_corecount': site_corecount,
                     ':min_files_ready': min_files_ready,
                     ':min_files_remaining': min_files_remaining,
                 }
                 params_map.update(rse_params_map)
                 # get preassigned_tasks_map from cache
                 preassigned_tasks_map = self._get_from_pt_cache()
                 preassigned_tasks_cached = preassigned_tasks_map.get(
                     key_name, [])
                 # get task_orig_attr_map from cache
                 task_orig_attr_map = self._get_from_attr_cache()
                 # number of tasks already preassigned
                 n_preassigned_tasks = len(preassigned_tasks_cached)
                 # nuber of tasks to preassign
                 n_tasks_to_preassign = max(
                     max_preassigned_tasks - n_preassigned_tasks, 0)
                 # preassign
                 if n_tasks_to_preassign <= 0:
                     tmp_log.debug(
                         '{key_name:<64} already has enough preassigned tasks ({n_tasks:>3}) ; skipped '
                         .format(key_name=key_name,
                                 n_tasks=n_preassigned_tasks))
                 elif DRY_RUN:
                     dry_sql_query = (
                         "SELECT t.jediTaskID, t.workQueue_ID "
                         "FROM {jedi_schema}.JEDI_Tasks t "
                         "WHERE t.status IN ('ready','running') AND t.lockedBy IS NULL "
                         "AND t.prodSourceLabel=:prodSourceLabel "
                         "AND t.resource_type=:resource_type "
                         "AND site IS NULL "
                         "AND (COALESCE(t.baseRamCount, 0) + (CASE WHEN t.ramUnit IN ('MBPerCore','MBPerCoreFixed') THEN t.ramCount*:site_corecount ELSE t.ramCount END))*0.95 < :site_maxrss "
                         "AND t.eventService=0 "
                         "AND EXISTS ( "
                         "SELECT * FROM {jedi_schema}.JEDI_Dataset_Locality dl "
                         "WHERE dl.jediTaskID=t.jediTaskID "
                         "AND dl.rse IN ({rse_params_str}) "
                         ") "
                         "{processing_type_constraint} "
                         "AND EXISTS ( "
                         "SELECT d.datasetID FROM {jedi_schema}.JEDI_Datasets d "
                         "WHERE t.jediTaskID=d.jediTaskID AND d.type='input' "
                         "AND d.nFilesToBeUsed-d.nFilesUsed>=:min_files_ready "
                         "AND d.nFiles-d.nFilesUsed>=:min_files_remaining "
                         ") "
                         "ORDER BY t.currentPriority DESC ").format(
                             jedi_schema=jedi_config.db.schemaJEDI,
                             rse_params_str=rse_params_str,
                             processing_type_constraint=
                             processing_type_constraint)
                     # tmp_log.debug('[dry run] {} {}'.format(dry_sql_query, params_map))
                     res = self.taskBufferIF.querySQL(
                         dry_sql_query, params_map)
                     n_tasks = 0 if res is None else len(res)
                     if n_tasks > 0:
                         result = [
                             x[0] for x in res
                             if x[0] not in preassigned_tasks_cached
                         ]
                         updated_tasks = result[:n_tasks_to_preassign]
                         tmp_log.debug(
                             '[dry run] {key_name:<64} {n_tasks:>3} tasks would be preassigned '
                             .format(key_name=key_name,
                                     n_tasks=n_tasks_to_preassign))
                         # update preassigned_tasks_map into cache
                         preassigned_tasks_map[key_name] = list(
                             set(updated_tasks)
                             | set(preassigned_tasks_cached))
                         tmp_log.debug('{} ; {}'.format(
                             str(updated_tasks),
                             str(preassigned_tasks_map[key_name])))
                         self._update_to_pt_cache(preassigned_tasks_map)
                 else:
                     updated_tasks_orig_attr = self.taskBufferIF.queryTasksToPreassign_JEDI(
                         sql_query,
                         params_map,
                         site,
                         blacklist=blacklisted_tasks_set,
                         limit=n_tasks_to_preassign)
                     if updated_tasks_orig_attr is None:
                         # dbproxy method failed
                         tmp_log.error(
                             '{key_name:<64} failed to preassign tasks '.
                             format(key_name=key_name))
                     else:
                         n_tasks = len(updated_tasks_orig_attr)
                         if n_tasks > 0:
                             updated_tasks = [
                                 x[0] for x in updated_tasks_orig_attr
                             ]
                             tmp_log.info(
                                 '{key_name:<64} {n_tasks:>3} tasks preassigned : {updated_tasks}'
                                 .format(key_name=key_name,
                                         n_tasks=str(n_tasks),
                                         updated_tasks=updated_tasks))
                             # update preassigned_tasks_map into cache
                             preassigned_tasks_map[key_name] = list(
                                 set(updated_tasks)
                                 | set(preassigned_tasks_cached))
                             self._update_to_pt_cache(preassigned_tasks_map)
                             # update task_orig_attr_map into cache and return map
                             for taskid, orig_attr in updated_tasks_orig_attr:
                                 taskid_str = str(taskid)
                                 task_orig_attr_map[taskid_str] = orig_attr
                                 ret_map['to_reassign'][taskid] = {
                                     'site': site,
                                     'n_jobs_to_fill': n_jobs_to_fill,
                                 }
                             self._update_to_attr_cache(task_orig_attr_map)
                             # Kibana log
                             for taskid in updated_tasks:
                                 tmp_log.debug(
                                     '#ATM #KV jediTaskID={taskid} action=do_preassign site={site} rtype={rtype} preassigned '
                                     .format(taskid=taskid,
                                             site=site,
                                             rtype=resource_type))
                         else:
                             tmp_log.debug(
                                 '{key_name:<64} found no proper task to preassign'
                                 .format(key_name=key_name))
     # total preassigned tasks
     preassigned_tasks_map = self._get_from_pt_cache()
     n_pt_tot = sum(
         [len(pt_list) for pt_list in preassigned_tasks_map.values()])
     tmp_log.debug('now {n_pt_tot} tasks preassigned in total'.format(
         n_pt_tot=n_pt_tot))
     # return
     return ret_map

Пример #33

Показать файл

Файл: SimpleTaskSetupper.py Проект: PanDAWMS/panda-jedi

    def doSetup(self, taskSpec, datasetToRegister, pandaJobs):
        # make logger
        tmpLog = MsgWrapper(logger,
                            "< jediTaskID={0} >".format(taskSpec.jediTaskID))
        tmpLog.info('start label={0} taskType={1}'.format(
            taskSpec.prodSourceLabel, taskSpec.taskType))
        # returns
        retFatal = self.SC_FATAL
        retOK = self.SC_SUCCEEDED
        try:
            # get DDM I/F
            ddmIF = self.ddmIF.getInterface(taskSpec.vo, taskSpec.cloud)
            # skip if DDM I/F is inactive
            if not ddmIF:
                tmpLog.info('skip due to inactive DDM I/F')
                return retOK
            # collect datasetID to register datasets/containers just in case
            for tmpPandaJob in pandaJobs:
                if not tmpPandaJob.produceUnMerge():
                    for tmpFileSpec in tmpPandaJob.Files:
                        if tmpFileSpec.type in ['output', 'log']:
                            if tmpFileSpec.datasetID not in datasetToRegister:
                                datasetToRegister.append(tmpFileSpec.datasetID)
            # register datasets
            if datasetToRegister:
                tmpLog.info('datasetToRegister={0}'.format(
                    str(datasetToRegister)))
                # get site mapper
                siteMapper = self.taskBufferIF.getSiteMapper()

                # loop over all datasets
                avDatasetList = []
                cnDatasetMap = {}
                ddmBackEnd = 'rucio'
                for datasetID in datasetToRegister:
                    # get output and log datasets
                    tmpLog.info(
                        'getting datasetSpec with datasetID={0}'.format(
                            datasetID))
                    tmpStat, datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(
                        taskSpec.jediTaskID, datasetID)
                    if not tmpStat:
                        tmpLog.error('failed to get output and log datasets')
                        return retFatal
                    if datasetSpec.isPseudo():
                        tmpLog.info('skip pseudo dataset')
                        continue

                    tmpLog.info('checking {0}'.format(datasetSpec.datasetName))
                    # check if dataset and container are available in DDM
                    for targetName in [
                            datasetSpec.datasetName, datasetSpec.containerName
                    ]:
                        if not targetName:
                            continue
                        if targetName in avDatasetList:
                            tmpLog.info(
                                '{0} already registered'.format(targetName))
                            continue
                        # set lifetime
                        lifetime = None
                        # check dataset/container in DDM
                        tmpList = ddmIF.listDatasets(targetName)
                        if not tmpList:
                            # get location
                            location = None
                            locForRule = None
                            if targetName == datasetSpec.datasetName:
                                # dataset
                                tmpLog.info('dest={0}'.format(
                                    datasetSpec.destination))
                                if datasetSpec.destination:
                                    if siteMapper.checkSite(
                                            datasetSpec.destination):
                                        location = siteMapper.getSite(
                                            'BNL_OSG_SPHENIX'
                                        ).ddm_output['default']
                                    else:
                                        location = datasetSpec.destination
                            if locForRule is None:
                                locForRule = location
                            # set metadata
                            if targetName == datasetSpec.datasetName:
                                metaData = {}
                                metaData['task_id'] = taskSpec.jediTaskID
                                if taskSpec.campaign:
                                    metaData['campaign'] = taskSpec.campaign
                            else:
                                metaData = None
                            # register dataset/container
                            tmpLog.info(
                                'registering {0} with location={1} backend={2} lifetime={3} meta={4}'
                                .format(targetName, location, ddmBackEnd,
                                        lifetime, str(metaData)))
                            tmpStat = ddmIF.registerNewDataset(
                                targetName,
                                backEnd=ddmBackEnd,
                                location=location,
                                lifetime=lifetime,
                                metaData=metaData)
                            if not tmpStat:
                                tmpLog.error('failed to register {0}'.format(
                                    targetName))
                                return retFatal
                            # register location
                            if locForRule:
                                """
                                if taskSpec.workingGroup:
                                    userName = taskSpec.workingGroup
                                else:
                                    userName = taskSpec.userName
                                """
                                userName = None
                                activity = None
                                grouping = None
                                tmpLog.info(
                                    'registering location={} lifetime={} days activity={} grouping={} '
                                    'owner={}'.format(locForRule, lifetime,
                                                      activity, grouping,
                                                      userName))
                                tmpStat = ddmIF.registerDatasetLocation(
                                    targetName,
                                    locForRule,
                                    owner=userName,
                                    lifetime=lifetime,
                                    backEnd=ddmBackEnd,
                                    activity=activity,
                                    grouping=grouping)
                                if not tmpStat:
                                    tmpLog.error(
                                        'failed to register location {0} for {1}'
                                        .format(locForRule, targetName))
                                    return retFatal
                            avDatasetList.append(targetName)

                    # check if dataset is in the container
                    if datasetSpec.containerName and datasetSpec.containerName != datasetSpec.datasetName:
                        # get list of constituent datasets in the container
                        if datasetSpec.containerName not in cnDatasetMap:
                            cnDatasetMap[
                                datasetSpec.
                                containerName] = ddmIF.listDatasetsInContainer(
                                    datasetSpec.containerName)
                        # add dataset
                        if datasetSpec.datasetName not in cnDatasetMap[
                                datasetSpec.containerName]:
                            tmpLog.info('adding {0} to {1}'.format(
                                datasetSpec.datasetName,
                                datasetSpec.containerName))
                            tmpStat = ddmIF.addDatasetsToContainer(
                                datasetSpec.containerName,
                                [datasetSpec.datasetName],
                                backEnd=ddmBackEnd)
                            if not tmpStat:
                                tmpLog.error('failed to add {0} to {1}'.format(
                                    datasetSpec.datasetName,
                                    datasetSpec.containerName))
                                return retFatal
                            cnDatasetMap[datasetSpec.containerName].append(
                                datasetSpec.datasetName)
                        else:
                            tmpLog.info('{0} already in {1}'.format(
                                datasetSpec.datasetName,
                                datasetSpec.containerName))
                    # update dataset
                    datasetSpec.status = 'registered'
                    self.taskBufferIF.updateDataset_JEDI(
                        datasetSpec, {
                            'jediTaskID': taskSpec.jediTaskID,
                            'datasetID': datasetID
                        })
            # return
            tmpLog.info('done')
            return retOK
        except Exception as e:
            errStr = 'doSetup failed with {}'.format(str(e))
            tmpLog.error(errStr + traceback.format_exc())
            taskSpec.setErrDiag(errStr)
            return retFatal

Пример #34

Показать файл

Файл: TaskCommando.py Проект: PanDAWMS/panda-jedi

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,commandMap in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,' < jediTaskID={0} >'.format(jediTaskID))
                 commandStr = commandMap['command']
                 commentStr = commandMap['comment']
                 oldStatus  = commandMap['oldStatus']
                 tmpLog.info('start for {0}'.format(commandStr))
                 tmpStat = Interaction.SC_SUCCEEDED
                 if commandStr in ['kill','finish','reassign']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # loop twice to see immediate result
                     for iLoop in range(2):
                         # get active PandaIDs to be killed
                         if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr:
                             pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID)
                         elif commandStr == 'reassign' and commentStr != None and 'nokill reassign' in commentStr:
                             pandaIDs = []
                         else:
                             pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True)
                         if pandaIDs == None:
                             tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID))
                             tmpStat = Interaction.SC_FAILED
                         # kill jobs or update task
                         if tmpStat == Interaction.SC_SUCCEEDED:
                             if pandaIDs == []:
                                 # done since no active jobs
                                 tmpMsg = 'completed cleaning jobs'
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpTaskSpec = JediTaskSpec()
                                 tmpTaskSpec.jediTaskID = jediTaskID
                                 updateTaskStatus = True
                                 if commandStr != 'reassign':
                                     # reset oldStatus
                                     # keep oldStatus for task reassignment since it is reset when actually reassigned
                                     tmpTaskSpec.forceUpdate('oldStatus')
                                 else:
                                     # extract cloud or site
                                     if commentStr != None:
                                         tmpItems = commentStr.split(':')
                                         if tmpItems[0] == 'cloud':
                                             tmpTaskSpec.cloud = tmpItems[1]
                                         elif tmpItems[0] == 'nucleus':
                                             tmpTaskSpec.nucleus = tmpItems[1]
                                         else:
                                             tmpTaskSpec.site = tmpItems[1]
                                         tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1])
                                         tmpLog.sendMsg(tmpMsg,self.msgType)
                                         tmpLog.info(tmpMsg)
                                         # back to oldStatus if necessary 
                                         if tmpItems[2] == 'y':
                                             tmpTaskSpec.status = oldStatus
                                             tmpTaskSpec.forceUpdate('oldStatus')
                                             updateTaskStatus = False
                                 if commandStr == 'reassign':
                                     tmpTaskSpec.forceUpdate('errorDialog')
                                 if commandStr == 'finish':
                                     # update datasets
                                     tmpLog.info('updating datasets to finish')
                                     tmpStat = self.taskBufferIF.updateDatasetsToFinishTask_JEDI(jediTaskID, self.pid)
                                     if not tmpStat:
                                         tmpLog.info('wait until datasets are updated to finish')
                                     # ignore failGoalUnreached when manually finished
                                     tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID)
                                     tmpTaskSpec.splitRule = taskSpec.splitRule
                                     tmpTaskSpec.unsetFailGoalUnreached()
                                 if updateTaskStatus:
                                     tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done']
                                 tmpMsg = 'set task_status={0}'.format(tmpTaskSpec.status)
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID},
                                                                            setOldModTime=True)
                                 tmpLog.info('done with {0}'.format(str(tmpRet)))
                                 break
                             else:
                                 # kill only in the first loop
                                 if iLoop > 0:
                                     break
                                 # wait or kill jobs 
                                 if 'soft finish' in commentStr:
                                     queuedPandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID)
                                     tmpMsg = "trying to kill {0} queued jobs for soft finish".format(len(queuedPandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = self.taskBufferIF.killJobs(queuedPandaIDs,commentStr,'52',True)
                                     tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = True
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                                     break
                                 else:
                                     tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpLog.sendMsg(tmpMsg,self.msgType)
                                     if commandStr in ['finish']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True)
                                     elif commandStr in ['reassign']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'51',True)
                                     else:
                                         # normal kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True)
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                 elif commandStr in ['retry','incexec']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # change task params for incexec
                     if commandStr == 'incexec':
                         try:
                             # read task params
                             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                             taskParamMap = RefinerUtils.decodeJSON(taskParam)
                             # remove some params
                             for newKey in ['nFiles','fixedSandbox']:
                                 try:
                                     del taskParamMap[newKey]
                                 except:
                                     pass
                             # convert new params
                             newParamMap = RefinerUtils.decodeJSON(commentStr)
                             # change params
                             for newKey,newVal in newParamMap.iteritems():
                                 if newVal == None:
                                     # delete
                                     if newKey in taskParamMap:
                                         del taskParamMap[newKey]
                                 else:
                                     # change
                                     taskParamMap[newKey] = newVal
                             # overwrite sandbox
                             if 'fixedSandbox' in taskParamMap:
                                 # noBuild
                                 for tmpParam in taskParamMap['jobParameters']:
                                     if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None:
                                         tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox']
                                 # build
                                 if taskParamMap.has_key('buildSpec'):
                                     taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox']
                                 # merge
                                 if taskParamMap.has_key('mergeSpec'):
                                     taskParamMap['mergeSpec']['jobParameters'] = \
                                         re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters'])
                             # encode new param
                             strTaskParams = RefinerUtils.encodeJSON(taskParamMap)
                             tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams)
                             if tmpRet != True:
                                 tmpLog.error('failed to update task params')
                                 continue
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue))
                             continue
                     # retry child tasks
                     if 'sole ' in commentStr:
                         retryChildTasks = False
                     else:
                         retryChildTasks = True
                     # discard events
                     if 'discard ' in commentStr:
                         discardEvents = True
                     else:
                         discardEvents = False
                     tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr,
                                                                             retryChildTasks=retryChildTasks,
                                                                             discardEvents=discardEvents)
                     if tmpRet == True:
                         tmpMsg = 'set task_status={0}'.format(newTaskStatus)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                         tmpLog.info(tmpMsg)
                     tmpLog.info('done with {0}'.format(tmpRet))
                 else:
                     tmpLog.error('unknown command')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errStr  = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errStr += traceback.format_exc()
             logger.error(errStr)

Пример #35

Показать файл

Файл: ContentsFeeder.py Проект: PanDAWMS/panda-jedi

    def runImpl(self):
        while True:
            try:
                # get a part of list
                nTasks = 10
                taskDsList = self.taskDsList.get(nTasks)
                # no more datasets
                if len(taskDsList) == 0:
                    self.logger.debug('%s terminating since no more items' % self.__class__.__name__)
                    return
                # loop over all tasks
                for jediTaskID,dsList in taskDsList:
                    allUpdated = True
                    taskBroken = False
                    taskOnHold = False
                    runningTask = False
                    missingMap = {}
                    datasetsIdxConsistency = []

                    # get task
                    tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False,True,self.pid,10)
                    if not tmpStat or taskSpec == None:
                        self.logger.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID))
                        continue

                    # make logger
                    try:
                        gshare = '_'.join(taskSpec.gshare.split(' '))
                    except:
                        gshare = 'Undefined'
                    tmpLog = MsgWrapper(self.logger,'<jediTaskID={0} gshare={1}>'.format(jediTaskID, gshare))

                    try:
                        # get task parameters
                        taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                        taskParamMap = RefinerUtils.decodeJSON(taskParam)
                    except:
                        errtype,errvalue = sys.exc_info()[:2]
                        tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue))
                        taskBroken = True
                    # renaming of parameters
                    if taskParamMap.has_key('nEventsPerInputFile'):
                        taskParamMap['nEventsPerFile'] = taskParamMap['nEventsPerInputFile']
                    # the number of files per job
                    nFilesPerJob = taskSpec.getNumFilesPerJob()
                    # the number of chunks used by scout 
                    nChunksForScout = 10
                    # load XML
                    if taskSpec.useLoadXML():
                        xmlConfig = taskParamMap['loadXML']
                    else:
                        xmlConfig = None
                    # skip files used by another task
                    if 'skipFilesUsedBy' in taskParamMap:
                        skipFilesUsedBy = taskParamMap['skipFilesUsedBy']
                    else:
                        skipFilesUsedBy = None
                    # check no wait
                    noWaitParent = False
                    parentOutDatasets = set()
                    if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None,taskSpec.jediTaskID]:
                        tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid)
                        if tmpStat == 'running':
                            noWaitParent = True
                            # get output datasets from parent task
                            tmpParentStat,tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.parent_tid,
                                                                                                                  ['output','log'])
                            # collect dataset names
                            for tmpParentOutDataset in tmpParentOutDatasets:
                                parentOutDatasets.add(tmpParentOutDataset.datasetName)
                    # loop over all datasets
                    nFilesMaster = 0
                    checkedMaster = False
                    setFrozenTime = True
                    if not taskBroken:
                        ddmIF = self.ddmIF.getInterface(taskSpec.vo) 
                        origNumFiles = None
                        if taskParamMap.has_key('nFiles'):
                            origNumFiles = taskParamMap['nFiles']
                        for datasetSpec in dsList:
                            tmpLog.debug('start loop for {0}(id={1})'.format(datasetSpec.datasetName,datasetSpec.datasetID))
                            # index consistency
                            if datasetSpec.indexConsistent():
                                datasetsIdxConsistency.append(datasetSpec.datasetID)
                            # get dataset metadata
                            tmpLog.debug('get metadata')
                            gotMetadata = False
                            stateUpdateTime = datetime.datetime.utcnow()                    
                            try:
                                if not datasetSpec.isPseudo():
                                    tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName)
                                else:
                                    # dummy metadata for pseudo dataset
                                    tmpMetadata = {'state':'closed'}
                                # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed 
                                if (noWaitParent or taskSpec.runUntilClosed()) and \
                                        (tmpMetadata['state'] == 'open' \
                                             or datasetSpec.datasetName in parentOutDatasets \
                                             or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets):
                                    # dummy metadata when parent is running
                                    tmpMetadata = {'state':'mutable'}
                                gotMetadata = True
                            except:
                                errtype,errvalue = sys.exc_info()[:2]
                                tmpLog.error('{0} failed to get metadata to {1}:{2}'.format(self.__class__.__name__,
                                                                                            errtype.__name__,errvalue))
                                if errtype == Interaction.JEDIFatalError:
                                    # fatal error
                                    datasetStatus = 'broken'
                                    taskBroken = True
                                    # update dataset status    
                                    self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                else:
                                    if not taskSpec.ignoreMissingInDS():
                                        # temporary error
                                        taskOnHold = True
                                    else:
                                        # ignore missing 
                                        datasetStatus = 'failed'
                                        # update dataset status
                                        self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                taskSpec.setErrDiag('failed to get metadata for {0}'.format(datasetSpec.datasetName))
                                if not taskSpec.ignoreMissingInDS():
                                    allUpdated = False
                            else:
                                # get file list specified in task parameters
                                fileList,includePatt,excludePatt = RefinerUtils.extractFileList(taskParamMap,datasetSpec.datasetName)   
                                # get the number of events in metadata
                                if taskParamMap.has_key('getNumEventsInMetadata'):
                                    getNumEvents = True
                                else:
                                    getNumEvents = False
                                # get file list from DDM
                                tmpLog.debug('get files')
                                try:
                                    useInFilesWithNewAttemptNr = False
                                    skipDuplicate = not datasetSpec.useDuplicatedFiles()
                                    if not datasetSpec.isPseudo():
                                        if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \
                                                not datasetSpec.containerName in ['',None]:
                                            # read files from container if file list is specified in task parameters
                                            tmpDatasetName = datasetSpec.containerName
                                        else:
                                            tmpDatasetName = datasetSpec.datasetName
                                        # use long format for LB
                                        longFormat = False
                                        if taskSpec.respectLumiblock() or taskSpec.orderByLB():
                                            longFormat = True
                                        tmpRet = ddmIF.getFilesInDataset(tmpDatasetName,
                                                                         getNumEvents=getNumEvents,
                                                                         skipDuplicate=skipDuplicate,
                                                                         longFormat=longFormat
                                                                         )
                                        tmpLog.debug('got {0} files in {1}'.format(len(tmpRet),tmpDatasetName))
                                        # remove lost files
                                        tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName,tmpRet)
                                        if tmpLostFiles != {}:
                                            tmpLog.debug('found {0} lost files in {1}'.format(len(tmpLostFiles),tmpDatasetName))
                                            for tmpListGUID,tmpLostLFN in tmpLostFiles.iteritems():
                                                tmpLog.debug('removed {0}'.format(tmpLostLFN))
                                                del tmpRet[tmpListGUID]
                                    else:
                                        if datasetSpec.isSeqNumber():
                                            # make dummy files for seq_number
                                            if datasetSpec.getNumRecords() != None:
                                                nPFN = datasetSpec.getNumRecords()
                                            elif origNumFiles != None:
                                                nPFN = origNumFiles
                                                if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \
                                                        and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                    nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerJob']
                                                elif taskParamMap.has_key('nEventsPerFile') and taskParamMap.has_key('nEventsPerRange'):
                                                    nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerRange']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap:
                                                nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerJob']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \
                                                    and taskSpec.getNumFilesPerJob() is not None:
                                                nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerFile'] / taskSpec.getNumFilesPerJob()
                                            else:
                                                # the default number of records for seq_number
                                                seqDefNumRecords = 10000
                                                # get nFiles of the master
                                                tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI(datasetSpec.jediTaskID,
                                                                                                           datasetSpec.masterID,
                                                                                                           ['nFiles'])
                                                # use nFiles of the master as the number of records if it is larger than the default
                                                if 'nFiles' in tmpMasterAtt and tmpMasterAtt['nFiles'] > seqDefNumRecords:
                                                    nPFN = tmpMasterAtt['nFiles']
                                                else:
                                                    nPFN = seqDefNumRecords
                                                # check usedBy 
                                                if skipFilesUsedBy != None:
                                                    for tmpJediTaskID in str(skipFilesUsedBy).split(','):
                                                        tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI(tmpJediTaskID,
                                                                                                                          {'datasetName':datasetSpec.datasetName},
                                                                                                                          ['nFiles'])
                                                        if 'nFiles' in tmpParentAtt and tmpParentAtt['nFiles']:
                                                            nPFN += tmpParentAtt['nFiles']
                                            tmpRet = {}
                                            # get offset
                                            tmpOffset = datasetSpec.getOffset()
                                            tmpOffset += 1
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {'lfn':iPFN+tmpOffset,
                                                                             'scope':None,
                                                                             'filesize':0,
                                                                             'checksum':None,
                                                                             }
                                        elif not taskSpec.useListPFN():
                                            # dummy file list for pseudo dataset
                                            tmpRet = {str(uuid.uuid4()):{'lfn':'pseudo_lfn',
                                                                         'scope':None,
                                                                         'filesize':0,
                                                                         'checksum':None,
                                                                         }
                                                      }
                                        else:
                                            # make dummy file list for PFN list
                                            if taskParamMap.has_key('nFiles'):
                                                nPFN = taskParamMap['nFiles']
                                            else:
                                                nPFN = 1
                                            tmpRet = {}
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {'lfn':'{0:06d}:{1}'.format(iPFN,taskParamMap['pfnList'][iPFN].split('/')[-1]),
                                                                             'scope':None,
                                                                             'filesize':0,
                                                                             'checksum':None,
                                                                             }
                                except:
                                    errtype,errvalue = sys.exc_info()[:2]
                                    tmpLog.error('failed to get files due to {0}:{1} {2}'.format(self.__class__.__name__,
                                                                                                 errtype.__name__,errvalue))
                                    if errtype == Interaction.JEDIFatalError:
                                        # fatal error
                                        datasetStatus = 'broken'
                                        taskBroken = True
                                        # update dataset status    
                                        self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                    else:
                                        # temporary error
                                        taskOnHold = True
                                    taskSpec.setErrDiag('failed to get files for {0}'.format(datasetSpec.datasetName))
                                    allUpdated = False
                                else:
                                    # parameters for master input
                                    respectLB = False
                                    useRealNumEvents = False
                                    if datasetSpec.isMaster():
                                        # respect LB boundaries
                                        respectLB = taskSpec.respectLumiblock()
                                        # use real number of events
                                        useRealNumEvents = taskSpec.useRealNumEvents()
                                    # the number of events per file
                                    nEventsPerFile  = None
                                    nEventsPerJob   = None
                                    nEventsPerRange = None
                                    tgtNumEventsPerJob = None
                                    if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \
                                            (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()):
                                        if taskParamMap.has_key('nEventsPerFile'):
                                            nEventsPerFile = taskParamMap['nEventsPerFile']
                                        elif datasetSpec.isMaster() and datasetSpec.isPseudo() and taskParamMap.has_key('nEvents'):
                                            # use nEvents as nEventsPerFile for pseudo input
                                            nEventsPerFile = taskParamMap['nEvents']
                                        if taskParamMap.has_key('nEventsPerJob'):
                                            nEventsPerJob = taskParamMap['nEventsPerJob']
                                        elif taskParamMap.has_key('nEventsPerRange'):
                                            nEventsPerRange = taskParamMap['nEventsPerRange']
                                        if 'tgtNumEventsPerJob' in taskParamMap:
                                            tgtNumEventsPerJob = taskParamMap['tgtNumEventsPerJob']
                                            # reset nEventsPerJob
                                            nEventsPerJob = None
                                    # max attempts
                                    maxAttempt = None
                                    maxFailure = None
                                    if datasetSpec.isMaster() or datasetSpec.toKeepTrack():
                                        # max attempts 
                                        if taskSpec.disableAutoRetry():
                                            # disable auto retry 
                                            maxAttempt = 1
                                        elif taskParamMap.has_key('maxAttempt'):
                                            maxAttempt = taskParamMap['maxAttempt']
                                        else:
                                            # use default value
                                            maxAttempt = 3
                                        # max failure
                                        if 'maxFailure' in taskParamMap:
                                            maxFailure = taskParamMap['maxFailure']
                                    # first event number
                                    firstEventNumber = None
                                    if datasetSpec.isMaster():
                                        # first event number
                                        firstEventNumber = 1 + taskSpec.getFirstEventOffset()
                                    # nMaxEvents
                                    nMaxEvents = None 
                                    if datasetSpec.isMaster() and taskParamMap.has_key('nEvents'):
                                        nMaxEvents = taskParamMap['nEvents']
                                    # nMaxFiles
                                    nMaxFiles = None
                                    if taskParamMap.has_key('nFiles'):
                                        if datasetSpec.isMaster():
                                            nMaxFiles = taskParamMap['nFiles']
                                        else:
                                            # calculate for secondary
                                            nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles)
                                            # multipled by the number of jobs per file for event-level splitting
                                            if nMaxFiles != None and taskParamMap.has_key('nEventsPerFile'):
                                                if taskParamMap.has_key('nEventsPerJob'):
                                                    if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                        nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerJob'])
                                                        nMaxFiles = int(math.ceil(nMaxFiles))
                                                elif taskParamMap.has_key('nEventsPerRange'):
                                                    if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerRange']:
                                                        nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerRange'])
                                                        nMaxFiles = int(math.ceil(nMaxFiles))
                                    # use scout
                                    useScout = False    
                                    if datasetSpec.isMaster() and taskSpec.useScout() and (datasetSpec.status != 'toupdate' or not taskSpec.isPostScout()):
                                        useScout = True
                                    # use files with new attempt numbers    
                                    useFilesWithNewAttemptNr = False
                                    if not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key('useInFilesWithNewAttemptNr'):
                                        useFilesWithNewAttemptNr = True
                                    # ramCount
                                    ramCount = 0
                                    # skip short input
                                    if datasetSpec.isMaster() and not datasetSpec.isPseudo() \
                                            and nEventsPerFile is not None and nEventsPerJob is not None \
                                            and nEventsPerFile >= nEventsPerJob \
                                            and 'skipShortInput' in taskParamMap and taskParamMap['skipShortInput'] == True:
                                        skipShortInput = True
                                    else:
                                        skipShortInput = False
                                    # feed files to the contents table
                                    tmpLog.debug('update contents')
                                    retDB,missingFileList,nFilesUnique,diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(datasetSpec,tmpRet,
                                                                                                                              tmpMetadata['state'],
                                                                                                                              stateUpdateTime,
                                                                                                                              nEventsPerFile,
                                                                                                                              nEventsPerJob,
                                                                                                                              maxAttempt,
                                                                                                                              firstEventNumber,
                                                                                                                              nMaxFiles,
                                                                                                                              nMaxEvents,
                                                                                                                              useScout,
                                                                                                                              fileList,
                                                                                                                              useFilesWithNewAttemptNr,
                                                                                                                              nFilesPerJob,
                                                                                                                              nEventsPerRange,
                                                                                                                              nChunksForScout,
                                                                                                                              includePatt,
                                                                                                                              excludePatt,
                                                                                                                              xmlConfig,
                                                                                                                              noWaitParent,
                                                                                                                              taskSpec.parent_tid,
                                                                                                                              self.pid,
                                                                                                                              maxFailure,
                                                                                                                              useRealNumEvents,
                                                                                                                              respectLB,
                                                                                                                              tgtNumEventsPerJob,
                                                                                                                              skipFilesUsedBy,
                                                                                                                              ramCount,
                                                                                                                              taskSpec,
                                                                                                                              skipShortInput)
                                    if retDB == False:
                                        taskSpec.setErrDiag('failed to insert files for {0}. {1}'.format(datasetSpec.datasetName,
                                                                                                         diagMap['errMsg']))
                                        allUpdated = False
                                        taskBroken = True
                                        break
                                    elif retDB == None:
                                        # the dataset is locked by another or status is not applicable
                                        allUpdated = False
                                        tmpLog.debug('escape since task or dataset is locked')
                                        break
                                    elif missingFileList != []:
                                        # files are missing
                                        tmpErrStr = '{0} files missing in {1}'.format(len(missingFileList),datasetSpec.datasetName)
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        allUpdated = False
                                        taskOnHold = True
                                        missingMap[datasetSpec.datasetName] = {'datasetSpec':datasetSpec,
                                                                               'missingFiles':missingFileList} 
                                    else:
                                        # reduce the number of files to be read
                                        if taskParamMap.has_key('nFiles'):
                                            if datasetSpec.isMaster():
                                                taskParamMap['nFiles'] -= nFilesUnique
                                        # reduce the number of files for scout
                                        if useScout:
                                            nChunksForScout = diagMap['nChunksForScout']
                                        # number of master input files
                                        if datasetSpec.isMaster():
                                            checkedMaster = True
                                            nFilesMaster += nFilesUnique
                                    # running task
                                    if diagMap['isRunningTask']:
                                        runningTask = True
                                    # no activated pending input for noWait
                                    if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout <= 0) \
                                            and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster():
                                        tmpErrStr = 'insufficient inputs are ready. '
                                        tmpErrStr += diagMap['errMsg']
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        taskOnHold = True
                                        setFrozenTime = False
                                        break
                            tmpLog.debug('end loop')
                    # no mater input
                    if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster:
                        tmpErrStr = 'no master input files. input dataset is empty'
                        tmpLog.error(tmpErrStr)
                        taskSpec.setErrDiag(tmpErrStr,None)
                        if taskSpec.allowEmptyInput() or noWaitParent:
                            taskOnHold = True
                        else:
                            taskBroken = True
                    # index consistency
                    if not taskOnHold and not taskBroken and len(datasetsIdxConsistency) > 0:
                        self.taskBufferIF.removeFilesIndexInconsistent_JEDI(jediTaskID,datasetsIdxConsistency)
                    # update task status
                    if taskBroken:
                        # task is broken
                        taskSpec.status = 'tobroken'
                        tmpMsg = 'set task_status={0}'.format(taskSpec.status)
                        tmpLog.info(tmpMsg)
                        tmpLog.sendMsg(tmpMsg,self.msgType)
                        allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid)
                    # change task status unless the task is running
                    if not runningTask:
                        if taskOnHold:
                            # go to pending state
                            if not taskSpec.status in ['broken','tobroken']:
                                taskSpec.setOnHold()
                            tmpMsg = 'set task_status={0}'.format(taskSpec.status)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg,self.msgType)
                            allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid,setFrozenTime=setFrozenTime)
                        elif allUpdated:
                            # all OK
                            allRet,newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,getTaskStatus=True,pid=self.pid,
                                                                                                       useWorldCloud=taskSpec.useWorldCloud())
                            tmpMsg = 'set task_status={0}'.format(newTaskStatus)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg,self.msgType)
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid)
                        tmpLog.debug('unlock not-running task with {0}'.format(retUnlock))
                    else:
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid)
                        tmpLog.debug('unlock task with {0}'.format(retUnlock))
                    tmpLog.debug('done')
            except:
                errtype,errvalue = sys.exc_info()[:2]
                logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))

Пример #36

Показать файл

 def do_for_data_locality(self):
     tmp_log = MsgWrapper(logger)
     # refresh
     self.refresh()
     # list of resource type
     # resource_type_list = [ rt.resource_name for rt in self.taskBufferIF.load_resource_types() ]
     # loop
     for prod_source_label in self.prodSourceLabelList:
         # site-rse map and blacklisted rses
         site_rse_map, blacklisted_rse_set = self.get_site_rse_map_and_blacklisted_rse_set(
             prod_source_label)
         tmp_log.debug('Found {0} blacklisted RSEs : {1}'.format(
             len(blacklisted_rse_set), ','.join(list(blacklisted_rse_set))))
         # parameter from GDP config
         upplimit_ioIntensity = self.taskBufferIF.getConfigValue(
             'task_withholder',
             'LIMIT_IOINTENSITY_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         lowlimit_currentPriority = self.taskBufferIF.getConfigValue(
             'task_withholder',
             'LIMIT_PRIORITY_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if upplimit_ioIntensity is None:
             upplimit_ioIntensity = 999999
         if lowlimit_currentPriority is None:
             lowlimit_currentPriority = -999999
         upplimit_ioIntensity = max(upplimit_ioIntensity, 100)
         # get work queue for gshare
         work_queue_list = self.workQueueMapper.getAlignedQueueList(
             self.vo, prod_source_label)
         # loop over work queue
         for work_queue in work_queue_list:
             gshare = work_queue.queue_name
             # get cutoff
             cutoff = self.taskBufferIF.getConfigValue(
                 'jobbroker', 'NQUEUELIMITSITE_{}'.format(gshare), 'jedi',
                 self.vo)
             if not cutoff:
                 cutoff = 20
             # busy sites
             busy_sites_list = self.get_busy_sites(gshare, cutoff)
             # rses of busy sites
             busy_rses = set()
             for site in busy_sites_list:
                 try:
                     busy_rses.update(set(site_rse_map[site]))
                 except KeyError:
                     continue
             # make sql parameters of rses
             to_exclude_rses = list(busy_rses | blacklisted_rse_set)
             rse_params_list = []
             rse_params_map = {}
             for j, rse in enumerate(to_exclude_rses):
                 rse_param = ':rse_{0}'.format(j + 1)
                 rse_params_list.append(rse_param)
                 rse_params_map[rse_param] = rse
             rse_params_str = ','.join(rse_params_list)
             # sql
             sql_query = (
                 "SELECT t.jediTaskID "
                 "FROM {jedi_schema}.JEDI_Tasks t "
                 "WHERE t.status IN ('ready','running','scouting') AND t.lockedBy IS NULL "
                 "AND t.gshare=:gshare "
                 "AND t.ioIntensity>=:ioIntensity AND t.currentPriority<:currentPriority "
                 "AND EXISTS ( "
                 "SELECT * FROM {jedi_schema}.JEDI_Datasets d "
                 "WHERE d.jediTaskID=t.jediTaskID "
                 "AND d.type='input' "
                 ") "
                 "AND NOT EXISTS ( "
                 "SELECT * FROM {jedi_schema}.JEDI_Dataset_Locality dl "
                 "WHERE dl.jediTaskID=t.jediTaskID "
                 "AND dl.rse NOT IN ({rse_params_str}) "
                 ") "
                 "FOR UPDATE ").format(
                     jedi_schema=jedi_config.db.schemaJEDI,
                     rse_params_str=rse_params_str)
             # params map
             params_map = {
                 ':gshare': gshare,
                 ':ioIntensity': upplimit_ioIntensity,
                 ':currentPriority': lowlimit_currentPriority,
             }
             params_map.update(rse_params_map)
             # pending reason
             reason = 'no local input data, ioIntensity>={ioIntensity}, currentPriority<{currentPriority},'\
                      'nQueue>max({cutOff},nRunning*2) at all sites where the task can run'.format(
                 ioIntensity=upplimit_ioIntensity,currentPriority=lowlimit_currentPriority,
                 cutOff=cutoff)
             # set pending
             dry_run = False
             if dry_run:
                 dry_sql_query = (
                     "SELECT t.jediTaskID "
                     "FROM {jedi_schema}.JEDI_Tasks t "
                     "WHERE t.status IN ('ready','running','scouting') AND t.lockedBy IS NULL "
                     "AND t.gshare=:gshare "
                     "AND t.ioIntensity>=:ioIntensity AND t.currentPriority<:currentPriority "
                     "AND EXISTS ( "
                     "SELECT * FROM {jedi_schema}.JEDI_Datasets d "
                     "WHERE d.jediTaskID=t.jediTaskID "
                     "AND d.type='input' "
                     ") "
                     "AND NOT EXISTS ( "
                     "SELECT * FROM {jedi_schema}.JEDI_Dataset_Locality dl "
                     "WHERE dl.jediTaskID=t.jediTaskID "
                     "AND dl.rse NOT IN ({rse_params_str}) "
                     ") ").format(jedi_schema=jedi_config.db.schemaJEDI,
                                  rse_params_str=rse_params_str)
                 res = self.taskBufferIF.querySQL(dry_sql_query, params_map)
                 n_tasks = 0 if res is None else len(res)
                 if n_tasks > 0:
                     result = [x[0] for x in res]
                     tmp_log.debug(
                         '[dry run] gshare: {gshare:<16} {n_tasks:>5} tasks would be pending : {result} ; reason="{reason}" '
                         .format(gshare=gshare,
                                 n_tasks=n_tasks,
                                 result=result,
                                 reason=reason))
             else:
                 n_tasks = self.taskBufferIF.queryTasksToBePending_JEDI(
                     sql_query, params_map, reason)
                 if n_tasks is not None and n_tasks > 0:
                     tmp_log.info(
                         'gshare: {gshare:<16} {n_tasks:>5} tasks got pending ; reason="{reason}" '
                         .format(gshare=gshare,
                                 n_tasks=str(n_tasks),
                                 reason=reason))

Пример #37

Показать файл

Файл: AtlasTaskSetupper.py Проект: PanDAWMS/panda-jedi

    def doSetup(self,taskSpec,datasetToRegister,pandaJobs):
        # make logger
        tmpLog = MsgWrapper(logger,"< jediTaskID={0} >".format(taskSpec.jediTaskID))
        tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType))
        # returns
        retFatal    = self.SC_FATAL
        retTmpError = self.SC_FAILED
        retOK       = self.SC_SUCCEEDED
        try:
            # get DDM I/F
            ddmIF = self.ddmIF.getInterface(taskSpec.vo)
            # register datasets
            if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']:
                # prod vs anal
                userSetup = False
                if taskSpec.prodSourceLabel in ['user']:
                    userSetup = True
                    # collect datasetID to register datasets/containers just in case
                    for tmpPandaJob in pandaJobs:
                        if not tmpPandaJob.produceUnMerge():
                            for tmpFileSpec in tmpPandaJob.Files:
                                if tmpFileSpec.type in ['output','log']:
                                    if tmpFileSpec.datasetID not in datasetToRegister:
                                        datasetToRegister.append(tmpFileSpec.datasetID)
                tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister)))
                # get site mapper
                siteMapper = self.taskBufferIF.getSiteMapper()

                # loop over all datasets
                avDatasetList = []
                cnDatasetMap  = {}
                for datasetID in datasetToRegister:
                    # get output and log datasets
                    tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID))
                    tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID,
                                                                                  datasetID)
                    if not tmpStat:
                        tmpLog.error('failed to get output and log datasets')
                        return retFatal
                    if datasetSpec.isPseudo():
                        tmpLog.info('skip pseudo dataset')
                        continue
                    # DDM backend
                    ddmBackEnd = taskSpec.getDdmBackEnd()
                    tmpLog.info('checking {0}'.format(datasetSpec.datasetName))
                    # check if dataset and container are available in DDM
                    for targetName in [datasetSpec.datasetName,datasetSpec.containerName]:
                        if targetName is None:
                            continue
                        if targetName not in avDatasetList:
                            # set lifetime
                            if targetName.startswith('panda'):
                                if datasetSpec.type == 'trn_log' and taskSpec.prodSourceLabel == 'managed':
                                    lifetime = 365
                                else:
                                    lifetime = 14
                            else:
                                lifetime = None
                            # check dataset/container in DDM
                            tmpList = ddmIF.listDatasets(targetName)
                            if tmpList == []:
                                # get location
                                location = None
                                locForRule = None
                                if targetName == datasetSpec.datasetName:
                                    # dataset
                                    if datasetSpec.site in ['',None]:
                                        if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                                            locForRule = datasetSpec.destination
                                        elif DataServiceUtils.getDestinationSE(datasetSpec.storageToken) is not None:
                                            location = DataServiceUtils.getDestinationSE(datasetSpec.storageToken)
                                        elif taskSpec.cloud is not None:
                                            # use T1 SE
                                            tmpT1Name = siteMapper.getCloud(taskSpec.cloud)['source']
                                            location = siteMapper.getDdmEndpoint(tmpT1Name, datasetSpec.storageToken,
                                                                                 taskSpec.prodSourceLabel,
                                                                                 JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType))
                                    else:
                                        tmpLog.info('site={0} token={1}'.format(datasetSpec.site, datasetSpec.storageToken))
                                        location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken,
                                                                             taskSpec.prodSourceLabel,
                                                                             JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType))
                                if locForRule is None:
                                    locForRule = location
                                # set metadata
                                if taskSpec.prodSourceLabel in ['managed','test'] and targetName == datasetSpec.datasetName:
                                    metaData = {}
                                    metaData['task_id'] = taskSpec.jediTaskID
                                    if taskSpec.campaign not in [None,'']:
                                        metaData['campaign'] = taskSpec.campaign
                                    if datasetSpec.getTransient() is not None:
                                        metaData['transient'] = datasetSpec.getTransient()
                                else:
                                    metaData = None
                                # register dataset/container
                                tmpLog.info('registering {0} with location={1} backend={2} lifetime={3} meta={4}'.format(targetName,
                                                                                                                         location,
                                                                                                                         ddmBackEnd,
                                                                                                                         lifetime,
                                                                                                                         str(metaData)))
                                tmpStat = ddmIF.registerNewDataset(targetName,backEnd=ddmBackEnd,location=location,
                                                                   lifetime=lifetime,metaData=metaData)
                                if not tmpStat:
                                    tmpLog.error('failed to register {0}'.format(targetName))
                                    return retFatal
                                # procedures for user
                                if userSetup or DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                                    # register location
                                    tmpToRegister = False
                                    if userSetup and targetName == datasetSpec.datasetName and datasetSpec.site not in ['',None]:
                                        if taskSpec.workingGroup:
                                            userName = taskSpec.workingGroup
                                        else:
                                            userName = taskSpec.userName
                                        grouping = None
                                        tmpToRegister = True
                                    elif DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                                        userName = None
                                        grouping = 'NONE'
                                        tmpToRegister = True
                                    if tmpToRegister:
                                        activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel)
                                        tmpLog.info('registering location={} lifetime={} days activity={} grouping={} '
                                                    'owner={}'.format(locForRule, lifetime, activity, grouping,
                                                                      userName))
                                        tmpStat = ddmIF.registerDatasetLocation(targetName,locForRule,owner=userName,
                                                                                lifetime=lifetime,backEnd=ddmBackEnd,
                                                                                activity=activity,grouping=grouping)
                                        if not tmpStat:
                                            tmpLog.error('failed to register location {0} for {1}'.format(locForRule,
                                                                                                          targetName))
                                            return retFatal
                                        # double copy
                                        if userSetup and datasetSpec.type == 'output':
                                            if datasetSpec.destination != datasetSpec.site:
                                                tmpLog.info('skip making double copy as destination={0} is not site={1}'.format(datasetSpec.destination,
                                                                                                                                datasetSpec.site))
                                            else:

                                                second_copy = True
                                                try:
                                                    if taskSpec.site:
                                                        panda_site = siteMapper.getSite(taskSpec.site)
                                                        if panda_site.catchall and 'skip_2nd_copy' in panda_site.catchall:
                                                            tmpLog.info('skip making double copy as specified in {0} catchall'.format(panda_site))
                                                            second_copy = False
                                                except Exception:
                                                    second_copy = True

                                                if second_copy:
                                                    locForDouble = '(type=SCRATCHDISK)\\notforextracopy=True'
                                                    tmpMsg  = 'registering double copy '
                                                    tmpMsg += 'location="{0}" lifetime={1}days activity={2} for dataset={3}'.format(locForDouble,lifetime,
                                                                                                                                    activity,targetName)
                                                    tmpLog.info(tmpMsg)
                                                    tmpStat = ddmIF.registerDatasetLocation(targetName,locForDouble,copies=2,owner=userName,
                                                                                            lifetime=lifetime,activity=activity,
                                                                                            grouping='NONE',weight='freespace',
                                                                                            ignore_availability=False)
                                                    if not tmpStat:
                                                        tmpLog.error('failed to register double copylocation {0} for {1}'.format(locForDouble,
                                                                                                                               targetName))
                                                        return retFatal
                                avDatasetList.append(targetName)
                            else:
                                tmpLog.info('{0} already registered'.format(targetName))
                    # check if dataset is in the container
                    if datasetSpec.containerName is not None and datasetSpec.containerName != datasetSpec.datasetName:
                        # get list of constituent datasets in the container
                        if datasetSpec.containerName not in cnDatasetMap:
                            cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName)
                        # add dataset
                        if datasetSpec.datasetName not in cnDatasetMap[datasetSpec.containerName]:
                            tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName))
                            tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName],
                                                                   backEnd=ddmBackEnd)
                            if not tmpStat:
                                tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName,
                                                                               datasetSpec.containerName))
                                return retFatal
                            cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName)
                        else:
                            tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName))
                    # update dataset
                    datasetSpec.status = 'registered'
                    self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID,
                                                                      'datasetID':datasetID})
            # register ES datasets
            if taskSpec.registerEsFiles():
                targetName = EventServiceUtils.getEsDatasetName(taskSpec.jediTaskID)
                location = None
                metaData = {}
                metaData['task_id'] = taskSpec.jediTaskID
                metaData['hidden']  = True
                tmpLog.info('registering ES dataset {0} with location={1} meta={2}'.format(targetName,
                                                                                           location,
                                                                                           str(metaData)))
                tmpStat = ddmIF.registerNewDataset(targetName,location=location,metaData=metaData,
                                                   resurrect=True)
                if not tmpStat:
                    tmpLog.error('failed to register ES dataset {0}'.format(targetName))
                    return retFatal
                # register rule
                location = 'type=DATADISK'
                activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel)
                grouping = 'NONE'
                tmpLog.info('registering location={0} activity={1} grouping={2}'.format(location,
                                                                                        activity,
                                                                                        grouping))
                tmpStat = ddmIF.registerDatasetLocation(targetName,location,activity=activity,
                                                        grouping=grouping)
                if not tmpStat:
                    tmpLog.error('failed to register location {0} with {2} for {1}'.format(location,
                                                                                           targetName,
                                                                                           activity))
                    return retFatal
            # open datasets
            if taskSpec.prodSourceLabel in ['managed','test']:
                # get the list of output/log datasets
                outDatasetList = []
                for tmpPandaJob in pandaJobs:
                    for tmpFileSpec in tmpPandaJob.Files:
                        if tmpFileSpec.type in ['output','log']:
                            if tmpFileSpec.destinationDBlock not in outDatasetList:
                                outDatasetList.append(tmpFileSpec.destinationDBlock)
                # open datasets
                for outDataset in outDatasetList:
                    tmpLog.info('open {0}'.format(outDataset))
                    ddmIF.openDataset(outDataset)
                    # unset lifetime
                    ddmIF.setDatasetMetadata(outDataset,'lifetime',None)
            # return
            tmpLog.info('done')
            return retOK
        except Exception:
            errtype,errvalue = sys.exc_info()[:2]
            tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue))
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            return retFatal

Пример #38

Показать файл

    def runImpl(self):
        while True:
            try:
                # get a part of list
                nTasks = 10
                taskDsList = self.taskDsList.get(nTasks)
                # no more datasets
                if len(taskDsList) == 0:
                    self.logger.debug('%s terminating since no more items' %
                                      self.__class__.__name__)
                    return
                # loop over all tasks
                for jediTaskID, dsList in taskDsList:
                    allUpdated = True
                    taskBroken = False
                    taskOnHold = False
                    runningTask = False
                    missingMap = {}
                    # make logger
                    tmpLog = MsgWrapper(
                        self.logger, '< jediTaskID={0} >'.format(jediTaskID))
                    # get task
                    tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(
                        jediTaskID, False, True, self.pid, 10)
                    if not tmpStat or taskSpec == None:
                        tmpLog.error(
                            'failed to get taskSpec for jediTaskID={0}'.format(
                                jediTaskID))
                        continue
                    try:
                        # get task parameters
                        taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                            jediTaskID)
                        taskParamMap = RefinerUtils.decodeJSON(taskParam)
                    except:
                        errtype, errvalue = sys.exc_info()[:2]
                        tmpLog.error(
                            'task param conversion from json failed with {0}:{1}'
                            .format(errtype.__name__, errvalue))
                        taskBroken = True
                    # renaming of parameters
                    if taskParamMap.has_key('nEventsPerInputFile'):
                        taskParamMap['nEventsPerFile'] = taskParamMap[
                            'nEventsPerInputFile']
                    # the number of files per job
                    nFilesPerJob = None
                    if taskParamMap.has_key('nFilesPerJob'):
                        nFilesPerJob = taskParamMap['nFilesPerJob']
                    # the number of chunks used by scout
                    nChunksForScout = 10
                    # load XML
                    if taskSpec.useLoadXML():
                        xmlConfig = taskParamMap['loadXML']
                    else:
                        xmlConfig = None
                    # skip files used by another task
                    if 'skipFilesUsedBy' in taskParamMap:
                        skipFilesUsedBy = taskParamMap['skipFilesUsedBy']
                    else:
                        skipFilesUsedBy = None
                    # check no wait
                    noWaitParent = False
                    parentOutDatasets = set()
                    if taskSpec.noWaitParent() and not taskSpec.parent_tid in [
                            None, taskSpec.jediTaskID
                    ]:
                        tmpStat = self.taskBufferIF.checkParentTask_JEDI(
                            taskSpec.parent_tid)
                        if tmpStat == 'running':
                            noWaitParent = True
                            # get output datasets from parent task
                            tmpParentStat, tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                                taskSpec.parent_tid, ['output', 'log'])
                            # collect dataset names
                            for tmpParentOutDataset in tmpParentOutDatasets:
                                parentOutDatasets.add(
                                    tmpParentOutDataset.datasetName)
                    # loop over all datasets
                    nFilesMaster = 0
                    checkedMaster = False
                    setFrozenTime = True
                    if not taskBroken:
                        ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                        origNumFiles = None
                        if taskParamMap.has_key('nFiles'):
                            origNumFiles = taskParamMap['nFiles']
                        for datasetSpec in dsList:
                            tmpLog.debug('start loop for {0}(id={1})'.format(
                                datasetSpec.datasetName,
                                datasetSpec.datasetID))
                            # get dataset metadata
                            tmpLog.debug('get metadata')
                            gotMetadata = False
                            stateUpdateTime = datetime.datetime.utcnow()
                            try:
                                if not datasetSpec.isPseudo():
                                    tmpMetadata = ddmIF.getDatasetMetaData(
                                        datasetSpec.datasetName)
                                else:
                                    # dummy metadata for pseudo dataset
                                    tmpMetadata = {'state': 'closed'}
                                # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed
                                if (noWaitParent or taskSpec.runUntilClosed()) and \
                                        (tmpMetadata['state'] == 'open' \
                                             or datasetSpec.datasetName in parentOutDatasets \
                                             or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets):
                                    # dummy metadata when parent is running
                                    tmpMetadata = {'state': 'mutable'}
                                gotMetadata = True
                            except:
                                errtype, errvalue = sys.exc_info()[:2]
                                tmpLog.error(
                                    '{0} failed to get metadata to {1}:{2}'.
                                    format(self.__class__.__name__,
                                           errtype.__name__, errvalue))
                                if errtype == Interaction.JEDIFatalError:
                                    # fatal error
                                    datasetStatus = 'broken'
                                    taskBroken = True
                                    # update dataset status
                                    self.updateDatasetStatus(
                                        datasetSpec, datasetStatus, tmpLog)
                                else:
                                    if not taskSpec.ignoreMissingInDS():
                                        # temporary error
                                        taskOnHold = True
                                    else:
                                        # ignore missing
                                        datasetStatus = 'failed'
                                        # update dataset status
                                        self.updateDatasetStatus(
                                            datasetSpec, datasetStatus, tmpLog)
                                taskSpec.setErrDiag(
                                    'failed to get metadata for {0}'.format(
                                        datasetSpec.datasetName))
                                if not taskSpec.ignoreMissingInDS():
                                    allUpdated = False
                            else:
                                # get file list specified in task parameters
                                fileList, includePatt, excludePatt = RefinerUtils.extractFileList(
                                    taskParamMap, datasetSpec.datasetName)
                                # get the number of events in metadata
                                if taskParamMap.has_key(
                                        'getNumEventsInMetadata'):
                                    getNumEvents = True
                                else:
                                    getNumEvents = False
                                # get file list from DDM
                                tmpLog.debug('get files')
                                try:
                                    useInFilesWithNewAttemptNr = False
                                    skipDuplicate = not datasetSpec.useDuplicatedFiles(
                                    )
                                    if not datasetSpec.isPseudo():
                                        if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \
                                                not datasetSpec.containerName in ['',None]:
                                            # read files from container if file list is specified in task parameters
                                            tmpDatasetName = datasetSpec.containerName
                                        else:
                                            tmpDatasetName = datasetSpec.datasetName
                                        # use long format for LB
                                        longFormat = False
                                        if taskSpec.respectLumiblock():
                                            longFormat = True
                                        tmpRet = ddmIF.getFilesInDataset(
                                            tmpDatasetName,
                                            getNumEvents=getNumEvents,
                                            skipDuplicate=skipDuplicate,
                                            longFormat=longFormat)
                                        tmpLog.debug(
                                            'got {0} files in {1}'.format(
                                                len(tmpRet), tmpDatasetName))
                                        # remove lost files
                                        tmpLostFiles = ddmIF.findLostFiles(
                                            tmpDatasetName, tmpRet)
                                        if tmpLostFiles != {}:
                                            tmpLog.debug(
                                                'found {0} lost files in {1}'.
                                                format(len(tmpLostFiles),
                                                       tmpDatasetName))
                                            for tmpListGUID, tmpLostLFN in tmpLostFiles.iteritems(
                                            ):
                                                tmpLog.debug(
                                                    'removed {0}'.format(
                                                        tmpLostLFN))
                                                del tmpRet[tmpListGUID]
                                    else:
                                        if datasetSpec.isSeqNumber():
                                            # make dummy files for seq_number
                                            if datasetSpec.getNumRecords(
                                            ) != None:
                                                nPFN = datasetSpec.getNumRecords(
                                                )
                                            elif origNumFiles != None:
                                                nPFN = origNumFiles
                                                if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \
                                                        and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                    nPFN = nPFN * taskParamMap[
                                                        'nEventsPerFile'] / taskParamMap[
                                                            'nEventsPerJob']
                                                elif taskParamMap.has_key(
                                                        'nEventsPerFile'
                                                ) and taskParamMap.has_key(
                                                        'nEventsPerRange'):
                                                    nPFN = nPFN * taskParamMap[
                                                        'nEventsPerFile'] / taskParamMap[
                                                            'nEventsPerRange']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap:
                                                nPFN = taskParamMap[
                                                    'nEvents'] / taskParamMap[
                                                        'nEventsPerJob']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \
                                                    and 'nFilesPerJob' in taskParamMap:
                                                nPFN = taskParamMap[
                                                    'nEvents'] / taskParamMap[
                                                        'nEventsPerFile'] / taskParamMap[
                                                            'nFilesPerJob']
                                            else:
                                                # the default number of records for seq_number
                                                seqDefNumRecords = 10000
                                                # get nFiles of the master
                                                tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI(
                                                    datasetSpec.jediTaskID,
                                                    datasetSpec.masterID,
                                                    ['nFiles'])
                                                # use nFiles of the master as the number of records if it is larger than the default
                                                if 'nFiles' in tmpMasterAtt and tmpMasterAtt[
                                                        'nFiles'] > seqDefNumRecords:
                                                    nPFN = tmpMasterAtt[
                                                        'nFiles']
                                                else:
                                                    nPFN = seqDefNumRecords
                                                # check usedBy
                                                if skipFilesUsedBy != None:
                                                    for tmpJediTaskID in str(
                                                            skipFilesUsedBy
                                                    ).split(','):
                                                        tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI(
                                                            tmpJediTaskID, {
                                                                'datasetName':
                                                                datasetSpec.
                                                                datasetName
                                                            }, ['nFiles'])
                                                        if 'nFiles' in tmpParentAtt and tmpParentAtt[
                                                                'nFiles']:
                                                            nPFN += tmpParentAtt[
                                                                'nFiles']
                                            tmpRet = {}
                                            # get offset
                                            tmpOffset = datasetSpec.getOffset()
                                            tmpOffset += 1
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {
                                                    'lfn': iPFN + tmpOffset,
                                                    'scope': None,
                                                    'filesize': 0,
                                                    'checksum': None,
                                                }
                                        elif not taskSpec.useListPFN():
                                            # dummy file list for pseudo dataset
                                            tmpRet = {
                                                str(uuid.uuid4()): {
                                                    'lfn': 'pseudo_lfn',
                                                    'scope': None,
                                                    'filesize': 0,
                                                    'checksum': None,
                                                }
                                            }
                                        else:
                                            # make dummy file list for PFN list
                                            if taskParamMap.has_key('nFiles'):
                                                nPFN = taskParamMap['nFiles']
                                            else:
                                                nPFN = 1
                                            tmpRet = {}
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {
                                                    'lfn':
                                                    '{0:06d}:{1}'.format(
                                                        iPFN,
                                                        taskParamMap['pfnList']
                                                        [iPFN].split('/')[-1]),
                                                    'scope':
                                                    None,
                                                    'filesize':
                                                    0,
                                                    'checksum':
                                                    None,
                                                }
                                except:
                                    errtype, errvalue = sys.exc_info()[:2]
                                    tmpLog.error(
                                        'failed to get files due to {0}:{1} {2}'
                                        .format(self.__class__.__name__,
                                                errtype.__name__, errvalue))
                                    if errtype == Interaction.JEDIFatalError:
                                        # fatal error
                                        datasetStatus = 'broken'
                                        taskBroken = True
                                        # update dataset status
                                        self.updateDatasetStatus(
                                            datasetSpec, datasetStatus, tmpLog)
                                    else:
                                        # temporary error
                                        taskOnHold = True
                                    taskSpec.setErrDiag(
                                        'failed to get files for {0}'.format(
                                            datasetSpec.datasetName))
                                    allUpdated = False
                                else:
                                    # parameters for master input
                                    respectLB = False
                                    useRealNumEvents = False
                                    if datasetSpec.isMaster():
                                        # respect LB boundaries
                                        respectLB = taskSpec.respectLumiblock()
                                        # use real number of events
                                        useRealNumEvents = taskSpec.useRealNumEvents(
                                        )
                                    # the number of events per file
                                    nEventsPerFile = None
                                    nEventsPerJob = None
                                    nEventsPerRange = None
                                    tgtNumEventsPerJob = None
                                    if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \
                                            (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()):
                                        if taskParamMap.has_key(
                                                'nEventsPerFile'):
                                            nEventsPerFile = taskParamMap[
                                                'nEventsPerFile']
                                        elif datasetSpec.isMaster(
                                        ) and datasetSpec.isPseudo(
                                        ) and taskParamMap.has_key('nEvents'):
                                            # use nEvents as nEventsPerFile for pseudo input
                                            nEventsPerFile = taskParamMap[
                                                'nEvents']
                                        if taskParamMap.has_key(
                                                'nEventsPerJob'):
                                            nEventsPerJob = taskParamMap[
                                                'nEventsPerJob']
                                        elif taskParamMap.has_key(
                                                'nEventsPerRange'):
                                            nEventsPerRange = taskParamMap[
                                                'nEventsPerRange']
                                        if 'tgtNumEventsPerJob' in taskParamMap:
                                            tgtNumEventsPerJob = taskParamMap[
                                                'tgtNumEventsPerJob']
                                            # reset nEventsPerJob
                                            nEventsPerJob = None
                                    # max attempts
                                    maxAttempt = None
                                    maxFailure = None
                                    if datasetSpec.isMaster(
                                    ) or datasetSpec.toKeepTrack():
                                        # max attempts
                                        if taskSpec.disableAutoRetry():
                                            # disable auto retry
                                            maxAttempt = 1
                                        elif taskParamMap.has_key(
                                                'maxAttempt'):
                                            maxAttempt = taskParamMap[
                                                'maxAttempt']
                                        else:
                                            # use default value
                                            maxAttempt = 3
                                        # max failure
                                        if 'maxFailure' in taskParamMap:
                                            maxFailure = taskParamMap[
                                                'maxFailure']
                                    # first event number
                                    firstEventNumber = None
                                    if datasetSpec.isMaster():
                                        # first event number
                                        firstEventNumber = 1 + taskSpec.getFirstEventOffset(
                                        )
                                    # nMaxEvents
                                    nMaxEvents = None
                                    if datasetSpec.isMaster(
                                    ) and taskParamMap.has_key('nEvents'):
                                        nMaxEvents = taskParamMap['nEvents']
                                    # nMaxFiles
                                    nMaxFiles = None
                                    if taskParamMap.has_key('nFiles'):
                                        if datasetSpec.isMaster():
                                            nMaxFiles = taskParamMap['nFiles']
                                        else:
                                            # calculate for secondary
                                            nMaxFiles = datasetSpec.getNumMultByRatio(
                                                origNumFiles)
                                            # multipled by the number of jobs per file for event-level splitting
                                            if nMaxFiles != None and taskParamMap.has_key(
                                                    'nEventsPerFile'):
                                                if taskParamMap.has_key(
                                                        'nEventsPerJob'):
                                                    if taskParamMap[
                                                            'nEventsPerFile'] > taskParamMap[
                                                                'nEventsPerJob']:
                                                        nMaxFiles *= float(
                                                            taskParamMap[
                                                                'nEventsPerFile']
                                                        ) / float(taskParamMap[
                                                            'nEventsPerJob'])
                                                        nMaxFiles = int(
                                                            math.ceil(
                                                                nMaxFiles))
                                                elif taskParamMap.has_key(
                                                        'nEventsPerRange'):
                                                    if taskParamMap[
                                                            'nEventsPerFile'] > taskParamMap[
                                                                'nEventsPerRange']:
                                                        nMaxFiles *= float(
                                                            taskParamMap[
                                                                'nEventsPerFile']
                                                        ) / float(taskParamMap[
                                                            'nEventsPerRange'])
                                                        nMaxFiles = int(
                                                            math.ceil(
                                                                nMaxFiles))
                                    # use scout
                                    useScout = False
                                    if datasetSpec.isMaster(
                                    ) and taskSpec.useScout() and (
                                            datasetSpec.status != 'toupdate'
                                            or not taskSpec.isPostScout()):
                                        useScout = True
                                    # use files with new attempt numbers
                                    useFilesWithNewAttemptNr = False
                                    if not datasetSpec.isPseudo(
                                    ) and fileList != [] and taskParamMap.has_key(
                                            'useInFilesWithNewAttemptNr'):
                                        useFilesWithNewAttemptNr = True
                                    #ramCount
                                    ramCount = 0

                                    # feed files to the contents table
                                    tmpLog.debug('update contents')
                                    retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(
                                        datasetSpec, tmpRet,
                                        tmpMetadata['state'], stateUpdateTime,
                                        nEventsPerFile, nEventsPerJob,
                                        maxAttempt, firstEventNumber,
                                        nMaxFiles, nMaxEvents, useScout,
                                        fileList, useFilesWithNewAttemptNr,
                                        nFilesPerJob, nEventsPerRange,
                                        nChunksForScout, includePatt,
                                        excludePatt, xmlConfig, noWaitParent,
                                        taskSpec.parent_tid, self.pid,
                                        maxFailure, useRealNumEvents,
                                        respectLB, tgtNumEventsPerJob,
                                        skipFilesUsedBy, ramCount)
                                    if retDB == False:
                                        taskSpec.setErrDiag(
                                            'failed to insert files for {0}. {1}'
                                            .format(datasetSpec.datasetName,
                                                    diagMap['errMsg']))
                                        allUpdated = False
                                        taskBroken = True
                                        break
                                    elif retDB == None:
                                        # the dataset is locked by another or status is not applicable
                                        allUpdated = False
                                        tmpLog.debug(
                                            'escape since task or dataset is locked'
                                        )
                                        break
                                    elif missingFileList != []:
                                        # files are missing
                                        tmpErrStr = '{0} files missing in {1}'.format(
                                            len(missingFileList),
                                            datasetSpec.datasetName)
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        allUpdated = False
                                        taskOnHold = True
                                        missingMap[datasetSpec.datasetName] = {
                                            'datasetSpec': datasetSpec,
                                            'missingFiles': missingFileList
                                        }
                                    else:
                                        # reduce the number of files to be read
                                        if taskParamMap.has_key('nFiles'):
                                            if datasetSpec.isMaster():
                                                taskParamMap[
                                                    'nFiles'] -= nFilesUnique
                                        # reduce the number of files for scout
                                        if useScout:
                                            nChunksForScout = diagMap[
                                                'nChunksForScout']
                                        # number of master input files
                                        if datasetSpec.isMaster():
                                            checkedMaster = True
                                            nFilesMaster += nFilesUnique
                                    # running task
                                    if diagMap['isRunningTask']:
                                        runningTask = True
                                    # no activated pending input for noWait
                                    if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout == 0) \
                                            and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster():
                                        tmpErrStr = 'insufficient inputs are ready. '
                                        tmpErrStr += diagMap['errMsg']
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        taskOnHold = True
                                        setFrozenTime = False
                                        break
                            tmpLog.debug('end loop')
                    # no mater input
                    if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster:
                        tmpErrStr = 'no master input files. input dataset is empty'
                        tmpLog.error(tmpErrStr)
                        taskSpec.setErrDiag(tmpErrStr, None)
                        if taskSpec.allowEmptyInput() or noWaitParent:
                            taskOnHold = True
                        else:
                            taskBroken = True
                    # update task status
                    if taskBroken:
                        # task is broken
                        taskSpec.status = 'tobroken'
                        tmpMsg = 'set task.status={0}'.format(taskSpec.status)
                        tmpLog.info(tmpMsg)
                        tmpLog.sendMsg(tmpMsg, self.msgType)
                        allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                            jediTaskID, taskSpec, pid=self.pid)
                    # change task status unless the task is running
                    if not runningTask:
                        if taskOnHold:
                            # go to pending state
                            if not taskSpec.status in ['broken', 'tobroken']:
                                taskSpec.setOnHold()
                            tmpMsg = 'set task.status={0}'.format(
                                taskSpec.status)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg, self.msgType)
                            allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                                jediTaskID,
                                taskSpec,
                                pid=self.pid,
                                setFrozenTime=setFrozenTime)
                        elif allUpdated:
                            # all OK
                            allRet, newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                                jediTaskID,
                                getTaskStatus=True,
                                pid=self.pid,
                                useWorldCloud=taskSpec.useWorldCloud())
                            tmpMsg = 'set task.status={0}'.format(
                                newTaskStatus)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg, self.msgType)
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(
                            jediTaskID, self.pid)
                        tmpLog.debug('unlock not-running task with {0}'.format(
                            retUnlock))
                    else:
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(
                            jediTaskID, self.pid)
                        tmpLog.debug('unlock task with {0}'.format(retUnlock))
                    tmpLog.debug('done')
            except:
                errtype, errvalue = sys.exc_info()[:2]
                logger.error('{0} failed in runImpl() with {1}:{2}'.format(
                    self.__class__.__name__, errtype.__name__, errvalue))

Пример #39

Показать файл

Файл: AtlasProdTaskBroker.py Проект: ruslan33/panda-jedi

 def runImpl(self):
     # cutoff for disk in TB
     diskThreshold = 5 * 1024
     # dataset type to ignore file availability check
     datasetTypeToSkipCheck = ['log']
     thrInputSize = 1024*1024*1024
     thrInputNum = 100
     thrInputSizeFrac = 0.1
     thrInputNumFrac = 0.1
     cutOffRW = 50
     negWeightTape = 0.001
     # main
     lastJediTaskID = None
     siteMapper = self.taskBufferIF.getSiteMapper()
     while True:
         try:
             taskInputList = self.inputList.get(1)
             # no more datasets
             if len(taskInputList) == 0:
                 self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__,
                                                                                                             self.numTasks))
                 return
             # loop over all tasks
             for taskSpec,inputChunk in taskInputList:
                 lastJediTaskID = taskSpec.jediTaskID
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='{0}'.format(taskSpec.jediTaskID))
                 tmpLog.debug('start')
                 # get nuclei
                 nucleusList = siteMapper.nuclei
                 if taskSpec.nucleus in nucleusList:
                     candidateNucleus = taskSpec.nucleus
                 else:
                     tmpLog.debug('got {0} candidates'.format(len(nucleusList)))
                     ######################################
                     # check status
                     newNucleusList = {}
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleusSpec.state in ['ACTIVE']:
                             tmpLog.debug('  skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus,
                                                                                                         tmpNucleusSpec.state))
                         else:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.debug('{0} candidates passed status check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # check endpoint
                     newNucleusList = {}
                     tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                                   ['output','log'])
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         toSkip = False
                         for tmpDatasetSpec in tmpDatasetSpecList:
                             # ignore distributed datasets
                             if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None:
                                 continue
                             # get endpoint with the pattern
                             tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken)
                             if tmpEP == None:
                                 tmpLog.debug('  skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus,
                                                                                                                     tmpDatasetSpec.storageToken))
                                 toSkip = True
                                 break
                             # check state
                             """
                             if not tmpEP['state'] in ['ACTIVE']:
                                 tmpLog.debug('  skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus,
                                                                                                                          tmpEP['ddm_endpoint_name'],
                                                                                                                          tmpEP['state']))
                                 toSkip = True
                                 break
                             """    
                             # check space
                             tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired']
                             if tmpSpaceSize < diskThreshold:
                                 tmpLog.debug('  skip nucleus={0} since disk shortage ({1}<{2}) at endpoint {3} criteria=-space'.format(tmpNucleus,
                                                                                                                                        tmpSpaceSize,
                                                                                                                                        diskThreshold,
                                                                                                                                        tmpEP['state']))
                                 toSkip = True
                                 break
                         if not toSkip:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.debug('{0} candidates passed endpoint check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ###################################### 
                     # data locality
                     toSkip = False
                     availableData = {}
                     for datasetSpec in inputChunk.getDatasets():
                         # only for real datasets
                         if datasetSpec.isPseudo():
                             continue
                         # ignore DBR
                         if DataServiceUtils.isDBR(datasetSpec.datasetName):
                             continue
                         # skip locality check
                         if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck:
                             continue
                         # get nuclei where data is available
                         tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF,
                                                                           datasetSpec.datasetName,
                                                                           nucleusList.keys())
                         if tmpSt != Interaction.SC_SUCCEEDED:
                             tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet))
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             toSkip = True
                             break
                         # sum
                         for tmpNucleus,tmpVals in tmpRet.iteritems():
                             if not tmpNucleus in availableData:
                                 availableData[tmpNucleus] = tmpVals
                             else:
                                 availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems())
                     if toSkip:
                         continue
                     if availableData != {}:
                         newNucleusList = {}
                         # skip if no data
                         for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                             if availableData[tmpNucleus]['tot_size'] > thrInputSize and \
                                     availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac:
                                 tmpLog.debug('  skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus,
                                                                                                                                         availableData[tmpNucleus]['ava_size_any'],
                                                                                                                                         availableData[tmpNucleus]['tot_size'],
                                                                                                                                         thrInputSizeFrac))
                             elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \
                                     availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac:
                                 tmpLog.debug('  skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus,
                                                                                                                                           availableData[tmpNucleus]['ava_num_any'],
                                                                                                                                           availableData[tmpNucleus]['tot_num'],
                                                                                                                                           thrInputNumFrac))
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         nucleusList = newNucleusList
                         tmpLog.debug('{0} candidates passed data check'.format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # ability to execute jobs
                     newNucleusList = {}
                     # get all panda sites
                     tmpSiteList = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         tmpSiteList += tmpNucleusSpec.allPandaSites
                     tmpSiteList = list(set(tmpSiteList))
                     tmpLog.debug('===== start for job check')
                     jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF)
                     tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True,
                                                          tmpSiteList,tmpLog)
                     tmpLog.debug('===== done for job check')
                     if tmpSt != Interaction.SC_SUCCEEDED:
                         tmpLog.debug('failed to get sites where jobs can run. Use any nuclei where input is available')
                         # use any nuclei where input is available if no sites can run jobs
                         tmpRet = tmpSiteList
                     okNuclei = set()
                     for tmpSite in tmpRet:
                         siteSpec = siteMapper.getSite(tmpSite)
                         okNuclei.add(siteSpec.pandasite)
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if tmpNucleus in okNuclei:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                         else:
                             tmpLog.debug('  skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus))
                     nucleusList = newNucleusList
                     tmpLog.debug('{0} candidates passed job check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ###################################### 
                     # RW
                     taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID)
                     ###################################### 
                     # weight
                     self.prioRW.acquire()
                     nucleusRW = self.prioRW[taskSpec.currentPriority]
                     self.prioRW.release()
                     totalWeight = 0
                     nucleusweights = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleus in nucleusRW:
                             nucleusRW[tmpNucleus] = 0
                         wStr = '1'
                         # with RW
                         if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW:
                             weight = 1 / float(nucleusRW[tmpNucleus])
                             wStr += '/({0}=RW)'.format(nucleusRW[tmpNucleus])
                         else:
                             weight = 1
                             wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW)
                         # with data
                         if availableData != {}:
                             weight *= float(availableData[tmpNucleus]['ava_size_any'])
                             weight /= float(availableData[tmpNucleus]['tot_size'])
                             wStr += '*({0}=available input size on DISK/TAPE)'.format(availableData[tmpNucleus]['ava_size_any'])
                             wStr += '/({0}=total input size)'.format(availableData[tmpNucleus]['tot_size'])
                             # negative weight for tape
                             if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']:
                                 weight *= negWeightTape
                                 wStr += '*({0}=weight for TAPE)'.format(negWeightTape)
                         tmpLog.debug('  use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr))
                         totalWeight += weight
                         nucleusweights.append((tmpNucleus,weight))
                     tmpLog.debug('final {0} candidates'.format(len(nucleusList)))
                     ###################################### 
                     # final selection
                     tgtWeight = random.uniform(0,totalWeight)
                     candidateNucleus = None
                     for tmpNucleus,weight in nucleusweights:
                         tgtWeight -= weight
                         if tgtWeight <= 0:
                             candidateNucleus = tmpNucleus
                             break
                     if candidateNucleus == None:
                         candidateNucleus = nucleusweights[-1][0]
                 ###################################### 
                 # update
                 nucleusSpec = nucleusList[candidateNucleus]
                 # get output/log datasets
                 tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                            ['output','log'])
                 # get destinations
                 retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)}
                 tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
                 tmpLog.info('  set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet))
                 # update RW table
                 self.prioRW.acquire()
                 for prio,rwMap in self.prioRW.iteritems():
                     if prio > taskSpec.currentPriority:
                         continue
                     if candidateNucleus in rwMap:
                         rwMap[candidateNucleus] += taskRW
                     else:
                         rwMap[candidateNucleus] = taskRW
                 self.prioRW.release()
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errMsg  = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID)
             errMsg += traceback.format_exc()
             logger.error(errMsg)

Пример #40

Показать файл

Файл: AtlasTaskGenerator.py Проект: PanDAWMS/panda-jedi

 def doGenerate(self,taskSpec,taskParamMap,**varMap):
     # make logger
     tmpLog = MsgWrapper(logger,"<jediTaskID={0}>".format(taskSpec.jediTaskID))
     tmpLog.info('start taskType={0}'.format(taskSpec.taskType))
     tmpLog.info(str(varMap))
     # returns
     retFatal    = self.SC_FATAL
     retTmpError = self.SC_FAILED
     retOK       = self.SC_SUCCEEDED
     try:
         # check prodSourceLabel
         if taskSpec.prodSourceLabel in ['managed','test']:
             # check taskType
             if taskSpec.taskType == 'recov':
                 # generate parent tasks for lost file recovery if it is not yet generated
                 if taskParamMap.has_key('parentGenerated'):
                     tmpLog.info('skip since already generated parent tasks')
                 else:
                     tmpLog.info('generating parent tasks for lost file recovery')
                     # missing files are undefined
                     if not varMap.has_key('missingFilesMap'):
                         tmpLog.error('missing files are undefined')
                         return retFatal
                     missingFilesMap = varMap['missingFilesMap']
                     # check datasets
                     for datasetName,datasetValMap in missingFilesMap.iteritems():
                         # dataset needs specify container
                         datasetSpec = datasetValMap['datasetSpec']
                         if datasetSpec.containerName in ['',None]:
                             errStr = 'cannot make parent tasks due to undefined container for datasetID={0}:{1}'.format(datasetSpec.datasetID,
                                                                                                                         datasetName)
                             taskSpec.setErrDiag(errStr)
                             tmpLog.error(errStr)
                             return retFatal
                     # make parameters for new task
                     newJsonStrList = []    
                     for datasetName,datasetValMap in missingFilesMap.iteritems():
                         datasetSpec = datasetValMap['datasetSpec']
                         newTaskParamMap = {}
                         newTaskParamMap['oldDatasetName']  = datasetName
                         newTaskParamMap['lostFiles']       = datasetValMap['missingFiles']
                         newTaskParamMap['vo']              = taskSpec.vo
                         newTaskParamMap['cloud']           = taskSpec.cloud
                         newTaskParamMap['taskPriority']    = taskSpec.taskPriority
                         newTaskParamMap['taskType']        = taskSpec.taskType
                         newTaskParamMap['prodSourceLabel'] = taskSpec.prodSourceLabel
                         logDatasetName = 'panda.jedi{0}.log.{1}'.format(taskSpec.taskType,uuid.uuid4())
                         newTaskParamMap['log'] = {'dataset': logDatasetName,
                                                   'type':'template',
                                                   'param_type':'log',
                                                   'token':'ATLASDATADISK',
                                                   'value':'{0}.${{SN}}.log.tgz'.format(logDatasetName)}
                         # make new datasetname
                         outDatasetName = datasetName
                         # remove /
                         outDatasetName = re.sub('/$','',outDatasetName)
                         # remove extension
                         outDatasetName = re.sub('\.{0}\d+$'.format(taskSpec.taskType),'',outDatasetName)
                         # add extension
                         outDatasetName = outDatasetName + '.{0}{1}'.format(taskSpec.taskType,taskSpec.jediTaskID)
                         newTaskParamMap['output'] = {'dataset': outDatasetName}
                         if not datasetSpec.containerName in ['',None]:
                             newTaskParamMap['output']['container'] = datasetSpec.containerName
                         # make json
                         jsonStr = json.dumps(newTaskParamMap)
                         newJsonStrList.append(jsonStr)
                     # change original task parameters to not repeat the same procedure and to use newly produced files 
                     taskParamMap['parentGenerated']         = True
                     taskParamMap['useInFilesInContainer']   = True
                     taskParamMap['useInFilesWithNewAttemptNr'] = True
                     jsonStr = json.dumps(taskParamMap)
                     # insert and update task parameters
                     sTmp,newJediTaskIDs = self.taskBufferIF.insertUpdateTaskParams_JEDI(taskSpec.jediTaskID,
                                                                                         taskSpec.vo,
                                                                                         taskSpec.prodSourceLabel,
                                                                                         jsonStr,newJsonStrList)
                     if sTmp:
                         tmpLog.info('inserted/updated tasks in DB : new jediTaskIDs={0}'.format(str(newJediTaskIDs)))
                     else:
                         tmpLog.error('failed to insert/update tasks in DB')
                         return retFatal
         # return
         tmpLog.info('done')        
         return retOK
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('doGenerate failed with {0}:{1}'.format(errtype.__name__,errvalue))
         return retFatal

Пример #41

Показать файл

Файл: TaskBroker.py Проект: PanDAWMS/panda-jedi

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 100
             taskList = self.taskList.get(nTasks)
             totalTasks, idxTasks = self.taskList.stat()
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug(
                     '{0} terminating since no more items'.format(
                         self.__class__.__name__))
                 return
             # make logger
             tmpLog = MsgWrapper(self.logger)
             tmpLog.info(
                 'start TaskCheckerThread {0}/{1} for jediTaskID={2}'.
                 format(idxTasks, totalTasks, taskList))
             tmpStat = Interaction.SC_SUCCEEDED
             # get TaskSpecs
             taskSpecList = []
             for jediTaskID in taskList:
                 tmpRet, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(
                     jediTaskID, False)
                 if tmpRet and taskSpec is not None:
                     taskSpecList.append(taskSpec)
                 else:
                     tmpLog.error(
                         'failed to get taskSpec for jediTaskID={0}'.format(
                             jediTaskID))
             if taskSpecList != []:
                 # get impl
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('getting Impl')
                     try:
                         impl = self.implFactory.getImpl(
                             self.vo, self.prodSourceLabel)
                         if impl is None:
                             # task brokerage is undefined
                             tmpLog.error(
                                 'task broker is undefined for vo={0} sourceLabel={1}'
                                 .format(self.vo, self.prodSourceLabel))
                             tmpStat = Interaction.SC_FAILED
                     except Exception:
                         errtype, errvalue = sys.exc_info()[:2]
                         tmpLog.error('getImpl failed with {0}:{1}'.format(
                             errtype.__name__, errvalue))
                         tmpStat = Interaction.SC_FAILED
                 # check
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('check with {0}'.format(
                         impl.__class__.__name__))
                     try:
                         tmpStat, taskCloudMap = impl.doCheck(taskSpecList)
                     except Exception:
                         errtype, errvalue = sys.exc_info()[:2]
                         tmpLog.error('doCheck failed with {0}:{1}'.format(
                             errtype.__name__, errvalue))
                         tmpStat = Interaction.SC_FAILED
                 # update
                 if tmpStat != Interaction.SC_SUCCEEDED:
                     tmpLog.error('failed to check assignment')
                 else:
                     tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(
                         taskCloudMap)
                     tmpLog.info('done with {0} for {1}'.format(
                         tmpRet, str(taskCloudMap)))
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(
                 self.__class__.__name__, errtype.__name__, errvalue))

Пример #42

Показать файл

Файл: TaskCommando.py Проект: tertychnyy/panda-jedi

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,commandMap in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID))
                 commandStr = commandMap['command']
                 commentStr = commandMap['comment']
                 oldStatus  = commandMap['oldStatus']
                 tmpLog.info('start for {0}'.format(commandStr))
                 tmpStat = Interaction.SC_SUCCEEDED
                 if commandStr in ['kill','finish','reassign']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # loop twice to see immediate result
                     for iLoop in range(2):
                         # get active PandaIDs to be killed
                         if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr:
                             pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID)
                         else:
                             pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True)
                         if pandaIDs == None:
                             tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID))
                             tmpStat = Interaction.SC_FAILED
                         # kill jobs or update task
                         if tmpStat == Interaction.SC_SUCCEEDED:
                             if pandaIDs == []:
                                 # done since no active jobs
                                 tmpMsg = 'completed cleaning jobs'
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpTaskSpec = JediTaskSpec()
                                 tmpTaskSpec.jediTaskID = jediTaskID
                                 updateTaskStatus = True
                                 if commandStr != 'reassign':
                                     # reset oldStatus
                                     # keep oldStatus for task reassignment since it is reset when actually reassigned
                                     tmpTaskSpec.forceUpdate('oldStatus')
                                 else:
                                     # extract cloud or site
                                     if commentStr != None:
                                         tmpItems = commentStr.split(':')
                                         if tmpItems[0] == 'cloud':
                                             tmpTaskSpec.cloud = tmpItems[1]
                                         else:
                                             tmpTaskSpec.site = tmpItems[1]
                                         tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1])
                                         tmpLog.sendMsg(tmpMsg,self.msgType)
                                         tmpLog.info(tmpMsg)
                                         # back to oldStatus if necessary 
                                         if tmpItems[2] == 'y':
                                             tmpTaskSpec.status = oldStatus
                                             tmpTaskSpec.forceUpdate('oldStatus')
                                             updateTaskStatus = False
                                 if commandStr == 'reassign':
                                     tmpTaskSpec.forceUpdate('errorDialog')
                                 if updateTaskStatus:
                                     tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done']
                                 tmpMsg = 'set task.status={0}'.format(tmpTaskSpec.status)
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID})
                                 tmpLog.info('done with {0}'.format(str(tmpRet)))
                                 break
                             else:
                                 # kill only in the first loop
                                 if iLoop > 0:
                                     break
                                 # wait or kill jobs 
                                 if 'soft finish' in commentStr:
                                     tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = True
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                                     break
                                 else:
                                     tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpLog.sendMsg(tmpMsg,self.msgType)
                                     if commandStr in ['reassign','finish']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True)
                                     else:
                                         # normal kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True)
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                 elif commandStr in ['retry','incexec']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # change task params for incexec
                     if commandStr == 'incexec':
                         try:
                             # read task params
                             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                             taskParamMap = RefinerUtils.decodeJSON(taskParam)
                             # remove some params
                             for newKey in ['nFiles','fixedSandbox']:
                                 try:
                                     del taskParamMap[newKey]
                                 except:
                                     pass
                             # convert new params
                             newParamMap = RefinerUtils.decodeJSON(commentStr)
                             # change params
                             for newKey,newVal in newParamMap.iteritems():
                                 if newVal == None:
                                     # delete
                                     if newKey in taskParamMap:
                                         del taskParamMap[newKey]
                                 else:
                                     # change
                                     taskParamMap[newKey] = newVal
                             # overwrite sandbox
                             if 'fixedSandbox' in taskParamMap:
                                 # noBuild
                                 for tmpParam in taskParamMap['jobParameters']:
                                     if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None:
                                         tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox']
                                 # build
                                 if taskParamMap.has_key('buildSpec'):
                                     taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox']
                                 # merge
                                 if taskParamMap.has_key('mergeSpec'):
                                     taskParamMap['mergeSpec']['jobParameters'] = \
                                         re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters'])
                             # encode new param
                             strTaskParams = RefinerUtils.encodeJSON(taskParamMap)
                             tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams)
                             if tmpRet != True:
                                 tmpLog.error('failed to update task params')
                                 continue
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue))
                             continue
                     # retry failed files
                     tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr)
                     if tmpRet == True:
                         tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                         tmpLog.info(tmpMsg)
                     tmpLog.info('done with {0}'.format(tmpRet))
                 else:
                     tmpLog.error('unknown command')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errStr  = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errStr += traceback.format_exc()
             logger.error(errStr)

Пример #43

Показать файл

 def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,
                         '<jediTaskID={0}>'.format(taskSpec.jediTaskID))
     tmpLog.debug('start')
     # return for failure
     retFatal = self.SC_FATAL, inputChunk
     retTmpError = self.SC_FAILED, inputChunk
     # set cloud
     try:
         if not taskParamMap:
             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                 taskSpec.jediTaskID)
             taskParamMap = RefinerUtils.decodeJSON(taskParam)
         if not taskSpec.cloud and 'cloud' in taskParamMap:
             taskSpec.cloud = taskParamMap['cloud']
     except Exception:
         pass
     # get sites in the cloud
     site_preassigned = True
     if taskSpec.site not in ['', None]:
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
         if self.siteMapper.checkSite(taskSpec.site):
             scanSiteList = [taskSpec.site]
         else:
             scanSiteList = []
             for tmpSite in self.siteMapper.getCloud(
                     taskSpec.cloud)['sites']:
                 if re.search(taskSpec.site, tmpSite):
                     scanSiteList.append(tmpSite)
             if not scanSiteList:
                 tmpLog.error('unknown site={}'.format(taskSpec.site))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retTmpError
     elif inputChunk.getPreassignedSite() is not None:
         scanSiteList = [inputChunk.getPreassignedSite()]
         tmpLog.debug('site={0} is pre-assigned in masterDS'.format(
             inputChunk.getPreassignedSite()))
     else:
         site_preassigned = False
         scanSiteList = self.siteMapper.getCloud(taskSpec.cloud)['sites']
         # remove NA
         if 'NA' in scanSiteList:
             scanSiteList.remove('NA')
         tmpLog.debug('cloud=%s has %s candidates' %
                      (taskSpec.cloud, len(scanSiteList)))
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     ######################################
     # selection for status and PandaSite
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         if tmpSiteSpec.status != 'online' and not site_preassigned:
             tmpLog.debug('  skip %s due to status=%s' %
                          (tmpSiteName, tmpSiteSpec.status))
             continue
         # check PandaSite
         if 'PandaSite' in taskParamMap and taskParamMap['PandaSite']:
             if tmpSiteSpec.pandasite != taskParamMap['PandaSite']:
                 tmpLog.debug('  skip %s due to wrong PandaSite=%s <> %s' %
                              (tmpSiteName, tmpSiteSpec.pandasite,
                               taskParamMap['PandaSite']))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed site status check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for scratch disk
     minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize(
     ) + inputChunk.getMaxAtomSize()
     minDiskCountS = minDiskCountS // 1024 // 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = taskSpec.getOutDiskSize(
         ) + taskSpec.getWorkDiskSize()
         minDiskCountR = minDiskCountR // 1024 // 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir:
             if JediCoreUtils.use_direct_io_for_job(taskSpec, tmpSiteSpec,
                                                    inputChunk):
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug(
                     '  skip {0} due to small scratch disk={1} < {2}'.
                     format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # free space must be >= 200GB
         diskThreshold = 200
         tmpSpaceSize = tmpSiteSpec.space
         if tmpSiteSpec.space and tmpSpaceSize < diskThreshold:
             tmpLog.debug(
                 '  skip {0} due to disk shortage in SE = {1} < {2}GB'.
                 format(tmpSiteName, tmpSiteSpec.space, diskThreshold))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if minWalltime not in [0, None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug(
                     '  skip {0} due to short site walltime={1}(site upper limit) < {2}'
                     .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug(
                     '  skip {0} due to short job walltime={1}(site lower limit) > {2}'
                     .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(
             len(scanSiteList), minWalltime, taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for memory
     origMinRamCount = inputChunk.getMaxRamCount()
     if not site_preassigned and origMinRamCount:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # job memory requirement
             if taskSpec.ramPerCore():
                 minRamCount = origMinRamCount * (
                     tmpSiteSpec.coreCount if tmpSiteSpec.coreCount else 1)
                 minRamCount += (taskSpec.baseRamCount
                                 if taskSpec.baseRamCount else 0)
             else:
                 minRamCount = origMinRamCount
             # site max memory requirement
             site_maxmemory = tmpSiteSpec.maxrss if tmpSiteSpec.maxrss else 0
             # check at the site
             if site_maxmemory and minRamCount and minRamCount > site_maxmemory:
                 tmpMsg = '  skip site={0} due to site RAM shortage {1}(site upper limit) less than {2} '.format(
                     tmpSiteName, site_maxmemory, minRamCount)
                 tmpLog.debug(tmpMsg)
                 continue
             # site min memory requirement
             site_minmemory = tmpSiteSpec.minrss if tmpSiteSpec.minrss else 0
             if site_minmemory and minRamCount and minRamCount < site_minmemory:
                 tmpMsg = '  skip site={0} due to job RAM shortage {1}(site lower limit) greater than {2} '.format(
                     tmpSiteName, site_minmemory, minRamCount)
                 tmpLog.info(tmpMsg)
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed memory check'.format(
             len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if tmpSiteName in nWNmap:
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][
                 'updateJob']
         if nPilot == 0 and taskSpec.prodSourceLabel not in ['test']:
             tmpLog.debug('  skip %s due to no pilot' % tmpSiteName)
             #continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed pilot activity check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # sites already used by task
     tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(
         taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # get list of available files
     availableFileMap = {}
     for datasetSpec in inputChunk.getDatasets():
         try:
             # get list of site to be scanned
             tmpLog.debug(
                 'getting the list of available files for {0}'.format(
                     datasetSpec.datasetName))
             fileScanSiteList = []
             for tmpPseudoSiteName in scanSiteList:
                 tmpSiteSpec = self.siteMapper.getSite(tmpPseudoSiteName)
                 tmpSiteName = tmpSiteSpec.get_unified_name()
                 if tmpSiteName in fileScanSiteList:
                     continue
                 fileScanSiteList.append(tmpSiteName)
             # mapping between sites and input storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteInputStorageEndpointMap(
                 fileScanSiteList, self.siteMapper,
                 taskSpec.prodSourceLabel, None)
             # disable file lookup for merge jobs
             if inputChunk.isMerging:
                 checkCompleteness = False
             else:
                 checkCompleteness = True
             if not datasetSpec.isMaster():
                 useCompleteOnly = True
             else:
                 useCompleteOnly = False
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(
                 datasetSpec,
                 siteStorageEP,
                 self.siteMapper,
                 check_completeness=checkCompleteness,
                 file_scan_in_container=False,
                 complete_only=useCompleteOnly)
             if tmpAvFileMap is None:
                 raise Interaction.JEDITemporaryError(
                     'ddmIF.getAvailableFiles failed')
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except Exception as e:
             tmpLog.error('failed to get available files with {}'.format(e))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # calculate weight
     tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsByGlobalShare(
         taskSpec.vo)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     preSiteCandidateSpec = None
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName,
                                                'running', None, None)
         nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                 tmpSiteName, 'defined',
                                                 None, None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                  tmpSiteName, 'activated',
                                                  None, None)
         weight = float(nRunning + 1) / float(nActivated + nAssigned +
                                              1) / float(nAssigned + 1)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # set weight
         siteCandidateSpec.weight = weight
         # files
         for tmpDatasetName, availableFiles in six.iteritems(
                 availableFileMap):
             if tmpSiteName in availableFiles:
                 siteCandidateSpec.add_local_disk_files(
                     availableFiles[tmpSiteName]['localdisk'])
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if weight not in weightMap:
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)
     # limit the number of sites
     maxNumSites = 5
     weightList = list(weightMap.keys())
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         if len(candidateSpecList) >= maxNumSites:
             break
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight[:(maxNumSites -
                                                len(candidateSpecList))]
     # collect site names
     scanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         # append
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug('  use {} with weight={} nFiles={}'.format(
             siteCandidateSpec.siteName, siteCandidateSpec.weight,
             len(siteCandidateSpec.localDiskFiles)))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED, inputChunk

Пример #44

Показать файл

Файл: TaskCommando.py Проект: lukewayne123/panda-jedi

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,commandMap in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID))
                 commandStr = commandMap['command']
                 commentStr = commandMap['comment']
                 oldStatus  = commandMap['oldStatus']
                 tmpLog.info('start for {0}'.format(commandStr))
                 tmpStat = Interaction.SC_SUCCEEDED
                 if commandStr in ['kill','finish','reassign']:
                     # get active PandaIDs to be killed
                     pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True)
                     if pandaIDs == None:
                         tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID))
                         tmpStat = Interaction.SC_FAILED
                     # kill jobs or update task
                     if tmpStat == Interaction.SC_SUCCEEDED:
                         if pandaIDs == []:
                             # done since no active jobs
                             tmpLog.info('completed the command')
                             tmpTaskSpec = JediTaskSpec()
                             tmpTaskSpec.jediTaskID = jediTaskID
                             updateTaskStatus = True
                             if commandStr != 'reassign':
                                 # keep oldStatus for task reassignment since it is reset when actually reassigned
                                 tmpTaskSpec.forceUpdate('oldStatus')
                             else:
                                 # extract cloud or site
                                 tmpItems = commentStr.split(':')
                                 if tmpItems[0] == 'cloud':
                                     tmpTaskSpec.cloud = tmpItems[1]
                                 else:
                                     tmpTaskSpec.site = tmpItems[1]
                                 # back to oldStatus if necessary 
                                 if tmpItems[2] == 'y':
                                     tmpTaskSpec.status = oldStatus
                                     tmpTaskSpec.forceUpdate('oldStatus')
                                     updateTaskStatus = False
                             if updateTaskStatus:
                                 tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done']
                             tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID})
                         else:
                             tmpLog.info('sending kill command')
                             tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True)
                         tmpLog.info('done with {0}'.format(str(tmpRet)))
                 elif commandStr in ['retry','incexec']:
                     # change task params for incexec
                     if commandStr == 'incexec':
                         try:
                             # read task params
                             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                             taskParamMap = RefinerUtils.decodeJSON(taskParam)
                             # remove some params
                             for newKey in ['nFiles','fixedSandbox']:
                                 try:
                                     del taskParamMap[newKey]
                                 except:
                                     pass
                             # convert new params
                             newParamMap = RefinerUtils.decodeJSON(commentStr)
                             # change params
                             for newKey,newVal in newParamMap.iteritems():
                                 if newVal == None:
                                     # delete
                                     if newKey in taskParamMap:
                                         del taskParamMap[newKey]
                                 else:
                                     # change
                                     taskParamMap[newKey] = newVal
                             # overwrite sandbox
                             if 'fixedSandbox' in taskParamMap:
                                 # noBuild
                                 for tmpParam in taskParamMap['jobParameters']:
                                     if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None:
                                         tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox']
                                 # build
                                 if taskParamMap.has_key('buildSpec'):
                                     taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox']
                                 # merge
                                 if taskParamMap.has_key('mergeSpec'):
                                     taskParamMap['mergeSpec']['jobParameters'] = \
                                         re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters'])
                             # encode new param
                             strTaskParams = RefinerUtils.encodeJSON(taskParamMap)
                             tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams)
                             if tmpRet != True:
                                 tmpLog.error('failed to update task params')
                                 continue
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue))
                             continue
                     # retry failed files
                     tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr)
                     if tmpRet == True:
                         tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                         tmpLog.info(tmpMsg)
                     tmpLog.info('done with {0}'.format(tmpRet))
                 else:
                     tmpLog.error('unknown command')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))

Пример #45

Показать файл

    def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap):
        # make logger
        tmpLog = MsgWrapper(logger,
                            '<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                            monToken='<jediTaskID={0} {1}>'.format(
                                taskSpec.jediTaskID,
                                datetime.datetime.utcnow().isoformat('/')))
        tmpLog.debug('start')
        # return for failure
        retFatal = self.SC_FATAL, inputChunk
        retTmpError = self.SC_FAILED, inputChunk
        # get primary site candidates
        sitePreAssigned = False
        excludeList = []
        includeList = None
        scanSiteList = []
        # get list of site access
        siteAccessList = self.taskBufferIF.listSiteAccess(
            None, taskSpec.userName)
        siteAccessMap = {}
        for tmpSiteName, tmpAccess in siteAccessList:
            siteAccessMap[tmpSiteName] = tmpAccess
        # site limitation
        if taskSpec.useLimitedSites():
            if 'excludedSite' in taskParamMap:
                excludeList = taskParamMap['excludedSite']
                # str to list for task retry
                try:
                    if type(excludeList) != types.ListType:
                        excludeList = excludeList.split(',')
                except:
                    pass
            if 'includedSite' in taskParamMap:
                includeList = taskParamMap['includedSite']
                # str to list for task retry
                if includeList == '':
                    includeList = None
                try:
                    if type(includeList) != types.ListType:
                        includeList = includeList.split(',')
                except:
                    pass
        # loop over all sites
        for siteName, tmpSiteSpec in self.siteMapper.siteSpecList.iteritems():
            if tmpSiteSpec.type == 'analysis':
                scanSiteList.append(siteName)
        # preassigned
        if not taskSpec.site in ['', None]:
            # site is pre-assigned
            tmpLog.info('site={0} is pre-assigned'.format(taskSpec.site))
            sitePreAssigned = True
            if not taskSpec.site in scanSiteList:
                scanSiteList.append(taskSpec.site)
        tmpLog.info('initial {0} candidates'.format(len(scanSiteList)))
        # allowed remote access protocol
        allowedRemoteProtocol = 'fax'
        # MP
        if taskSpec.coreCount != None and taskSpec.coreCount > 1:
            # use MCORE only
            useMP = 'only'
        elif taskSpec.coreCount == 0:
            # use MCORE and normal
            useMP = 'any'
        else:
            # not use MCORE
            useMP = 'unuse'
        ######################################
        # selection for status
        newScanSiteList = []
        for tmpSiteName in scanSiteList:
            tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
            # skip unified queues
            if tmpSiteSpec.is_unified:
                tmpLog.info(
                    '  skip site=%s due to is_unified=%s criteria=-unified' %
                    (tmpSiteName, tmpSiteSpec.is_unified))
                continue
            # check site status
            skipFlag = False
            if tmpSiteSpec.status in ['offline']:
                skipFlag = True
            elif tmpSiteSpec.status in ['brokeroff', 'test']:
                if not sitePreAssigned:
                    skipFlag = True
                elif tmpSiteName != taskSpec.site:
                    skipFlag = True
            if not skipFlag:
                newScanSiteList.append(tmpSiteName)
            else:
                tmpLog.info(
                    '  skip site=%s due to status=%s criteria=-status' %
                    (tmpSiteName, tmpSiteSpec.status))
        scanSiteList = newScanSiteList
        tmpLog.info('{0} candidates passed site status check'.format(
            len(scanSiteList)))
        if scanSiteList == []:
            tmpLog.error('no candidates')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retTmpError
        ######################################
        # selection for MP
        newScanSiteList = []
        for tmpSiteName in scanSiteList:
            tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
            # check at the site
            if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \
                    (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]):
                newScanSiteList.append(tmpSiteName)
            else:
                tmpLog.info('  skip site=%s due to core mismatch cores_site=%s <> cores_task=%s criteria=-cpucore' % \
                                (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount))
        scanSiteList = newScanSiteList
        tmpLog.info('{0} candidates passed for useMP={1}'.format(
            len(scanSiteList), useMP))
        if scanSiteList == []:
            tmpLog.error('no candidates')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retTmpError
        ######################################
        # selection for release
        if taskSpec.transHome != None:
            unified_site_list = self.get_unified_sites(scanSiteList)
            if taskSpec.transHome.startswith('ROOT'):
                # hack until x86_64-slc6-gcc47-opt is published in installedsw
                if taskSpec.architecture == 'x86_64-slc6-gcc47-opt':
                    tmpCmtConfig = 'x86_64-slc6-gcc46-opt'
                else:
                    tmpCmtConfig = taskSpec.architecture
                siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                    unified_site_list,
                    cmtConfig=tmpCmtConfig,
                    onlyCmtConfig=True)
            elif 'AthAnalysis' in taskSpec.transHome or re.search('Ath[a-zA-Z]+Base',taskSpec.transHome) != None \
                    or 'AnalysisBase' in taskSpec.transHome:
                # AthAnalysis
                siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                    unified_site_list,
                    cmtConfig=taskSpec.architecture,
                    onlyCmtConfig=True)
            else:
                # remove AnalysisTransforms-
                transHome = re.sub('^[^-]+-*', '', taskSpec.transHome)
                transHome = re.sub('_', '-', transHome)
                if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None and taskSpec.transHome != 'AnalysisTransforms' and \
                        re.search('\d{4}-\d{2}-\d{2}T\d{4}$',taskSpec.transHome) == None and \
                        re.search('_\d+\.\d+\.\d+$',taskSpec.transHome) is None:
                    # cache is checked
                    siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                        unified_site_list,
                        caches=transHome,
                        cmtConfig=taskSpec.architecture)
                elif (transHome == '' and taskSpec.transUses != None) or \
                        (re.search('_\d+\.\d+\.\d+$',taskSpec.transHome) is not None and \
                             (taskSpec.transUses is None or re.search('-\d+\.\d+$',taskSpec.transUses) is None)):
                    # remove Atlas-
                    transUses = taskSpec.transUses.split('-')[-1]
                    # release is checked
                    siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                        unified_site_list,
                        releases=transUses,
                        cmtConfig=taskSpec.architecture)
                    siteListWithSW += self.taskBufferIF.checkSitesWithRelease(
                        unified_site_list,
                        caches=transHome,
                        cmtConfig=taskSpec.architecture)
                else:
                    # nightlies
                    siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                        unified_site_list, releases='CVMFS')
            newScanSiteList = []
            for tmpSiteName in unified_site_list:
                tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                # release check is disabled or release is available
                if tmpSiteSpec.releases == ['ANY']:
                    newScanSiteList.append(tmpSiteName)
                elif tmpSiteName in siteListWithSW:
                    newScanSiteList.append(tmpSiteName)
                else:
                    # release is unavailable
                    tmpLog.info('  skip site=%s due to missing rel/cache %s:%s:%s criteria=-cache' % \
                                 (tmpSiteName,taskSpec.transUses,taskSpec.transHome,taskSpec.architecture))
            scanSiteList = self.get_pseudo_sites(newScanSiteList, scanSiteList)
            tmpLog.info('{0} candidates passed for SW {1}:{2}:{3}'.format(
                len(scanSiteList), taskSpec.transUses, taskSpec.transHome,
                taskSpec.architecture))
            if scanSiteList == []:
                tmpLog.error('no candidates')
                taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                # send info to logger
                self.sendLogMessage(tmpLog)
                return retTmpError
        ######################################
        # selection for memory
        minRamCount = inputChunk.getMaxRamCount()
        minRamCount = JediCoreUtils.compensateRamCount(minRamCount)
        if not minRamCount in [0, None]:
            newScanSiteList = []
            for tmpSiteName in scanSiteList:
                tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                # site max memory requirement
                if not tmpSiteSpec.maxrss in [0, None]:
                    site_maxmemory = tmpSiteSpec.maxrss
                else:
                    site_maxmemory = tmpSiteSpec.maxmemory
                if not site_maxmemory in [
                        0, None
                ] and minRamCount != 0 and minRamCount > site_maxmemory:
                    tmpLog.info(
                        '  skip site={0} due to site RAM shortage. site_maxmemory={1} < job_minramcount={2} criteria=-lowmemory'
                        .format(tmpSiteName, site_maxmemory, minRamCount))
                    continue
                # site min memory requirement
                if not tmpSiteSpec.minrss in [0, None]:
                    site_minmemory = tmpSiteSpec.minrss
                else:
                    site_minmemory = tmpSiteSpec.minmemory
                if not site_minmemory in [
                        0, None
                ] and minRamCount != 0 and minRamCount < site_minmemory:
                    tmpLog.info(
                        '  skip site={0} due to job RAM shortage. site_minmemory={1} > job_minramcount={2} criteria=-highmemory'
                        .format(tmpSiteName, site_minmemory, minRamCount))
                    continue
                newScanSiteList.append(tmpSiteName)
            scanSiteList = newScanSiteList
            ramUnit = taskSpec.ramUnit
            if ramUnit is None:
                ramUnit = 'MB'
            tmpLog.info('{0} candidates passed memory check = {1} {2}'.format(
                len(scanSiteList), minRamCount, ramUnit))
            if scanSiteList == []:
                tmpLog.error('no candidates')
                taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                # send info to logger
                self.sendLogMessage(tmpLog)
                return retTmpError
        ######################################
        # selection for scratch disk
        tmpMaxAtomSize = inputChunk.getMaxAtomSize()
        tmpEffAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True)
        tmpOutDiskSize = taskSpec.getOutDiskSize()
        tmpWorkDiskSize = taskSpec.getWorkDiskSize()
        minDiskCountS = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize + tmpMaxAtomSize
        minDiskCountS = minDiskCountS / 1024 / 1024
        # size for direct IO sites
        if taskSpec.useLocalIO():
            minDiskCountR = minDiskCountS
        else:
            minDiskCountR = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize
            minDiskCountR = minDiskCountR / 1024 / 1024
        tmpLog.info(
            'maxAtomSize={0} effectiveAtomSize={1} outDiskCount={2} workDiskSize={3}'
            .format(tmpMaxAtomSize, tmpEffAtomSize, tmpOutDiskSize,
                    tmpWorkDiskSize))
        tmpLog.info('minDiskCountScratch={0} minDiskCountRemote={1}'.format(
            minDiskCountS, minDiskCountR))
        newScanSiteList = []
        for tmpSiteName in self.get_unified_sites(scanSiteList):
            tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
            # check at the site
            if tmpSiteSpec.maxwdir != 0:
                if tmpSiteSpec.isDirectIO():
                    minDiskCount = minDiskCountR
                else:
                    minDiskCount = minDiskCountS
                if minDiskCount > tmpSiteSpec.maxwdir:
                    tmpLog.info(
                        '  skip site={0} due to small scratch disk={1} < {2} criteria=-disk'
                        .format(tmpSiteName, tmpSiteSpec.maxwdir,
                                minDiskCount))
                    continue
            newScanSiteList.append(tmpSiteName)
        scanSiteList = self.get_pseudo_sites(newScanSiteList, scanSiteList)
        tmpLog.info('{0} candidates passed scratch disk check'.format(
            len(scanSiteList)))
        if scanSiteList == []:
            tmpLog.error('no candidates')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retTmpError
        ######################################
        # selection for available space in SE
        newScanSiteList = []
        for tmpSiteName in self.get_unified_sites(scanSiteList):
            # check endpoint
            tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
            tmpEndPoint = tmpSiteSpec.ddm_endpoints_output.getEndPoint(
                tmpSiteSpec.ddm_output)
            if tmpEndPoint is not None:
                # free space must be >= 200GB
                diskThreshold = 200
                tmpSpaceSize = 0
                if tmpEndPoint['space_expired'] is not None:
                    tmpSpaceSize += tmpEndPoint['space_expired']
                if tmpEndPoint['space_free'] is not None:
                    tmpSpaceSize += tmpEndPoint['space_free']
                if tmpSpaceSize < diskThreshold:
                    tmpLog.info(
                        '  skip site={0} due to disk shortage in SE {1} < {2}GB criteria=-disk'
                        .format(tmpSiteName, tmpSpaceSize, diskThreshold))
                    continue
                # check if blacklisted
                if tmpEndPoint['blacklisted'] == 'Y':
                    tmpLog.info(
                        '  skip site={0} since {1} is blacklisted in DDM criteria=-blacklist'
                        .format(tmpSiteName, tmpSiteSpec.ddm_output))
                    continue
            newScanSiteList.append(tmpSiteName)
        scanSiteList = self.get_pseudo_sites(newScanSiteList, scanSiteList)
        tmpLog.info('{0} candidates passed SE space check'.format(
            len(scanSiteList)))
        if scanSiteList == []:
            tmpLog.error('no candidates')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retTmpError
        ######################################
        # selection for walltime
        minWalltime = taskSpec.walltime
        if not minWalltime in [0, None] and minWalltime > 0:
            minWalltime *= tmpEffAtomSize
            newScanSiteList = []
            for tmpSiteName in scanSiteList:
                tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                # check at the site
                if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                    tmpLog.info(
                        '  skip site={0} due to short site walltime={1}(site upper limit) < {2} criteria=-shortwalltime'
                        .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime))
                    continue
                if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                    tmpLog.info(
                        '  skip site={0} due to short job walltime={1}(site lower limit) > {2} criteria=-longwalltime'
                        .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime))
                    continue
                newScanSiteList.append(tmpSiteName)
            scanSiteList = newScanSiteList
            tmpLog.info('{0} candidates passed walltime check ={1}{2}'.format(
                len(scanSiteList), minWalltime, taskSpec.walltimeUnit))
            if scanSiteList == []:
                tmpLog.error('no candidates')
                taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                # send info to logger
                self.sendLogMessage(tmpLog)
                return retTmpError
        ######################################
        # selection for nPilot
        nWNmap = self.taskBufferIF.getCurrentSiteData()
        newScanSiteList = []
        for tmpSiteName in self.get_unified_sites(scanSiteList):
            # check at the site
            nPilot = 0
            if nWNmap.has_key(tmpSiteName):
                nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][
                    'updateJob']
            if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
                tmpLog.info(
                    '  skip site=%s due to no pilot criteria=-nopilot' %
                    tmpSiteName)
                if not self.testMode:
                    continue
            newScanSiteList.append(tmpSiteName)
        scanSiteList = self.get_pseudo_sites(newScanSiteList, scanSiteList)
        tmpLog.info('{0} candidates passed pilot activity check'.format(
            len(scanSiteList)))
        if scanSiteList == []:
            tmpLog.error('no candidates')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retTmpError
        ######################################
        # check inclusion and exclusion
        newScanSiteList = []
        sitesForANY = []
        for tmpSiteName in self.get_unified_sites(scanSiteList):
            autoSite = False
            # check exclusion
            if AtlasBrokerUtils.isMatched(tmpSiteName, excludeList):
                tmpLog.info(
                    '  skip site={0} excluded criteria=-excluded'.format(
                        tmpSiteName))
                continue
            # check inclusion
            if includeList != None and not AtlasBrokerUtils.isMatched(
                    tmpSiteName, includeList):
                if 'AUTO' in includeList:
                    autoSite = True
                else:
                    tmpLog.info(
                        '  skip site={0} not included criteria=-notincluded'.
                        format(tmpSiteName))
                    continue
            tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
            # limited access
            if tmpSiteSpec.accesscontrol == 'grouplist':
                if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \
                        siteAccessMap[tmpSiteSpec.sitename] != 'approved':
                    tmpLog.info(
                        '  skip site={0} limited access criteria=-limitedaccess'
                        .format(tmpSiteName))
                    continue
            # check cloud
            if not taskSpec.cloud in [None, '', 'any', tmpSiteSpec.cloud]:
                tmpLog.info(
                    '  skip site={0} cloud mismatch criteria=-cloudmismatch'.
                    format(tmpSiteName))
                continue
            if autoSite:
                sitesForANY.append(tmpSiteName)
            else:
                newScanSiteList.append(tmpSiteName)
        # use AUTO sites if no sites are included
        if newScanSiteList == []:
            newScanSiteList = sitesForANY
        else:
            for tmpSiteName in sitesForANY:
                tmpLog.info(
                    '  skip site={0} not included criteria=-notincluded'.
                    format(tmpSiteName))
        scanSiteList = self.get_pseudo_sites(newScanSiteList, scanSiteList)
        tmpLog.info('{0} candidates passed inclusion/exclusion/cloud'.format(
            len(scanSiteList)))
        if scanSiteList == []:
            tmpLog.error('no candidates')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retTmpError
        ######################################
        # selection for data availability
        hasDDS = False
        dataWeight = {}
        remoteSourceList = {}
        if inputChunk.getDatasets() != []:
            oldScanSiteList = copy.copy(scanSiteList)
            oldScanUnifiedSiteList = self.get_unified_sites(oldScanSiteList)
            for datasetSpec in inputChunk.getDatasets():
                datasetName = datasetSpec.datasetName
                if not self.dataSiteMap.has_key(datasetName):
                    # get the list of sites where data is available
                    tmpLog.debug(
                        'getting the list of sites where {0} is available'.
                        format(datasetName))
                    tmpSt, tmpRet = AtlasBrokerUtils.getAnalSitesWithData(
                        self.get_unified_sites(scanSiteList), self.siteMapper,
                        self.ddmIF, datasetName)
                    if tmpSt in [
                            Interaction.JEDITemporaryError,
                            Interaction.JEDITimeoutError
                    ]:
                        tmpLog.error(
                            'temporary failed to get the list of sites where data is available, since %s'
                            % tmpRet)
                        taskSpec.setErrDiag(
                            tmpLog.uploadLog(taskSpec.jediTaskID))
                        # send info to logger
                        self.sendLogMessage(tmpLog)
                        return retTmpError
                    if tmpSt == Interaction.JEDIFatalError:
                        tmpLog.error(
                            'fatal error when getting the list of sites where data is available, since %s'
                            % tmpRet)
                        taskSpec.setErrDiag(
                            tmpLog.uploadLog(taskSpec.jediTaskID))
                        # send info to logger
                        self.sendLogMessage(tmpLog)
                        return retFatal
                    # append
                    self.dataSiteMap[datasetName] = tmpRet
                    if datasetName.startswith('ddo'):
                        tmpLog.debug(' {0} sites'.format(len(tmpRet)))
                    else:
                        tmpLog.debug(' {0} sites : {1}'.format(
                            len(tmpRet), str(tmpRet)))
                        # check if distributed
                        if tmpRet != {}:
                            isDistributed = True
                            for tmpMap in tmpRet.values():
                                for tmpVal in tmpMap.values():
                                    if tmpVal['state'] == 'complete':
                                        isDistributed = False
                                        break
                                if not isDistributed:
                                    break
                            if isDistributed:
                                # check if really distributed
                                isDistributed = self.ddmIF.isDistributedDataset(
                                    datasetName)
                                if isDistributed:
                                    hasDDS = True
                                    datasetSpec.setDistributed()
                                    tmpLog.debug(' {0} is distributed'.format(
                                        datasetName))
                # check if the data is available at somewhere
                if self.dataSiteMap[datasetName] == {}:
                    for tmpSiteName in scanSiteList:
                        tmpLog.info(
                            '  skip site={0} data is unavailable criteria=-input'
                            .format(tmpSiteName))
                    tmpLog.error(
                        '{0} is unavailable at any site'.format(datasetName))
                    taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                    # send info to logger
                    self.sendLogMessage(tmpLog)
                    return retFatal
            # get the list of sites where data is available
            scanSiteList = None
            scanSiteListOnDisk = None
            normFactor = 0
            for datasetName, tmpDataSite in self.dataSiteMap.iteritems():
                normFactor += 1
                # get sites where replica is available
                tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(
                    tmpDataSite, includeTape=True)
                tmpDiskSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(
                    tmpDataSite, includeTape=False)
                # get sites which can remotely access source sites
                if inputChunk.isMerging:
                    # disable remote access for merging
                    tmpSatelliteSites = {}
                elif (not sitePreAssigned) or (
                        sitePreAssigned and not taskSpec.site in tmpSiteList):
                    tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites(
                        tmpDiskSiteList,
                        self.taskBufferIF,
                        self.siteMapper,
                        nSites=50,
                        protocol=allowedRemoteProtocol)
                else:
                    tmpSatelliteSites = {}
                # make weight map for local
                for tmpSiteName in tmpSiteList:
                    if not dataWeight.has_key(tmpSiteName):
                        dataWeight[tmpSiteName] = 0
                    # give more weight to disk
                    if tmpSiteName in tmpDiskSiteList:
                        dataWeight[tmpSiteName] += 1
                    else:
                        dataWeight[tmpSiteName] += 0.001
                # make weight map for remote
                for tmpSiteName, tmpWeightSrcMap in tmpSatelliteSites.iteritems(
                ):
                    # skip since local data is available
                    if tmpSiteName in tmpSiteList:
                        continue
                    tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                    # negative weight for remote access
                    wRemote = 50.0
                    if not tmpSiteSpec.wansinklimit in [0, None]:
                        wRemote /= float(tmpSiteSpec.wansinklimit)
                    # sum weight
                    if not dataWeight.has_key(tmpSiteName):
                        dataWeight[tmpSiteName] = float(
                            tmpWeightSrcMap['weight']) / wRemote
                    else:
                        dataWeight[tmpSiteName] += float(
                            tmpWeightSrcMap['weight']) / wRemote
                    # make remote source list
                    if not remoteSourceList.has_key(tmpSiteName):
                        remoteSourceList[tmpSiteName] = {}
                    remoteSourceList[tmpSiteName][
                        datasetName] = tmpWeightSrcMap['source']
                # first list
                if scanSiteList == None:
                    scanSiteList = []
                    for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                        if not tmpSiteName in oldScanUnifiedSiteList:
                            continue
                        if not tmpSiteName in scanSiteList:
                            scanSiteList.append(tmpSiteName)
                    scanSiteListOnDisk = set()
                    for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys(
                    ):
                        if not tmpSiteName in oldScanUnifiedSiteList:
                            continue
                        scanSiteListOnDisk.add(tmpSiteName)
                    continue
                # pickup sites which have all data
                newScanList = []
                for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                    if tmpSiteName in scanSiteList and not tmpSiteName in newScanList:
                        newScanList.append(tmpSiteName)
                scanSiteList = newScanList
                tmpLog.debug('{0} is available at {1} sites'.format(
                    datasetName, len(scanSiteList)))
                # pickup sites which have all data on DISK
                newScanListOnDisk = set()
                for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys():
                    if tmpSiteName in scanSiteListOnDisk:
                        newScanListOnDisk.add(tmpSiteName)
                scanSiteListOnDisk = newScanListOnDisk
                tmpLog.debug('{0} is available at {1} sites on DISK'.format(
                    datasetName, len(scanSiteListOnDisk)))
            # check for preassigned
            if sitePreAssigned and not taskSpec.site in scanSiteList:
                scanSiteList = []
                tmpLog.info(
                    'data is unavailable locally or remotely at preassigned site {0}'
                    .format(taskSpec.site))
            elif len(scanSiteListOnDisk) > 0:
                # use only disk sites
                scanSiteList = list(scanSiteListOnDisk)
            scanSiteList = self.get_pseudo_sites(scanSiteList, oldScanSiteList)
            # dump
            for tmpSiteName in oldScanSiteList:
                if tmpSiteName not in scanSiteList:
                    tmpLog.info(
                        '  skip site={0} data is unavailable criteria=-input'.
                        format(tmpSiteName))
            tmpLog.info('{0} candidates have input data'.format(
                len(scanSiteList)))
            if scanSiteList == []:
                tmpLog.error('no candidates')
                taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                # send info to logger
                self.sendLogMessage(tmpLog)
                return retFatal
        ######################################
        # sites already used by task
        tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(
            taskSpec.jediTaskID)
        if not tmpSt:
            tmpLog.error('failed to get sites which already used by task')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retTmpError
        sitesUsedByTask = self.get_unified_sites(sitesUsedByTask)
        ######################################
        # calculate weight
        """
        fqans = taskSpec.makeFQANs()
        tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans,
                                                                                                  taskSpec.workingGroup,True)
        currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight)
        currentPriority -= 500
        tmpLog.debug('currentPriority={0}'.format(currentPriority))
        """
        tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsByGlobalShare(
            taskSpec.vo)
        if not tmpSt:
            tmpLog.error('failed to get job statistics with priority')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retTmpError
        # check for preassigned
        if sitePreAssigned and (taskSpec.site not in scanSiteList
                                and taskSpec.site
                                not in self.get_unified_sites(scanSiteList)):
            tmpLog.info("preassigned site {0} did not pass all tests".format(
                taskSpec.site))
            tmpLog.error('no candidates')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retFatal
        ######################################
        # final procedure
        tmpLog.info('final {0} candidates'.format(len(scanSiteList)))
        weightMap = {}
        candidateSpecList = []
        timeWindowForFC = 6
        preSiteCandidateSpec = None
        failureCounts = self.taskBufferIF.getFailureCountsForTask_JEDI(
            taskSpec.jediTaskID, timeWindowForFC)
        problematicSites = set()
        for tmpPseudoSiteName in scanSiteList:
            tmpSiteSpec = self.siteMapper.getSite(tmpPseudoSiteName)
            tmpSiteName = tmpSiteSpec.get_unified_name()
            # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
            nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName,
                                                   'running', None, None)
            nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                    tmpSiteName, 'defined',
                                                    None, None)
            nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) + \
                         AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None)
            nStarting = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                    tmpSiteName, 'starting',
                                                    None, None)
            nFailed = 0
            nClosed = 0
            nFinished = 0
            if tmpSiteName in failureCounts:
                if 'failed' in failureCounts[tmpSiteName]:
                    nFailed = failureCounts[tmpSiteName]['failed']
                if 'closed' in failureCounts[tmpSiteName]:
                    nClosed = failureCounts[tmpSiteName]['closed']
                if 'finished' in failureCounts[tmpSiteName]:
                    nFinished = failureCounts[tmpSiteName]['finished']
            # problematic sites
            if nFailed + nClosed > 2 * nFinished:
                problematicSites.add(tmpSiteName)
            # calculate weight
            weight = float(nRunning + 1) / float(nActivated + nAssigned +
                                                 nStarting + 1)
            nThrottled = 0
            if remoteSourceList.has_key(tmpSiteName):
                nThrottled = AtlasBrokerUtils.getNumJobs(
                    jobStatPrioMap, tmpSiteName, 'throttled', None, None)
                weight /= float(nThrottled + 1)
            # noramize weights by taking data availability into account
            tmpDataWeight = 1
            if dataWeight.has_key(tmpSiteName):
                weight = weight * dataWeight[tmpSiteName]
                tmpDataWeight = dataWeight[tmpSiteName]
            # make candidate
            siteCandidateSpec = SiteCandidate(tmpPseudoSiteName)
            # preassigned
            if sitePreAssigned and tmpSiteName == taskSpec.site:
                preSiteCandidateSpec = siteCandidateSpec
            # set weight
            siteCandidateSpec.weight = weight
            tmpStr = '  site={0} nRun={1} nDef={2} nAct={3} nStart={4} '.format(
                tmpPseudoSiteName, nRunning, nAssigned, nActivated, nStarting)
            tmpStr += 'nFailed={0} nClosed={1} nFinished={2} nTr={3} dataW={4} W={5}'.format(
                nFailed, nClosed, nFinished, nThrottled, tmpDataWeight, weight)
            tmpLog.info(tmpStr)
            # append
            if tmpSiteName in sitesUsedByTask:
                candidateSpecList.append(siteCandidateSpec)
            else:
                if not weightMap.has_key(weight):
                    weightMap[weight] = []
                weightMap[weight].append(siteCandidateSpec)
        # sort candidates by weights
        weightList = weightMap.keys()
        weightList.sort()
        weightList.reverse()
        for weightVal in weightList:
            sitesWithWeight = weightMap[weightVal]
            random.shuffle(sitesWithWeight)
            candidateSpecList += sitesWithWeight
        # limit the number of sites. use all sites for distributed datasets
        if not hasDDS:
            maxNumSites = 10
            # remove problematic sites
            candidateSpecList = AtlasBrokerUtils.skipProblematicSites(
                candidateSpecList, problematicSites, sitesUsedByTask,
                preSiteCandidateSpec, maxNumSites, timeWindowForFC, tmpLog)
        # append preassigned
        if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList:
            candidateSpecList.append(preSiteCandidateSpec)
        # collect site names
        scanSiteList = []
        for siteCandidateSpec in candidateSpecList:
            scanSiteList.append(siteCandidateSpec.siteName)
        # get list of available files
        availableFileMap = {}
        for datasetSpec in inputChunk.getDatasets():
            try:
                # get list of site to be scanned
                fileScanSiteList = []
                for tmpPseudoSiteName in scanSiteList:
                    tmpSiteSpec = self.siteMapper.getSite(tmpPseudoSiteName)
                    tmpSiteName = tmpSiteSpec.get_unified_name()
                    if tmpSiteName in fileScanSiteList:
                        continue
                    fileScanSiteList.append(tmpSiteName)
                    if remoteSourceList.has_key(
                            tmpSiteName
                    ) and remoteSourceList[tmpSiteName].has_key(
                            datasetSpec.datasetName):
                        for tmpRemoteSite in remoteSourceList[tmpSiteName][
                                datasetSpec.datasetName]:
                            if not tmpRemoteSite in fileScanSiteList:
                                fileScanSiteList.append(tmpRemoteSite)
                # mapping between sites and input storage endpoints
                siteStorageEP = AtlasBrokerUtils.getSiteInputStorageEndpointMap(
                    fileScanSiteList, self.siteMapper)

                # disable file lookup for merge jobs
                if inputChunk.isMerging:
                    checkCompleteness = False
                else:
                    checkCompleteness = True
                # get available files per site/endpoint
                tmpAvFileMap = self.ddmIF.getAvailableFiles(
                    datasetSpec,
                    siteStorageEP,
                    self.siteMapper,
                    check_completeness=checkCompleteness)
                if tmpAvFileMap == None:
                    raise Interaction.JEDITemporaryError, 'ddmIF.getAvailableFiles failed'
                availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
            except:
                errtype, errvalue = sys.exc_info()[:2]
                tmpLog.error('failed to get available files with %s %s' %
                             (errtype.__name__, errvalue))
                taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                # send info to logger
                self.sendLogMessage(tmpLog)
                return retTmpError
        # append candidates
        newScanSiteList = []
        for siteCandidateSpec in candidateSpecList:
            tmpPseudoSiteName = siteCandidateSpec.siteName
            tmpSiteSpec = self.siteMapper.getSite(tmpPseudoSiteName)
            tmpSiteName = tmpSiteSpec.get_unified_name()
            # preassigned
            if sitePreAssigned and tmpSiteName != taskSpec.site:
                tmpLog.info(
                    '  skip site={0} non pre-assigned site criteria=-nonpreassigned'
                    .format(tmpPseudoSiteName))
                continue
            # set available files
            if inputChunk.getDatasets() == []:
                isAvailable = True
            else:
                isAvailable = False
            for tmpDatasetName, availableFiles in availableFileMap.iteritems():
                tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName)
                # check remote files
                if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[
                        tmpSiteName].has_key(tmpDatasetName):
                    for tmpRemoteSite in remoteSourceList[tmpSiteName][
                            tmpDatasetName]:
                        if availableFiles.has_key(tmpRemoteSite) and \
                                len(tmpDatasetSpec.Files) <= len(availableFiles[tmpRemoteSite]['localdisk']):
                            # use only remote disk files
                            siteCandidateSpec.remoteFiles += availableFiles[
                                tmpRemoteSite]['localdisk']
                            # set remote site and access protocol
                            siteCandidateSpec.remoteProtocol = allowedRemoteProtocol
                            siteCandidateSpec.remoteSource = tmpRemoteSite
                            isAvailable = True
                            break
                # local files
                if availableFiles.has_key(tmpSiteName):
                    if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \
                            len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']) or \
                            len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localtape']) or \
                            (tmpDatasetSpec.isDistributed() and len(availableFiles[tmpSiteName]['all']) > 0):
                        siteCandidateSpec.localDiskFiles += availableFiles[
                            tmpSiteName]['localdisk']
                        # add cached files to local list since cached files go to pending when reassigned
                        siteCandidateSpec.localDiskFiles += availableFiles[
                            tmpSiteName]['cache']
                        siteCandidateSpec.localTapeFiles += availableFiles[
                            tmpSiteName]['localtape']
                        siteCandidateSpec.cacheFiles += availableFiles[
                            tmpSiteName]['cache']
                        siteCandidateSpec.remoteFiles += availableFiles[
                            tmpSiteName]['remote']
                        siteCandidateSpec.addAvailableFiles(
                            availableFiles[tmpSiteName]['all'])
                        isAvailable = True
                    else:
                        tmpMsg = '{0} is incomplete at {1} : nFiles={2} nLocal={3} nCached={4} nTape={5}'
                        tmpLog.debug(
                            tmpMsg.format(
                                tmpDatasetName,
                                tmpPseudoSiteName,
                                len(tmpDatasetSpec.Files),
                                len(availableFiles[tmpSiteName]['localdisk']),
                                len(availableFiles[tmpSiteName]['cache']),
                                len(availableFiles[tmpSiteName]['localtape']),
                            ))
                if not isAvailable:
                    break
            # append
            if not isAvailable:
                tmpLog.info(
                    '  skip site={0} file unavailable criteria=-fileunavailable'
                    .format(siteCandidateSpec.siteName))
                continue
            inputChunk.addSiteCandidate(siteCandidateSpec)
            newScanSiteList.append(siteCandidateSpec.siteName)
            tmpLog.info(
                '  use site={0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5} criteria=+use'
                .format(
                    siteCandidateSpec.siteName,
                    siteCandidateSpec.weight,
                    len(siteCandidateSpec.localDiskFiles),
                    len(siteCandidateSpec.localTapeFiles),
                    len(siteCandidateSpec.cacheFiles),
                    len(siteCandidateSpec.remoteFiles),
                ))
        scanSiteList = newScanSiteList
        if scanSiteList == []:
            tmpLog.error('no candidates')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retTmpError
        # send info to logger
        self.sendLogMessage(tmpLog)
        # return
        tmpLog.debug('done')
        return self.SC_SUCCEEDED, inputChunk

Пример #46

Показать файл

Файл: WatchDog.py Проект: ruslan33/panda-jedi

    def start(self):
        # start base classes
        JediKnight.start(self)
        FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF)
        # go into main loop
        while True:
            startTime = datetime.datetime.utcnow()
            try:
                # get logger
                tmpLog = MsgWrapper(logger)
                tmpLog.info('start')
                # loop over all vos
                for vo in self.vos:
                    # loop over all sourceLabels
                    for prodSourceLabel in self.prodSourceLabels:
                        # rescue picked files
                        tmpLog.info('rescue tasks with picked files for vo={0} label={1}'.format(vo,prodSourceLabel)) 
                        tmpRet = self.taskBufferIF.rescuePickedFiles_JEDI(vo,prodSourceLabel,
                                                                          jedi_config.watchdog.waitForPicked)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to rescue')
                        else:
                            tmpLog.info('rescued {0} tasks'.format(tmpRet))

                        # reactivate pending tasks
                        tmpLog.info('reactivate pending tasks for vo={0} label={1}'.format(vo,prodSourceLabel)) 
                        timeoutForPending = None
                        if hasattr(jedi_config.watchdog,'timeoutForPendingVoLabel'): 
                            timeoutForPending = JediCoreUtils.getConfigParam(jedi_config.watchdog.timeoutForPendingVoLabel,vo,prodSourceLabel)
                        if timeoutForPending == None:
                            timeoutForPending = jedi_config.watchdog.timeoutForPending
                        timeoutForPending = int(timeoutForPending)    
                        tmpRet = self.taskBufferIF.reactivatePendingTasks_JEDI(vo,prodSourceLabel,
                                                                               jedi_config.watchdog.waitForPending,
                                                                               timeoutForPending)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to reactivate')
                        else:
                            tmpLog.info('reactivated {0} tasks'.format(tmpRet))
                        # unlock tasks
                        tmpLog.info('unlock tasks for vo={0} label={1}'.format(vo,prodSourceLabel)) 
                        tmpRet = self.taskBufferIF.unlockTasks_JEDI(vo,prodSourceLabel,
                                                                    jedi_config.watchdog.waitForLocked)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to unlock')
                        else:
                            tmpLog.info('unlock {0} tasks'.format(tmpRet))
                        # restart contents update
                        tmpLog.info('restart contents update for vo={0} label={1}'.format(vo,prodSourceLabel)) 
                        tmpRet = self.taskBufferIF.restartTasksForContentsUpdate_JEDI(vo,prodSourceLabel)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to restart')
                        else:
                            tmpLog.info('restarted {0} tasks'.format(tmpRet))
                        # kick exhausted tasks
                        tmpLog.info('kick exhausted tasks for vo={0} label={1}'.format(vo,prodSourceLabel)) 
                        tmpRet = self.taskBufferIF.kickExhaustedTasks_JEDI(vo,prodSourceLabel,
                                                                           jedi_config.watchdog.waitForExhausted)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to kick')
                        else:
                            tmpLog.info('kicked {0} tasks'.format(tmpRet))
                        # finish tasks when goal is reached
                        tmpLog.info('finish achieved tasks for vo={0} label={1}'.format(vo,prodSourceLabel)) 
                        tmpRet = self.taskBufferIF.getAchievedTasks_JEDI(vo,prodSourceLabel,
                                                                         jedi_config.watchdog.waitForAchieved)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to finish')
                        else:
                            for jediTaskID in tmpRet:
                                self.taskBufferIF.sendCommandTaskPanda(jediTaskID,'JEDI. Goal reached',True,'finish',comQualifier='soft')
                            tmpLog.info('finished {0} tasks'.format(tmpRet))
                        # vo/prodSourceLabel specific action
                        impl = self.getImpl(vo,prodSourceLabel)
                        if impl != None:
                            tmpLog.info('special action for vo={0} label={1} with {2}'.format(vo,prodSourceLabel,impl.__class__.__name__))
                            tmpStat = impl.doAction()
                            if tmpStat !=  Interaction.SC_SUCCEEDED:
                                tmpLog.error('failed to run special acction for vo={0} label={1}'.format(vo,prodSourceLabel))
                            else:
                                tmpLog.info('done for vo={0} label={1}'.format(vo,prodSourceLabel))
                tmpLog.info('done')
            except:
                errtype,errvalue = sys.exc_info()[:2]
                tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
            # sleep if needed
            loopCycle = jedi_config.watchdog.loopCycle
            timeDelta = datetime.datetime.utcnow() - startTime
            sleepPeriod = loopCycle - timeDelta.seconds
            if sleepPeriod > 0:
                time.sleep(sleepPeriod)
            # randomize cycle
            self.randomSleep()

Пример #47

Показать файл

Файл: AtlasTaskSetupper.py Проект: tertychnyy/panda-jedi

 def doSetup(self,taskSpec,datasetToRegister,pandaJobs):
     # make logger
     tmpLog = MsgWrapper(logger,"<jediTaskID={0}>".format(taskSpec.jediTaskID))
     tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType))
     # returns
     retFatal    = self.SC_FATAL
     retTmpError = self.SC_FAILED
     retOK       = self.SC_SUCCEEDED
     try:
         # get DDM I/F
         ddmIF = self.ddmIF.getInterface(taskSpec.vo)
         # register datasets
         if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']:
             # prod vs anal
             userSetup = False
             if taskSpec.prodSourceLabel in ['user']:
                 userSetup = True
                 # collect datasetID to register datasets/containers just in case
                 for tmpPandaJob in pandaJobs:
                     if not tmpPandaJob.produceUnMerge():
                         for tmpFileSpec in tmpPandaJob.Files:
                             if tmpFileSpec.type in ['output','log']:
                                 if not tmpFileSpec.datasetID in datasetToRegister:
                                     datasetToRegister.append(tmpFileSpec.datasetID)
             tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister)))
             # get site mapper
             siteMapper = self.taskBufferIF.getSiteMapper()
             # loop over all datasets
             avDatasetList = []
             cnDatasetMap  = {}
             for datasetID in datasetToRegister:
                 # get output and log datasets
                 tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID))
                 tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID,
                                                                               datasetID)
                 if not tmpStat:
                     tmpLog.error('failed to get output and log datasets')
                     return retFatal
                 # DDM backend
                 ddmBackEnd = taskSpec.getDdmBackEnd()
                 tmpLog.info('checking {0}'.format(datasetSpec.datasetName)) 
                 # check if dataset and container are available in DDM
                 for targetName in [datasetSpec.datasetName,datasetSpec.containerName]:
                     if targetName == None:
                         continue
                     if not targetName in avDatasetList:
                         # set lifetime
                         if targetName.startswith('panda'):
                             lifetime = 14
                         else:
                             lifetime = None
                         # check dataset/container in DDM
                         tmpList = ddmIF.listDatasets(targetName)
                         if tmpList == []:
                             # get location
                             location = None
                             locForRule = None
                             if targetName == datasetSpec.datasetName:
                                 # dataset
                                 if datasetSpec.site in ['',None]:
                                     if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                                         locForRule = datasetSpec.destination
                                     elif DataServiceUtils.getDestinationSE(datasetSpec.storageToken) != None:
                                         location = DataServiceUtils.getDestinationSE(datasetSpec.storageToken)
                                     elif taskSpec.cloud != None:
                                         # use T1 SE
                                         tmpT1Name = siteMapper.getCloud(taskSpec.cloud)['source']
                                         location = siteMapper.getDdmEndpoint(tmpT1Name,datasetSpec.storageToken)
                                 else:
                                     location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken)
                             if locForRule == None:
                                 locForRule = location
                             # set metadata
                             if taskSpec.prodSourceLabel in ['managed','test'] and targetName == datasetSpec.datasetName:
                                 metaData = {}
                                 metaData['task_id'] = taskSpec.jediTaskID
                                 if not taskSpec.campaign in [None,'']:
                                     metaData['campaign'] = taskSpec.campaign 
                                 if datasetSpec.getTransient() != None:
                                     metaData['transient'] = datasetSpec.getTransient()
                             else:
                                 metaData = None
                             # register dataset/container
                             tmpLog.info('registering {0} with location={1} backend={2} lifetime={3} meta={4}'.format(targetName,
                                                                                                                      location,
                                                                                                                      ddmBackEnd,
                                                                                                                      lifetime,
                                                                                                                      str(metaData)))
                             tmpStat = ddmIF.registerNewDataset(targetName,backEnd=ddmBackEnd,location=location,
                                                                lifetime=lifetime,metaData=metaData)
                             if not tmpStat:
                                 tmpLog.error('failed to register {0}'.format(targetName))
                                 return retFatal
                             # procedures for user 
                             if userSetup or DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                                 # register location
                                 tmpToRegister = False
                                 if userSetup and targetName == datasetSpec.datasetName and not datasetSpec.site in ['',None]:
                                     userName = taskSpec.userName
                                     grouping = None
                                     tmpToRegister = True
                                 elif DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                                     userName = None
                                     grouping = 'NONE'
                                     tmpToRegister = True
                                 if tmpToRegister:
                                     activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel)
                                     tmpLog.info('registring location={0} lifetime={1}days activity={2} grouping={3}'.format(locForRule,lifetime,
                                                                                                                             activity,grouping))
                                     tmpStat = ddmIF.registerDatasetLocation(targetName,locForRule,owner=userName,
                                                                             lifetime=lifetime,backEnd=ddmBackEnd,
                                                                             activity=activity,grouping=grouping)
                                     if not tmpStat:
                                         tmpLog.error('failed to register location {0} with {2} for {1}'.format(locForRule,
                                                                                                                targetName,
                                                                                                                ddmBackEnd))
                                         return retFatal
                             avDatasetList.append(targetName)
                         else:
                             tmpLog.info('{0} already registered'.format(targetName))
                 # check if dataset is in the container
                 if datasetSpec.containerName != None and datasetSpec.containerName != datasetSpec.datasetName:
                     # get list of constituent datasets in the container
                     if not cnDatasetMap.has_key(datasetSpec.containerName):
                         cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName)
                     # add dataset
                     if not datasetSpec.datasetName in cnDatasetMap[datasetSpec.containerName]:
                         tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) 
                         tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName],
                                                                backEnd=ddmBackEnd)
                         if not tmpStat:
                             tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName,
                                                                            datasetSpec.containerName))
                             return retFatal
                         cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName)
                     else:
                         tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) 
                 # update dataset
                 datasetSpec.status = 'registered'
                 self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID,
                                                                   'datasetID':datasetID})
         # open datasets
         if taskSpec.prodSourceLabel in ['managed','test']:
             # get the list of output/log datasets
             outDatasetList = []
             for tmpPandaJob in pandaJobs:
                 for tmpFileSpec in tmpPandaJob.Files:
                     if tmpFileSpec.type in ['output','log']:
                         if not tmpFileSpec.destinationDBlock in outDatasetList:
                             outDatasetList.append(tmpFileSpec.destinationDBlock)
             # open datasets
             for outDataset in outDatasetList:
                 tmpLog.info('open {0}'.format(outDataset))
                 ddmIF.openDataset(outDataset)
                 # unset lifetime
                 ddmIF.setDatasetMetadata(outDataset,'lifetime',None)
         # return
         tmpLog.info('done')        
         return retOK
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue))
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retFatal

Пример #48

Показать файл

    def start(self):
        # start base classes
        JediKnight.start(self)
        FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF)
        # go into main loop
        while True:
            startTime = datetime.datetime.utcnow()
            try:
                # get logger
                tmpLog = MsgWrapper(logger)
                tmpLog.info('start')
                # loop over all vos
                for vo in self.vos:
                    # loop over all sourceLabels
                    for prodSourceLabel in self.prodSourceLabels:
                        # rescue picked files
                        tmpLog.info('rescue tasks with picked files for vo={0} label={1}'.format(vo,prodSourceLabel)) 
                        tmpRet = self.taskBufferIF.rescuePickedFiles_JEDI(vo,prodSourceLabel,
                                                                          jedi_config.watchdog.waitForPicked)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to rescue')
                        else:
                            tmpLog.info('rescued {0} tasks'.format(tmpRet))

                        # reactivate pending tasks
                        tmpLog.info('reactivate pending tasks for vo={0} label={1}'.format(vo,prodSourceLabel)) 
                        tmpRet = self.taskBufferIF.reactivatePendingTasks_JEDI(vo,prodSourceLabel,
                                                                               jedi_config.watchdog.waitForPending,
                                                                               jedi_config.watchdog.timeoutForPending)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to reactivate')
                        else:
                            tmpLog.info('reactivated {0} tasks'.format(tmpRet))
                        # vo/prodSourceLabel specific action
                        impl = self.getImpl(vo,prodSourceLabel)
                        if impl != None:
                            tmpLog.info('special action for vo={0} label={1} with {2}'.format(vo,prodSourceLabel,impl.__class__.__name__))
                            tmpStat = impl.doAction()
                            if tmpStat !=  Interaction.SC_SUCCEEDED:
                                tmpLog.error('failed to run special acction for vo={0} label={1}'.format(vo,prodSourceLabel))
                            else:
                                tmpLog.info('done for vo={0} label={1}'.format(vo,prodSourceLabel))
                tmpLog.info('done')
            except:
                errtype,errvalue = sys.exc_info()[:2]
                tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
            # sleep if needed
            loopCycle = jedi_config.watchdog.loopCycle
            timeDelta = datetime.datetime.utcnow() - startTime
            sleepPeriod = loopCycle - timeDelta.seconds
            if sleepPeriod > 0:
                time.sleep(sleepPeriod)
            # randomize cycle
            self.randomSleep()

Пример #49

Показать файл

Файл: JobSplitter.py Проект: PanDAWMS/panda-jedi

 def doSplit(self,
             taskSpec,
             inputChunk,
             siteMapper,
             allow_chunk_size_limit=False):
     # return for failure
     retFatal = self.SC_FATAL, []
     retTmpError = self.SC_FAILED, []
     # make logger
     tmpLog = MsgWrapper(
         logger, '< jediTaskID={0} datasetID={1} >'.format(
             taskSpec.jediTaskID, inputChunk.masterIndexName))
     tmpLog.debug(
         'start chunk_size_limit={}'.format(allow_chunk_size_limit))
     if not inputChunk.isMerging:
         # set maxNumFiles using taskSpec if specified
         maxNumFiles = taskSpec.getMaxNumFilesPerJob()
         # set fsize gradients using taskSpec
         sizeGradients = taskSpec.getOutDiskSize()
         # set fsize intercepts using taskSpec
         sizeIntercepts = taskSpec.getWorkDiskSize()
         # walltime
         if not taskSpec.useHS06():
             walltimeGradient = taskSpec.walltime
         else:
             walltimeGradient = taskSpec.getCpuTime()
         # number of events per job if defined
         nEventsPerJob = taskSpec.getNumEventsPerJob()
         # number of files per job if defined
         if not taskSpec.dynamicNumEvents():
             nFilesPerJob = taskSpec.getNumFilesPerJob()
         else:
             nFilesPerJob = None
         if nFilesPerJob is None and nEventsPerJob is None and inputChunk.useScout() \
                 and not taskSpec.useLoadXML() and not taskSpec.respectSplitRule():
             nFilesPerJob = 1
         # grouping with boundaryID
         useBoundary = taskSpec.useGroupWithBoundaryID()
         # fsize intercepts per input size
         sizeGradientsPerInSize = None
         # max primay output size
         maxOutSize = None
         # max size per job
         maxSizePerJob = taskSpec.getMaxSizePerJob()
         if maxSizePerJob is not None:
             maxSizePerJob += InputChunk.defaultOutputSize
         # dynamic number of events
         dynNumEvents = taskSpec.dynamicNumEvents()
         # max number of event ranges
         maxNumEventRanges = None
         # multiplicity of jobs
         if taskSpec.useJobCloning():
             multiplicity = 1
         else:
             multiplicity = taskSpec.getNumEventServiceConsumer()
         # split with fields
         if taskSpec.getFieldNumToLFN(
         ) is not None and taskSpec.useFileAsSourceLFN():
             splitByFields = taskSpec.getFieldNumToLFN()
         else:
             splitByFields = None
     else:
         # set parameters for merging
         maxNumFiles = taskSpec.getMaxNumFilesPerMergeJob()
         sizeGradients = 0
         walltimeGradient = 0
         nFilesPerJob = taskSpec.getNumFilesPerMergeJob()
         nEventsPerJob = taskSpec.getNumEventsPerMergeJob()
         maxSizePerJob = None
         useBoundary = {'inSplit': 3}
         dynNumEvents = False
         maxNumEventRanges = None
         multiplicity = None
         # gradients per input size is 1 + margin
         sizeGradientsPerInSize = self.sizeGradientsPerInSizeForMerge
         # intercepts for libDS
         sizeIntercepts = taskSpec.getWorkDiskSize()
         # mergein of 500MB
         interceptsMergin = self.interceptsMerginForMerge
         if sizeIntercepts < interceptsMergin:
             sizeIntercepts = interceptsMergin
         maxOutSize = taskSpec.getMaxSizePerMergeJob()
         if maxOutSize is None:
             # max output size is 5GB for merging by default
             maxOutSize = 5 * 1024 * 1024 * 1024
         # split with fields
         if taskSpec.getFieldNumToLFN(
         ) is not None and taskSpec.useFileAsSourceLFN():
             splitByFields = list(
                 range(4 + 1, 4 + 1 + len(taskSpec.getFieldNumToLFN())))
         else:
             splitByFields = None
     # LB
     respectLB = taskSpec.respectLumiblock()
     # dump
     tmpLog.debug(
         'maxNumFiles={0} sizeGradients={1} sizeIntercepts={2} useBoundary={3}'
         .format(maxNumFiles, sizeGradients, sizeIntercepts, useBoundary))
     tmpLog.debug(
         'walltimeGradient={0} nFilesPerJob={1} nEventsPerJob={2}'.format(
             walltimeGradient, nFilesPerJob, nEventsPerJob))
     tmpLog.debug('useScout={} isMerging={}'.format(inputChunk.useScout(),
                                                    inputChunk.isMerging))
     tmpLog.debug(
         'sizeGradientsPerInSize={0} maxOutSize={1} respectLB={2} dynNumEvents={3}'
         .format(sizeGradientsPerInSize, maxOutSize, respectLB,
                 dynNumEvents))
     tmpLog.debug('multiplicity={0} splitByFields={1} nFiles={2}'.format(
         multiplicity, str(splitByFields),
         inputChunk.getNumFilesInMaster()))
     # split
     returnList = []
     subChunks = []
     iSubChunks = 0
     if inputChunk.useScout() and not inputChunk.isMerging:
         default_nSubChunks = 2
     elif taskSpec.is_hpo_workflow():
         default_nSubChunks = 2
     else:
         default_nSubChunks = 25
     subChunk = None
     nSubChunks = default_nSubChunks
     strict_chunkSize = False
     while True:
         # change site
         if iSubChunks % nSubChunks == 0 or subChunk == []:
             # append to return map
             if subChunks != []:
                 # get site names for parallel execution
                 if taskSpec.getNumSitesPerJob(
                 ) > 1 and not inputChunk.isMerging and inputChunk.useJumbo != 'fake':
                     siteName = inputChunk.getParallelSites(
                         taskSpec.getNumSitesPerJob(), nSubChunks,
                         [siteName])
                 returnList.append({
                     'siteName': siteName,
                     'subChunks': subChunks,
                     'siteCandidate': siteCandidate,
                 })
                 try:
                     gshare = taskSpec.gshare.replace(' ', '_')
                 except Exception:
                     gshare = None
                 tmpLog.info('split to nJobs=%s at site=%s gshare=%s' %
                             (len(subChunks), siteName, gshare))
                 # checkpoint
                 inputChunk.checkpoint_file_usage()
                 # reset
                 subChunks = []
             # skip PQs with chunk size limit
             ngList = []
             if not allow_chunk_size_limit:
                 for siteName in inputChunk.get_candidate_names():
                     siteSpec = siteMapper.getSite(siteName)
                     if siteSpec.get_job_chunk_size() is not None:
                         ngList.append(siteName)
             # new candidate
             siteCandidate, getCandidateMsg = inputChunk.getOneSiteCandidate(
                 nSubChunks, ngSites=ngList, get_msg=True)
             if siteCandidate is None:
                 tmpLog.debug('no candidate: {0}'.format(getCandidateMsg))
                 break
             siteName = siteCandidate.siteName
             siteSpec = siteMapper.getSite(siteName)
             # set chunk size
             nSubChunks = siteSpec.get_job_chunk_size()
             if nSubChunks is None:
                 nSubChunks = default_nSubChunks
                 strict_chunkSize = False
             else:
                 strict_chunkSize = True
             # directIO
             if not JediCoreUtils.use_direct_io_for_job(
                     taskSpec, siteSpec, inputChunk):
                 useDirectIO = False
             else:
                 useDirectIO = True
             # get maxSize if it is set in taskSpec
             maxSize = maxSizePerJob
             if maxSize is None:
                 # use maxwdir as the default maxSize
                 if not useDirectIO:
                     maxSize = siteCandidate.get_overridden_attribute(
                         'maxwdir')
                     if maxSize is None:
                         maxSize = siteSpec.maxwdir
                     if maxSize:
                         maxSize *= 1024 * 1024
                 elif nEventsPerJob is not None or nFilesPerJob is not None:
                     maxSize = None
                 else:
                     maxSize = siteCandidate.get_overridden_attribute(
                         'maxwdir')
                     if maxSize is None:
                         maxSize = siteSpec.maxwdir
                     maxSize = max(50000, maxSize) * 1024 * 1024
             else:
                 # add offset
                 maxSize += sizeIntercepts
             # max disk size
             maxDiskSize = siteCandidate.get_overridden_attribute('maxwdir')
             if maxDiskSize is None:
                 maxDiskSize = siteSpec.maxwdir
             if maxDiskSize:
                 maxDiskSize *= 1024 * 1024
             # max walltime
             maxWalltime = None
             if not inputChunk.isMerging:
                 maxWalltime = taskSpec.getMaxWalltime()
             if maxWalltime is None:
                 maxWalltime = siteSpec.maxtime
             # core count
             if siteSpec.coreCount:
                 coreCount = siteSpec.coreCount
             else:
                 coreCount = 1
             # core power
             corePower = siteSpec.corepower
             # max num of event ranges for dynNumEvents
             if dynNumEvents:
                 maxNumEventRanges = int(siteSpec.get_n_sim_events() //
                                         taskSpec.get_min_granularity())
                 if maxNumEventRanges == 0:
                     maxNumEventRanges = 1
             tmpLog.debug(
                 'chosen {0} : {1} : nQueue={2} nRunCap={3}'.format(
                     siteName, getCandidateMsg, siteCandidate.nQueuedJobs,
                     siteCandidate.nRunningJobsCap))
             tmpLog.debug('new weight {0}'.format(siteCandidate.weight))
             tmpLog.debug(
                 'maxSize={0} maxWalltime={1} coreCount={2} corePower={3} maxNumEventRanges={4} maxDisk={5}'
                 .format(maxSize, maxWalltime, coreCount, corePower,
                         maxNumEventRanges, maxDiskSize))
             tmpLog.debug('useDirectIO={0} label={1}'.format(
                 useDirectIO, taskSpec.prodSourceLabel))
         # get sub chunk
         subChunk = inputChunk.getSubChunk(
             siteName,
             maxSize=maxSize,
             maxNumFiles=maxNumFiles,
             sizeGradients=sizeGradients,
             sizeIntercepts=sizeIntercepts,
             nFilesPerJob=nFilesPerJob,
             walltimeGradient=walltimeGradient,
             maxWalltime=maxWalltime,
             nEventsPerJob=nEventsPerJob,
             useBoundary=useBoundary,
             sizeGradientsPerInSize=sizeGradientsPerInSize,
             maxOutSize=maxOutSize,
             coreCount=coreCount,
             respectLB=respectLB,
             corePower=corePower,
             dynNumEvents=dynNumEvents,
             maxNumEventRanges=maxNumEventRanges,
             multiplicity=multiplicity,
             splitByFields=splitByFields,
             tmpLog=tmpLog,
             useDirectIO=useDirectIO,
             maxDiskSize=maxDiskSize,
             enableLog=True)
         if subChunk is None:
             break
         if subChunk != []:
             # append
             subChunks.append(subChunk)
         iSubChunks += 1
     # append to return map if remain
     isSkipped = False
     if subChunks != []:
         # skip if chunk size is not enough
         if allow_chunk_size_limit and strict_chunkSize and len(
                 subChunks) < nSubChunks:
             tmpLog.debug(
                 'skip splitting since chunk size {} is less than chunk size limit {} at {}'
                 .format(len(subChunks), nSubChunks, siteName))
             inputChunk.rollback_file_usage()
             isSkipped = True
         else:
             # get site names for parallel execution
             if taskSpec.getNumSitesPerJob(
             ) > 1 and not inputChunk.isMerging:
                 siteName = inputChunk.getParallelSites(
                     taskSpec.getNumSitesPerJob(), nSubChunks, [siteName])
             returnList.append({
                 'siteName': siteName,
                 'subChunks': subChunks,
                 'siteCandidate': siteCandidate,
             })
             try:
                 gshare = taskSpec.gshare.replace(' ', '_')
             except Exception:
                 gshare = None
             tmpLog.info('split to nJobs=%s at site=%s gshare=%s' %
                         (len(subChunks), siteName, gshare))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED, returnList, isSkipped

Пример #50

Показать файл

Файл: TaskBroker.py Проект: PanDAWMS/panda-jedi

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 100
             taskList = self.taskList.get(nTasks)
             totalTasks, idxTasks = self.taskList.stat()
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug(
                     '{0} terminating since no more items'.format(
                         self.__class__.__name__))
                 return
             # make logger
             tmpLog = MsgWrapper(self.logger)
             tmpLog.info(
                 'start TaskBrokerThread {0}/{1} for jediTaskID={2}'.format(
                     idxTasks, totalTasks, taskList))
             tmpStat = Interaction.SC_SUCCEEDED
             # get TaskSpecs
             tmpListToAssign = []
             for tmpTaskItem in taskList:
                 tmpListItem = self.taskBufferIF.getTasksToBeProcessed_JEDI(
                     None,
                     None,
                     None,
                     None,
                     None,
                     simTasks=[tmpTaskItem],
                     readMinFiles=True)
                 if tmpListItem is None:
                     # failed
                     tmpLog.error(
                         'failed to get the input chunks for jediTaskID={0}'
                         .format(tmpTaskItem))
                     tmpStat = Interaction.SC_FAILED
                     break
                 tmpListToAssign += tmpListItem
             # get impl
             if tmpStat == Interaction.SC_SUCCEEDED:
                 tmpLog.info('getting Impl')
                 try:
                     impl = self.implFactory.getImpl(
                         self.vo, self.prodSourceLabel)
                     if impl is None:
                         # task refiner is undefined
                         tmpLog.error(
                             'task broker is undefined for vo={0} sourceLabel={1}'
                             .format(self.vo, self.prodSourceLabel))
                         tmpStat = Interaction.SC_FAILED
                 except Exception:
                     errtype, errvalue = sys.exc_info()[:2]
                     tmpLog.error('getImpl failed with {0}:{1}'.format(
                         errtype.__name__, errvalue))
                     tmpStat = Interaction.SC_FAILED
             # brokerage
             if tmpStat == Interaction.SC_SUCCEEDED:
                 tmpLog.info('brokerage with {0} for {1} tasks '.format(
                     impl.__class__.__name__, len(tmpListToAssign)))
                 try:
                     tmpStat = impl.doBrokerage(tmpListToAssign, self.vo,
                                                self.prodSourceLabel,
                                                self.workQueue,
                                                self.resource_name)
                 except Exception:
                     errtype, errvalue = sys.exc_info()[:2]
                     tmpLog.error('doBrokerage failed with {0}:{1}'.format(
                         errtype.__name__, errvalue))
                     tmpStat = Interaction.SC_FAILED
             # register
             if tmpStat != Interaction.SC_SUCCEEDED:
                 tmpLog.error('failed')
             else:
                 tmpLog.info('done')
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(
                 self.__class__.__name__, errtype.__name__, errvalue))

Пример #51

Показать файл

Файл: AtlasProdJobThrottler.py Проект: PanDAWMS/panda-jedi

    def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue, resource_name):
        # params
        nBunch = 4
        threshold = 2.0
        nJobsInBunchMax = 600
        nJobsInBunchMin = 500
        minTotalWalltime = 50*1000*1000
        nWaitingLimit = 4
        nWaitingBunchLimit = 2
        nParallel = 2
        nParallelCap = 5
        # make logger
        tmpLog = MsgWrapper(logger)

        workQueueID = workQueue.getID()
        workQueueName = workQueue.queue_name

        workQueueName = '_'.join(workQueue.queue_name.split(' '))
        msgHeader = '{0}:{1} cloud={2} queue={3} resource_type={4}:'.format(vo, prodSourceLabel, cloudName,
                                                                            workQueueName, resource_name)
        tmpLog.debug('{0} start workQueueID={1}'.format(msgHeader, workQueueID))

        # get central configuration values
        config_map = self.__getConfiguration(vo, workQueue.queue_name, resource_name)
        configQueueLimit = config_map[NQUEUELIMIT]['value']
        configQueueCap = config_map[NQUEUECAP]['value']
        configRunningCap = config_map[NRUNNINGCAP]['value']

        tmpLog.debug(msgHeader + ' got configuration configQueueLimit={0}, configQueueCap={1}, configRunningCap={2}'
                     .format(configQueueLimit, configQueueCap, configRunningCap))

        # check if unthrottled
        if not workQueue.throttled:
            msgBody = "PASS unthrottled since GS_throttled is False"
            tmpLog.info(msgHeader+" "+msgBody)
            return self.retUnThrottled

        # get the jobs statistics for our wq/gs and expand the stats map
        jobstats_map = self.__prepareJobStats(workQueue, resource_name, config_map)
        nRunning_rt = jobstats_map['nRunning_rt']
        nRunning_gs = jobstats_map['nRunning_gs']
        nRunning_runningcap = jobstats_map['nRunning_runningcap']
        nNotRun_rt = jobstats_map['nNotRun_rt']
        nNotRun_gs = jobstats_map['nNotRun_gs']
        nNotRun_queuelimit = jobstats_map['nNotRun_queuelimit']
        nNotRun_queuecap = jobstats_map['nNotRun_queuecap']
        nDefine_rt = jobstats_map['nDefine_rt']
        nDefine_gs = jobstats_map['nDefine_gs']
        nDefine_queuelimit = jobstats_map['nDefine_queuelimit']
        nDefine_queuecap = jobstats_map['nDefine_queuecap']
        nWaiting_rt = jobstats_map['nWaiting_rt']
        nWaiting_gs = jobstats_map['nWaiting_gs']

        # check if higher prio tasks are waiting
        if workQueue.queue_name in non_rt_wqs:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed', cloudName, workQueue)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo, workQueue, 'managed', cloudName)
        else:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed', cloudName, workQueue, resource_name)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo, workQueue, 'managed', cloudName, resource_name)

        highestPrioInPandaDB = highestPrioJobStat['highestPrio']
        nNotRunHighestPrio   = highestPrioJobStat['nNotRun']
        if highestPrioWaiting is None:
            msgBody = 'failed to get the highest priority of waiting tasks'
            tmpLog.error("{0} {1}".format(msgHeader, msgBody))
            return self.retTmpError

        # high priority tasks are waiting
        highPrioQueued = False
        if highestPrioWaiting > highestPrioInPandaDB \
                or (highestPrioWaiting == highestPrioInPandaDB and nNotRunHighestPrio < nJobsInBunchMin):
            highPrioQueued = True
        tmpLog.debug("{0} highestPrio waiting:{1} inPanda:{2} numNotRun:{3} -> highPrioQueued={4}".format(msgHeader,
                                                                                                          highestPrioWaiting,
                                                                                                          highestPrioInPandaDB,
                                                                                                          nNotRunHighestPrio,
                                                                                                          highPrioQueued))
        # set maximum number of jobs to be submitted
        if workQueue.queue_name in non_rt_wqs:
            tmpRemainingSlot = int(nRunning_gs * threshold - nNotRun_gs)
        else:
            tmpRemainingSlot = int(nRunning_rt * threshold - nNotRun_rt)
        # use the lower limit to avoid creating too many _sub/_dis datasets
        nJobsInBunch = min(max(nJobsInBunchMin, tmpRemainingSlot), nJobsInBunchMax)

        if configQueueLimit is not None:
            nQueueLimit = configQueueLimit
        else:
            nQueueLimit = nJobsInBunch * nBunch

        # use nPrestage for reprocessing
        if workQueue.queue_name in ['Heavy Ion', 'Reprocessing default']:
            # reset nJobsInBunch
            if nQueueLimit > (nNotRun_queuelimit + nDefine_queuelimit):
                tmpRemainingSlot = nQueueLimit - (nNotRun_queuelimit + nDefine_queuelimit)
                if tmpRemainingSlot > nJobsInBunch:
                    nJobsInBunch = min(tmpRemainingSlot, nJobsInBunchMax)

        # get cap
        # set number of jobs to be submitted
        if configQueueCap is None:
            self.setMaxNumJobs(nJobsInBunch / nParallel)
        else:
            self.setMaxNumJobs(configQueueCap / nParallelCap)

        # get total walltime
        totWalltime = self.taskBufferIF.getTotalWallTime_JEDI(vo, prodSourceLabel, workQueue, resource_name, cloudName)

        # log the current situation and limits
        tmpLog.info("{0} nQueueLimit={1} nRunCap={2} nQueueCap={3}".format(msgHeader, nQueueLimit,
                                                                           configRunningCap, configQueueCap))
        tmpLog.info("{0} at global share level: nQueued={1} nDefine={2} nRunning={3}".format(msgHeader,
                                                                                             nNotRun_gs + nDefine_gs,
                                                                                             nDefine_gs, nRunning_gs))
        tmpLog.info("{0} at resource type level: nQueued_rt={1} nDefine_rt={2} nRunning_rt={3} totWalltime={4}".format(msgHeader,
                                                                                                                nNotRun_rt + nDefine_rt,
                                                                                                                nDefine_rt, nRunning_rt,
                                                                                                                totWalltime))

        # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling
        limitPriority = False
        if workQueue.queue_name not in non_rt_wqs \
                and nRunning_rt == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit \
                and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(nNotRun_queuelimit + nDefine_queuelimit,
                                                                                                     nQueueLimit, totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs \
                and nRunning_gs == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(nNotRun_queuelimit + nDefine_queuelimit,
                                                                                                     nQueueLimit, totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name not in non_rt_wqs and  nRunning_rt != 0 \
                and float(nNotRun_rt + nDefine_rt) / float(nRunning_rt) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_rt({0})/nRunning_rt({1})>{2} & nQueued_queuelimit({3})>{4} totWalltime({5})>{6}".format(nNotRun_rt + nDefine_rt, nRunning_rt,
                                                                                                               threshold, nNotRun_queuelimit + nDefine_queuelimit,
                                                                                                               nQueueLimit, totWalltime,
                                                                                                               minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs and nRunning_gs != 0 \
                and float(nNotRun_gs + nDefine_gs) / float(nRunning_gs) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_gs({0})/nRunning_gs({1})>{2} & nQueued_queuelimit({3})>{4}".format(nNotRun_gs + nDefine_gs, nRunning_gs,
                                                                                                               threshold, nNotRun_queuelimit + nDefine_queuelimit,
                                                                                                               nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif nDefine_queuelimit > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # brokerage is stuck
                msgBody = "SKIP too many nDefined_queuelimit({0})>{1}".format(nDefine_queuelimit, nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif nWaiting_rt > max(nRunning_rt * nWaitingLimit, nJobsInBunch * nWaitingBunchLimit):
            limitPriority = True
            if not highPrioQueued:
                # too many waiting
                msgBody = "SKIP too many nWaiting_rt({0})>max(nRunning_rt({1})x{2},{3}x{4})".format(nWaiting_rt, nRunning_rt, nWaitingLimit,
                                                                                                    nJobsInBunch, nWaitingBunchLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif configRunningCap and nRunning_runningcap > configRunningCap:
            # cap on running
            msgBody = "SKIP nRunning_runningcap({0})>nRunningCap({1})".format(nRunning_runningcap, configRunningCap)
            tmpLog.warning('{0} {1}'.format(msgHeader, msgBody))
            tmpLog.sendMsg('{0} {1}'.format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
            return self.retMergeUnThr

        elif configQueueCap and nNotRun_queuecap + nDefine_queuecap > configQueueCap:
            limitPriority = True
            if not highPrioQueued:
                # cap on queued
                msgBody = "SKIP nQueued_queuecap({0})>nQueueCap({1})".format(nNotRun_queuecap + nDefine_queuecap, configQueueCap)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        # get jobs from prodDB
        limitPriorityValue = None
        if limitPriority:
            limitPriorityValue = highestPrioWaiting
            self.setMinPriority(limitPriorityValue)
        else:
            # not enough jobs are queued
            if (nNotRun_queuelimit + nDefine_queuelimit < nQueueLimit * 0.9) \
                    or (workQueue.queue_name in non_rt_wqs and nNotRun_gs + nDefine_gs < nRunning_gs) \
                    or (workQueue.queue_name not in non_rt_wqs and nNotRun_rt + nDefine_rt < nRunning_rt):
                tmpLog.debug(msgHeader+" not enough jobs queued")
                self.notEnoughJobsQueued()
                self.setMaxNumJobs(max(self.maxNumJobs, nQueueLimit/20))

        msgBody = "PASS - priority limit={0} maxNumJobs={1}".format(limitPriorityValue, self.maxNumJobs)
        tmpLog.info(msgHeader+" "+msgBody)
        return self.retUnThrottled