Пример #1
0
 def doCleanDataLocality(self):
     tmpLog = MsgWrapper(logger, ' #ATM #KV doCleanDataLocality')
     tmpLog.debug('start')
     try:
         # lock
         got_lock = self.taskBufferIF.lockProcess_JEDI(  vo=self.vo, prodSourceLabel='default',
                                                         cloud=None, workqueue_id=None, resource_name=None,
                                                         component='AtlasDataLocalityUpdaterWatchDog.doCleanDataLocality',
                                                         pid=self.pid, timeLimit=1440)
         if not got_lock:
             tmpLog.debug('locked by another process. Skipped')
             return
         tmpLog.debug('got lock')
         # lifetime of records
         record_lifetime_hours = 24
         # run
         now_timestamp = datetime.datetime.utcnow()
         before_timestamp = now_timestamp - datetime.timedelta(hours=record_lifetime_hours)
         n_rows = self.taskBufferIF.deleteOutdatedDatasetLocality_JEDI(before_timestamp)
         tmpLog.info('cleaned up {0} records'.format(n_rows))
         # done
         tmpLog.debug('done')
     except Exception:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0} {1} {2}'.format(errtype, errvalue, traceback.format_exc()))
Пример #2
0
 def registerDatasetSubscription(self,datasetName,location,activity=None,ignoreUnknown=False):
     methodName = 'registerDatasetSubscription'
     methodName = '{0} datasetName={1} location={2}'.format(methodName,datasetName,location)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     isOK = True
     try:
         # get DQ2 API            
         dq2 = DQ2()
         # call
         dq2.registerDatasetSubscription(datasetName,location,activity=activity)
     except DQSubscriptionExistsException:
         pass
     except DQUnknownDatasetException:
         if ignoreUnknown:
             pass
         else:
             isOK = False
     except:
         isOK = False
     if not isOK:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)
     tmpLog.info('done')
     return self.SC_SUCCEEDED,True
Пример #3
0
 def freezeDataset(self,datasetName,ignoreUnknown=False):
     methodName = 'freezeDataset'
     methodName = '{0} datasetName={1}'.format(methodName,datasetName)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     isOK = True
     try:
         # get DQ2 API            
         dq2=DQ2()
         # freeze
         dq2.freezeDataset(datasetName)
     except DQFrozenDatasetException:
         pass
     except DQUnknownDatasetException:
         if ignoreUnknown:
             pass
         else:
             isOK = False
     except:
         isOK = False
     if isOK:
         tmpLog.info('done')
         return self.SC_SUCCEEDED,True
     else:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)
Пример #4
0
 def start(self):
     # start base classes
     JediKnight.start(self)
     FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF)
     # go into main loop
     while True:
         startTime = datetime.datetime.utcnow()
         try:
             # get logger
             tmpLog = MsgWrapper(logger)
             tmpLog.info('start')
             # loop over all vos
             for vo in self.vos:
                 # loop over all sourceLabels
                 for prodSourceLabel in self.prodSourceLabels:
                     # prepare tasks to be finished
                     tmpLog.info('preparing tasks to be finished for vo={0} label={1}'.format(vo,prodSourceLabel))
                     tmpRet = self.taskBufferIF.prepareTasksToBeFinished_JEDI(vo,prodSourceLabel,
                                                                              jedi_config.postprocessor.nTasks,
                                                                              pid=self.pid)
                     if tmpRet == None:
                         # failed
                         tmpLog.error('failed to prepare tasks')
                     # get tasks to be finished
                     tmpLog.info('getting tasks to be finished') 
                     tmpList = self.taskBufferIF.getTasksToBeFinished_JEDI(vo,prodSourceLabel,self.pid,
                                                                           jedi_config.postprocessor.nTasks)
                     if tmpList == None: 
                         # failed
                         tmpLog.error('failed to get tasks to be finished')
                     else:
                         tmpLog.info('got {0} tasks'.format(len(tmpList)))
                         # put to a locked list
                         taskList = ListWithLock(tmpList)
                         # make thread pool
                         threadPool = ThreadPool()
                         # make workers
                         nWorker = jedi_config.postprocessor.nWorkers
                         for iWorker in range(nWorker):
                             thr = PostProcessorThread(taskList,threadPool,
                                                       self.taskBufferIF,
                                                       self.ddmIF,
                                                       self)
                             thr.start()
                         # join
                         threadPool.join()
             tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
         # sleep if needed
         loopCycle = 60
         timeDelta = datetime.datetime.utcnow() - startTime
         sleepPeriod = loopCycle - timeDelta.seconds
         if sleepPeriod > 0:
             time.sleep(sleepPeriod)
Пример #5
0
 def start(self):
     # start base classes
     JediKnight.start(self)
     FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF)
     # go into main loop
     while True:
         startTime = datetime.datetime.utcnow()
         try:
             # get logger
             tmpLog = MsgWrapper(logger)
             tmpLog.info('start')
             # loop over all vos
             for vo in self.vos:
                 # loop over all sourceLabels
                 for prodSourceLabel in self.prodSourceLabels:
                     # vo/prodSourceLabel specific action
                     impl = self.getImpl(vo,
                                         prodSourceLabel,
                                         subType=self.subStr)
                     if impl is not None:
                         plugin_name = impl.__class__.__name__
                         tmpLog.info(
                             'pre-action for vo={} label={} cls={}'.format(
                                 vo, prodSourceLabel, plugin_name))
                         impl.pre_action(tmpLog, vo, prodSourceLabel,
                                         self.pid)
                         tmpLog.info(
                             'do action for vo={} label={} cls={}'.format(
                                 vo, prodSourceLabel, plugin_name))
                         tmpStat = impl.doAction()
                         if tmpStat != Interaction.SC_SUCCEEDED:
                             tmpLog.error(
                                 'failed to run special action for vo={} label={} cls={}'
                                 .format(vo, prodSourceLabel, plugin_name))
                         else:
                             tmpLog.info(
                                 'done for vo={} label={} cls={}'.format(
                                     vo, prodSourceLabel, plugin_name))
             tmpLog.info('done')
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             tmpLog.error('failed in {0}.start() with {1} {2}'.format(
                 self.__class__.__name__, errtype.__name__, errvalue))
         # sleep if needed
         loopCycle = jedi_config.watchdog.loopCycle if self.period is None else self.period
         timeDelta = datetime.datetime.utcnow() - startTime
         sleepPeriod = loopCycle - timeDelta.seconds
         if sleepPeriod > 0:
             time.sleep(sleepPeriod)
         # randomize cycle
         self.randomSleep(max_val=loopCycle)
Пример #6
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for taskSpec in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID))
                 tmpLog.info('start')
                 tmpStat = Interaction.SC_SUCCEEDED
                 # get impl
                 impl = self.implFactory.instantiateImpl(taskSpec.vo,taskSpec.prodSourceLabel,None,
                                                         self.taskBufferIF,self.ddmIF)
                 if impl == None:
                     # post processor is undefined
                     tmpLog.error('post-processor is undefined for vo={0} sourceLabel={1}'.format(taskSpec.vo,taskSpec.prodSourceLabel))
                     tmpStat = Interaction.SC_FATAL
                 # execute    
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('post-process with {0}'.format(impl.__class__.__name__))
                     try:
                         impl.doPostProcess(taskSpec,tmpLog)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         tmpLog.error('doPostProcess failed with {0}:{1}'.format(errtype.__name__,errvalue))
                         tmpStat = Interaction.SC_FATAL
                 # done
                 if tmpStat == Interaction.SC_FATAL:
                     # task is broken
                     tmpErrStr = 'post-process failed'
                     tmpLog.error(tmpErrStr)
                     taskSpec.status = 'broken'
                     taskSpec.setErrDiag(tmpErrStr)
                     taskSpec.lockedBy = None
                     self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID})    
                 elif tmpStat == Interaction.SC_FAILED:
                     tmpErrStr = 'post processing failed'
                     taskSpec.setOnHold()
                     taskSpec.setErrDiag(tmpErrStr,True)
                     taskSpec.lockedBy = None
                     self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID})
                     tmpLog.info('set task_status={0} since {1}'.format(taskSpec.status,taskSpec.errorDialog))
                     continue
                 # final procedure
                 try:
                     impl.doFinalProcedure(taskSpec,tmpLog)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     tmpLog.error('doFinalProcedure failed with {0}:{1}'.format(errtype.__name__,errvalue))
                 # done
                 tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
Пример #7
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 100
             taskList = self.taskList.get(nTasks)
             totalTasks,idxTasks = self.taskList.stat()
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # make logger
             tmpLog = MsgWrapper(self.logger)
             tmpLog.info('start TaskBrokerThread {0}/{1} for jediTaskID={2}'.format(idxTasks,totalTasks,taskList))
             tmpStat = Interaction.SC_SUCCEEDED
             # get TaskSpecs
             tmpListToAssign = []
             for tmpTaskItem in taskList:
                 tmpListItem = self.taskBufferIF.getTasksToBeProcessed_JEDI(None,None,None,None,None,
                                                                            simTasks=[tmpTaskItem],
                                                                            readMinFiles=True)
                 if tmpListItem == None:
                     # failed
                     tmpLog.error('failed to get the input chunks for jediTaskID={0}'.format(tmpTaskItem))
                     tmpStat = Interaction.SC_FAILED
                     break
                 tmpListToAssign += tmpListItem
             # get impl                    
             if tmpStat == Interaction.SC_SUCCEEDED:                    
                 tmpLog.info('getting Impl')
                 try:
                     impl = self.implFactory.getImpl(self.vo,self.prodSourceLabel)
                     if impl == None:
                         # task refiner is undefined
                         tmpLog.error('task broker is undefined for vo={0} sourceLabel={1}'.format(self.vo,self.prodSourceLabel))
                         tmpStat = Interaction.SC_FAILED
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     tmpLog.error('getImpl failed with {0}:{1}'.format(errtype.__name__,errvalue))
                     tmpStat = Interaction.SC_FAILED
             # brokerage
             if tmpStat == Interaction.SC_SUCCEEDED:
                 tmpLog.info('brokerage with {0} for {1} tasks '.format(impl.__class__.__name__,len(tmpListToAssign)))
                 try:
                     tmpStat = impl.doBrokerage(tmpListToAssign,self.vo,
                                                self.prodSourceLabel,self.workQueue)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     tmpLog.error('doBrokerage failed with {0}:{1}'.format(errtype.__name__,errvalue))
                     tmpStat = Interaction.SC_FAILED
             # register
             if tmpStat != Interaction.SC_SUCCEEDED:
                 tmpLog.error('failed')
             else:
                 tmpLog.info('done')                    
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
Пример #8
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 100
             taskList = self.taskList.get(nTasks)
             totalTasks,idxTasks = self.taskList.stat()
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # make logger
             tmpLog = MsgWrapper(self.logger)
             tmpLog.info('start TaskCheckerThread {0}/{1} for jediTaskID={2}'.format(idxTasks,totalTasks,taskList))
             tmpStat = Interaction.SC_SUCCEEDED
             # get TaskSpecs
             taskSpecList = []
             for jediTaskID in taskList:
                 tmpRet,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False)
                 if tmpRet and taskSpec != None:
                     taskSpecList.append(taskSpec)
                 else:
                     tmpLog.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID))
             if taskSpecList != []:
                 # get impl                    
                 if tmpStat == Interaction.SC_SUCCEEDED:                    
                     tmpLog.info('getting Impl')
                     try:
                         impl = self.implFactory.getImpl(self.vo,self.prodSourceLabel)
                         if impl == None:
                             # task brokerage is undefined
                             tmpLog.error('task broker is undefined for vo={0} sourceLabel={1}'.format(self.vo,self.prodSourceLabel))
                             tmpStat = Interaction.SC_FAILED
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         tmpLog.error('getImpl failed with {0}:{1}'.format(errtype.__name__,errvalue))
                         tmpStat = Interaction.SC_FAILED
                 # check
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('brokerage with {0}'.format(impl.__class__.__name__))
                     try:
                         tmpStat,taskCloudMap = impl.doCheck(taskSpecList)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         tmpLog.error('doCheck failed with {0}:{1}'.format(errtype.__name__,errvalue))
                         tmpStat = Interaction.SC_FAILED
                 # update
                 if tmpStat != Interaction.SC_SUCCEEDED:
                     tmpLog.error('failed to check assignment')
                 else:
                     tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(taskCloudMap)
                     tmpLog.info('done with {0} for {1}'.format(tmpRet,str(taskCloudMap)))
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
Пример #9
0
 def finger(self,userName):
     methodName = 'finger'
     methodName = '{0} userName={1}'.format(methodName,userName)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     try:
         # cleanup DN
         userName = parse_dn(userName)
         # exec
         tmpRet = infoClient().finger(userName)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0}:{1}'.format(methodName,errMsg)
     tmpLog.info('done')
     return self.SC_SUCCEEDED,tmpRet
Пример #10
0
 def setDatasetOwner(self,datasetName,userName):
     methodName = 'setDatasetOwner'
     methodName = '{0} datasetName={1} userName={2}'.format(methodName,datasetName,userName)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     try:
         # cleanup DN
         userName = parse_dn(userName)
         # get DQ2 API            
         dq2=DQ2()
         # set
         dq2.setMetaDataAttribute(datasetName,'owner',userName)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)
     tmpLog.info('done')
     return self.SC_SUCCEEDED,True
Пример #11
0
 def registerDatasetLocation(self,datasetName,location,lifetime=None,owner=None):
     methodName = 'registerDatasetLocation'
     methodName = '{0} datasetName={1} location={2}'.format(methodName,datasetName,location)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     try:
         # cleanup DN
         owner = parse_dn(owner)
         # get DQ2 API            
         dq2 = DQ2()
         # set
         dq2.registerDatasetLocation(datasetName,location,lifetime=lifetime)
         dq2.setReplicaMetaDataAttribute(datasetName,location,'owner',owner)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)
     tmpLog.info('done')
     return self.SC_SUCCEEDED,True
Пример #12
0
 def setDatasetMetadata(self,datasetName,metadataName,metadaValue):
     methodName = 'setDatasetMetadata'
     methodName = '{0} datasetName={1} metadataName={2} metadaValue={3}'.format(methodName,datasetName,
                                                                                metadataName,metadaValue)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     try:
         # get DQ2 API            
         dq2 = DQ2()
         # set
         dq2.setMetaDataAttribute(datasetName,metadataName,metadaValue)
     except DQUnknownDatasetException:
         pass
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)
     tmpLog.info('done')
     return self.SC_SUCCEEDED,True
Пример #13
0
 def expandContainer(self,containerName):
     methodName = 'expandContainer'
     methodName = '{0} contName={1}'.format(methodName,containerName)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     try:
         dsList = []
         # get real names
         tmpS,tmpRealNameList = self.listDatasets(containerName)
         if tmpS != self.SC_SUCCEEDED:
             tmpLog.error('failed to get real names')
             return tmpS,tmpRealNameList
         # loop over all names
         for tmpRealName in tmpRealNameList:
             # container
             if tmpRealName.endswith('/'):
                 # get contents
                 tmpS,tmpO = self.listDatasetsInContainer(tmpRealName)
                 if tmpS != self.SC_SUCCEEDED:
                     tmpLog.error('failed to get datasets in {0}'.format(tmpRealName))
                     return tmpS,tmpO
             else:
                 tmpO = [tmpRealName]
             # collect dataset names
             for tmpStr in tmpO:
                 if not tmpStr in dsList:
                     dsList.append(tmpStr)
         dsList.sort()        
         # return
         tmpLog.info('got {0}'.format(str(dsList)))
         return self.SC_SUCCEEDED,dsList
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error('failed with {0}'.format(errMsg))
         return errCode,'{0} : {1}'.format(methodName,errMsg)
Пример #14
0
 def deleteDataset(self,datasetName,emptyOnly,ignoreUnknown=False):
     methodName = 'deleteDataset'
     methodName = '{0} datasetName={1}'.format(methodName,datasetName)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     isOK = True
     retStr = ''
     nFiles = -1
     try:
         # get DQ2 API            
         dq2=DQ2()
         # get the number of files
         if emptyOnly:
             nFiles = dq2.getNumberOfFiles(datasetName)
         # erase
         if not emptyOnly or nFiles == 0:
             dq2.eraseDataset(datasetName)
             retStr = 'deleted {0}'.format(datasetName)
         else:
             retStr = 'keep {0} where {1} files are available'.format(datasetName,nFiles)
     except DQUnknownDatasetException:
         if ignoreUnknown:
             pass
         else:
             isOK = False
     except:
         isOK = False
     if isOK:
         tmpLog.info('done')
         return self.SC_SUCCEEDED,retStr
     else:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)
Пример #15
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.info('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,splitRule,taskStatus,parent_tid in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'< jediTaskID={0} >'.format(jediTaskID))
                 tmpLog.debug('start')
                 tmpStat = Interaction.SC_SUCCEEDED
                 errStr = ''
                 # read task parameters
                 try:
                     taskParam = None
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue)
                     tmpLog.debug(taskParam)
                     tmpLog.error(errStr)
                     continue
                     tmpStat = Interaction.SC_FAILED
                 # get impl
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('getting Impl')
                     try:
                         # get VO and sourceLabel
                         vo = taskParamMap['vo']
                         prodSourceLabel = taskParamMap['prodSourceLabel']
                         taskType = taskParamMap['taskType']
                         tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType))
                         # get impl
                         impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType,
                                                                 self.taskBufferIF,self.ddmIF)
                         if impl == None:
                             # task refiner is undefined
                             errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # extract common parameters
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('extracting common')
                     try:
                         # initalize impl
                         impl.initializeRefiner(tmpLog)
                         impl.oldTaskStatus = taskStatus
                         # extract common parameters
                         impl.extractCommon(jediTaskID,taskParamMap,self.workQueueMapper,splitRule)
                         # set parent tid
                         if not parent_tid in [None,jediTaskID]:
                             impl.taskSpec.parent_tid = parent_tid
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to extract common parameters with {0}:{1} {2}'.format(errtype.__name__,errvalue,
                                                                                                traceback.format_exc())
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # check attribute length
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('checking attribute length')
                     if not impl.taskSpec.checkAttrLength():
                         tmpLog.error(impl.taskSpec.errorDialog)
                         tmpStat = Interaction.SC_FAILED
                 # check parent
                 noWaitParent = False
                 parentState = None
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     if not parent_tid in [None,jediTaskID]:
                         tmpLog.info('check parent task')
                         try:
                             tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid)
                             parentState = tmpStat
                             if tmpStat == 'completed':
                                 # parent is done
                                 tmpStat = Interaction.SC_SUCCEEDED
                             elif tmpStat == 'running':
                                 if not impl.taskSpec.noWaitParent():
                                     # parent is running
                                     errStr = 'pending until parent task {0} is done'.format(parent_tid)
                                     impl.taskSpec.status = taskStatus
                                     impl.taskSpec.setOnHold()
                                     impl.taskSpec.setErrDiag(errStr)
                                     tmpLog.info(errStr)
                                     self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                                       oldStatus=[taskStatus],setFrozenTime=False)
                                     continue
                                 else:
                                     # not wait for parent
                                     tmpStat = Interaction.SC_SUCCEEDED
                                     noWaitParent = True
                             else:
                                 # parent is corrupted
                                 tmpStat = Interaction.SC_FAILED
                                 tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid)
                                 impl.taskSpec.setErrDiag(tmpErrStr)
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # refine
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('refining with {0}'.format(impl.__class__.__name__))
                     try:
                         tmpStat = impl.doRefine(jediTaskID,taskParamMap)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         # wait unknown input if noWaitParent or waitInput
                         if ((impl.taskSpec.noWaitParent() or impl.taskSpec.waitInput()) \
                                 and errtype == JediException.UnknownDatasetError) or parentState == 'running' \
                                 or errtype == Interaction.JEDITemporaryError:
                             if impl.taskSpec.noWaitParent() or parentState == 'running':
                                 tmpErrStr = 'pending until parent produces input'
                                 setFrozenTime=False
                             elif errtype == Interaction.JEDITemporaryError:
                                 tmpErrStr = 'pending due to DDM problem. {0}'.format(errvalue)
                                 setFrozenTime=True
                             else:
                                 tmpErrStr = 'pending until input is staged'
                                 setFrozenTime=True
                             impl.taskSpec.status = taskStatus
                             impl.taskSpec.setOnHold()
                             impl.taskSpec.setErrDiag(tmpErrStr)
                             tmpLog.info(tmpErrStr)
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                               oldStatus=[taskStatus],
                                                               insertUnknown=impl.unknownDatasetList,
                                                               setFrozenTime=setFrozenTime)
                             continue
                         else:
                             errStr  = 'failed to refine task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # register
                 if tmpStat != Interaction.SC_SUCCEEDED:
                     tmpLog.error('failed to refine the task')
                     if impl == None or impl.taskSpec == None:
                         tmpTaskSpec = JediTaskSpec()
                         tmpTaskSpec.jediTaskID = jediTaskID
                     else:
                         tmpTaskSpec = impl.taskSpec
                     tmpTaskSpec.status = 'tobroken'
                     if errStr != '':
                         tmpTaskSpec.setErrDiag(errStr,True)
                     self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID},oldStatus=[taskStatus])
                 else:
                     tmpLog.info('registering')                    
                     # fill JEDI tables
                     try:
                         # enable protection against task duplication
                         if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \
                                 not impl.taskSpec.checkPreProcessed():
                             uniqueTaskName = True
                         else:
                             uniqueTaskName = False
                         strTaskParams = None
                         if impl.updatedTaskParams != None:
                             strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams)
                         if taskStatus == 'registered':
                             # unset pre-process flag
                             if impl.taskSpec.checkPreProcessed():
                                 impl.taskSpec.setPostPreProcess()
                             # full registration
                             tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec,
                                                                                                  impl.inMasterDatasetSpec,
                                                                                                  impl.inSecDatasetSpecList,
                                                                                                  impl.outDatasetSpecList,
                                                                                                  impl.outputTemplateMap,
                                                                                                  impl.jobParamsTemplate,
                                                                                                  strTaskParams,
                                                                                                  impl.unmergeMasterDatasetSpec,
                                                                                                  impl.unmergeDatasetSpecMap,
                                                                                                  uniqueTaskName,
                                                                                                  taskStatus) 
                             if not tmpStat:
                                 tmpErrStr = 'failed to register the task to JEDI in a single shot'
                                 tmpLog.error(tmpErrStr)
                                 impl.taskSpec.status = newTaskStatus
                                 impl.taskSpec.setErrDiag(tmpErrStr,True)
                                 self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                                   oldStatus=[taskStatus])
                             tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                             tmpLog.info(tmpMsg)
                             tmpLog.sendMsg(tmpMsg,self.msgType)
                         else:
                             # disable scouts if previous attempt didn't use it
                             if not impl.taskSpec.useScout(splitRule):
                                 impl.taskSpec.setUseScout(False)
                             # update task with new params
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                               oldStatus=[taskStatus])
                             # appending for incremetnal execution
                             tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec,
                                                                             impl.inSecDatasetSpecList)
                             if not tmpStat:
                                 tmpLog.error('failed to append datasets for incexec')
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(tmpErrStr)
                     else:
                         tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
Пример #16
0
 def doSetup(self, taskSpec, datasetToRegister, pandaJobs):
     # make logger
     tmpLog = MsgWrapper(logger,
                         "<jediTaskID={0}>".format(taskSpec.jediTaskID))
     tmpLog.info('start label={0} taskType={1}'.format(
         taskSpec.prodSourceLabel, taskSpec.taskType))
     # returns
     retFatal = self.SC_FATAL
     retTmpError = self.SC_FAILED
     retOK = self.SC_SUCCEEDED
     try:
         # get DDM I/F
         ddmIF = self.ddmIF.getInterface(taskSpec.vo)
         # register datasets
         if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']:
             # prod vs anal
             userSetup = False
             if taskSpec.prodSourceLabel in ['user']:
                 userSetup = True
                 # collect datasetID to register datasets/containers just in case
                 for tmpPandaJob in pandaJobs:
                     if not tmpPandaJob.produceUnMerge():
                         for tmpFileSpec in tmpPandaJob.Files:
                             if tmpFileSpec.type in ['output', 'log']:
                                 if not tmpFileSpec.datasetID in datasetToRegister:
                                     datasetToRegister.append(
                                         tmpFileSpec.datasetID)
             tmpLog.info('datasetToRegister={0}'.format(
                 str(datasetToRegister)))
             # get site mapper
             siteMapper = self.taskBufferIF.getSiteMapper()
             # loop over all datasets
             avDatasetList = []
             cnDatasetMap = {}
             for datasetID in datasetToRegister:
                 # get output and log datasets
                 tmpLog.info(
                     'getting datasetSpec with datasetID={0}'.format(
                         datasetID))
                 tmpStat, datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(
                     taskSpec.jediTaskID, datasetID)
                 if not tmpStat:
                     tmpLog.error('failed to get output and log datasets')
                     return retFatal
                 # DDM backend
                 ddmBackEnd = taskSpec.getDdmBackEnd()
                 tmpLog.info('checking {0}'.format(datasetSpec.datasetName))
                 # check if dataset and container are available in DDM
                 for targetName in [
                         datasetSpec.datasetName, datasetSpec.containerName
                 ]:
                     if targetName == None:
                         continue
                     if not targetName in avDatasetList:
                         # set lifetime
                         if targetName.startswith('panda'):
                             if datasetSpec.type == 'trn_log' and taskSpec.prodSourceLabel == 'managed':
                                 lifetime = 365
                             else:
                                 lifetime = 14
                         else:
                             lifetime = None
                         # check dataset/container in DDM
                         tmpList = ddmIF.listDatasets(targetName)
                         if tmpList == []:
                             # get location
                             location = None
                             locForRule = None
                             if targetName == datasetSpec.datasetName:
                                 # dataset
                                 if datasetSpec.site in ['', None]:
                                     if DataServiceUtils.getDistributedDestination(
                                             datasetSpec.storageToken
                                     ) != None:
                                         locForRule = datasetSpec.destination
                                     elif DataServiceUtils.getDestinationSE(
                                             datasetSpec.storageToken
                                     ) != None:
                                         location = DataServiceUtils.getDestinationSE(
                                             datasetSpec.storageToken)
                                     elif taskSpec.cloud != None:
                                         # use T1 SE
                                         tmpT1Name = siteMapper.getCloud(
                                             taskSpec.cloud)['source']
                                         location = siteMapper.getDdmEndpoint(
                                             tmpT1Name,
                                             datasetSpec.storageToken)
                                 else:
                                     tmpLog.info('site={0} token='.format(
                                         datasetSpec.site,
                                         datasetSpec.storageToken))
                                     location = siteMapper.getDdmEndpoint(
                                         datasetSpec.site,
                                         datasetSpec.storageToken)
                             if locForRule == None:
                                 locForRule = location
                             # set metadata
                             if taskSpec.prodSourceLabel in [
                                     'managed', 'test'
                             ] and targetName == datasetSpec.datasetName:
                                 metaData = {}
                                 metaData['task_id'] = taskSpec.jediTaskID
                                 if not taskSpec.campaign in [None, '']:
                                     metaData[
                                         'campaign'] = taskSpec.campaign
                                 if datasetSpec.getTransient() != None:
                                     metaData[
                                         'transient'] = datasetSpec.getTransient(
                                         )
                             else:
                                 metaData = None
                             # register dataset/container
                             tmpLog.info(
                                 'registering {0} with location={1} backend={2} lifetime={3} meta={4}'
                                 .format(targetName, location, ddmBackEnd,
                                         lifetime, str(metaData)))
                             tmpStat = ddmIF.registerNewDataset(
                                 targetName,
                                 backEnd=ddmBackEnd,
                                 location=location,
                                 lifetime=lifetime,
                                 metaData=metaData)
                             if not tmpStat:
                                 tmpLog.error(
                                     'failed to register {0}'.format(
                                         targetName))
                                 return retFatal
                             # procedures for user
                             if userSetup or DataServiceUtils.getDistributedDestination(
                                     datasetSpec.storageToken) != None:
                                 # register location
                                 tmpToRegister = False
                                 if userSetup and targetName == datasetSpec.datasetName and not datasetSpec.site in [
                                         '', None
                                 ]:
                                     userName = taskSpec.userName
                                     grouping = None
                                     tmpToRegister = True
                                 elif DataServiceUtils.getDistributedDestination(
                                         datasetSpec.storageToken) != None:
                                     userName = None
                                     grouping = 'NONE'
                                     tmpToRegister = True
                                 if tmpToRegister:
                                     activity = DataServiceUtils.getActivityForOut(
                                         taskSpec.prodSourceLabel)
                                     tmpLog.info(
                                         'registering location={0} lifetime={1}days activity={2} grouping={3}'
                                         .format(locForRule, lifetime,
                                                 activity, grouping))
                                     tmpStat = ddmIF.registerDatasetLocation(
                                         targetName,
                                         locForRule,
                                         owner=userName,
                                         lifetime=lifetime,
                                         backEnd=ddmBackEnd,
                                         activity=activity,
                                         grouping=grouping)
                                     if not tmpStat:
                                         tmpLog.error(
                                             'failed to register location {0} with {2} for {1}'
                                             .format(
                                                 locForRule, targetName,
                                                 ddmBackEnd))
                                         return retFatal
                             avDatasetList.append(targetName)
                         else:
                             tmpLog.info('{0} already registered'.format(
                                 targetName))
                 # check if dataset is in the container
                 if datasetSpec.containerName != None and datasetSpec.containerName != datasetSpec.datasetName:
                     # get list of constituent datasets in the container
                     if not cnDatasetMap.has_key(datasetSpec.containerName):
                         cnDatasetMap[
                             datasetSpec.
                             containerName] = ddmIF.listDatasetsInContainer(
                                 datasetSpec.containerName)
                     # add dataset
                     if not datasetSpec.datasetName in cnDatasetMap[
                             datasetSpec.containerName]:
                         tmpLog.info('adding {0} to {1}'.format(
                             datasetSpec.datasetName,
                             datasetSpec.containerName))
                         tmpStat = ddmIF.addDatasetsToContainer(
                             datasetSpec.containerName,
                             [datasetSpec.datasetName],
                             backEnd=ddmBackEnd)
                         if not tmpStat:
                             tmpLog.error('failed to add {0} to {1}'.format(
                                 datasetSpec.datasetName,
                                 datasetSpec.containerName))
                             return retFatal
                         cnDatasetMap[datasetSpec.containerName].append(
                             datasetSpec.datasetName)
                     else:
                         tmpLog.info('{0} already in {1}'.format(
                             datasetSpec.datasetName,
                             datasetSpec.containerName))
                 # update dataset
                 datasetSpec.status = 'registered'
                 self.taskBufferIF.updateDataset_JEDI(
                     datasetSpec, {
                         'jediTaskID': taskSpec.jediTaskID,
                         'datasetID': datasetID
                     })
                 # register ES datasets
                 if False:  # FIXME taskSpec.useEventService() and not taskSpec.useJobCloning() and datasetSpec.type == 'output':
                     targetName = datasetSpec.datasetName + EventServiceUtils.esSuffixDDM
                     location = None
                     metaData = {}
                     metaData['task_id'] = taskSpec.jediTaskID
                     metaData['hidden'] = True
                     tmpLog.info(
                         'registering ES dataset {0} with location={1} meta={2}'
                         .format(targetName, location, str(metaData)))
                     tmpStat = ddmIF.registerNewDataset(targetName,
                                                        location=location,
                                                        metaData=metaData)
                     if not tmpStat:
                         tmpLog.error(
                             'failed to register ES dataset {0}'.format(
                                 targetName))
                         return retFatal
                     # register rule
                     location = 'type=ES'
                     activity = DataServiceUtils.getActivityForOut(
                         taskSpec.prodSourceLabel)
                     grouping = 'NONE'
                     tmpLog.info(
                         'registering location={0} activity={1} grouping={2}'
                         .format(location, activity, grouping))
                     tmpStat = ddmIF.registerDatasetLocation(
                         targetName,
                         location,
                         activity=activity,
                         grouping=grouping)
                     if not tmpStat:
                         tmpLog.error(
                             'failed to register location {0} with {2} for {1}'
                             .format(location, targetName, activity))
                         return retFatal
         # open datasets
         if taskSpec.prodSourceLabel in ['managed', 'test']:
             # get the list of output/log datasets
             outDatasetList = []
             for tmpPandaJob in pandaJobs:
                 for tmpFileSpec in tmpPandaJob.Files:
                     if tmpFileSpec.type in ['output', 'log']:
                         if not tmpFileSpec.destinationDBlock in outDatasetList:
                             outDatasetList.append(
                                 tmpFileSpec.destinationDBlock)
             # open datasets
             for outDataset in outDatasetList:
                 tmpLog.info('open {0}'.format(outDataset))
                 ddmIF.openDataset(outDataset)
                 # unset lifetime
                 ddmIF.setDatasetMetadata(outDataset, 'lifetime', None)
         # return
         tmpLog.info('done')
         return retOK
     except:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('doSetup failed with {0}:{1}'.format(
             errtype.__name__, errvalue))
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retFatal
Пример #17
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskDsList = self.taskDsList.get(nTasks)
             # no more datasets
             if len(taskDsList) == 0:
                 self.logger.debug('%s terminating since no more items' % self.__class__.__name__)
                 return
             # loop over all tasks
             for jediTaskID,dsList in taskDsList:
                 allUpdated = True
                 taskBroken = False
                 taskOnHold = False
                 runningTask = False
                 missingMap = {}
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(jediTaskID))
                 # get task
                 tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False,True,self.pid,10)
                 if not tmpStat or taskSpec == None:
                     tmpLog.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID))
                     continue
                 try:
                     # get task parameters
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue))
                     taskBroken = True
                 # renaming of parameters
                 if taskParamMap.has_key('nEventsPerInputFile'):
                     taskParamMap['nEventsPerFile'] = taskParamMap['nEventsPerInputFile']
                 # the number of files per job
                 nFilesPerJob = None
                 if taskParamMap.has_key('nFilesPerJob'):
                     nFilesPerJob = taskParamMap['nFilesPerJob']
                 # the number of chunks used by scout 
                 nChunksForScout = 10
                 # load XML
                 if taskSpec.useLoadXML():
                     xmlConfig = taskParamMap['loadXML']
                 else:
                     xmlConfig = None
                 # check no wait
                 noWaitParent = False
                 if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None,taskSpec.jediTaskID]:
                     tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid)
                     if tmpStat == 'running':
                         noWaitParent = True
                 # loop over all datasets
                 nFilesMaster = 0
                 checkedMaster = False
                 setFrozenTime = True
                 if not taskBroken:
                     ddmIF = self.ddmIF.getInterface(taskSpec.vo) 
                     origNumFiles = None
                     if taskParamMap.has_key('nFiles'):
                         origNumFiles = taskParamMap['nFiles']
                     for datasetSpec in dsList:
                         tmpLog.info('start loop for {0}(id={1})'.format(datasetSpec.datasetName,datasetSpec.datasetID))
                         # get dataset metadata
                         tmpLog.info('get metadata')
                         gotMetadata = False
                         stateUpdateTime = datetime.datetime.utcnow()                    
                         try:
                             if not datasetSpec.isPseudo():
                                 tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName)
                             else:
                                 # dummy metadata for pseudo dataset
                                 tmpMetadata = {'state':'closed'}
                             # set mutable when parent is running and the dataset is open
                             if noWaitParent and tmpMetadata['state'] == 'open':
                                 # dummy metadata when parent is running
                                 tmpMetadata = {'state':'mutable'}
                             gotMetadata = True
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             tmpLog.error('{0} failed to get metadata to {1}:{2}'.format(self.__class__.__name__,
                                                                                         errtype.__name__,errvalue))
                             if errtype == Interaction.JEDIFatalError:
                                 # fatal error
                                 datasetStatus = 'broken'
                                 taskBroken = True
                                 # update dataset status    
                                 self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                             else:
                                 # temporary error
                                 taskOnHold = True
                             taskSpec.setErrDiag('failed to get metadata for {0}'.format(datasetSpec.datasetName))
                             allUpdated = False
                         else:
                             # get file list specified in task parameters
                             fileList,includePatt,excludePatt = RefinerUtils.extractFileList(taskParamMap,datasetSpec.datasetName)   
                             # get the number of events in metadata
                             if taskParamMap.has_key('getNumEventsInMetadata'):
                                 getNumEvents = True
                             else:
                                 getNumEvents = False
                             # get file list from DDM
                             tmpLog.info('get files')
                             try:
                                 useInFilesWithNewAttemptNr = False
                                 skipDuplicate = not datasetSpec.useDuplicatedFiles()
                                 if not datasetSpec.isPseudo():
                                     if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \
                                             not datasetSpec.containerName in ['',None]:
                                         # read files from container if file list is specified in task parameters
                                         tmpDatasetName = datasetSpec.containerName
                                     else:
                                         tmpDatasetName = datasetSpec.datasetName
                                     tmpRet = ddmIF.getFilesInDataset(tmpDatasetName,
                                                                      getNumEvents=getNumEvents,
                                                                      skipDuplicate=skipDuplicate
                                                                      )
                                     tmpLog.info('got {0} files in {1}'.format(len(tmpRet),tmpDatasetName))
                                     # remove lost files
                                     tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName,tmpRet)
                                     if tmpLostFiles != {}:
                                         tmpLog.info('found {0} lost files in {1}'.format(len(tmpLostFiles),tmpDatasetName))
                                         for tmpListGUID,tmpLostLFN in tmpLostFiles.iteritems():
                                             tmpLog.info('removed {0}'.format(tmpLostLFN))
                                             del tmpRet[tmpListGUID]
                                 else:
                                     if not taskSpec.useListPFN():
                                         # dummy file list for pseudo dataset
                                         tmpRet = {str(uuid.uuid4()):{'lfn':'pseudo_lfn',
                                                                      'scope':None,
                                                                      'filesize':0,
                                                                      'checksum':None,
                                                                      }
                                                   }
                                     else:
                                         # make dummy file list for PFN list
                                         if taskParamMap.has_key('nFiles'):
                                             nPFN = taskParamMap['nFiles']
                                         else:
                                             nPFN = 1
                                         tmpRet = {}
                                         for iPFN in range(nPFN):
                                             tmpRet[str(uuid.uuid4())] = {'lfn':'{0:06d}:{1}'.format(iPFN,taskParamMap['pfnList'][iPFN].split('/')[-1]),
                                                                          'scope':None,
                                                                          'filesize':0,
                                                                          'checksum':None,
                                                                          }
                             except:
                                 errtype,errvalue = sys.exc_info()[:2]
                                 tmpLog.error('failed to get files due to {0}:{1}'.format(self.__class__.__name__,
                                                                                              errtype.__name__,errvalue))
                                 if errtype == Interaction.JEDIFatalError:
                                     # fatal error
                                     datasetStatus = 'broken'
                                     taskBroken = True
                                     # update dataset status    
                                     self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                 else:
                                     # temporary error
                                     taskOnHold = True
                                 taskSpec.setErrDiag('failed to get files for {0}'.format(datasetSpec.datasetName))
                                 allUpdated = False
                             else:
                                 # the number of events per file
                                 nEventsPerFile  = None
                                 nEventsPerJob   = None
                                 nEventsPerRange = None
                                 if (datasetSpec.isMaster() and taskParamMap.has_key('nEventsPerFile')) or \
                                         (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents')):
                                     if taskParamMap.has_key('nEventsPerFile'):
                                         nEventsPerFile = taskParamMap['nEventsPerFile']
                                     elif datasetSpec.isPseudo() and taskParamMap.has_key('nEvents'):
                                         # use nEvents as nEventsPerFile for pseudo input
                                         nEventsPerFile = taskParamMap['nEvents']
                                     if taskParamMap.has_key('nEventsPerJob'):
                                         nEventsPerJob = taskParamMap['nEventsPerJob']
                                     elif taskParamMap.has_key('nEventsPerRange'):
                                         nEventsPerRange = taskParamMap['nEventsPerRange']
                                 # max attempts
                                 maxAttempt = None
                                 if datasetSpec.isMaster() or datasetSpec.toKeepTrack():
                                     # max attempts 
                                     if taskSpec.disableAutoRetry():
                                         # disable auto retry 
                                         maxAttempt = 1
                                     elif taskParamMap.has_key('maxAttempt'):
                                         maxAttempt = taskParamMap['maxAttempt']
                                     else:
                                         # use default value
                                         maxAttempt = 3
                                 # first event number
                                 firstEventNumber = None
                                 if datasetSpec.isMaster():
                                     # first event number
                                     firstEventNumber = 1 + taskSpec.getFirstEventOffset()
                                 # nMaxEvents
                                 nMaxEvents = None 
                                 if datasetSpec.isMaster() and taskParamMap.has_key('nEvents'):
                                     nMaxEvents = taskParamMap['nEvents']
                                 # nMaxFiles
                                 nMaxFiles = None
                                 if taskParamMap.has_key('nFiles'):
                                     if datasetSpec.isMaster():
                                         nMaxFiles = taskParamMap['nFiles']
                                     else:
                                         # calculate for secondary
                                         nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles)
                                         # multipled by the number of jobs per file for event-level splitting
                                         if nMaxFiles != None and taskParamMap.has_key('nEventsPerFile'):
                                             if taskParamMap.has_key('nEventsPerJob'):
                                                 if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                     nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerJob'])
                                                     nMaxFiles = int(math.ceil(nMaxFiles))
                                             elif taskParamMap.has_key('nEventsPerRange'):
                                                 if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerRange']:
                                                     nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerRange'])
                                                     nMaxFiles = int(math.ceil(nMaxFiles))
                                 # use scout
                                 useScout = False    
                                 if datasetSpec.isMaster() and taskSpec.useScout() and datasetSpec.status != 'toupdate':
                                     useScout = True
                                 # use files with new attempt numbers    
                                 useFilesWithNewAttemptNr = False
                                 if not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key('useInFilesWithNewAttemptNr'):
                                     useFilesWithNewAttemptNr = True
                                 # feed files to the contents table
                                 tmpLog.info('update contents')
                                 retDB,missingFileList,nFilesUnique,diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(datasetSpec,tmpRet,
                                                                                                                           tmpMetadata['state'],
                                                                                                                           stateUpdateTime,
                                                                                                                           nEventsPerFile,
                                                                                                                           nEventsPerJob,
                                                                                                                           maxAttempt,
                                                                                                                           firstEventNumber,
                                                                                                                           nMaxFiles,
                                                                                                                           nMaxEvents,
                                                                                                                           useScout,
                                                                                                                           fileList,
                                                                                                                           useFilesWithNewAttemptNr,
                                                                                                                           nFilesPerJob,
                                                                                                                           nEventsPerRange,
                                                                                                                           nChunksForScout,
                                                                                                                           includePatt,
                                                                                                                           excludePatt,
                                                                                                                           xmlConfig,
                                                                                                                           noWaitParent,
                                                                                                                           taskSpec.parent_tid,
                                                                                                                           self.pid)
                                 if retDB == False:
                                     taskSpec.setErrDiag('failed to insert files for {0}. {1}'.format(datasetSpec.datasetName,
                                                                                                      diagMap['errMsg']))
                                     allUpdated = False
                                     taskBroken = True
                                     break
                                 elif retDB == None:
                                     # the dataset is locked by another or status is not applicable
                                     allUpdated = False
                                     tmpLog.info('escape since task or dataset is locked')
                                     break
                                 elif missingFileList != []:
                                     # files are missing
                                     tmpErrStr = '{0} files missing in {1}'.format(len(missingFileList),datasetSpec.datasetName)
                                     tmpLog.info(tmpErrStr)
                                     taskSpec.setErrDiag(tmpErrStr)
                                     allUpdated = False
                                     taskOnHold = True
                                     missingMap[datasetSpec.datasetName] = {'datasetSpec':datasetSpec,
                                                                            'missingFiles':missingFileList} 
                                 else:
                                     # reduce the number of files to be read
                                     if taskParamMap.has_key('nFiles'):
                                         if datasetSpec.isMaster():
                                             taskParamMap['nFiles'] -= nFilesUnique
                                     # reduce the number of files for scout
                                     if useScout:
                                         nChunksForScout = diagMap['nChunksForScout']
                                     # number of master input files
                                     if datasetSpec.isMaster():
                                         checkedMaster = True
                                         nFilesMaster += nFilesUnique
                                 # running task
                                 if diagMap['isRunningTask']:
                                     runningTask = True
                                 # no activated pending input for noWait
                                 if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout == 0):
                                     tmpErrStr = 'insufficient inputs are ready'
                                     tmpLog.info(tmpErrStr)
                                     taskSpec.setErrDiag(tmpErrStr)
                                     taskOnHold = True
                                     setFrozenTime = False
                                     break
                         tmpLog.info('end loop')
                 # no mater input
                 if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster:
                     tmpErrStr = 'no master input files. input dataset is empty'
                     tmpLog.error(tmpErrStr)
                     taskSpec.setErrDiag(tmpErrStr,None)
                     if taskSpec.allowEmptyInput() or noWaitParent:
                         taskOnHold = True
                     else:
                         taskBroken = True
                 # update task status
                 if taskBroken:
                     # task is broken
                     taskSpec.status = 'tobroken'
                     tmpMsg = 'set task.status={0}'.format(taskSpec.status)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid)
                 # change task status unless the task is running
                 if not runningTask:
                     if taskOnHold:
                         if not noWaitParent:
                             # initialize task generator
                             taskGenerator = TaskGenerator(taskSpec.vo,taskSpec.prodSourceLabel)
                             tmpStat = taskGenerator.initializeMods(self.taskBufferIF,
                                                                    self.ddmIF.getInterface(taskSpec.vo))
                             if not tmpStat:
                                 tmpErrStr = 'failed to initialize TaskGenerator'
                                 tmpLog.error(tmpErrStr)
                                 taskSpec.status = 'tobroken'
                                 taskSpec.setErrDiag(tmpErrStr)
                             else:
                                 # make parent tasks if necessary
                                 tmpLog.info('make parent tasks with {0} (if necessary)'.format(taskGenerator.getClassName(taskSpec.vo,
                                                                                                                           taskSpec.prodSourceLabel)))
                                 tmpStat = taskGenerator.doGenerate(taskSpec,taskParamMap,missingFilesMap=missingMap)
                                 if tmpStat == Interaction.SC_FATAL:
                                     # failed to make parent tasks
                                     taskSpec.status = 'tobroken'
                                     tmpLog.error('failed to make parent tasks')
                         # go to pending state
                         if not taskSpec.status in ['broken','tobroken']:
                             taskSpec.setOnHold()
                         tmpMsg = 'set task.status={0}'.format(taskSpec.status)
                         tmpLog.info(tmpMsg)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                         allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid,setFrozenTime=setFrozenTime)
                     elif allUpdated:
                         # all OK
                         allRet,newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,getTaskStatus=True,
                                                                                                    pid=self.pid)
                         tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                         tmpLog.info(tmpMsg)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                 tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
Пример #18
0
    def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue,
                      resource_name):
        # params
        nBunch = 4
        threshold = 2.0
        nJobsInBunchMax = 600
        nJobsInBunchMin = 500
        minTotalWalltime = 50 * 1000 * 1000
        nWaitingLimit = 4
        nWaitingBunchLimit = 2
        nParallel = 2
        nParallelCap = 5
        # make logger
        tmpLog = MsgWrapper(logger)

        workQueueID = workQueue.getID()
        workQueueName = workQueue.queue_name

        workQueueName = '_'.join(workQueue.queue_name.split(' '))
        msgHeader = '{0}:{1} cloud={2} queue={3} resource_type={4}:'.format(
            vo, prodSourceLabel, cloudName, workQueueName, resource_name)
        tmpLog.debug('{0} start workQueueID={1}'.format(
            msgHeader, workQueueID))

        # get central configuration values
        config_map = self.__getConfiguration(vo, workQueue.queue_name,
                                             resource_name)
        configQueueLimit = config_map[NQUEUELIMIT]['value']
        configQueueCap = config_map[NQUEUECAP]['value']
        configRunningCap = config_map[NRUNNINGCAP]['value']

        tmpLog.debug(
            msgHeader +
            ' got configuration configQueueLimit={0}, configQueueCap={1}, configRunningCap={2}'
            .format(configQueueLimit, configQueueCap, configRunningCap))

        # check if unthrottled
        if not workQueue.throttled:
            msgBody = "PASS unthrottled since GS_throttled is False"
            tmpLog.info(msgHeader + " " + msgBody)
            return self.retUnThrottled

        # get the jobs statistics for our wq/gs and expand the stats map
        jobstats_map = self.__prepareJobStats(workQueue, resource_name,
                                              config_map)
        nRunning_rt = jobstats_map['nRunning_rt']
        nRunning_gs = jobstats_map['nRunning_gs']
        nRunning_runningcap = jobstats_map['nRunning_runningcap']
        nNotRun_rt = jobstats_map['nNotRun_rt']
        nNotRun_gs = jobstats_map['nNotRun_gs']
        nNotRun_queuelimit = jobstats_map['nNotRun_queuelimit']
        nNotRun_queuecap = jobstats_map['nNotRun_queuecap']
        nDefine_rt = jobstats_map['nDefine_rt']
        nDefine_gs = jobstats_map['nDefine_gs']
        nDefine_queuelimit = jobstats_map['nDefine_queuelimit']
        nDefine_queuecap = jobstats_map['nDefine_queuecap']
        nWaiting_rt = jobstats_map['nWaiting_rt']
        nWaiting_gs = jobstats_map['nWaiting_gs']

        # check if higher prio tasks are waiting
        if workQueue.queue_name in non_rt_wqs:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI(
                'managed', cloudName, workQueue)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(
                vo, workQueue, 'managed', cloudName)
        else:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI(
                'managed', cloudName, workQueue, resource_name)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(
                vo, workQueue, 'managed', cloudName, resource_name)

        highestPrioInPandaDB = highestPrioJobStat['highestPrio']
        nNotRunHighestPrio = highestPrioJobStat['nNotRun']
        if highestPrioWaiting is None:
            msgBody = 'failed to get the highest priority of waiting tasks'
            tmpLog.error("{0} {1}".format(msgHeader, msgBody))
            return self.retTmpError

        # high priority tasks are waiting
        highPrioQueued = False
        if highestPrioWaiting > highestPrioInPandaDB \
                or (highestPrioWaiting == highestPrioInPandaDB and nNotRunHighestPrio < nJobsInBunchMin):
            highPrioQueued = True
        tmpLog.debug(
            "{0} highestPrio waiting:{1} inPanda:{2} numNotRun:{3} -> highPrioQueued={4}"
            .format(msgHeader, highestPrioWaiting, highestPrioInPandaDB,
                    nNotRunHighestPrio, highPrioQueued))
        # set maximum number of jobs to be submitted
        if workQueue.queue_name in non_rt_wqs:
            tmpRemainingSlot = int(nRunning_gs * threshold - nNotRun_gs)
        else:
            tmpRemainingSlot = int(nRunning_rt * threshold - nNotRun_rt)
        # use the lower limit to avoid creating too many _sub/_dis datasets
        nJobsInBunch = min(max(nJobsInBunchMin, tmpRemainingSlot),
                           nJobsInBunchMax)

        if configQueueLimit is not None:
            nQueueLimit = configQueueLimit
        else:
            nQueueLimit = nJobsInBunch * nBunch

        # use nPrestage for reprocessing
        if workQueue.queue_name in ['Heavy Ion', 'Reprocessing default']:
            # reset nJobsInBunch
            if nQueueLimit > (nNotRun_queuelimit + nDefine_queuelimit):
                tmpRemainingSlot = nQueueLimit - (nNotRun_queuelimit +
                                                  nDefine_queuelimit)
                if tmpRemainingSlot > nJobsInBunch:
                    nJobsInBunch = min(tmpRemainingSlot, nJobsInBunchMax)

        # get cap
        # set number of jobs to be submitted
        if configQueueCap is None:
            self.setMaxNumJobs(nJobsInBunch / nParallel)
        else:
            self.setMaxNumJobs(configQueueCap / nParallelCap)

        # get total walltime
        totWalltime = self.taskBufferIF.getTotalWallTime_JEDI(
            vo, prodSourceLabel, workQueue, resource_name, cloudName)

        # log the current situation and limits
        tmpLog.info("{0} nQueueLimit={1} nRunCap={2} nQueueCap={3}".format(
            msgHeader, nQueueLimit, configRunningCap, configQueueCap))
        tmpLog.info(
            "{0} at global share level: nQueued={1} nDefine={2} nRunning={3}".
            format(msgHeader, nNotRun_gs + nDefine_gs, nDefine_gs,
                   nRunning_gs))
        tmpLog.info(
            "{0} at resource type level: nQueued_rt={1} nDefine_rt={2} nRunning_rt={3} totWalltime={4}"
            .format(msgHeader, nNotRun_rt + nDefine_rt, nDefine_rt,
                    nRunning_rt, totWalltime))

        # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling
        limitPriority = False
        if workQueue.queue_name not in non_rt_wqs \
                and nRunning_rt == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit \
                and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs \
                and nRunning_gs == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name not in non_rt_wqs and  nRunning_rt != 0 \
                and float(nNotRun_rt + nDefine_rt) / float(nRunning_rt) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_rt({0})/nRunning_rt({1})>{2} & nQueued_queuelimit({3})>{4} totWalltime({5})>{6}".format(
                    nNotRun_rt + nDefine_rt, nRunning_rt, threshold,
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs and nRunning_gs != 0 \
                and float(nNotRun_gs + nDefine_gs) / float(nRunning_gs) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_gs({0})/nRunning_gs({1})>{2} & nQueued_queuelimit({3})>{4}".format(
                    nNotRun_gs + nDefine_gs, nRunning_gs, threshold,
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif nDefine_queuelimit > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # brokerage is stuck
                msgBody = "SKIP too many nDefined_queuelimit({0})>{1}".format(
                    nDefine_queuelimit, nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif nWaiting_rt > max(nRunning_rt * nWaitingLimit,
                               nJobsInBunch * nWaitingBunchLimit):
            limitPriority = True
            if not highPrioQueued:
                # too many waiting
                msgBody = "SKIP too many nWaiting_rt({0})>max(nRunning_rt({1})x{2},{3}x{4})".format(
                    nWaiting_rt, nRunning_rt, nWaitingLimit, nJobsInBunch,
                    nWaitingBunchLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif configRunningCap and nRunning_runningcap > configRunningCap:
            # cap on running
            msgBody = "SKIP nRunning_runningcap({0})>nRunningCap({1})".format(
                nRunning_runningcap, configRunningCap)
            tmpLog.warning('{0} {1}'.format(msgHeader, msgBody))
            tmpLog.sendMsg('{0} {1}'.format(msgHeader, msgBody),
                           self.msgType,
                           msgLevel='warning',
                           escapeChar=True)
            return self.retMergeUnThr

        elif configQueueCap and nNotRun_queuecap + nDefine_queuecap > configQueueCap:
            limitPriority = True
            if not highPrioQueued:
                # cap on queued
                msgBody = "SKIP nQueued_queuecap({0})>nQueueCap({1})".format(
                    nNotRun_queuecap + nDefine_queuecap, configQueueCap)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        # get jobs from prodDB
        limitPriorityValue = None
        if limitPriority:
            limitPriorityValue = highestPrioWaiting
            self.setMinPriority(limitPriorityValue)
        else:
            # not enough jobs are queued
            if (nNotRun_queuelimit + nDefine_queuelimit < nQueueLimit * 0.9) \
                    or (workQueue.queue_name in non_rt_wqs and nNotRun_gs + nDefine_gs < nRunning_gs) \
                    or (workQueue.queue_name not in non_rt_wqs and nNotRun_rt + nDefine_rt < nRunning_rt):
                tmpLog.debug(msgHeader + " not enough jobs queued")
                if not workQueue.queue_name in non_rt_wqs:
                    self.notEnoughJobsQueued()
                self.setMaxNumJobs(max(self.maxNumJobs, nQueueLimit / 20))

        msgBody = "PASS - priority limit={0} maxNumJobs={1}".format(
            limitPriorityValue, self.maxNumJobs)
        tmpLog.info(msgHeader + " " + msgBody)
        return self.retUnThrottled
Пример #19
0
 def runImpl(self):
     # cutoff for disk in TB
     diskThreshold = self.taskBufferIF.getConfigValue(
         self.msgType,
         'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name), 'jedi',
         'atlas')
     if diskThreshold is None:
         diskThreshold = 100 * 1024
     # dataset type to ignore file availability check
     datasetTypeToSkipCheck = ['log']
     # thresholds for data availability check
     thrInputSize = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas')
     if thrInputSize is None:
         thrInputSize = 1
     thrInputSize *= 1024 * 1024 * 1024
     thrInputNum = self.taskBufferIF.getConfigValue(self.msgType,
                                                    'INPUT_NUM_THRESHOLD',
                                                    'jedi', 'atlas')
     if thrInputNum is None:
         thrInputNum = 100
     thrInputSizeFrac = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas')
     if thrInputSizeFrac is None:
         thrInputSizeFrac = 10
     thrInputSizeFrac = float(thrInputSizeFrac) / 100
     thrInputNumFrac = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas')
     if thrInputNumFrac is None:
         thrInputNumFrac = 10
     thrInputNumFrac = float(thrInputNumFrac) / 100
     cutOffRW = 50
     negWeightTape = 0.001
     minIoIntensityWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MIN_IO_INTENSITY_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if minIoIntensityWithLD is None:
         minIoIntensityWithLD = 200
     minInputSizeWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MIN_INPUT_SIZE_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if minInputSizeWithLD is None:
         minInputSizeWithLD = 10000
     maxTaskPrioWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MAX_TASK_PRIO_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if maxTaskPrioWithLD is None:
         maxTaskPrioWithLD = 800
     # main
     lastJediTaskID = None
     siteMapper = self.taskBufferIF.getSiteMapper()
     while True:
         try:
             taskInputList = self.inputList.get(1)
             # no more datasets
             if len(taskInputList) == 0:
                 self.logger.debug(
                     '{0} terminating after processing {1} tasks since no more inputs '
                     .format(self.__class__.__name__, self.numTasks))
                 return
             # loop over all tasks
             for taskSpec, inputChunk in taskInputList:
                 lastJediTaskID = taskSpec.jediTaskID
                 # make logger
                 tmpLog = MsgWrapper(
                     self.logger,
                     '<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                     monToken='jediTaskID={0}'.format(taskSpec.jediTaskID))
                 tmpLog.debug('start')
                 tmpLog.info(
                     'thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}'
                     .format(thrInputSize, thrInputNum, thrInputSizeFrac,
                             thrInputNumFrac))
                 # read task parameters
                 try:
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                         taskSpec.jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except Exception:
                     tmpLog.error('failed to read task params')
                     taskSpec.setErrDiag(
                         tmpLog.uploadLog(taskSpec.jediTaskID))
                     self.sendLogMessage(tmpLog)
                     continue
                 # RW
                 taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(
                     taskSpec.jediTaskID)
                 # get nuclei
                 nucleusList = siteMapper.nuclei
                 if taskSpec.nucleus in siteMapper.nuclei:
                     candidateNucleus = taskSpec.nucleus
                 elif taskSpec.nucleus in siteMapper.satellites:
                     nucleusList = siteMapper.satellites
                     candidateNucleus = taskSpec.nucleus
                 else:
                     tmpLog.info('got {0} candidates'.format(
                         len(nucleusList)))
                     ######################################
                     # check status
                     newNucleusList = {}
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleusSpec.state not in ['ACTIVE']:
                             tmpLog.info(
                                 '  skip nucleus={0} due to status={1} criteria=-status'
                                 .format(tmpNucleus, tmpNucleusSpec.state))
                         else:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info(
                         '{0} candidates passed status check'.format(
                             len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # check status of transfer backlog
                     t1Weight = taskSpec.getT1Weight()
                     if t1Weight < 0:
                         tmpLog.info(
                             'skip transfer backlog check due to negative T1Weight'
                         )
                     else:
                         newNucleusList = {}
                         backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei(
                         )
                         for tmpNucleus, tmpNucleusSpec in iteritems(
                                 nucleusList):
                             if tmpNucleus in backlogged_nuclei:
                                 tmpLog.info(
                                     '  skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog'
                                     .format(tmpNucleus))
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         nucleusList = newNucleusList
                         tmpLog.info(
                             '{0} candidates passed transfer backlog check'.
                             format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # check endpoint
                     fractionFreeSpace = {}
                     newNucleusList = {}
                     tmpStat, tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                         taskSpec.jediTaskID, ['output', 'log'])
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         toSkip = False
                         for tmpDatasetSpec in tmpDatasetSpecList:
                             # ignore distributed datasets
                             if DataServiceUtils.getDistributedDestination(
                                     tmpDatasetSpec.storageToken
                             ) is not None:
                                 continue
                             # get endpoint with the pattern
                             tmpEP = tmpNucleusSpec.getAssociatedEndpoint(
                                 tmpDatasetSpec.storageToken)
                             if tmpEP is None:
                                 tmpLog.info(
                                     '  skip nucleus={0} since no endpoint with {1} criteria=-match'
                                     .format(tmpNucleus,
                                             tmpDatasetSpec.storageToken))
                                 toSkip = True
                                 break
                             # check state
                             """
                             if tmpEP['state'] not in ['ACTIVE']:
                                 tmpLog.info('  skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus,
                                                                                                                          tmpEP['ddm_endpoint_name'],
                                                                                                                          tmpEP['state']))
                                 toSkip = True
                                 break
                             """
                             # check space
                             tmpSpaceSize = tmpEP['space_free'] + tmpEP[
                                 'space_expired']
                             tmpSpaceToUse = 0
                             if tmpNucleus in self.fullRW:
                                 # 0.25GB per cpuTime/corePower/day
                                 tmpSpaceToUse = long(
                                     self.fullRW[tmpNucleus] / 10 / 24 /
                                     3600 * 0.25)
                             if tmpSpaceSize - tmpSpaceToUse < diskThreshold:
                                 tmpLog.info(
                                     '  skip nucleus={0} since disk shortage (free {1} GB - reserved {2} GB < thr {3} GB) at endpoint {4} criteria=-space'
                                     .format(tmpNucleus, tmpSpaceSize,
                                             tmpSpaceToUse, diskThreshold,
                                             tmpEP['ddm_endpoint_name']))
                                 toSkip = True
                                 break
                             # keep fraction of free space
                             if tmpNucleus not in fractionFreeSpace:
                                 fractionFreeSpace[tmpNucleus] = {
                                     'total': 0,
                                     'free': 0
                                 }
                             try:
                                 tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                     float(fractionFreeSpace[tmpNucleus]['total'])
                             except Exception:
                                 tmpOld = None
                             try:
                                 tmpNew = float(tmpSpaceSize -
                                                tmpSpaceToUse) / float(
                                                    tmpEP['space_total'])
                             except Exception:
                                 tmpNew = None
                             if tmpNew is not None and (tmpOld is None
                                                        or tmpNew < tmpOld):
                                 fractionFreeSpace[tmpNucleus] = {
                                     'total': tmpEP['space_total'],
                                     'free': tmpSpaceSize - tmpSpaceToUse
                                 }
                         if not toSkip:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info(
                         '{0} candidates passed endpoint check {1} TB'.
                         format(len(nucleusList), diskThreshold / 1024))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # ability to execute jobs
                     newNucleusList = {}
                     # get all panda sites
                     tmpSiteList = []
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         tmpSiteList += tmpNucleusSpec.allPandaSites
                     tmpSiteList = list(set(tmpSiteList))
                     tmpLog.debug('===== start for job check')
                     jobBroker = AtlasProdJobBroker(self.ddmIF,
                                                    self.taskBufferIF)
                     tmpSt, tmpRet = jobBroker.doBrokerage(
                         taskSpec, taskSpec.cloud, inputChunk, None, True,
                         tmpSiteList, tmpLog)
                     tmpLog.debug('===== done for job check')
                     if tmpSt != Interaction.SC_SUCCEEDED:
                         tmpLog.error('no sites can run jobs')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     okNuclei = set()
                     for tmpSite in tmpRet:
                         siteSpec = siteMapper.getSite(tmpSite)
                         okNuclei.add(siteSpec.pandasite)
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleus in okNuclei:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                         else:
                             tmpLog.info(
                                 '  skip nucleus={0} due to missing ability to run jobs criteria=-job'
                                 .format(tmpNucleus))
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed job check'.format(
                         len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # data locality
                     toSkip = False
                     availableData = {}
                     for datasetSpec in inputChunk.getDatasets():
                         # only for real datasets
                         if datasetSpec.isPseudo():
                             continue
                         # ignore DBR
                         if DataServiceUtils.isDBR(datasetSpec.datasetName):
                             continue
                         # skip locality check
                         if DataServiceUtils.getDatasetType(
                                 datasetSpec.datasetName
                         ) in datasetTypeToSkipCheck:
                             continue
                         # primary only
                         if taskParamMap.get(
                                 'taskBrokerOnMaster'
                         ) is True and not datasetSpec.isMaster():
                             continue
                         # use deep scan for primary dataset unless data carousel
                         if datasetSpec.isMaster(
                         ) and not taskSpec.inputPreStaging():
                             deepScan = True
                         else:
                             deepScan = False
                         # get nuclei where data is available
                         tmpSt, tmpRet = AtlasBrokerUtils.getNucleiWithData(
                             siteMapper, self.ddmIF,
                             datasetSpec.datasetName,
                             list(nucleusList.keys()), deepScan)
                         if tmpSt != Interaction.SC_SUCCEEDED:
                             tmpLog.error(
                                 'failed to get nuclei where data is available, since {0}'
                                 .format(tmpRet))
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             toSkip = True
                             break
                         # sum
                         for tmpNucleus, tmpVals in iteritems(tmpRet):
                             if tmpNucleus not in availableData:
                                 availableData[tmpNucleus] = tmpVals
                             else:
                                 availableData[tmpNucleus] = dict(
                                     (k, v + tmpVals[k])
                                     for (k, v) in iteritems(
                                         availableData[tmpNucleus]))
                     if toSkip:
                         continue
                     if availableData != {}:
                         newNucleusList = {}
                         # skip if no data
                         skipMsgList = []
                         for tmpNucleus, tmpNucleusSpec in iteritems(
                                 nucleusList):
                             if taskSpec.inputPreStaging(
                             ) and availableData[tmpNucleus][
                                     'ava_num_any'] > 0:
                                 # use incomplete replicas for data carousel since the completeness is guaranteed
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                             elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \
                                     availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac:
                                 tmpMsg = '  skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(
                                     tmpNucleus, availableData[tmpNucleus]
                                     ['ava_size_any'],
                                     availableData[tmpNucleus]['tot_size'],
                                     thrInputSizeFrac)
                                 skipMsgList.append(tmpMsg)
                             elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \
                                     availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac:
                                 tmpMsg = '  skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(
                                     tmpNucleus, availableData[tmpNucleus]
                                     ['ava_num_any'],
                                     availableData[tmpNucleus]['tot_num'],
                                     thrInputNumFrac)
                                 skipMsgList.append(tmpMsg)
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         totInputSize = list(availableData.values(
                         ))[0]['tot_size'] / 1024 / 1024 / 1024
                         data_locality_check_str = (
                             '(ioIntensity ({0}) is None or less than {1} kBPerS '
                             'and input size ({2} GB) is less than {3}) '
                             'or task.currentPriority ({4}) is higher than or equal to {5}'
                         ).format(taskSpec.ioIntensity,
                                  minIoIntensityWithLD, int(totInputSize),
                                  minInputSizeWithLD,
                                  taskSpec.currentPriority,
                                  maxTaskPrioWithLD)
                         if len(newNucleusList) > 0:
                             nucleusList = newNucleusList
                             for tmpMsg in skipMsgList:
                                 tmpLog.info(tmpMsg)
                         elif ((taskSpec.ioIntensity is None
                               or taskSpec.ioIntensity <= minIoIntensityWithLD)
                               and totInputSize <= minInputSizeWithLD) \
                               or taskSpec.currentPriority >= maxTaskPrioWithLD:
                             availableData = {}
                             tmpLog.info(
                                 '  disable data locality check since no nucleus has input data, {}'
                                 .format(data_locality_check_str))
                         else:
                             # no candidate + unavoidable data locality check
                             nucleusList = newNucleusList
                             for tmpMsg in skipMsgList:
                                 tmpLog.info(tmpMsg)
                             tmpLog.info(
                                 '  the following conditions required to disable data locality check: {}'
                                 .format(data_locality_check_str))
                         tmpLog.info(
                             '{0} candidates passed data check'.format(
                                 len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # weight
                     self.prioRW.acquire()
                     nucleusRW = self.prioRW[taskSpec.currentPriority]
                     self.prioRW.release()
                     totalWeight = 0
                     nucleusweights = []
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleus not in nucleusRW:
                             nucleusRW[tmpNucleus] = 0
                         wStr = '1'
                         # with RW
                         if tmpNucleus in nucleusRW and nucleusRW[
                                 tmpNucleus] >= cutOffRW:
                             weight = 1 / float(nucleusRW[tmpNucleus])
                             wStr += '/( RW={0} )'.format(
                                 nucleusRW[tmpNucleus])
                         else:
                             weight = 1
                             wStr += '/(1 : RW={0}<{1})'.format(
                                 nucleusRW[tmpNucleus], cutOffRW)
                         # with data
                         if availableData != {}:
                             if availableData[tmpNucleus]['tot_size'] > 0:
                                 weight *= float(availableData[tmpNucleus]
                                                 ['ava_size_any'])
                                 weight /= float(
                                     availableData[tmpNucleus]['tot_size'])
                                 wStr += '* ( available_input_size_DISKTAPE={0} )'.format(
                                     availableData[tmpNucleus]
                                     ['ava_size_any'])
                                 wStr += '/ ( total_input_size={0} )'.format(
                                     availableData[tmpNucleus]['tot_size'])
                                 # negative weight for tape
                                 if availableData[tmpNucleus][
                                         'ava_size_any'] > availableData[
                                             tmpNucleus]['ava_size_disk']:
                                     weight *= negWeightTape
                                     wStr += '*( weight_TAPE={0} )'.format(
                                         negWeightTape)
                         # fraction of free space
                         if tmpNucleus in fractionFreeSpace:
                             try:
                                 tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                     float(fractionFreeSpace[tmpNucleus]['total'])
                                 weight *= tmpFrac
                                 wStr += '*( free_space={0} )/( total_space={1} )'.format(
                                     fractionFreeSpace[tmpNucleus]['free'],
                                     fractionFreeSpace[tmpNucleus]['total'])
                             except Exception:
                                 pass
                         tmpLog.info(
                             '  use nucleus={0} weight={1} {2} criteria=+use'
                             .format(tmpNucleus, weight, wStr))
                         totalWeight += weight
                         nucleusweights.append((tmpNucleus, weight))
                     tmpLog.info('final {0} candidates'.format(
                         len(nucleusList)))
                     ######################################
                     # final selection
                     tgtWeight = random.uniform(0, totalWeight)
                     candidateNucleus = None
                     for tmpNucleus, weight in nucleusweights:
                         tgtWeight -= weight
                         if tgtWeight <= 0:
                             candidateNucleus = tmpNucleus
                             break
                     if candidateNucleus is None:
                         candidateNucleus = nucleusweights[-1][0]
                 ######################################
                 # update
                 nucleusSpec = nucleusList[candidateNucleus]
                 # get output/log datasets
                 tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                     taskSpec.jediTaskID, ['output', 'log'])
                 # get destinations
                 retMap = {
                     taskSpec.jediTaskID:
                     AtlasBrokerUtils.getDictToSetNucleus(
                         nucleusSpec, tmpDatasetSpecs)
                 }
                 tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
                 tmpLog.info(
                     '  set nucleus={0} with {1} criteria=+set'.format(
                         candidateNucleus, tmpRet))
                 self.sendLogMessage(tmpLog)
                 if tmpRet:
                     tmpMsg = 'set task_status=ready'
                     tmpLog.sendMsg(tmpMsg, self.msgType)
                 # update RW table
                 self.prioRW.acquire()
                 for prio, rwMap in iteritems(self.prioRW):
                     if prio > taskSpec.currentPriority:
                         continue
                     if candidateNucleus in rwMap:
                         rwMap[candidateNucleus] += taskRW
                     else:
                         rwMap[candidateNucleus] = taskRW
                 self.prioRW.release()
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             errMsg = '{0}.runImpl() failed with {1} {2} '.format(
                 self.__class__.__name__, errtype.__name__, errvalue)
             errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID)
             errMsg += traceback.format_exc()
             logger.error(errMsg)
Пример #20
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,splitRule,taskStatus,parent_tid in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(jediTaskID))
                 tmpLog.info('start')
                 tmpStat = Interaction.SC_SUCCEEDED
                 errStr = ''
                 # read task parameters
                 try:
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue)
                     tmpLog.error(errStr)
                     tmpStat = Interaction.SC_FAILED
                 # get impl
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('getting Impl')
                     try:
                         # get VO and sourceLabel
                         vo = taskParamMap['vo']
                         prodSourceLabel = taskParamMap['prodSourceLabel']
                         taskType = taskParamMap['taskType']
                         tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType))
                         # get impl
                         impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType,
                                                                 self.taskBufferIF,self.ddmIF)
                         if impl == None:
                             # task refiner is undefined
                             errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # extract common parameters
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('extracting common')                    
                     try:
                         # initalize impl
                         impl.initializeRefiner(tmpLog)
                         # extarct common parameters
                         impl.extractCommon(jediTaskID,taskParamMap,self.workQueueMapper,splitRule)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to extract common parameters with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # check parent
                 noWaitParent = False
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     if not parent_tid in [None,jediTaskID]:
                         tmpLog.info('check parent task')
                         try:
                             tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid)
                             if tmpStat == 'completed':
                                 # parent is done
                                 tmpStat = Interaction.SC_SUCCEEDED
                             elif tmpStat == 'running':
                                 if not impl.taskSpec.noWaitParent():
                                     # parent is running
                                     errStr = 'pending until parent task {0} is done'.format(parent_tid)
                                     impl.taskSpec.status = taskStatus
                                     impl.taskSpec.setOnHold()
                                     impl.taskSpec.setErrDiag(errStr)
                                     tmpLog.info(errStr)
                                     self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID})
                                     continue
                                 else:
                                     # not wait for parent
                                     tmpStat = Interaction.SC_SUCCEEDED
                                     noWaitParent = True
                             else:
                                 # parent is corrupted
                                 tmpStat = Interaction.SC_FAILED
                                 tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid)
                                 impl.taskSpec.setErrDiag(tmpErrStr)
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # refine
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('refining with {0}'.format(impl.__class__.__name__))
                     try:
                         tmpStat = impl.doRefine(jediTaskID,taskParamMap)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         # no wait for parent
                         if impl.taskSpec.noWaitParent() and errtype == JediException.UnknownDatasetError:
                             impl.taskSpec.status = taskStatus
                             impl.taskSpec.setOnHold()
                             errStr = 'pending until parent produces input'
                             tmpLog.info(errStr)
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID})
                             continue
                         else:
                             errStr = 'failed to refine task'
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # register
                 if tmpStat != Interaction.SC_SUCCEEDED:
                     tmpLog.error('failed to refine the task')
                     if impl == None or impl.taskSpec == None:
                         tmpTaskSpec = JediTaskSpec()
                         tmpTaskSpec.jediTaskID = jediTaskID
                     else:
                         tmpTaskSpec = impl.taskSpec
                     tmpTaskSpec.status = 'tobroken'
                     if errStr != '':
                         tmpTaskSpec.setErrDiag(errStr,True)
                     self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID})
                 else:
                     tmpLog.info('registering')                    
                     # fill JEDI tables
                     try:
                         # enable protection against task duplication
                         if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \
                                 not impl.taskSpec.checkPreProcessed():
                             uniqueTaskName = True
                         else:
                             uniqueTaskName = False
                         strTaskParams = None
                         if impl.updatedTaskParams != None:
                             strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams)
                         if taskStatus == 'registered':
                             # unset pre-process flag
                             if impl.taskSpec.checkPreProcessed():
                                 impl.taskSpec.setPostPreProcess()
                             # full registration
                             tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec,
                                                                                                  impl.inMasterDatasetSpec,
                                                                                                  impl.inSecDatasetSpecList,
                                                                                                  impl.outDatasetSpecList,
                                                                                                  impl.outputTemplateMap,
                                                                                                  impl.jobParamsTemplate,
                                                                                                  strTaskParams,
                                                                                                  impl.unmergeMasterDatasetSpec,
                                                                                                  impl.unmergeDatasetSpecMap,
                                                                                                  uniqueTaskName) 
                             if not tmpStat:
                                 tmpErrStr = 'failed to register the task to JEDI in a single shot'
                                 tmpLog.error(tmpErrStr)
                                 impl.taskSpec.status = 'tobroken'
                                 impl.taskSpec.setErrDiag(tmpErrStr,True)
                                 self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID})
                             tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                             tmpLog.info(tmpMsg)
                             tmpLog.sendMsg(tmpMsg,self.msgType)
                         else:        
                             # appending for incremetnal execution
                             tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec,
                                                                             impl.inSecDatasetSpecList)
                             if not tmpStat:
                                 tmpLog.error('failed to append datasets for incexec')
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(tmpErrStr)
                     else:
                         tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
Пример #21
0
 def undo_preassign(self):
     tmp_log = MsgWrapper(logger, 'undo_preassign')
     # refresh
     self.refresh()
     # busy sites
     busy_sites_dict = self.get_busy_sites()
     # loop to undo preassignment
     for prod_source_label in self.prodSourceLabelList:
         # parameter from GDP config
         max_preassigned_tasks = self.taskBufferIF.getConfigValue(
             'queue_filler',
             'MAX_PREASSIGNED_TASKS_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if max_preassigned_tasks is None:
             max_preassigned_tasks = 3
         min_files_ready = self.taskBufferIF.getConfigValue(
             'queue_filler',
             'MIN_FILES_READY_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if min_files_ready is None:
             min_files_ready = 50
         min_files_remaining = self.taskBufferIF.getConfigValue(
             'queue_filler',
             'MIN_FILES_REMAINING_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if min_files_remaining is None:
             min_files_remaining = 100
         # clean up outdated blacklist
         blacklist_duration_hours = 12
         blacklisted_tasks_map_orig = self._get_from_bt_cache()
         blacklisted_tasks_map = copy.deepcopy(blacklisted_tasks_map_orig)
         now_time = datetime.datetime.utcnow()
         min_allowed_time = now_time - datetime.timedelta(
             hours=blacklist_duration_hours)
         min_allowed_ts = int(min_allowed_time.timestamp())
         for ts_str in blacklisted_tasks_map_orig:
             ts = int(ts_str)
             if ts < min_allowed_ts:
                 del blacklisted_tasks_map[ts_str]
         self._update_to_bt_cache(blacklisted_tasks_map)
         n_bt_old = sum([
             len(bt_list)
             for bt_list in blacklisted_tasks_map_orig.values()
         ])
         n_bt = sum(
             [len(bt_list) for bt_list in blacklisted_tasks_map.values()])
         tmp_log.debug(
             'done cleanup blacklist; before {n_bt_old} , now {n_bt} tasks in blacklist'
             .format(n_bt_old=n_bt_old, n_bt=n_bt))
         # get a copy of preassigned_tasks_map from cache
         preassigned_tasks_map_orig = self._get_from_pt_cache()
         preassigned_tasks_map = copy.deepcopy(preassigned_tasks_map_orig)
         # clean up task_orig_attr_map in cache
         task_orig_attr_map_orig = self._get_from_attr_cache()
         task_orig_attr_map = copy.deepcopy(task_orig_attr_map_orig)
         all_preassiged_taskids = set()
         for taskid_list in preassigned_tasks_map_orig.values():
             all_preassiged_taskids |= set(taskid_list)
         for taskid_str in task_orig_attr_map_orig:
             taskid = int(taskid_str)
             if taskid not in all_preassiged_taskids:
                 del task_orig_attr_map[taskid_str]
         self._update_to_attr_cache(task_orig_attr_map)
         # loop on preassigned tasks in cache
         for key_name in preassigned_tasks_map_orig:
             # parse key name = site + resource_type
             site, resource_type = key_name.split('|')
             # preassigned tasks in cache
             preassigned_tasks_cached = preassigned_tasks_map.get(
                 key_name, [])
             # force_undo=True for all tasks in busy sites, and force_undo=False for tasks not in status to generate jobs
             force_undo = False
             if site in busy_sites_dict or len(
                     preassigned_tasks_cached) > max_preassigned_tasks:
                 force_undo = True
             reason_str = 'site busy or offline or with too many preassigned tasks' if force_undo \
                             else 'task paused/terminated or without enough files to process'
             # parameters for undo, kinda ugly
             params_map = {
                 ':min_files_ready': min_files_ready,
                 ':min_files_remaining': min_files_remaining,
             }
             # undo preassign
             had_undo = False
             updated_tasks = []
             if DRY_RUN:
                 if force_undo:
                     updated_tasks = list(preassigned_tasks_cached)
                     n_tasks = len(updated_tasks)
                 else:
                     preassigned_tasks_list = []
                     preassigned_tasks_params_map = {}
                     for j, taskid in enumerate(preassigned_tasks_cached):
                         pt_param = ':pt_{0}'.format(j + 1)
                         preassigned_tasks_list.append(pt_param)
                         preassigned_tasks_params_map[pt_param] = taskid
                     if not preassigned_tasks_list:
                         continue
                     preassigned_tasks_params_str = ','.join(
                         preassigned_tasks_list)
                     dry_sql_query = (
                         "SELECT t.jediTaskID "
                         "FROM {jedi_schema}.JEDI_Tasks t "
                         "WHERE t.jediTaskID IN ({preassigned_tasks_params_str}) "
                         "AND t.site IS NOT NULL "
                         "AND NOT ( "
                         "t.status IN ('ready','running') "
                         "AND EXISTS ( "
                         "SELECT d.datasetID FROM {0}.JEDI_Datasets d "
                         "WHERE t.jediTaskID=d.jediTaskID AND d.type='input' "
                         "AND d.nFilesToBeUsed-d.nFilesUsed>=:min_files_ready AND d.nFiles-d.nFilesUsed>=:min_files_remaining "
                         ") "
                         ") ").format(jedi_schema=jedi_config.db.schemaJEDI,
                                      preassigned_tasks_params_str=
                                      preassigned_tasks_params_str)
                     res = self.taskBufferIF.querySQL(
                         dry_sql_query, preassigned_tasks_params_map)
                     n_tasks = 0 if res is None else len(res)
                     if n_tasks > 0:
                         updated_tasks = [x[0] for x in res]
                 # tmp_log.debug('[dry run] {} {} force={}'.format(key_name, str(updated_tasks), force_undo))
                 had_undo = True
                 if n_tasks > 0:
                     tmp_log.debug(
                         '[dry run] {key_name:<64} {n_tasks:>3} preassigned tasks would be undone ({reason_str}) '
                         .format(key_name=key_name,
                                 n_tasks=n_tasks,
                                 reason_str=reason_str))
             else:
                 updated_tasks = self.taskBufferIF.undoPreassignedTasks_JEDI(
                     preassigned_tasks_cached,
                     task_orig_attr_map=task_orig_attr_map,
                     params_map=params_map,
                     force=force_undo)
                 if updated_tasks is None:
                     # dbproxy method failed
                     tmp_log.error(
                         '{key_name:<64} failed to undo preassigned tasks (force={force_undo})'
                         .format(key_name=key_name, force_undo=force_undo))
                 else:
                     had_undo = True
                     n_tasks = len(updated_tasks)
                     if n_tasks > 0:
                         tmp_log.info(
                             '{key_name:<64} {n_tasks:>3} preassigned tasks undone ({reason_str}) : {updated_tasks} '
                             .format(key_name=key_name,
                                     n_tasks=str(n_tasks),
                                     reason_str=reason_str,
                                     updated_tasks=updated_tasks))
                         # Kibana log
                         for taskid in updated_tasks:
                             tmp_log.debug(
                                 '#ATM #KV jediTaskID={taskid} action=undo_preassign site={site} rtype={rtype} un-preassinged since {reason_str}'
                                 .format(taskid=taskid,
                                         site=site,
                                         rtype=resource_type,
                                         reason_str=reason_str))
             # update preassigned_tasks_map into cache
             if had_undo:
                 if force_undo:
                     del preassigned_tasks_map[key_name]
                 else:
                     tmp_tasks_set = set(preassigned_tasks_cached) - set(
                         updated_tasks)
                     if not tmp_tasks_set:
                         del preassigned_tasks_map[key_name]
                     else:
                         preassigned_tasks_map[key_name] = list(
                             tmp_tasks_set)
                 self._update_to_pt_cache(preassigned_tasks_map)
             # update blacklisted_tasks_map into cache
             if had_undo and not force_undo:
                 blacklisted_tasks_map_orig = self._get_from_bt_cache()
                 blacklisted_tasks_map = copy.deepcopy(
                     blacklisted_tasks_map_orig)
                 now_time = datetime.datetime.utcnow()
                 now_rounded_ts = int(
                     now_time.replace(minute=0, second=0,
                                      microsecond=0).timestamp())
                 ts_str = str(now_rounded_ts)
                 if ts_str in blacklisted_tasks_map_orig:
                     tmp_bt_list = blacklisted_tasks_map[ts_str]
                     blacklisted_tasks_map[ts_str] = list(
                         set(tmp_bt_list) | set(updated_tasks))
                 else:
                     blacklisted_tasks_map[ts_str] = list(updated_tasks)
                 self._update_to_bt_cache(blacklisted_tasks_map)
Пример #22
0
 def doSetup(self,taskSpec,datasetToRegister):
     # make logger
     tmpLog = MsgWrapper(logger,"<jediTaskID={0}>".format(taskSpec.jediTaskID))
     tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType))
     tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister)))
     # returns
     retFatal    = self.SC_FATAL
     retTmpError = self.SC_FAILED
     retOK       = self.SC_SUCCEEDED
     try:
         if datasetToRegister != []:
             # prod vs anal
             userSetup = False
             if taskSpec.prodSourceLabel in ['user']:
                 userSetup = True
             # get DDM I/F
             ddmIF = self.ddmIF.getInterface(taskSpec.vo)
             # get site mapper
             siteMapper = self.taskBufferIF.getSiteMapper()
             # loop over all datasets
             avDatasetList = []
             cnDatasetMap  = {}
             for datasetID in datasetToRegister:
                 # get output and log datasets
                 tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID))
                 tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID,
                                                                               datasetID)
                 if not tmpStat:
                     tmpLog.error('failed to get output and log datasets')
                     return retFatal
                 tmpLog.info('checking {0}'.format(datasetSpec.datasetName)) 
                 # check if dataset and container are available in DDM
                 for targetName in [datasetSpec.datasetName,datasetSpec.containerName]:
                     if targetName == None:
                         continue
                     if not targetName in avDatasetList:
                         # check dataset/container in DDM
                         tmpList = ddmIF.listDatasets(targetName)
                         if tmpList == []:
                             # register dataset/container
                             tmpLog.info('registering {0}'.format(targetName))
                             tmpStat = ddmIF.registerNewDataset(targetName)
                             if not tmpStat:
                                 tmpLog.error('failed to register {0}'.format(targetName))
                                 return retFatal
                             # procedures for user 
                             if userSetup:
                                 # set owner
                                 tmpLog.info('setting owner={0}'.format(taskSpec.userName))
                                 tmpStat = ddmIF.setDatasetOwner(targetName,taskSpec.userName)
                                 if not tmpStat:
                                     tmpLog.error('failed to set ownership {0} with {1}'.format(targetName,
                                                                                                taskSpec.userName))
                                     return retFatal
                                 # register location
                                 if targetName == datasetSpec.datasetName and not datasetSpec.site in ['',None]: 
                                     location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken)
                                     tmpLog.info('registring location={0}'.format(location))
                                     tmpStat = ddmIF.registerDatasetLocation(targetName,location,owner=taskSpec.userName)
                                     if not tmpStat:
                                         tmpLog.error('failed to register location {0} for {1}'.format(location,
                                                                                                       targetName))
                                         return retFatal
                             avDatasetList.append(targetName)
                         else:
                             tmpLog.info('{0} already registered'.format(targetName))
                 # check if dataset is in the container
                 if datasetSpec.containerName != None and datasetSpec.containerName != datasetSpec.datasetName:
                     # get list of constituent datasets in the container
                     if not cnDatasetMap.has_key(datasetSpec.containerName):
                         cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName)
                     # add dataset
                     if not datasetSpec.datasetName in cnDatasetMap[datasetSpec.containerName]:
                         tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) 
                         tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName])
                         if not tmpStat:
                             tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName,
                                                                            datasetSpec.containerName))
                             return retFatal
                         cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName)
                     else:
                         tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) 
                 # update dataset
                 datasetSpec.status = 'registered'
                 self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID,
                                                                   'datasetID':datasetID})
         # return
         tmpLog.info('done')        
         return retOK
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue))
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retFatal
Пример #23
0
 def getAvailableFiles(self,datasetSpec,siteEndPointMap,siteMapper,ngGroup=[],checkLFC=False):
     # make logger
     methodName = 'getAvailableFiles'
     methodName += ' <datasetID={0}>'.format(datasetSpec.datasetID)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start datasetName={0}'.format(datasetSpec.datasetName))
     try:
         # list of NG endpoints
         ngEndPoints = []
         if 1 in ngGroup:
             ngEndPoints += ['_SCRATCHDISK$','_LOCALGROUPDISK$','_LOCALGROUPTAPE$','_USERDISK$',
                            '_DAQ$','_TMPDISK$','_TZERO$','_GRIDFTP$','MOCKTEST$']
         if 2 in ngGroup:
             ngEndPoints += ['_LOCALGROUPTAPE$',
                            '_DAQ$','_TMPDISK$','_TZERO$','_GRIDFTP$','MOCKTEST$']
         # get all associated endpoints
         siteAllEndPointsMap = {}
         for siteName,endPointPattList in siteEndPointMap.iteritems():
             # get all endpoints matching with patterns 
             allEndPointList = []
             for endPointPatt in endPointPattList:
                 if '*' in endPointPatt:
                     # wildcard
                     endPointPatt = endPointPatt.replace('*','.*')
                     for endPointToA in TiersOfATLAS.getAllDestinationSites():
                         if re.search('^'+endPointPatt+'$',endPointToA) != None:
                             if not endPointToA in allEndPointList:
                                 allEndPointList.append(endPointToA)
                 else:
                     # normal endpoint
                     if endPointPatt in TiersOfATLAS.getAllDestinationSites() and \
                            not endPointPatt in allEndPointList:
                         allEndPointList.append(endPointPatt)
             # get associated endpoints
             siteAllEndPointsMap[siteName] = []
             for endPoint in allEndPointList:
                 # append
                 if not self.checkNGEndPoint(endPoint,ngEndPoints) and \
                         not endPoint in siteAllEndPointsMap[siteName]:
                     siteAllEndPointsMap[siteName].append(endPoint)
                 else:
                     # already checked
                     continue
                 # get alternate name
                 altName = TiersOfATLAS.getSiteProperty(endPoint,'alternateName')
                 if altName != None and altName != ['']:
                     for assEndPoint in TiersOfATLAS.resolveGOC({altName[0]:None})[altName[0]]:
                         if not assEndPoint in siteAllEndPointsMap[siteName] and \
                                not self.checkNGEndPoint(assEndPoint,ngEndPoints):
                             siteAllEndPointsMap[siteName].append(assEndPoint)
         # get replica map
         tmpStat,tmpOut = self.listDatasetReplicas(datasetSpec.datasetName)
         if tmpStat != self.SC_SUCCEEDED:
             tmpLog.error('faild to get dataset replicas with {0}'.format(tmpOut))
             raise tmpStat,tmpOut
         datasetReplicaMap = tmpOut
         # collect SE, LFC hosts, storage path, storage type
         lfcSeMap = {}
         storagePathMap = {}
         completeReplicaMap = {}
         siteHasCompleteReplica = False
         for siteName,allEndPointList in siteAllEndPointsMap.iteritems():
             tmpLfcSeMap = {}
             tmpStoragePathMap = {}
             tmpSiteSpec = siteMapper.getSite(siteName)
             for tmpEndPoint in allEndPointList:
                 # storage type
                 if TiersOfATLAS.isTapeSite(tmpEndPoint):
                     storageType = 'localtape'
                 else:
                     storageType = 'localdisk'
                 # no scan when site has complete replicas
                 if datasetReplicaMap.has_key(tmpEndPoint) and datasetReplicaMap[tmpEndPoint][-1]['found'] != None \
                    and datasetReplicaMap[tmpEndPoint][-1]['total'] == datasetReplicaMap[tmpEndPoint][-1]['found']:
                     completeReplicaMap[tmpEndPoint] = storageType
                     siteHasCompleteReplica = True
                 # no LFC scan for many-time datasets
                 if datasetSpec.isManyTime():
                     continue
                 # get LFC
                 lfc = TiersOfATLAS.getLocalCatalog(tmpEndPoint)
                 # add map
                 if not tmpLfcSeMap.has_key(lfc):
                     tmpLfcSeMap[lfc] = []
                 # get SE
                 seStr = TiersOfATLAS.getSiteProperty(tmpEndPoint, 'srm')
                 tmpMatch = re.search('://([^:/]+):*\d*/',seStr)
                 if tmpMatch != None:
                     se = tmpMatch.group(1)
                     if not se in tmpLfcSeMap[lfc]:
                         tmpLfcSeMap[lfc].append(se)
                 else:
                     tmpLog.error('faild to extract SE from %s for %s:%s' % \
                                  (seStr,siteName,tmpEndPoint))
                 # get SE + path
                 seStr = TiersOfATLAS.getSiteProperty(tmpEndPoint, 'srm')
                 tmpMatch = re.search('(srm://.+)$',seStr)
                 if tmpMatch == None:
                     tmpLog.error('faild to extract SE+PATH from %s for %s:%s' % \
                                  (seStr,siteName,tmpEndPoint))
                     continue
                 # add full path to storage map
                 tmpSePath = tmpMatch.group(1)
                 tmpStoragePathMap[tmpSePath] = {'siteName':siteName,'storageType':storageType}
                 # add compact path
                 tmpSePath = re.sub('(:\d+)*/srm/[^\?]+\?SFN=','',tmpSePath)
                 tmpStoragePathMap[tmpSePath] = {'siteName':siteName,'storageType':storageType}
             # add to map to trigger LFC scan if complete replica is missing at the site
             if DataServiceUtils.isCachedFile(datasetSpec.datasetName,tmpSiteSpec):
                 pass
             elif not siteHasCompleteReplica or checkLFC:
                 for tmpKey,tmpVal in tmpLfcSeMap.iteritems():
                     if not lfcSeMap.has_key(tmpKey):
                         lfcSeMap[tmpKey] = []
                     lfcSeMap[tmpKey] += tmpVal
                 for tmpKey,tmpVal in tmpStoragePathMap.iteritems():
                     storagePathMap[tmpKey] = tmpVal
         # collect GUIDs and LFNs
         fileMap        = {}
         lfnMap         = {}
         lfnFileSepcMap = {}
         scopeMap       = {}
         for tmpFile in datasetSpec.Files:
             fileMap[tmpFile.GUID] = tmpFile.lfn
             lfnMap[tmpFile.lfn] = tmpFile
             lfnFileSepcMap[tmpFile.lfn] = tmpFile
             scopeMap[tmpFile.lfn] = tmpFile.scope
         # get SURLs
         surlMap = {}
         for lfcHost,seList in lfcSeMap.iteritems():
             tmpLog.debug('lookup in LFC:{0} for {1}'.format(lfcHost,str(seList)))               
             tmpStat,tmpRetMap = self.getSURLsFromLFC(fileMap,lfcHost,seList,scopes=scopeMap)
             tmpLog.debug(str(tmpStat))
             if tmpStat != self.SC_SUCCEEDED:
                 raise RuntimeError,tmpRetMap
             for lfn,surls in tmpRetMap.iteritems():
                 if not surlMap.has_key(lfn):
                     surlMap[lfn] = surls
                 else:
                     surlMap[lfn] += surls
         # make return
         returnMap = {}
         for siteName,allEndPointList in siteAllEndPointsMap.iteritems():
             # set default return values
             if not returnMap.has_key(siteName):
                 returnMap[siteName] = {'localdisk':[],'localtape':[],'cache':[],'remote':[]}
             # loop over all files    
             tmpSiteSpec = siteMapper.getSite(siteName)                
             # check if the file is cached
             if DataServiceUtils.isCachedFile(datasetSpec.datasetName,tmpSiteSpec):
                 for tmpFileSpec in datasetSpec.Files:
                     # add to cached file list
                     returnMap[siteName]['cache'].append(tmpFileSpec)
             # complete replicas
             if not checkLFC:        
                 for tmpEndPoint in allEndPointList:
                     if completeReplicaMap.has_key(tmpEndPoint):
                         storageType = completeReplicaMap[tmpEndPoint]
                         returnMap[siteName][storageType] += datasetSpec.Files
         # loop over all available LFNs
         avaLFNs = surlMap.keys()
         avaLFNs.sort()
         for tmpLFN in avaLFNs:
             tmpFileSpec = lfnFileSepcMap[tmpLFN]                
             # loop over all SURLs
             for tmpSURL in surlMap[tmpLFN]:
                 for tmpSePath in storagePathMap.keys():
                     # check SURL
                     if tmpSURL.startswith(tmpSePath):
                         # add
                         siteName = storagePathMap[tmpSePath]['siteName']
                         storageType = storagePathMap[tmpSePath]['storageType']
                         if not tmpFileSpec in returnMap[siteName][storageType]:
                             returnMap[siteName][storageType].append(tmpFileSpec)
                         break
         # dump
         dumpStr = ''
         for siteName,storageTypeFile in returnMap.iteritems():
             dumpStr += '{0}:('.format(siteName)
             for storageType,fileList in storageTypeFile.iteritems():
                 dumpStr += '{0}:{1},'.format(storageType,len(fileList))
             dumpStr = dumpStr[:-1]
             dumpStr += ') '
         dumpStr= dumpStr[:-1]
         tmpLog.debug(dumpStr)
         # return
         tmpLog.info('done')            
         return self.SC_SUCCEEDED,returnMap
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errMsg = 'failed with {0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return self.SC_FAILED,'{0}.{1} {2}'.format(self.__class__.__name__,methodName,errMsg)
Пример #24
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug(
                     '{0} terminating since no more items'.format(
                         self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID, commandMap in taskList:
                 # make logger
                 tmpLog = MsgWrapper(
                     self.logger, ' < jediTaskID={0} >'.format(jediTaskID))
                 commandStr = commandMap['command']
                 commentStr = commandMap['comment']
                 oldStatus = commandMap['oldStatus']
                 tmpLog.info('start for {0}'.format(commandStr))
                 tmpStat = Interaction.SC_SUCCEEDED
                 if commandStr in ['kill', 'finish', 'reassign']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg, self.msgType)
                     # loop twice to see immediate result
                     for iLoop in range(2):
                         # get active PandaIDs to be killed
                         if commandStr == 'reassign' and commentStr is not None and 'soft reassign' in commentStr:
                             pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(
                                 jediTaskID)
                         elif commandStr == 'reassign' and commentStr is not None and 'nokill reassign' in commentStr:
                             pandaIDs = []
                         else:
                             pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(
                                 jediTaskID, True)
                         if pandaIDs is None:
                             tmpLog.error(
                                 'failed to get PandaIDs for jediTaskID={0}'
                                 .format(jediTaskID))
                             tmpStat = Interaction.SC_FAILED
                         # kill jobs or update task
                         if tmpStat == Interaction.SC_SUCCEEDED:
                             if pandaIDs == []:
                                 # done since no active jobs
                                 tmpMsg = 'completed cleaning jobs'
                                 tmpLog.sendMsg(tmpMsg, self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpTaskSpec = JediTaskSpec()
                                 tmpTaskSpec.jediTaskID = jediTaskID
                                 updateTaskStatus = True
                                 if commandStr != 'reassign':
                                     # reset oldStatus
                                     # keep oldStatus for task reassignment since it is reset when actually reassigned
                                     tmpTaskSpec.forceUpdate('oldStatus')
                                 else:
                                     # extract cloud or site
                                     if commentStr is not None:
                                         tmpItems = commentStr.split(':')
                                         if tmpItems[0] == 'cloud':
                                             tmpTaskSpec.cloud = tmpItems[1]
                                         elif tmpItems[0] == 'nucleus':
                                             tmpTaskSpec.nucleus = tmpItems[
                                                 1]
                                         else:
                                             tmpTaskSpec.site = tmpItems[1]
                                         tmpMsg = 'set {0}={1}'.format(
                                             tmpItems[0], tmpItems[1])
                                         tmpLog.sendMsg(
                                             tmpMsg, self.msgType)
                                         tmpLog.info(tmpMsg)
                                         # back to oldStatus if necessary
                                         if tmpItems[2] == 'y':
                                             tmpTaskSpec.status = oldStatus
                                             tmpTaskSpec.forceUpdate(
                                                 'oldStatus')
                                             updateTaskStatus = False
                                 if commandStr == 'reassign':
                                     tmpTaskSpec.forceUpdate('errorDialog')
                                 if commandStr == 'finish':
                                     # update datasets
                                     tmpLog.info(
                                         'updating datasets to finish')
                                     tmpStat = self.taskBufferIF.updateDatasetsToFinishTask_JEDI(
                                         jediTaskID, self.pid)
                                     if not tmpStat:
                                         tmpLog.info(
                                             'wait until datasets are updated to finish'
                                         )
                                     # ignore failGoalUnreached when manually finished
                                     tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(
                                         jediTaskID)
                                     tmpTaskSpec.splitRule = taskSpec.splitRule
                                     tmpTaskSpec.unsetFailGoalUnreached()
                                 if updateTaskStatus:
                                     tmpTaskSpec.status = JediTaskSpec.commandStatusMap(
                                     )[commandStr]['done']
                                 tmpMsg = 'set task_status={0}'.format(
                                     tmpTaskSpec.status)
                                 tmpLog.sendMsg(tmpMsg, self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpRet = self.taskBufferIF.updateTask_JEDI(
                                     tmpTaskSpec,
                                     {'jediTaskID': jediTaskID},
                                     setOldModTime=True)
                                 tmpLog.info('done with {0}'.format(
                                     str(tmpRet)))
                                 break
                             else:
                                 # kill only in the first loop
                                 if iLoop > 0:
                                     break
                                 # wait or kill jobs
                                 if commentStr and 'soft finish' in commentStr:
                                     queuedPandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(
                                         jediTaskID)
                                     tmpMsg = "trying to kill {0} queued jobs for soft finish".format(
                                         len(queuedPandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = self.taskBufferIF.killJobs(
                                         queuedPandaIDs, commentStr, '52',
                                         True)
                                     tmpMsg = "wating {0} jobs for soft finish".format(
                                         len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = True
                                     tmpLog.info('done with {0}'.format(
                                         str(tmpRet)))
                                     break
                                 else:
                                     tmpMsg = "trying to kill {0} jobs".format(
                                         len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpLog.sendMsg(tmpMsg, self.msgType)
                                     if commandStr in ['finish']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(
                                             pandaIDs, commentStr, '52',
                                             True)
                                     elif commandStr in ['reassign']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(
                                             pandaIDs, commentStr, '51',
                                             True)
                                     else:
                                         # normal kill
                                         tmpRet = self.taskBufferIF.killJobs(
                                             pandaIDs, commentStr, '50',
                                             True)
                                     tmpLog.info('done with {0}'.format(
                                         str(tmpRet)))
                 elif commandStr in ['retry', 'incexec']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg, self.msgType)
                     # change task params for incexec
                     if commandStr == 'incexec':
                         try:
                             # read task params
                             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                                 jediTaskID)
                             taskParamMap = RefinerUtils.decodeJSON(
                                 taskParam)
                             # remove some params
                             for newKey in ['nFiles', 'fixedSandbox']:
                                 try:
                                     del taskParamMap[newKey]
                                 except Exception:
                                     pass
                             # convert new params
                             newParamMap = RefinerUtils.decodeJSON(
                                 commentStr)
                             # change params
                             for newKey, newVal in iteritems(newParamMap):
                                 if newVal is None:
                                     # delete
                                     if newKey in taskParamMap:
                                         del taskParamMap[newKey]
                                 else:
                                     # change
                                     taskParamMap[newKey] = newVal
                             # overwrite sandbox
                             if 'fixedSandbox' in taskParamMap:
                                 # noBuild
                                 for tmpParam in taskParamMap[
                                         'jobParameters']:
                                     if tmpParam[
                                             'type'] == 'constant' and re.search(
                                                 '^-a [^ ]+$',
                                                 tmpParam['value']
                                             ) is not None:
                                         tmpParam['value'] = '-a {0}'.format(
                                             taskParamMap['fixedSandbox'])
                                 # build
                                 if 'buildSpec' in taskParamMap:
                                     taskParamMap['buildSpec'][
                                         'archiveName'] = taskParamMap[
                                             'fixedSandbox']
                                 # merge
                                 if 'mergeSpec' in taskParamMap:
                                     taskParamMap['mergeSpec']['jobParameters'] = \
                                         re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters'])
                             # encode new param
                             strTaskParams = RefinerUtils.encodeJSON(
                                 taskParamMap)
                             tmpRet = self.taskBufferIF.updateTaskParams_JEDI(
                                 jediTaskID, strTaskParams)
                             if tmpRet is not True:
                                 tmpLog.error(
                                     'failed to update task params')
                                 continue
                         except Exception as e:
                             tmpLog.error(
                                 'failed to change task params with {} {}'.
                                 format(str(e), traceback.format_exc()))
                             continue
                     # retry child tasks
                     if 'sole ' in commentStr:
                         retryChildTasks = False
                     else:
                         retryChildTasks = True
                     # discard events
                     if 'discard ' in commentStr:
                         discardEvents = True
                     else:
                         discardEvents = False
                     # release un-staged files
                     if 'staged ' in commentStr:
                         releaseUnstaged = True
                     else:
                         releaseUnstaged = False
                     tmpRet, newTaskStatus = self.taskBufferIF.retryTask_JEDI(
                         jediTaskID,
                         commandStr,
                         retryChildTasks=retryChildTasks,
                         discardEvents=discardEvents,
                         release_unstaged=releaseUnstaged)
                     if tmpRet is True:
                         tmpMsg = 'set task_status={0}'.format(
                             newTaskStatus)
                         tmpLog.sendMsg(tmpMsg, self.msgType)
                         tmpLog.info(tmpMsg)
                     tmpLog.info('done with {0}'.format(tmpRet))
                 else:
                     tmpLog.error('unknown command')
         except Exception as e:
             errStr = '{} failed in runImpl() with {} {} '.format(
                 self.__class__.__name__, str(e), traceback.format_exc())
             logger.error(errStr)
Пример #25
0
 def runImpl(self):
     # cutoff for disk in TB
     diskThreshold = self.taskBufferIF.getConfigValue(self.msgType, 'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name),
                                                      'jedi', 'atlas')
     if diskThreshold is None:
         diskThreshold = 100 * 1024
     # dataset type to ignore file availability check
     datasetTypeToSkipCheck = ['log']
     # thresholds for data availability check
     thrInputSize = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas')
     if thrInputSize is None:
         thrInputSize = 1
     thrInputSize *= 1024*1024*1024
     thrInputNum = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_THRESHOLD', 'jedi', 'atlas')
     if thrInputNum is None:
         thrInputNum = 100
     thrInputSizeFrac = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas')
     if thrInputSizeFrac is None:
         thrInputSizeFrac = 10
     thrInputSizeFrac = float(thrInputSizeFrac) / 100
     thrInputNumFrac = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas')
     if thrInputNumFrac is None:
         thrInputNumFrac = 10
     thrInputNumFrac = float(thrInputNumFrac) / 100
     cutOffRW = 50
     negWeightTape = 0.001
     # main
     lastJediTaskID = None
     siteMapper = self.taskBufferIF.getSiteMapper()
     while True:
         try:
             taskInputList = self.inputList.get(1)
             # no more datasets
             if len(taskInputList) == 0:
                 self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__,
                                                                                                             self.numTasks))
                 return
             # loop over all tasks
             for taskSpec,inputChunk in taskInputList:
                 lastJediTaskID = taskSpec.jediTaskID
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='jediTaskID={0}'.format(taskSpec.jediTaskID))
                 tmpLog.debug('start')
                 tmpLog.info('thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}'.format(thrInputSize,
                                                                                                                 thrInputNum,
                                                                                                                 thrInputSizeFrac,
                                                                                                                 thrInputNumFrac))
                 # RW
                 taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID)
                 # get nuclei
                 nucleusList = siteMapper.nuclei
                 if taskSpec.nucleus in nucleusList:
                     candidateNucleus = taskSpec.nucleus
                 else:
                     tmpLog.info('got {0} candidates'.format(len(nucleusList)))
                     ######################################
                     # check status
                     newNucleusList = {}
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleusSpec.state in ['ACTIVE']:
                             tmpLog.info('  skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus,
                                                                                                         tmpNucleusSpec.state))
                         else:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed status check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # check status of transfer backlog
                     t1Weight = taskSpec.getT1Weight()
                     if t1Weight < 0:
                         tmpLog.info('skip transfer backlog check due to negative T1Weight')
                     else:
                         newNucleusList = {}
                         backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei()
                         for tmpNucleus, tmpNucleusSpec in nucleusList.iteritems():
                             if tmpNucleus in backlogged_nuclei:
                                 tmpLog.info('  skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog'.
                                              format(tmpNucleus))
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         nucleusList = newNucleusList
                         tmpLog.info('{0} candidates passed transfer backlog check'.format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # check endpoint
                     fractionFreeSpace = {}
                     newNucleusList = {}
                     tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                                   ['output','log'])
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         toSkip = False
                         for tmpDatasetSpec in tmpDatasetSpecList:
                             # ignore distributed datasets
                             if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None:
                                 continue
                             # get endpoint with the pattern
                             tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken)
                             if tmpEP == None:
                                 tmpLog.info('  skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus,
                                                                                                                     tmpDatasetSpec.storageToken))
                                 toSkip = True
                                 break
                             # check state
                             """
                             if not tmpEP['state'] in ['ACTIVE']:
                                 tmpLog.info('  skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus,
                                                                                                                          tmpEP['ddm_endpoint_name'],
                                                                                                                          tmpEP['state']))
                                 toSkip = True
                                 break
                             """    
                             # check space
                             tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired']
                             tmpSpaceToUse = 0
                             if tmpNucleus in self.fullRW:
                                 # 0.25GB per cpuTime/corePower/day
                                 tmpSpaceToUse = long(self.fullRW[tmpNucleus]/10/24/3600*0.25)
                             if tmpSpaceSize-tmpSpaceToUse < diskThreshold:
                                 tmpLog.info('  skip nucleus={0} since disk shortage (free {1} - reserved {2} < thr {3}) at endpoint {4} criteria=-space'.format(tmpNucleus,
                                                                                                                                                                  tmpSpaceSize,
                                                                                                                                                                  tmpSpaceToUse,
                                                                                                                                                                  diskThreshold,
                                                                                                                                                                  tmpEP['ddm_endpoint_name']))
                                 toSkip = True
                                 break
                             # keep fraction of free space
                             if not tmpNucleus in fractionFreeSpace:
                                 fractionFreeSpace[tmpNucleus] = {'total':0,'free':0}
                             try:
                                 tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                     float(fractionFreeSpace[tmpNucleus]['total'])
                             except:
                                 tmpOld = None
                             try:
                                 tmpNew = float(tmpSpaceSize-tmpSpaceToUse)/float(tmpEP['space_total'])
                             except:
                                 tmpNew = None
                             if tmpNew != None and (tmpOld == None or tmpNew < tmpOld):
                                 fractionFreeSpace[tmpNucleus] = {'total':tmpEP['space_total'],
                                                                  'free':tmpSpaceSize-tmpSpaceToUse}
                         if not toSkip:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed endpoint check {1} TB'.format(len(nucleusList),diskThreshold/1024))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # ability to execute jobs
                     newNucleusList = {}
                     # get all panda sites
                     tmpSiteList = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         tmpSiteList += tmpNucleusSpec.allPandaSites
                     tmpSiteList = list(set(tmpSiteList))
                     tmpLog.debug('===== start for job check')
                     jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF)
                     tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True,
                                                          tmpSiteList,tmpLog)
                     tmpLog.debug('===== done for job check')
                     if tmpSt != Interaction.SC_SUCCEEDED:
                         tmpLog.error('no sites can run jobs')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     okNuclei = set()
                     for tmpSite in tmpRet:
                         siteSpec = siteMapper.getSite(tmpSite)
                         okNuclei.add(siteSpec.pandasite)
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if tmpNucleus in okNuclei:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                         else:
                             tmpLog.info('  skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus))
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed job check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ###################################### 
                     # data locality
                     toSkip = False
                     availableData = {}
                     for datasetSpec in inputChunk.getDatasets():
                         # only for real datasets
                         if datasetSpec.isPseudo():
                             continue
                         # ignore DBR
                         if DataServiceUtils.isDBR(datasetSpec.datasetName):
                             continue
                         # skip locality check
                         if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck:
                             continue
                         # use deep scan for primary dataset
                         if datasetSpec.isMaster():
                             deepScan = True
                         else:
                             deepScan = False
                         # get nuclei where data is available
                         tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF,
                                                                           datasetSpec.datasetName,
                                                                           nucleusList.keys(),
                                                                           deepScan)
                         if tmpSt != Interaction.SC_SUCCEEDED:
                             tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet))
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             toSkip = True
                             break
                         # sum
                         for tmpNucleus,tmpVals in tmpRet.iteritems():
                             if not tmpNucleus in availableData:
                                 availableData[tmpNucleus] = tmpVals
                             else:
                                 availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems())
                     if toSkip:
                         continue
                     if availableData != {}:
                         newNucleusList = {}
                         # skip if no data
                         skipMsgList = []
                         for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                             if len(nucleusList) == 1:
                                 tmpLog.info('  disable data locality check for nucleus={0} since no other candidate'.format(tmpNucleus))
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                             elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \
                                     availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac:
                                 tmpMsg = '  skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus,
                                                                                                                                     availableData[tmpNucleus]['ava_size_any'],
                                                                                                                                     availableData[tmpNucleus]['tot_size'],
                                                                                                                                     thrInputSizeFrac)
                                 skipMsgList.append(tmpMsg)
                             elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \
                                     availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac:
                                 tmpMsg = '  skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus,
                                                                                                                                       availableData[tmpNucleus]['ava_num_any'],
                                                                                                                                       availableData[tmpNucleus]['tot_num'],
                                                                                                                                       thrInputNumFrac)
                                 skipMsgList.append(tmpMsg)
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         if len(newNucleusList) > 0:
                             nucleusList = newNucleusList
                             for tmpMsg in skipMsgList:
                                 tmpLog.info(tmpMsg)
                         else:
                             tmpLog.info('  disable data locality check since no nucleus has input data')
                         tmpLog.info('{0} candidates passed data check'.format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ###################################### 
                     # weight
                     self.prioRW.acquire()
                     nucleusRW = self.prioRW[taskSpec.currentPriority]
                     self.prioRW.release()
                     totalWeight = 0
                     nucleusweights = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleus in nucleusRW:
                             nucleusRW[tmpNucleus] = 0
                         wStr = '1'
                         # with RW
                         if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW:
                             weight = 1 / float(nucleusRW[tmpNucleus])
                             wStr += '/( RW={0} )'.format(nucleusRW[tmpNucleus])
                         else:
                             weight = 1
                             wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW)
                         # with data
                         if availableData != {}:
                             if availableData[tmpNucleus]['tot_size'] > 0:
                                 weight *= float(availableData[tmpNucleus]['ava_size_any'])
                                 weight /= float(availableData[tmpNucleus]['tot_size'])
                                 wStr += '* ( available_input_size_DISKTAPE={0} )'.format(availableData[tmpNucleus]['ava_size_any'])
                                 wStr += '/ ( total_input_size={0} )'.format(availableData[tmpNucleus]['tot_size'])
                                 # negative weight for tape
                                 if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']:
                                     weight *= negWeightTape
                                     wStr += '*( weight_TAPE={0} )'.format(negWeightTape)
                             # fraction of free space
                             if tmpNucleus in fractionFreeSpace:
                                 try:
                                     tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                         float(fractionFreeSpace[tmpNucleus]['total'])
                                     weight *= tmpFrac
                                     wStr += '*( free_space={0} )/( total_space={1} )'.format(fractionFreeSpace[tmpNucleus]['free'],
                                                                                          fractionFreeSpace[tmpNucleus]['total'])
                                 except:
                                     pass
                         tmpLog.info('  use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr))
                         totalWeight += weight
                         nucleusweights.append((tmpNucleus,weight))
                     tmpLog.info('final {0} candidates'.format(len(nucleusList)))
                     ###################################### 
                     # final selection
                     tgtWeight = random.uniform(0,totalWeight)
                     candidateNucleus = None
                     for tmpNucleus,weight in nucleusweights:
                         tgtWeight -= weight
                         if tgtWeight <= 0:
                             candidateNucleus = tmpNucleus
                             break
                     if candidateNucleus == None:
                         candidateNucleus = nucleusweights[-1][0]
                 ###################################### 
                 # update
                 nucleusSpec = nucleusList[candidateNucleus]
                 # get output/log datasets
                 tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                            ['output','log'])
                 # get destinations
                 retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)}
                 tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
                 tmpLog.info('  set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet))
                 self.sendLogMessage(tmpLog)
                 if tmpRet:
                     tmpMsg = 'set task.status=ready'
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                 # update RW table
                 self.prioRW.acquire()
                 for prio,rwMap in self.prioRW.iteritems():
                     if prio > taskSpec.currentPriority:
                         continue
                     if candidateNucleus in rwMap:
                         rwMap[candidateNucleus] += taskRW
                     else:
                         rwMap[candidateNucleus] = taskRW
                 self.prioRW.release()
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errMsg  = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID)
             errMsg += traceback.format_exc()
             logger.error(errMsg)
Пример #26
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskDsList = self.taskDsList.get(nTasks)
             # no more datasets
             if len(taskDsList) == 0:
                 self.logger.debug("%s terminating since no more items" % self.__class__.__name__)
                 return
             # loop over all tasks
             for jediTaskID, dsList in taskDsList:
                 allUpdated = True
                 taskBroken = False
                 taskOnHold = False
                 runningTask = False
                 missingMap = {}
                 # make logger
                 tmpLog = MsgWrapper(self.logger, "<jediTaskID={0}>".format(jediTaskID))
                 # get task
                 tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID, False, True, None, 10)
                 if not tmpStat or taskSpec == None:
                     tmpLog.error("failed to get taskSpec for jediTaskID={0}".format(jediTaskID))
                     continue
                 try:
                     # get task parameters
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype, errvalue = sys.exc_info()[:2]
                     tmpLog.error(
                         "task param conversion from json failed with {0}:{1}".format(errtype.__name__, errvalue)
                     )
                     taskBroken = True
                 # renaming of parameters
                 if taskParamMap.has_key("nEventsPerInputFile"):
                     taskParamMap["nEventsPerFile"] = taskParamMap["nEventsPerInputFile"]
                 # the number of files per job
                 nFilesPerJob = None
                 if taskParamMap.has_key("nFilesPerJob"):
                     nFilesPerJob = taskParamMap["nFilesPerJob"]
                 # the number of files used by scout
                 nFilesForScout = 0
                 if nFilesPerJob != None:
                     nFilesForScout = 10 * nFilesPerJob
                 else:
                     nFilesForScout = 10
                 # load XML
                 if taskSpec.useLoadXML():
                     try:
                         loadXML = taskParamMap["loadXML"]
                         xmlConfig = ParseJobXML.dom_parser(xmlStr=loadXML)
                     except:
                         errtype, errvalue = sys.exc_info()[:2]
                         tmpLog.error("failed to load XML config with {0}:{1}".format(errtype.__name__, errvalue))
                         taskBroken = True
                 else:
                     xmlConfig = None
                 # check no wait
                 noWaitParent = False
                 if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None, taskSpec.jediTaskID]:
                     tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid)
                     if tmpStat == "running":
                         noWaitParent = True
                 # loop over all datasets
                 nFilesMaster = 0
                 if not taskBroken:
                     ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                     origNumFiles = None
                     if taskParamMap.has_key("nFiles"):
                         origNumFiles = taskParamMap["nFiles"]
                     for datasetSpec in dsList:
                         tmpLog.info(
                             "start loop for {0}(id={1})".format(datasetSpec.datasetName, datasetSpec.datasetID)
                         )
                         # get dataset metadata
                         tmpLog.info("get metadata")
                         gotMetadata = False
                         stateUpdateTime = datetime.datetime.utcnow()
                         try:
                             if not datasetSpec.isPseudo():
                                 tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName)
                             else:
                                 # dummy metadata for pseudo dataset
                                 tmpMetadata = {"state": "closed"}
                             # set mutable when parent is running and the dataset is open
                             if noWaitParent and tmpMetadata["state"] == "open":
                                 # dummy metadata when parent is running
                                 tmpMetadata = {"state": "mutable"}
                             gotMetadata = True
                         except:
                             errtype, errvalue = sys.exc_info()[:2]
                             tmpLog.error(
                                 "{0} failed to get metadata to {1}:{2}".format(
                                     self.__class__.__name__, errtype.__name__, errvalue
                                 )
                             )
                             if errtype == Interaction.JEDIFatalError:
                                 # fatal error
                                 datasetStatus = "broken"
                                 taskBroken = True
                                 # update dataset status
                                 self.updateDatasetStatus(datasetSpec, datasetStatus, tmpLog)
                             else:
                                 # temporary error
                                 taskOnHold = True
                             taskSpec.setErrDiag("failed to get metadata for {0}".format(datasetSpec.datasetName))
                             allUpdated = False
                         else:
                             # get file list specified in task parameters
                             fileList, includePatt, excludePatt = RefinerUtils.extractFileList(
                                 taskParamMap, datasetSpec.datasetName
                             )
                             # get the number of events in metadata
                             if taskParamMap.has_key("getNumEventsInMetadata"):
                                 getNumEvents = True
                             else:
                                 getNumEvents = False
                             # get file list from DDM
                             tmpLog.info("get files")
                             try:
                                 useInFilesWithNewAttemptNr = False
                                 skipDuplicate = not datasetSpec.useDuplicatedFiles()
                                 if not datasetSpec.isPseudo():
                                     if (
                                         fileList != []
                                         and taskParamMap.has_key("useInFilesInContainer")
                                         and not datasetSpec.containerName in ["", None]
                                     ):
                                         # read files from container if file list is specified in task parameters
                                         tmpDatasetName = datasetSpec.containerName
                                     else:
                                         tmpDatasetName = datasetSpec.datasetName
                                     tmpRet = ddmIF.getFilesInDataset(
                                         tmpDatasetName, getNumEvents=getNumEvents, skipDuplicate=skipDuplicate
                                     )
                                     # remove lost files
                                     tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName, tmpRet)
                                     if tmpLostFiles != {}:
                                         tmpLog.info(
                                             "found {0} lost files in {1}".format(len(tmpLostFiles), tmpDatasetName)
                                         )
                                         for tmpListGUID, tmpLostLFN in tmpLostFiles.iteritems():
                                             tmpLog.info("removed {0}".format(tmpLostLFN))
                                             del tmpRet[tmpListGUID]
                                 else:
                                     if not taskSpec.useListPFN():
                                         # dummy file list for pseudo dataset
                                         tmpRet = {
                                             str(uuid.uuid4()): {
                                                 "lfn": "pseudo_lfn",
                                                 "scope": None,
                                                 "filesize": 0,
                                                 "checksum": None,
                                             }
                                         }
                                     else:
                                         # make dummy file list for PFN list
                                         if taskParamMap.has_key("nFiles"):
                                             nPFN = taskParamMap["nFiles"]
                                         else:
                                             nPFN = 1
                                         tmpRet = {}
                                         for iPFN in range(nPFN):
                                             tmpRet[str(uuid.uuid4())] = {
                                                 "lfn": "{0:06d}:{1}".format(
                                                     iPFN, taskParamMap["pfnList"][iPFN].split("/")[-1]
                                                 ),
                                                 "scope": None,
                                                 "filesize": 0,
                                                 "checksum": None,
                                             }
                             except:
                                 errtype, errvalue = sys.exc_info()[:2]
                                 tmpLog.error(
                                     "failed to get files due to {0}:{1}".format(
                                         self.__class__.__name__, errtype.__name__, errvalue
                                     )
                                 )
                                 if errtype == Interaction.JEDIFatalError:
                                     # fatal error
                                     datasetStatus = "broken"
                                     taskBroken = True
                                     # update dataset status
                                     self.updateDatasetStatus(datasetSpec, datasetStatus, tmpLog)
                                 else:
                                     # temporary error
                                     taskOnHold = True
                                 taskSpec.setErrDiag("failed to get files for {0}".format(datasetSpec.datasetName))
                                 allUpdated = False
                             else:
                                 # the number of events per file
                                 nEventsPerFile = None
                                 nEventsPerJob = None
                                 nEventsPerRange = None
                                 if (datasetSpec.isMaster() and taskParamMap.has_key("nEventsPerFile")) or (
                                     datasetSpec.isPseudo() and taskParamMap.has_key("nEvents")
                                 ):
                                     if taskParamMap.has_key("nEventsPerFile"):
                                         nEventsPerFile = taskParamMap["nEventsPerFile"]
                                     elif datasetSpec.isPseudo() and taskParamMap.has_key("nEvents"):
                                         # use nEvents as nEventsPerFile for pseudo input
                                         nEventsPerFile = taskParamMap["nEvents"]
                                     if taskParamMap.has_key("nEventsPerJob"):
                                         nEventsPerJob = taskParamMap["nEventsPerJob"]
                                     elif taskParamMap.has_key("nEventsPerRange"):
                                         nEventsPerRange = taskParamMap["nEventsPerRange"]
                                 # max attempts and first event number
                                 maxAttempt = None
                                 firstEventNumber = None
                                 if datasetSpec.isMaster():
                                     # max attempts
                                     if taskSpec.disableAutoRetry():
                                         # disable auto retry
                                         maxAttempt = 1
                                     elif taskParamMap.has_key("maxAttempt"):
                                         maxAttempt = taskParamMap["maxAttempt"]
                                     else:
                                         # use default value
                                         maxAttempt = 3
                                     # first event number
                                     firstEventNumber = 1 + taskSpec.getFirstEventOffset()
                                 # nMaxEvents
                                 nMaxEvents = None
                                 if datasetSpec.isMaster() and taskParamMap.has_key("nEvents"):
                                     nMaxEvents = taskParamMap["nEvents"]
                                 # nMaxFiles
                                 nMaxFiles = None
                                 if taskParamMap.has_key("nFiles"):
                                     if datasetSpec.isMaster():
                                         nMaxFiles = taskParamMap["nFiles"]
                                     else:
                                         # calculate for secondary
                                         nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles)
                                         # multipled by the number of jobs per file for event-level splitting
                                         if nMaxFiles != None and taskParamMap.has_key("nEventsPerFile"):
                                             if taskParamMap.has_key("nEventsPerJob"):
                                                 if taskParamMap["nEventsPerFile"] > taskParamMap["nEventsPerJob"]:
                                                     nMaxFiles *= float(taskParamMap["nEventsPerFile"]) / float(
                                                         taskParamMap["nEventsPerJob"]
                                                     )
                                                     nMaxFiles = int(math.ceil(nMaxFiles))
                                             elif taskParamMap.has_key("nEventsPerRange"):
                                                 if taskParamMap["nEventsPerFile"] > taskParamMap["nEventsPerRange"]:
                                                     nMaxFiles *= float(taskParamMap["nEventsPerFile"]) / float(
                                                         taskParamMap["nEventsPerRange"]
                                                     )
                                                     nMaxFiles = int(math.ceil(nMaxFiles))
                                 # use scout
                                 useScout = False
                                 if datasetSpec.isMaster() and taskSpec.useScout():
                                     useScout = True
                                 # use files with new attempt numbers
                                 useFilesWithNewAttemptNr = False
                                 if (
                                     not datasetSpec.isPseudo()
                                     and fileList != []
                                     and taskParamMap.has_key("useInFilesWithNewAttemptNr")
                                 ):
                                     useFilesWithNewAttemptNr = True
                                 # feed files to the contents table
                                 tmpLog.info("update contents")
                                 retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(
                                     datasetSpec,
                                     tmpRet,
                                     tmpMetadata["state"],
                                     stateUpdateTime,
                                     nEventsPerFile,
                                     nEventsPerJob,
                                     maxAttempt,
                                     firstEventNumber,
                                     nMaxFiles,
                                     nMaxEvents,
                                     useScout,
                                     fileList,
                                     useFilesWithNewAttemptNr,
                                     nFilesPerJob,
                                     nEventsPerRange,
                                     nFilesForScout,
                                     includePatt,
                                     excludePatt,
                                     xmlConfig,
                                     noWaitParent,
                                     taskSpec.parent_tid,
                                 )
                                 if retDB == False:
                                     taskSpec.setErrDiag(
                                         "failed to insert files for {0}. {1}".format(
                                             datasetSpec.datasetName, diagMap["errMsg"]
                                         )
                                     )
                                     allUpdated = False
                                     taskBroken = True
                                     break
                                 elif retDB == None:
                                     # the dataset is locked by another or status is not applicable
                                     allUpdated = False
                                 elif missingFileList != []:
                                     # files are missing
                                     tmpErrStr = "{0} files missing in {1}".format(
                                         len(missingFileList), datasetSpec.datasetName
                                     )
                                     tmpLog.info(tmpErrStr)
                                     taskSpec.setErrDiag(tmpErrStr)
                                     allUpdated = False
                                     taskOnHold = True
                                     missingMap[datasetSpec.datasetName] = {
                                         "datasetSpec": datasetSpec,
                                         "missingFiles": missingFileList,
                                     }
                                 else:
                                     # reduce the number of files to be read
                                     if taskParamMap.has_key("nFiles"):
                                         if datasetSpec.isMaster():
                                             taskParamMap["nFiles"] -= nFilesUnique
                                     # reduce the number of files for scout
                                     if useScout:
                                         nFilesForScout = diagMap["nFilesForScout"]
                                     # number of master input files
                                     if datasetSpec.isMaster():
                                         nFilesMaster += nFilesUnique
                                 # running task
                                 if diagMap["isRunningTask"]:
                                     runningTask = True
                                 # no activated pending input for noWait
                                 if noWaitParent and diagMap["nActivatedPending"] == 0:
                                     tmpErrStr = "insufficient inputs are ready"
                                     tmpLog.info(tmpErrStr)
                                     taskSpec.setErrDiag(tmpErrStr)
                                     taskOnHold = True
                         tmpLog.info("end loop")
                 # no mater input
                 if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0:
                     tmpErrStr = "no master input files. input dataset is empty"
                     tmpLog.error(tmpErrStr)
                     taskSpec.setErrDiag(tmpErrStr, None)
                     if taskSpec.allowEmptyInput() or noWaitParent:
                         taskOnHold = True
                     else:
                         taskBroken = True
                 # update task status
                 if taskBroken:
                     # task is broken
                     taskSpec.status = "tobroken"
                     tmpMsg = "set task.status={0}".format(taskSpec.status)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg, self.msgType)
                     allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID, taskSpec)
                 # change task status unless the task is running
                 if not runningTask:
                     if taskOnHold:
                         if not noWaitParent:
                             # initialize task generator
                             taskGenerator = TaskGenerator(taskSpec.vo, taskSpec.prodSourceLabel)
                             tmpStat = taskGenerator.initializeMods(
                                 self.taskBufferIF, self.ddmIF.getInterface(taskSpec.vo)
                             )
                             if not tmpStat:
                                 tmpErrStr = "failed to initialize TaskGenerator"
                                 tmpLog.error(tmpErrStr)
                                 taskSpec.status = "tobroken"
                                 taskSpec.setErrDiag(tmpErrStr)
                             else:
                                 # make parent tasks if necessary
                                 tmpLog.info(
                                     "make parent tasks with {0} (if necessary)".format(
                                         taskGenerator.getClassName(taskSpec.vo, taskSpec.prodSourceLabel)
                                     )
                                 )
                                 tmpStat = taskGenerator.doGenerate(
                                     taskSpec, taskParamMap, missingFilesMap=missingMap
                                 )
                                 if tmpStat == Interaction.SC_FATAL:
                                     # failed to make parent tasks
                                     taskSpec.status = "tobroken"
                                     tmpLog.error("failed to make parent tasks")
                         # go to pending state
                         if not taskSpec.status in ["broken", "tobroken"]:
                             taskSpec.setOnHold()
                         tmpMsg = "set task.status={0}".format(taskSpec.status)
                         tmpLog.info(tmpMsg)
                         tmpLog.sendMsg(tmpMsg, self.msgType)
                         allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID, taskSpec)
                     elif allUpdated:
                         # all OK
                         allRet, newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                             jediTaskID, getTaskStatus=True
                         )
                         tmpMsg = "set task.status={0}".format(newTaskStatus)
                         tmpLog.info(tmpMsg)
                         tmpLog.sendMsg(tmpMsg, self.msgType)
                 tmpLog.info("done")
         except:
             errtype, errvalue = sys.exc_info()[:2]
             logger.error(
                 "{0} failed in runImpl() with {1}:{2}".format(self.__class__.__name__, errtype.__name__, errvalue)
             )
Пример #27
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.info('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,splitRule,taskStatus,parent_tid in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'< jediTaskID={0} >'.format(jediTaskID))
                 tmpLog.debug('start')
                 tmpStat = Interaction.SC_SUCCEEDED
                 errStr = ''
                 # read task parameters
                 try:
                     taskParam = None
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue)
                     tmpLog.debug(taskParam)
                     tmpLog.error(errStr)
                     continue
                     tmpStat = Interaction.SC_FAILED
                 # get impl
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('getting Impl')
                     try:
                         # get VO and sourceLabel
                         vo = taskParamMap['vo']
                         prodSourceLabel = taskParamMap['prodSourceLabel']
                         taskType = taskParamMap['taskType']
                         tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType))
                         # get impl
                         impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType,
                                                                 self.taskBufferIF,self.ddmIF)
                         if impl == None:
                             # task refiner is undefined
                             errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # extract common parameters
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('extracting common')
                     try:
                         # initalize impl
                         impl.initializeRefiner(tmpLog)
                         impl.oldTaskStatus = taskStatus
                         # extract common parameters
                         impl.extractCommon(jediTaskID, taskParamMap, self.workQueueMapper, splitRule)
                         # set parent tid
                         if not parent_tid in [None,jediTaskID]:
                             impl.taskSpec.parent_tid = parent_tid
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to extract common parameters with {0}:{1} {2}'.format(errtype.__name__,errvalue,
                                                                                                traceback.format_exc())
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # check attribute length
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('checking attribute length')
                     if not impl.taskSpec.checkAttrLength():
                         tmpLog.error(impl.taskSpec.errorDialog)
                         tmpStat = Interaction.SC_FAILED
                 # staging
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     if 'toStaging' in taskParamMap and taskStatus <> 'staged':
                         errStr = 'wait until staging is done'
                         impl.taskSpec.status = 'staging'
                         impl.taskSpec.oldStatus = taskStatus
                         impl.taskSpec.setErrDiag(errStr)
                         # not to update some task attributes
                         impl.taskSpec.resetRefinedAttrs()
                         tmpLog.info(errStr)
                         self.taskBufferIF.updateTask_JEDI(impl.taskSpec, {'jediTaskID':impl.taskSpec.jediTaskID},
                                                           oldStatus=[taskStatus], updateDEFT=False, setFrozenTime=False)
                         continue
                 # check parent
                 noWaitParent = False
                 parentState = None
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     if parent_tid not in [None,jediTaskID]:
                         tmpLog.info('check parent task')
                         try:
                             tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid)
                             parentState = tmpStat
                             if tmpStat == 'completed':
                                 # parent is done
                                 tmpStat = Interaction.SC_SUCCEEDED
                             elif tmpStat == 'running':
                                 if not impl.taskSpec.noWaitParent():
                                     # parent is running
                                     errStr = 'pending until parent task {0} is done'.format(parent_tid)
                                     impl.taskSpec.status = taskStatus
                                     impl.taskSpec.setOnHold()
                                     impl.taskSpec.setErrDiag(errStr)
                                     # not to update some task attributes
                                     impl.taskSpec.resetRefinedAttrs()
                                     tmpLog.info(errStr)
                                     self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                                       oldStatus=[taskStatus],setFrozenTime=False)
                                     continue
                                 else:
                                     # not wait for parent
                                     tmpStat = Interaction.SC_SUCCEEDED
                                     noWaitParent = True
                             else:
                                 # parent is corrupted
                                 tmpStat = Interaction.SC_FAILED
                                 tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid)
                                 impl.taskSpec.setErrDiag(tmpErrStr)
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # refine
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('refining with {0}'.format(impl.__class__.__name__))
                     try:
                         tmpStat = impl.doRefine(jediTaskID,taskParamMap)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         # wait unknown input if noWaitParent or waitInput
                         if ((impl.taskSpec.noWaitParent() or impl.taskSpec.waitInput()) \
                                 and errtype == JediException.UnknownDatasetError) or parentState == 'running' \
                                 or errtype == Interaction.JEDITemporaryError:
                             if impl.taskSpec.noWaitParent() or parentState == 'running':
                                 tmpErrStr = 'pending until parent produces input'
                                 setFrozenTime=False
                             elif errtype == Interaction.JEDITemporaryError:
                                 tmpErrStr = 'pending due to DDM problem. {0}'.format(errvalue)
                                 setFrozenTime=True
                             else:
                                 tmpErrStr = 'pending until input is staged'
                                 setFrozenTime=True
                             impl.taskSpec.status = taskStatus
                             impl.taskSpec.setOnHold()
                             impl.taskSpec.setErrDiag(tmpErrStr)
                             # not to update some task attributes
                             impl.taskSpec.resetRefinedAttrs()
                             tmpLog.info(tmpErrStr)
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                               oldStatus=[taskStatus],
                                                               insertUnknown=impl.unknownDatasetList,
                                                               setFrozenTime=setFrozenTime)
                             continue
                         else:
                             errStr  = 'failed to refine task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # register
                 if tmpStat != Interaction.SC_SUCCEEDED:
                     tmpLog.error('failed to refine the task')
                     if impl == None or impl.taskSpec == None:
                         tmpTaskSpec = JediTaskSpec()
                         tmpTaskSpec.jediTaskID = jediTaskID
                     else:
                         tmpTaskSpec = impl.taskSpec
                     tmpTaskSpec.status = 'tobroken'
                     if errStr != '':
                         tmpTaskSpec.setErrDiag(errStr,True)
                     self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID},oldStatus=[taskStatus])
                 else:
                     tmpLog.info('registering')                    
                     # fill JEDI tables
                     try:
                         # enable protection against task duplication
                         if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \
                                 not impl.taskSpec.checkPreProcessed():
                             uniqueTaskName = True
                         else:
                             uniqueTaskName = False
                         strTaskParams = None
                         if impl.updatedTaskParams != None:
                             strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams)
                         if taskStatus in ['registered', 'staged']:
                             # unset pre-process flag
                             if impl.taskSpec.checkPreProcessed():
                                 impl.taskSpec.setPostPreProcess()
                             # full registration
                             tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec,
                                                                                                  impl.inMasterDatasetSpec,
                                                                                                  impl.inSecDatasetSpecList,
                                                                                                  impl.outDatasetSpecList,
                                                                                                  impl.outputTemplateMap,
                                                                                                  impl.jobParamsTemplate,
                                                                                                  strTaskParams,
                                                                                                  impl.unmergeMasterDatasetSpec,
                                                                                                  impl.unmergeDatasetSpecMap,
                                                                                                  uniqueTaskName,
                                                                                                  taskStatus) 
                             if not tmpStat:
                                 tmpErrStr = 'failed to register the task to JEDI in a single shot'
                                 tmpLog.error(tmpErrStr)
                                 impl.taskSpec.status = newTaskStatus
                                 impl.taskSpec.setErrDiag(tmpErrStr,True)
                                 self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                                   oldStatus=[taskStatus])
                             tmpMsg = 'set task_status={0}'.format(newTaskStatus)
                             tmpLog.info(tmpMsg)
                             tmpLog.sendMsg(tmpMsg,self.msgType)
                         else:
                             # disable scouts if previous attempt didn't use it
                             if not impl.taskSpec.useScout(splitRule):
                                 impl.taskSpec.setUseScout(False)
                             # disallow to reset some attributes
                             for attName in ['ramCount', 'walltime', 'cpuTime', 'startTime']:
                                 impl.taskSpec.resetChangedAttr(attName)
                             # update task with new params
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                               oldStatus=[taskStatus])
                             # appending for incremetnal execution
                             tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec,
                                                                             impl.inSecDatasetSpecList)
                             if not tmpStat:
                                 tmpLog.error('failed to append datasets for incexec')
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(tmpErrStr)
                     else:
                         tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
Пример #28
0
 def findLostFiles(self,datasetName,fileMap):
     methodName = 'findLostFiles'
     methodName += ' <datasetName={0}>'.format(datasetName)
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('start')
     try:
         # get replicas
         tmpStat,tmpOut = self.listDatasetReplicas(datasetName)
         if tmpStat != self.SC_SUCCEEDED:
             tmpLog.error('faild to get dataset replicas with {0}'.format(tmpOut))
             raise tmpStat,tmpOut
         # check if complete replica is available
         hasCompReplica = False
         datasetReplicaMap = tmpOut
         for tmpEndPoint in datasetReplicaMap.keys():
             if datasetReplicaMap[tmpEndPoint][-1]['found'] != None and \
                     datasetReplicaMap[tmpEndPoint][-1]['total'] == datasetReplicaMap[tmpEndPoint][-1]['found']:
                 hasCompReplica = True
                 break
         # no lost files
         if hasCompReplica:
             tmpLog.info('done with no lost files')
             self.SC_SUCCEEDED,{}
         # get LFNs and scopes
         lfnMap = {}
         scopeMap = {}
         for tmpGUID in fileMap.keys():
             tmpLFN = fileMap[tmpGUID]['lfn']
             lfnMap[tmpGUID] = tmpLFN
             scopeMap[tmpLFN] = fileMap[tmpGUID]['scope']
         # get LFC and SE
         lfcSeMap = {}
         for tmpEndPoint in datasetReplicaMap.keys():
             # get LFC
             lfc = TiersOfATLAS.getLocalCatalog(tmpEndPoint)
             # add map
             if not lfcSeMap.has_key(lfc):
                 lfcSeMap[lfc] = []
             # get SE
             seStr = TiersOfATLAS.getSiteProperty(tmpEndPoint, 'srm')
             tmpMatch = re.search('://([^:/]+):*\d*/',seStr)
             if tmpMatch != None:
                 se = tmpMatch.group(1)
                 if not se in lfcSeMap[lfc]:
                     lfcSeMap[lfc].append(se)
         # get SURLs
         for lfcHost,seList in lfcSeMap.iteritems():
             tmpStat,tmpRetMap = self.getSURLsFromLFC(lfnMap,lfcHost,seList,scopes=scopeMap)
             if tmpStat != self.SC_SUCCEEDED:
                 tmpLog.error('faild to get SURLs with {0}'.format(tmpRetMap))
                 raise tmpStat,tmpRetMap
             # look for missing files
             newLfnMap = {}
             for tmpGUID,tmpLFN in lfnMap.iteritems():
                 if not tmpLFN in tmpRetMap:
                     newLfnMap[tmpGUID] = tmpLFN
             lfnMap = newLfnMap
         tmpLog.info('done with lost '+','.join(str(tmpLFN) for tmpLFN in lfnMap.values()))
         return self.SC_SUCCEEDED,lfnMap
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errMsg = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errMsg)
         return errCode,'{0} : {1}'.format(methodName,errMsg)
Пример #29
0
 def getLatestDBRelease(self):
     methodName = 'getLatestDBRelease'
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('trying to get the latest version number of DBR')
     # get ddo datasets
     tmpStat,ddoDatasets = self.listDatasets('ddo.*')
     if tmpStat != self.SC_SUCCEEDED or ddoDatasets == {}:
         tmpLog.error('failed to get a list of DBRelease datasets from DQ2')
         return self.SC_FAILED,None
     # reverse sort to avoid redundant lookup   
     ddoDatasets.sort()
     ddoDatasets.reverse()
     # extract version number
     latestVerMajor = 0
     latestVerMinor = 0
     latestVerBuild = 0
     latestVerRev   = 0
     latestDBR = ''
     for tmpName in ddoDatasets:
         # ignore CDRelease
         if ".CDRelease." in tmpName:
             continue
         # ignore user
         if tmpName.startswith('ddo.user'):
             continue
         # use Atlas.Ideal
         if not ".Atlas.Ideal." in tmpName:
             continue
         match = re.search('\.v(\d+)(_*[^\.]*)$',tmpName)
         if match == None:
             tmpLog.warning('cannot extract version number from %s' % tmpName)
             continue
         # ignore special DBRs
         if match.group(2) != '':
             continue
         # get major,minor,build,revision numbers
         tmpVerStr = match.group(1)
         tmpVerMajor = 0
         tmpVerMinor = 0
         tmpVerBuild = 0
         tmpVerRev   = 0
         try:
             tmpVerMajor = int(tmpVerStr[0:2])
         except:
             pass
         try:
             tmpVerMinor = int(tmpVerStr[2:4])
         except:
             pass
         try:
             tmpVerBuild = int(tmpVerStr[4:6])
         except:
             pass
         try:
             tmpVerRev = int(tmpVerStr[6:])
             # use only three digit DBR
             continue
         except:
             pass
         # compare
         if latestVerMajor > tmpVerMajor:
             continue
         elif latestVerMajor == tmpVerMajor:
             if latestVerMinor > tmpVerMinor:
                 continue
             elif latestVerMinor == tmpVerMinor:
                 if latestVerBuild > tmpVerBuild:
                     continue
                 elif latestVerBuild == tmpVerBuild:
                     if latestVerRev > tmpVerRev:
                         continue
         # check if well replicated
         tmpStat,ddoReplicas = self.listDatasetReplicas(tmpName)
         if len(ddoReplicas) < 10:
             continue
         # higher or equal version
         latestVerMajor = tmpVerMajor
         latestVerMinor = tmpVerMinor
         latestVerBuild = tmpVerBuild
         latestVerRev   = tmpVerRev
         latestDBR = tmpName
     # failed
     if latestDBR == '':
         tmpLog.error('failed to get the latest version of DBRelease dataset from DQ2')
         return self.SC_FAILED,None
     tmpLog.info('use {0}'.format(latestDBR))
     return self.SC_SUCCEEDED,latestDBR
Пример #30
0
    def start(self):
        # start base classes
        JediKnight.start(self)
        FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF)
        # go into main loop
        while True:
            startTime = datetime.datetime.utcnow()
            try:
                # get logger
                tmpLog = MsgWrapper(logger)
                tmpLog.info('start')
                # loop over all vos
                for vo in self.vos:
                    # loop over all sourceLabels
                    for prodSourceLabel in self.prodSourceLabels:
                        # rescue picked files
                        tmpLog.info(
                            'rescue tasks with picked files for vo={0} label={1}'
                            .format(vo, prodSourceLabel))
                        tmpRet = self.taskBufferIF.rescuePickedFiles_JEDI(
                            vo, prodSourceLabel,
                            jedi_config.watchdog.waitForPicked)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to rescue')
                        else:
                            tmpLog.info('rescued {0} tasks'.format(tmpRet))

                        # reactivate pending tasks
                        tmpLog.info(
                            'reactivate pending tasks for vo={0} label={1}'.
                            format(vo, prodSourceLabel))
                        tmpRet = self.taskBufferIF.reactivatePendingTasks_JEDI(
                            vo, prodSourceLabel,
                            jedi_config.watchdog.waitForPending,
                            jedi_config.watchdog.timeoutForPending)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to reactivate')
                        else:
                            tmpLog.info('reactivated {0} tasks'.format(tmpRet))
                        # unlock tasks
                        tmpLog.info('unlock tasks for vo={0} label={1}'.format(
                            vo, prodSourceLabel))
                        tmpRet = self.taskBufferIF.unlockTasks_JEDI(
                            vo, prodSourceLabel,
                            jedi_config.watchdog.waitForLocked)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to unlock')
                        else:
                            tmpLog.info('unlock {0} tasks'.format(tmpRet))
                        # restart contents update
                        tmpLog.info(
                            'restart contents update for vo={0} label={1}'.
                            format(vo, prodSourceLabel))
                        tmpRet = self.taskBufferIF.restartTasksForContentsUpdate_JEDI(
                            vo, prodSourceLabel)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to restart')
                        else:
                            tmpLog.info('restarted {0} tasks'.format(tmpRet))
                        # kick exhausted tasks
                        tmpLog.info(
                            'kick exhausted tasks for vo={0} label={1}'.format(
                                vo, prodSourceLabel))
                        tmpRet = self.taskBufferIF.kickExhaustedTasks_JEDI(
                            vo, prodSourceLabel,
                            jedi_config.watchdog.waitForExhausted)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to kick')
                        else:
                            tmpLog.info('kicked {0} tasks'.format(tmpRet))
                        # finish tasks when goal is reached
                        tmpLog.info(
                            'finish achieved tasks for vo={0} label={1}'.
                            format(vo, prodSourceLabel))
                        tmpRet = self.taskBufferIF.getAchievedTasks_JEDI(
                            vo, prodSourceLabel,
                            jedi_config.watchdog.waitForAchieved)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to finish')
                        else:
                            for jediTaskID in tmpRet:
                                self.taskBufferIF.sendCommandTaskPanda(
                                    jediTaskID,
                                    'JEDI. Goal reached',
                                    True,
                                    'finish',
                                    comQualifier='soft')
                            tmpLog.info('finished {0} tasks'.format(tmpRet))
                        # vo/prodSourceLabel specific action
                        impl = self.getImpl(vo, prodSourceLabel)
                        if impl != None:
                            tmpLog.info(
                                'special action for vo={0} label={1} with {2}'.
                                format(vo, prodSourceLabel,
                                       impl.__class__.__name__))
                            tmpStat = impl.doAction()
                            if tmpStat != Interaction.SC_SUCCEEDED:
                                tmpLog.error(
                                    'failed to run special acction for vo={0} label={1}'
                                    .format(vo, prodSourceLabel))
                            else:
                                tmpLog.info('done for vo={0} label={1}'.format(
                                    vo, prodSourceLabel))
                tmpLog.info('done')
            except:
                errtype, errvalue = sys.exc_info()[:2]
                tmpLog.error('failed in {0}.start() with {1} {2}'.format(
                    self.__class__.__name__, errtype.__name__, errvalue))
            # sleep if needed
            loopCycle = jedi_config.watchdog.loopCycle
            timeDelta = datetime.datetime.utcnow() - startTime
            sleepPeriod = loopCycle - timeDelta.seconds
            if sleepPeriod > 0:
                time.sleep(sleepPeriod)
            # randomize cycle
            self.randomSleep()
Пример #31
0
 def doGenerate(self, taskSpec, taskParamMap, **varMap):
     # make logger
     tmpLog = MsgWrapper(logger,
                         "<jediTaskID={0}>".format(taskSpec.jediTaskID))
     tmpLog.info('start taskType={0}'.format(taskSpec.taskType))
     tmpLog.info(str(varMap))
     # returns
     retFatal = self.SC_FATAL
     retTmpError = self.SC_FAILED
     retOK = self.SC_SUCCEEDED
     try:
         # check prodSourceLabel
         if taskSpec.prodSourceLabel in ['managed', 'test']:
             # check taskType
             if taskSpec.taskType == 'recov':
                 # generate parent tasks for lost file recovery if it is not yet generated
                 if 'parentGenerated' in taskParamMap:
                     tmpLog.info(
                         'skip since already generated parent tasks')
                 else:
                     tmpLog.info(
                         'generating parent tasks for lost file recovery')
                     # missing files are undefined
                     if 'missingFilesMap' not in varMap:
                         tmpLog.error('missing files are undefined')
                         return retFatal
                     missingFilesMap = varMap['missingFilesMap']
                     # check datasets
                     for datasetName, datasetValMap in iteritems(
                             missingFilesMap):
                         # dataset needs specify container
                         datasetSpec = datasetValMap['datasetSpec']
                         if datasetSpec.containerName in ['', None]:
                             errStr = 'cannot make parent tasks due to undefined container for datasetID={0}:{1}'.format(
                                 datasetSpec.datasetID, datasetName)
                             taskSpec.setErrDiag(errStr)
                             tmpLog.error(errStr)
                             return retFatal
                     # make parameters for new task
                     newJsonStrList = []
                     for datasetName, datasetValMap in iteritems(
                             missingFilesMap):
                         datasetSpec = datasetValMap['datasetSpec']
                         newTaskParamMap = {}
                         newTaskParamMap['oldDatasetName'] = datasetName
                         newTaskParamMap['lostFiles'] = datasetValMap[
                             'missingFiles']
                         newTaskParamMap['vo'] = taskSpec.vo
                         newTaskParamMap['cloud'] = taskSpec.cloud
                         newTaskParamMap[
                             'taskPriority'] = taskSpec.taskPriority
                         newTaskParamMap['taskType'] = taskSpec.taskType
                         newTaskParamMap[
                             'prodSourceLabel'] = taskSpec.prodSourceLabel
                         logDatasetName = 'panda.jedi{0}.log.{1}'.format(
                             taskSpec.taskType, uuid.uuid4())
                         newTaskParamMap['log'] = {
                             'dataset':
                             logDatasetName,
                             'type':
                             'template',
                             'param_type':
                             'log',
                             'token':
                             'ATLASDATADISK',
                             'value':
                             '{0}.${{SN}}.log.tgz'.format(logDatasetName)
                         }
                         # make new datasetname
                         outDatasetName = datasetName
                         # remove /
                         outDatasetName = re.sub('/$', '', outDatasetName)
                         # remove extension
                         outDatasetName = re.sub(
                             '\.{0}\d+$'.format(taskSpec.taskType), '',
                             outDatasetName)
                         # add extension
                         outDatasetName = outDatasetName + '.{0}{1}'.format(
                             taskSpec.taskType, taskSpec.jediTaskID)
                         newTaskParamMap['output'] = {
                             'dataset': outDatasetName
                         }
                         if datasetSpec.containerName not in ['', None]:
                             newTaskParamMap['output'][
                                 'container'] = datasetSpec.containerName
                         # make json
                         jsonStr = json.dumps(newTaskParamMap)
                         newJsonStrList.append(jsonStr)
                     # change original task parameters to not repeat the same procedure and to use newly produced files
                     taskParamMap['parentGenerated'] = True
                     taskParamMap['useInFilesInContainer'] = True
                     taskParamMap['useInFilesWithNewAttemptNr'] = True
                     jsonStr = json.dumps(taskParamMap)
                     # insert and update task parameters
                     sTmp, newJediTaskIDs = self.taskBufferIF.insertUpdateTaskParams_JEDI(
                         taskSpec.jediTaskID, taskSpec.vo,
                         taskSpec.prodSourceLabel, jsonStr, newJsonStrList)
                     if sTmp:
                         tmpLog.info(
                             'inserted/updated tasks in DB : new jediTaskIDs={0}'
                             .format(str(newJediTaskIDs)))
                     else:
                         tmpLog.error('failed to insert/update tasks in DB')
                         return retFatal
         # return
         tmpLog.info('done')
         return retOK
     except Exception:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('doGenerate failed with {0}:{1}'.format(
             errtype.__name__, errvalue))
         return retFatal
Пример #32
0
 def do_preassign(self):
     tmp_log = MsgWrapper(logger, 'do_preassign')
     # refresh
     self.refresh()
     # list of resource type
     resource_type_list = [
         rt.resource_name for rt in self.taskBufferIF.load_resource_types()
     ]
     # threshold of time duration in second that the queue keeps empty to trigger preassignment
     empty_duration_threshold = 1800
     # return map
     ret_map = {
         'to_reassign': {},
     }
     # loop
     for prod_source_label in self.prodSourceLabelList:
         # site-rse map
         site_rse_map = self.get_site_rse_map(prod_source_label)
         # parameters from GDP config
         max_preassigned_tasks = self.taskBufferIF.getConfigValue(
             'queue_filler',
             'MAX_PREASSIGNED_TASKS_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if max_preassigned_tasks is None:
             max_preassigned_tasks = 3
         min_files_ready = self.taskBufferIF.getConfigValue(
             'queue_filler',
             'MIN_FILES_READY_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if min_files_ready is None:
             min_files_ready = 50
         min_files_remaining = self.taskBufferIF.getConfigValue(
             'queue_filler',
             'MIN_FILES_REMAINING_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if min_files_remaining is None:
             min_files_remaining = 100
         # load site empty-since map from cache
         site_empty_since_map_orig = self._get_from_ses_cache()
         # available sites
         available_sites_list = self.get_available_sites_list()
         # now timestamp
         now_time = datetime.datetime.utcnow()
         now_time_ts = int(now_time.timestamp())
         # update site empty-since map
         site_empty_since_map = copy.deepcopy(site_empty_since_map_orig)
         available_site_name_list = [x[0] for x in available_sites_list]
         for site in site_empty_since_map_orig:
             # remove sites that are no longer empty
             if site not in available_site_name_list:
                 del site_empty_since_map[site]
         for site in available_site_name_list:
             # add newly found empty sites
             if site not in site_empty_since_map_orig:
                 site_empty_since_map[site] = now_time_ts
         self._update_to_ses_cache(site_empty_since_map)
         # evaluate sites to preaassign according to cache
         # get blacklisted_tasks_map from cache
         blacklisted_tasks_map = self._get_from_bt_cache()
         blacklisted_tasks_set = set()
         for bt_list in blacklisted_tasks_map.values():
             blacklisted_tasks_set |= set(bt_list)
         # loop over available sites to preassign
         for (site, tmpSiteSpec, n_jobs_to_fill) in available_sites_list:
             # rses of the available site
             available_rses = set()
             try:
                 available_rses.update(set(site_rse_map[site]))
             except KeyError:
                 tmp_log.debug(
                     'skipped {site} since no good RSE'.format(site=site))
                 continue
             # do not consider TAPE rses
             for rse in set(available_rses):
                 if 'TAPE' in str(rse):
                     available_rses.remove(rse)
             # skip if no rse for available site
             if not available_rses:
                 tmp_log.debug(
                     'skipped {site} since no available RSE'.format(
                         site=site))
                 continue
             # skip if no coreCount set
             if not tmpSiteSpec.coreCount or not tmpSiteSpec.coreCount > 0:
                 tmp_log.debug(
                     'skipped {site} since coreCount is not set'.format(
                         site=site))
                 continue
             # now timestamp
             now_time = datetime.datetime.utcnow()
             now_time_ts = int(now_time.timestamp())
             # skip if not empty for long enough
             if site not in site_empty_since_map:
                 tmp_log.error(
                     'skipped {site} since not in empty-since map (should not happen)'
                     .format(site=site))
                 continue
             empty_duration = now_time_ts - site_empty_since_map[site]
             tmp_num_slots = tmpSiteSpec.getNumStandby(None, None)
             if empty_duration < empty_duration_threshold and not tmp_num_slots:
                 tmp_log.debug(
                     'skipped {site} since not empty for enough time ({ed}s < {edt}s)'
                     .format(site=site,
                             ed=empty_duration,
                             edt=empty_duration_threshold))
                 continue
             # only simul tasks if site has fairsharePolicy setup
             processing_type_constraint = ''
             if tmpSiteSpec.fairsharePolicy not in ('NULL', None):
                 if 'type=simul:0%' in tmpSiteSpec.fairsharePolicy:
                     # skip if zero share of simul
                     tmp_log.debug(
                         'skipped {site} since with fairshare but zero for simul'
                         .format(site=site))
                     continue
                 else:
                     processing_type_constraint = "AND t.processingType='simul' "
             # site attributes
             site_maxrss = tmpSiteSpec.maxrss if tmpSiteSpec.maxrss not in (
                 0, None) else 999999
             site_corecount = tmpSiteSpec.coreCount
             site_capability = str(tmpSiteSpec.capability).lower()
             # make sql parameters of rses
             available_rses = list(available_rses)
             rse_params_list = []
             rse_params_map = {}
             for j, rse in enumerate(available_rses):
                 rse_param = ':rse_{0}'.format(j + 1)
                 rse_params_list.append(rse_param)
                 rse_params_map[rse_param] = rse
             rse_params_str = ','.join(rse_params_list)
             # sql
             sql_query = (
                 "SELECT t.jediTaskID, t.workQueue_ID "
                 "FROM {jedi_schema}.JEDI_Tasks t "
                 "WHERE t.status IN ('ready','running') AND t.lockedBy IS NULL "
                 "AND t.prodSourceLabel=:prodSourceLabel "
                 "AND t.resource_type=:resource_type "
                 "AND site IS NULL "
                 "AND (COALESCE(t.baseRamCount, 0) + (CASE WHEN t.ramUnit IN ('MBPerCore','MBPerCoreFixed') THEN t.ramCount*:site_corecount ELSE t.ramCount END))*0.95 < :site_maxrss "
                 "AND t.eventService=0 "
                 "AND EXISTS ( "
                 "SELECT * FROM {jedi_schema}.JEDI_Dataset_Locality dl "
                 "WHERE dl.jediTaskID=t.jediTaskID "
                 "AND dl.rse IN ({rse_params_str}) "
                 ") "
                 "{processing_type_constraint} "
                 "AND EXISTS ( "
                 "SELECT d.datasetID FROM {jedi_schema}.JEDI_Datasets d "
                 "WHERE t.jediTaskID=d.jediTaskID AND d.type='input' "
                 "AND d.nFilesToBeUsed-d.nFilesUsed>=:min_files_ready "
                 "AND d.nFiles-d.nFilesUsed>=:min_files_remaining "
                 ") "
                 "ORDER BY t.currentPriority DESC "
                 "FOR UPDATE ").format(
                     jedi_schema=jedi_config.db.schemaJEDI,
                     rse_params_str=rse_params_str,
                     processing_type_constraint=processing_type_constraint)
             # loop over resource type
             for resource_type in resource_type_list:
                 # key name for preassigned_tasks_map = site + resource_type
                 key_name = '{0}|{1}'.format(site, resource_type)
                 # skip if not match with site capability
                 if site_capability == 'score' and not resource_type.startswith(
                         'SCORE'):
                     continue
                 elif site_capability == 'mcore' and not resource_type.startswith(
                         'MCORE'):
                     continue
                 # params map
                 params_map = {
                     ':prodSourceLabel': prod_source_label,
                     ':resource_type': resource_type,
                     ':site_maxrss': site_maxrss,
                     ':site_corecount': site_corecount,
                     ':min_files_ready': min_files_ready,
                     ':min_files_remaining': min_files_remaining,
                 }
                 params_map.update(rse_params_map)
                 # get preassigned_tasks_map from cache
                 preassigned_tasks_map = self._get_from_pt_cache()
                 preassigned_tasks_cached = preassigned_tasks_map.get(
                     key_name, [])
                 # get task_orig_attr_map from cache
                 task_orig_attr_map = self._get_from_attr_cache()
                 # number of tasks already preassigned
                 n_preassigned_tasks = len(preassigned_tasks_cached)
                 # nuber of tasks to preassign
                 n_tasks_to_preassign = max(
                     max_preassigned_tasks - n_preassigned_tasks, 0)
                 # preassign
                 if n_tasks_to_preassign <= 0:
                     tmp_log.debug(
                         '{key_name:<64} already has enough preassigned tasks ({n_tasks:>3}) ; skipped '
                         .format(key_name=key_name,
                                 n_tasks=n_preassigned_tasks))
                 elif DRY_RUN:
                     dry_sql_query = (
                         "SELECT t.jediTaskID, t.workQueue_ID "
                         "FROM {jedi_schema}.JEDI_Tasks t "
                         "WHERE t.status IN ('ready','running') AND t.lockedBy IS NULL "
                         "AND t.prodSourceLabel=:prodSourceLabel "
                         "AND t.resource_type=:resource_type "
                         "AND site IS NULL "
                         "AND (COALESCE(t.baseRamCount, 0) + (CASE WHEN t.ramUnit IN ('MBPerCore','MBPerCoreFixed') THEN t.ramCount*:site_corecount ELSE t.ramCount END))*0.95 < :site_maxrss "
                         "AND t.eventService=0 "
                         "AND EXISTS ( "
                         "SELECT * FROM {jedi_schema}.JEDI_Dataset_Locality dl "
                         "WHERE dl.jediTaskID=t.jediTaskID "
                         "AND dl.rse IN ({rse_params_str}) "
                         ") "
                         "{processing_type_constraint} "
                         "AND EXISTS ( "
                         "SELECT d.datasetID FROM {jedi_schema}.JEDI_Datasets d "
                         "WHERE t.jediTaskID=d.jediTaskID AND d.type='input' "
                         "AND d.nFilesToBeUsed-d.nFilesUsed>=:min_files_ready "
                         "AND d.nFiles-d.nFilesUsed>=:min_files_remaining "
                         ") "
                         "ORDER BY t.currentPriority DESC ").format(
                             jedi_schema=jedi_config.db.schemaJEDI,
                             rse_params_str=rse_params_str,
                             processing_type_constraint=
                             processing_type_constraint)
                     # tmp_log.debug('[dry run] {} {}'.format(dry_sql_query, params_map))
                     res = self.taskBufferIF.querySQL(
                         dry_sql_query, params_map)
                     n_tasks = 0 if res is None else len(res)
                     if n_tasks > 0:
                         result = [
                             x[0] for x in res
                             if x[0] not in preassigned_tasks_cached
                         ]
                         updated_tasks = result[:n_tasks_to_preassign]
                         tmp_log.debug(
                             '[dry run] {key_name:<64} {n_tasks:>3} tasks would be preassigned '
                             .format(key_name=key_name,
                                     n_tasks=n_tasks_to_preassign))
                         # update preassigned_tasks_map into cache
                         preassigned_tasks_map[key_name] = list(
                             set(updated_tasks)
                             | set(preassigned_tasks_cached))
                         tmp_log.debug('{} ; {}'.format(
                             str(updated_tasks),
                             str(preassigned_tasks_map[key_name])))
                         self._update_to_pt_cache(preassigned_tasks_map)
                 else:
                     updated_tasks_orig_attr = self.taskBufferIF.queryTasksToPreassign_JEDI(
                         sql_query,
                         params_map,
                         site,
                         blacklist=blacklisted_tasks_set,
                         limit=n_tasks_to_preassign)
                     if updated_tasks_orig_attr is None:
                         # dbproxy method failed
                         tmp_log.error(
                             '{key_name:<64} failed to preassign tasks '.
                             format(key_name=key_name))
                     else:
                         n_tasks = len(updated_tasks_orig_attr)
                         if n_tasks > 0:
                             updated_tasks = [
                                 x[0] for x in updated_tasks_orig_attr
                             ]
                             tmp_log.info(
                                 '{key_name:<64} {n_tasks:>3} tasks preassigned : {updated_tasks}'
                                 .format(key_name=key_name,
                                         n_tasks=str(n_tasks),
                                         updated_tasks=updated_tasks))
                             # update preassigned_tasks_map into cache
                             preassigned_tasks_map[key_name] = list(
                                 set(updated_tasks)
                                 | set(preassigned_tasks_cached))
                             self._update_to_pt_cache(preassigned_tasks_map)
                             # update task_orig_attr_map into cache and return map
                             for taskid, orig_attr in updated_tasks_orig_attr:
                                 taskid_str = str(taskid)
                                 task_orig_attr_map[taskid_str] = orig_attr
                                 ret_map['to_reassign'][taskid] = {
                                     'site': site,
                                     'n_jobs_to_fill': n_jobs_to_fill,
                                 }
                             self._update_to_attr_cache(task_orig_attr_map)
                             # Kibana log
                             for taskid in updated_tasks:
                                 tmp_log.debug(
                                     '#ATM #KV jediTaskID={taskid} action=do_preassign site={site} rtype={rtype} preassigned '
                                     .format(taskid=taskid,
                                             site=site,
                                             rtype=resource_type))
                         else:
                             tmp_log.debug(
                                 '{key_name:<64} found no proper task to preassign'
                                 .format(key_name=key_name))
     # total preassigned tasks
     preassigned_tasks_map = self._get_from_pt_cache()
     n_pt_tot = sum(
         [len(pt_list) for pt_list in preassigned_tasks_map.values()])
     tmp_log.debug('now {n_pt_tot} tasks preassigned in total'.format(
         n_pt_tot=n_pt_tot))
     # return
     return ret_map
Пример #33
0
    def doSetup(self, taskSpec, datasetToRegister, pandaJobs):
        # make logger
        tmpLog = MsgWrapper(logger,
                            "< jediTaskID={0} >".format(taskSpec.jediTaskID))
        tmpLog.info('start label={0} taskType={1}'.format(
            taskSpec.prodSourceLabel, taskSpec.taskType))
        # returns
        retFatal = self.SC_FATAL
        retOK = self.SC_SUCCEEDED
        try:
            # get DDM I/F
            ddmIF = self.ddmIF.getInterface(taskSpec.vo, taskSpec.cloud)
            # skip if DDM I/F is inactive
            if not ddmIF:
                tmpLog.info('skip due to inactive DDM I/F')
                return retOK
            # collect datasetID to register datasets/containers just in case
            for tmpPandaJob in pandaJobs:
                if not tmpPandaJob.produceUnMerge():
                    for tmpFileSpec in tmpPandaJob.Files:
                        if tmpFileSpec.type in ['output', 'log']:
                            if tmpFileSpec.datasetID not in datasetToRegister:
                                datasetToRegister.append(tmpFileSpec.datasetID)
            # register datasets
            if datasetToRegister:
                tmpLog.info('datasetToRegister={0}'.format(
                    str(datasetToRegister)))
                # get site mapper
                siteMapper = self.taskBufferIF.getSiteMapper()

                # loop over all datasets
                avDatasetList = []
                cnDatasetMap = {}
                ddmBackEnd = 'rucio'
                for datasetID in datasetToRegister:
                    # get output and log datasets
                    tmpLog.info(
                        'getting datasetSpec with datasetID={0}'.format(
                            datasetID))
                    tmpStat, datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(
                        taskSpec.jediTaskID, datasetID)
                    if not tmpStat:
                        tmpLog.error('failed to get output and log datasets')
                        return retFatal
                    if datasetSpec.isPseudo():
                        tmpLog.info('skip pseudo dataset')
                        continue

                    tmpLog.info('checking {0}'.format(datasetSpec.datasetName))
                    # check if dataset and container are available in DDM
                    for targetName in [
                            datasetSpec.datasetName, datasetSpec.containerName
                    ]:
                        if not targetName:
                            continue
                        if targetName in avDatasetList:
                            tmpLog.info(
                                '{0} already registered'.format(targetName))
                            continue
                        # set lifetime
                        lifetime = None
                        # check dataset/container in DDM
                        tmpList = ddmIF.listDatasets(targetName)
                        if not tmpList:
                            # get location
                            location = None
                            locForRule = None
                            if targetName == datasetSpec.datasetName:
                                # dataset
                                tmpLog.info('dest={0}'.format(
                                    datasetSpec.destination))
                                if datasetSpec.destination:
                                    if siteMapper.checkSite(
                                            datasetSpec.destination):
                                        location = siteMapper.getSite(
                                            'BNL_OSG_SPHENIX'
                                        ).ddm_output['default']
                                    else:
                                        location = datasetSpec.destination
                            if locForRule is None:
                                locForRule = location
                            # set metadata
                            if targetName == datasetSpec.datasetName:
                                metaData = {}
                                metaData['task_id'] = taskSpec.jediTaskID
                                if taskSpec.campaign:
                                    metaData['campaign'] = taskSpec.campaign
                            else:
                                metaData = None
                            # register dataset/container
                            tmpLog.info(
                                'registering {0} with location={1} backend={2} lifetime={3} meta={4}'
                                .format(targetName, location, ddmBackEnd,
                                        lifetime, str(metaData)))
                            tmpStat = ddmIF.registerNewDataset(
                                targetName,
                                backEnd=ddmBackEnd,
                                location=location,
                                lifetime=lifetime,
                                metaData=metaData)
                            if not tmpStat:
                                tmpLog.error('failed to register {0}'.format(
                                    targetName))
                                return retFatal
                            # register location
                            if locForRule:
                                """
                                if taskSpec.workingGroup:
                                    userName = taskSpec.workingGroup
                                else:
                                    userName = taskSpec.userName
                                """
                                userName = None
                                activity = None
                                grouping = None
                                tmpLog.info(
                                    'registering location={} lifetime={} days activity={} grouping={} '
                                    'owner={}'.format(locForRule, lifetime,
                                                      activity, grouping,
                                                      userName))
                                tmpStat = ddmIF.registerDatasetLocation(
                                    targetName,
                                    locForRule,
                                    owner=userName,
                                    lifetime=lifetime,
                                    backEnd=ddmBackEnd,
                                    activity=activity,
                                    grouping=grouping)
                                if not tmpStat:
                                    tmpLog.error(
                                        'failed to register location {0} for {1}'
                                        .format(locForRule, targetName))
                                    return retFatal
                            avDatasetList.append(targetName)

                    # check if dataset is in the container
                    if datasetSpec.containerName and datasetSpec.containerName != datasetSpec.datasetName:
                        # get list of constituent datasets in the container
                        if datasetSpec.containerName not in cnDatasetMap:
                            cnDatasetMap[
                                datasetSpec.
                                containerName] = ddmIF.listDatasetsInContainer(
                                    datasetSpec.containerName)
                        # add dataset
                        if datasetSpec.datasetName not in cnDatasetMap[
                                datasetSpec.containerName]:
                            tmpLog.info('adding {0} to {1}'.format(
                                datasetSpec.datasetName,
                                datasetSpec.containerName))
                            tmpStat = ddmIF.addDatasetsToContainer(
                                datasetSpec.containerName,
                                [datasetSpec.datasetName],
                                backEnd=ddmBackEnd)
                            if not tmpStat:
                                tmpLog.error('failed to add {0} to {1}'.format(
                                    datasetSpec.datasetName,
                                    datasetSpec.containerName))
                                return retFatal
                            cnDatasetMap[datasetSpec.containerName].append(
                                datasetSpec.datasetName)
                        else:
                            tmpLog.info('{0} already in {1}'.format(
                                datasetSpec.datasetName,
                                datasetSpec.containerName))
                    # update dataset
                    datasetSpec.status = 'registered'
                    self.taskBufferIF.updateDataset_JEDI(
                        datasetSpec, {
                            'jediTaskID': taskSpec.jediTaskID,
                            'datasetID': datasetID
                        })
            # return
            tmpLog.info('done')
            return retOK
        except Exception as e:
            errStr = 'doSetup failed with {}'.format(str(e))
            tmpLog.error(errStr + traceback.format_exc())
            taskSpec.setErrDiag(errStr)
            return retFatal
Пример #34
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,commandMap in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,' < jediTaskID={0} >'.format(jediTaskID))
                 commandStr = commandMap['command']
                 commentStr = commandMap['comment']
                 oldStatus  = commandMap['oldStatus']
                 tmpLog.info('start for {0}'.format(commandStr))
                 tmpStat = Interaction.SC_SUCCEEDED
                 if commandStr in ['kill','finish','reassign']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # loop twice to see immediate result
                     for iLoop in range(2):
                         # get active PandaIDs to be killed
                         if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr:
                             pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID)
                         elif commandStr == 'reassign' and commentStr != None and 'nokill reassign' in commentStr:
                             pandaIDs = []
                         else:
                             pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True)
                         if pandaIDs == None:
                             tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID))
                             tmpStat = Interaction.SC_FAILED
                         # kill jobs or update task
                         if tmpStat == Interaction.SC_SUCCEEDED:
                             if pandaIDs == []:
                                 # done since no active jobs
                                 tmpMsg = 'completed cleaning jobs'
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpTaskSpec = JediTaskSpec()
                                 tmpTaskSpec.jediTaskID = jediTaskID
                                 updateTaskStatus = True
                                 if commandStr != 'reassign':
                                     # reset oldStatus
                                     # keep oldStatus for task reassignment since it is reset when actually reassigned
                                     tmpTaskSpec.forceUpdate('oldStatus')
                                 else:
                                     # extract cloud or site
                                     if commentStr != None:
                                         tmpItems = commentStr.split(':')
                                         if tmpItems[0] == 'cloud':
                                             tmpTaskSpec.cloud = tmpItems[1]
                                         elif tmpItems[0] == 'nucleus':
                                             tmpTaskSpec.nucleus = tmpItems[1]
                                         else:
                                             tmpTaskSpec.site = tmpItems[1]
                                         tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1])
                                         tmpLog.sendMsg(tmpMsg,self.msgType)
                                         tmpLog.info(tmpMsg)
                                         # back to oldStatus if necessary 
                                         if tmpItems[2] == 'y':
                                             tmpTaskSpec.status = oldStatus
                                             tmpTaskSpec.forceUpdate('oldStatus')
                                             updateTaskStatus = False
                                 if commandStr == 'reassign':
                                     tmpTaskSpec.forceUpdate('errorDialog')
                                 if commandStr == 'finish':
                                     # update datasets
                                     tmpLog.info('updating datasets to finish')
                                     tmpStat = self.taskBufferIF.updateDatasetsToFinishTask_JEDI(jediTaskID, self.pid)
                                     if not tmpStat:
                                         tmpLog.info('wait until datasets are updated to finish')
                                     # ignore failGoalUnreached when manually finished
                                     tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID)
                                     tmpTaskSpec.splitRule = taskSpec.splitRule
                                     tmpTaskSpec.unsetFailGoalUnreached()
                                 if updateTaskStatus:
                                     tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done']
                                 tmpMsg = 'set task_status={0}'.format(tmpTaskSpec.status)
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID},
                                                                            setOldModTime=True)
                                 tmpLog.info('done with {0}'.format(str(tmpRet)))
                                 break
                             else:
                                 # kill only in the first loop
                                 if iLoop > 0:
                                     break
                                 # wait or kill jobs 
                                 if 'soft finish' in commentStr:
                                     queuedPandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID)
                                     tmpMsg = "trying to kill {0} queued jobs for soft finish".format(len(queuedPandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = self.taskBufferIF.killJobs(queuedPandaIDs,commentStr,'52',True)
                                     tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = True
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                                     break
                                 else:
                                     tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpLog.sendMsg(tmpMsg,self.msgType)
                                     if commandStr in ['finish']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True)
                                     elif commandStr in ['reassign']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'51',True)
                                     else:
                                         # normal kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True)
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                 elif commandStr in ['retry','incexec']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # change task params for incexec
                     if commandStr == 'incexec':
                         try:
                             # read task params
                             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                             taskParamMap = RefinerUtils.decodeJSON(taskParam)
                             # remove some params
                             for newKey in ['nFiles','fixedSandbox']:
                                 try:
                                     del taskParamMap[newKey]
                                 except:
                                     pass
                             # convert new params
                             newParamMap = RefinerUtils.decodeJSON(commentStr)
                             # change params
                             for newKey,newVal in newParamMap.iteritems():
                                 if newVal == None:
                                     # delete
                                     if newKey in taskParamMap:
                                         del taskParamMap[newKey]
                                 else:
                                     # change
                                     taskParamMap[newKey] = newVal
                             # overwrite sandbox
                             if 'fixedSandbox' in taskParamMap:
                                 # noBuild
                                 for tmpParam in taskParamMap['jobParameters']:
                                     if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None:
                                         tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox']
                                 # build
                                 if taskParamMap.has_key('buildSpec'):
                                     taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox']
                                 # merge
                                 if taskParamMap.has_key('mergeSpec'):
                                     taskParamMap['mergeSpec']['jobParameters'] = \
                                         re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters'])
                             # encode new param
                             strTaskParams = RefinerUtils.encodeJSON(taskParamMap)
                             tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams)
                             if tmpRet != True:
                                 tmpLog.error('failed to update task params')
                                 continue
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue))
                             continue
                     # retry child tasks
                     if 'sole ' in commentStr:
                         retryChildTasks = False
                     else:
                         retryChildTasks = True
                     # discard events
                     if 'discard ' in commentStr:
                         discardEvents = True
                     else:
                         discardEvents = False
                     tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr,
                                                                             retryChildTasks=retryChildTasks,
                                                                             discardEvents=discardEvents)
                     if tmpRet == True:
                         tmpMsg = 'set task_status={0}'.format(newTaskStatus)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                         tmpLog.info(tmpMsg)
                     tmpLog.info('done with {0}'.format(tmpRet))
                 else:
                     tmpLog.error('unknown command')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errStr  = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errStr += traceback.format_exc()
             logger.error(errStr)
Пример #35
0
    def runImpl(self):
        while True:
            try:
                # get a part of list
                nTasks = 10
                taskDsList = self.taskDsList.get(nTasks)
                # no more datasets
                if len(taskDsList) == 0:
                    self.logger.debug('%s terminating since no more items' % self.__class__.__name__)
                    return
                # loop over all tasks
                for jediTaskID,dsList in taskDsList:
                    allUpdated = True
                    taskBroken = False
                    taskOnHold = False
                    runningTask = False
                    missingMap = {}
                    datasetsIdxConsistency = []

                    # get task
                    tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False,True,self.pid,10)
                    if not tmpStat or taskSpec == None:
                        self.logger.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID))
                        continue

                    # make logger
                    try:
                        gshare = '_'.join(taskSpec.gshare.split(' '))
                    except:
                        gshare = 'Undefined'
                    tmpLog = MsgWrapper(self.logger,'<jediTaskID={0} gshare={1}>'.format(jediTaskID, gshare))

                    try:
                        # get task parameters
                        taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                        taskParamMap = RefinerUtils.decodeJSON(taskParam)
                    except:
                        errtype,errvalue = sys.exc_info()[:2]
                        tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue))
                        taskBroken = True
                    # renaming of parameters
                    if taskParamMap.has_key('nEventsPerInputFile'):
                        taskParamMap['nEventsPerFile'] = taskParamMap['nEventsPerInputFile']
                    # the number of files per job
                    nFilesPerJob = taskSpec.getNumFilesPerJob()
                    # the number of chunks used by scout 
                    nChunksForScout = 10
                    # load XML
                    if taskSpec.useLoadXML():
                        xmlConfig = taskParamMap['loadXML']
                    else:
                        xmlConfig = None
                    # skip files used by another task
                    if 'skipFilesUsedBy' in taskParamMap:
                        skipFilesUsedBy = taskParamMap['skipFilesUsedBy']
                    else:
                        skipFilesUsedBy = None
                    # check no wait
                    noWaitParent = False
                    parentOutDatasets = set()
                    if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None,taskSpec.jediTaskID]:
                        tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid)
                        if tmpStat == 'running':
                            noWaitParent = True
                            # get output datasets from parent task
                            tmpParentStat,tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.parent_tid,
                                                                                                                  ['output','log'])
                            # collect dataset names
                            for tmpParentOutDataset in tmpParentOutDatasets:
                                parentOutDatasets.add(tmpParentOutDataset.datasetName)
                    # loop over all datasets
                    nFilesMaster = 0
                    checkedMaster = False
                    setFrozenTime = True
                    if not taskBroken:
                        ddmIF = self.ddmIF.getInterface(taskSpec.vo) 
                        origNumFiles = None
                        if taskParamMap.has_key('nFiles'):
                            origNumFiles = taskParamMap['nFiles']
                        for datasetSpec in dsList:
                            tmpLog.debug('start loop for {0}(id={1})'.format(datasetSpec.datasetName,datasetSpec.datasetID))
                            # index consistency
                            if datasetSpec.indexConsistent():
                                datasetsIdxConsistency.append(datasetSpec.datasetID)
                            # get dataset metadata
                            tmpLog.debug('get metadata')
                            gotMetadata = False
                            stateUpdateTime = datetime.datetime.utcnow()                    
                            try:
                                if not datasetSpec.isPseudo():
                                    tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName)
                                else:
                                    # dummy metadata for pseudo dataset
                                    tmpMetadata = {'state':'closed'}
                                # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed 
                                if (noWaitParent or taskSpec.runUntilClosed()) and \
                                        (tmpMetadata['state'] == 'open' \
                                             or datasetSpec.datasetName in parentOutDatasets \
                                             or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets):
                                    # dummy metadata when parent is running
                                    tmpMetadata = {'state':'mutable'}
                                gotMetadata = True
                            except:
                                errtype,errvalue = sys.exc_info()[:2]
                                tmpLog.error('{0} failed to get metadata to {1}:{2}'.format(self.__class__.__name__,
                                                                                            errtype.__name__,errvalue))
                                if errtype == Interaction.JEDIFatalError:
                                    # fatal error
                                    datasetStatus = 'broken'
                                    taskBroken = True
                                    # update dataset status    
                                    self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                else:
                                    if not taskSpec.ignoreMissingInDS():
                                        # temporary error
                                        taskOnHold = True
                                    else:
                                        # ignore missing 
                                        datasetStatus = 'failed'
                                        # update dataset status
                                        self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                taskSpec.setErrDiag('failed to get metadata for {0}'.format(datasetSpec.datasetName))
                                if not taskSpec.ignoreMissingInDS():
                                    allUpdated = False
                            else:
                                # get file list specified in task parameters
                                fileList,includePatt,excludePatt = RefinerUtils.extractFileList(taskParamMap,datasetSpec.datasetName)   
                                # get the number of events in metadata
                                if taskParamMap.has_key('getNumEventsInMetadata'):
                                    getNumEvents = True
                                else:
                                    getNumEvents = False
                                # get file list from DDM
                                tmpLog.debug('get files')
                                try:
                                    useInFilesWithNewAttemptNr = False
                                    skipDuplicate = not datasetSpec.useDuplicatedFiles()
                                    if not datasetSpec.isPseudo():
                                        if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \
                                                not datasetSpec.containerName in ['',None]:
                                            # read files from container if file list is specified in task parameters
                                            tmpDatasetName = datasetSpec.containerName
                                        else:
                                            tmpDatasetName = datasetSpec.datasetName
                                        # use long format for LB
                                        longFormat = False
                                        if taskSpec.respectLumiblock() or taskSpec.orderByLB():
                                            longFormat = True
                                        tmpRet = ddmIF.getFilesInDataset(tmpDatasetName,
                                                                         getNumEvents=getNumEvents,
                                                                         skipDuplicate=skipDuplicate,
                                                                         longFormat=longFormat
                                                                         )
                                        tmpLog.debug('got {0} files in {1}'.format(len(tmpRet),tmpDatasetName))
                                        # remove lost files
                                        tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName,tmpRet)
                                        if tmpLostFiles != {}:
                                            tmpLog.debug('found {0} lost files in {1}'.format(len(tmpLostFiles),tmpDatasetName))
                                            for tmpListGUID,tmpLostLFN in tmpLostFiles.iteritems():
                                                tmpLog.debug('removed {0}'.format(tmpLostLFN))
                                                del tmpRet[tmpListGUID]
                                    else:
                                        if datasetSpec.isSeqNumber():
                                            # make dummy files for seq_number
                                            if datasetSpec.getNumRecords() != None:
                                                nPFN = datasetSpec.getNumRecords()
                                            elif origNumFiles != None:
                                                nPFN = origNumFiles
                                                if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \
                                                        and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                    nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerJob']
                                                elif taskParamMap.has_key('nEventsPerFile') and taskParamMap.has_key('nEventsPerRange'):
                                                    nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerRange']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap:
                                                nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerJob']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \
                                                    and taskSpec.getNumFilesPerJob() is not None:
                                                nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerFile'] / taskSpec.getNumFilesPerJob()
                                            else:
                                                # the default number of records for seq_number
                                                seqDefNumRecords = 10000
                                                # get nFiles of the master
                                                tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI(datasetSpec.jediTaskID,
                                                                                                           datasetSpec.masterID,
                                                                                                           ['nFiles'])
                                                # use nFiles of the master as the number of records if it is larger than the default
                                                if 'nFiles' in tmpMasterAtt and tmpMasterAtt['nFiles'] > seqDefNumRecords:
                                                    nPFN = tmpMasterAtt['nFiles']
                                                else:
                                                    nPFN = seqDefNumRecords
                                                # check usedBy 
                                                if skipFilesUsedBy != None:
                                                    for tmpJediTaskID in str(skipFilesUsedBy).split(','):
                                                        tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI(tmpJediTaskID,
                                                                                                                          {'datasetName':datasetSpec.datasetName},
                                                                                                                          ['nFiles'])
                                                        if 'nFiles' in tmpParentAtt and tmpParentAtt['nFiles']:
                                                            nPFN += tmpParentAtt['nFiles']
                                            tmpRet = {}
                                            # get offset
                                            tmpOffset = datasetSpec.getOffset()
                                            tmpOffset += 1
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {'lfn':iPFN+tmpOffset,
                                                                             'scope':None,
                                                                             'filesize':0,
                                                                             'checksum':None,
                                                                             }
                                        elif not taskSpec.useListPFN():
                                            # dummy file list for pseudo dataset
                                            tmpRet = {str(uuid.uuid4()):{'lfn':'pseudo_lfn',
                                                                         'scope':None,
                                                                         'filesize':0,
                                                                         'checksum':None,
                                                                         }
                                                      }
                                        else:
                                            # make dummy file list for PFN list
                                            if taskParamMap.has_key('nFiles'):
                                                nPFN = taskParamMap['nFiles']
                                            else:
                                                nPFN = 1
                                            tmpRet = {}
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {'lfn':'{0:06d}:{1}'.format(iPFN,taskParamMap['pfnList'][iPFN].split('/')[-1]),
                                                                             'scope':None,
                                                                             'filesize':0,
                                                                             'checksum':None,
                                                                             }
                                except:
                                    errtype,errvalue = sys.exc_info()[:2]
                                    tmpLog.error('failed to get files due to {0}:{1} {2}'.format(self.__class__.__name__,
                                                                                                 errtype.__name__,errvalue))
                                    if errtype == Interaction.JEDIFatalError:
                                        # fatal error
                                        datasetStatus = 'broken'
                                        taskBroken = True
                                        # update dataset status    
                                        self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                    else:
                                        # temporary error
                                        taskOnHold = True
                                    taskSpec.setErrDiag('failed to get files for {0}'.format(datasetSpec.datasetName))
                                    allUpdated = False
                                else:
                                    # parameters for master input
                                    respectLB = False
                                    useRealNumEvents = False
                                    if datasetSpec.isMaster():
                                        # respect LB boundaries
                                        respectLB = taskSpec.respectLumiblock()
                                        # use real number of events
                                        useRealNumEvents = taskSpec.useRealNumEvents()
                                    # the number of events per file
                                    nEventsPerFile  = None
                                    nEventsPerJob   = None
                                    nEventsPerRange = None
                                    tgtNumEventsPerJob = None
                                    if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \
                                            (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()):
                                        if taskParamMap.has_key('nEventsPerFile'):
                                            nEventsPerFile = taskParamMap['nEventsPerFile']
                                        elif datasetSpec.isMaster() and datasetSpec.isPseudo() and taskParamMap.has_key('nEvents'):
                                            # use nEvents as nEventsPerFile for pseudo input
                                            nEventsPerFile = taskParamMap['nEvents']
                                        if taskParamMap.has_key('nEventsPerJob'):
                                            nEventsPerJob = taskParamMap['nEventsPerJob']
                                        elif taskParamMap.has_key('nEventsPerRange'):
                                            nEventsPerRange = taskParamMap['nEventsPerRange']
                                        if 'tgtNumEventsPerJob' in taskParamMap:
                                            tgtNumEventsPerJob = taskParamMap['tgtNumEventsPerJob']
                                            # reset nEventsPerJob
                                            nEventsPerJob = None
                                    # max attempts
                                    maxAttempt = None
                                    maxFailure = None
                                    if datasetSpec.isMaster() or datasetSpec.toKeepTrack():
                                        # max attempts 
                                        if taskSpec.disableAutoRetry():
                                            # disable auto retry 
                                            maxAttempt = 1
                                        elif taskParamMap.has_key('maxAttempt'):
                                            maxAttempt = taskParamMap['maxAttempt']
                                        else:
                                            # use default value
                                            maxAttempt = 3
                                        # max failure
                                        if 'maxFailure' in taskParamMap:
                                            maxFailure = taskParamMap['maxFailure']
                                    # first event number
                                    firstEventNumber = None
                                    if datasetSpec.isMaster():
                                        # first event number
                                        firstEventNumber = 1 + taskSpec.getFirstEventOffset()
                                    # nMaxEvents
                                    nMaxEvents = None 
                                    if datasetSpec.isMaster() and taskParamMap.has_key('nEvents'):
                                        nMaxEvents = taskParamMap['nEvents']
                                    # nMaxFiles
                                    nMaxFiles = None
                                    if taskParamMap.has_key('nFiles'):
                                        if datasetSpec.isMaster():
                                            nMaxFiles = taskParamMap['nFiles']
                                        else:
                                            # calculate for secondary
                                            nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles)
                                            # multipled by the number of jobs per file for event-level splitting
                                            if nMaxFiles != None and taskParamMap.has_key('nEventsPerFile'):
                                                if taskParamMap.has_key('nEventsPerJob'):
                                                    if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                        nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerJob'])
                                                        nMaxFiles = int(math.ceil(nMaxFiles))
                                                elif taskParamMap.has_key('nEventsPerRange'):
                                                    if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerRange']:
                                                        nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerRange'])
                                                        nMaxFiles = int(math.ceil(nMaxFiles))
                                    # use scout
                                    useScout = False    
                                    if datasetSpec.isMaster() and taskSpec.useScout() and (datasetSpec.status != 'toupdate' or not taskSpec.isPostScout()):
                                        useScout = True
                                    # use files with new attempt numbers    
                                    useFilesWithNewAttemptNr = False
                                    if not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key('useInFilesWithNewAttemptNr'):
                                        useFilesWithNewAttemptNr = True
                                    # ramCount
                                    ramCount = 0
                                    # skip short input
                                    if datasetSpec.isMaster() and not datasetSpec.isPseudo() \
                                            and nEventsPerFile is not None and nEventsPerJob is not None \
                                            and nEventsPerFile >= nEventsPerJob \
                                            and 'skipShortInput' in taskParamMap and taskParamMap['skipShortInput'] == True:
                                        skipShortInput = True
                                    else:
                                        skipShortInput = False
                                    # feed files to the contents table
                                    tmpLog.debug('update contents')
                                    retDB,missingFileList,nFilesUnique,diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(datasetSpec,tmpRet,
                                                                                                                              tmpMetadata['state'],
                                                                                                                              stateUpdateTime,
                                                                                                                              nEventsPerFile,
                                                                                                                              nEventsPerJob,
                                                                                                                              maxAttempt,
                                                                                                                              firstEventNumber,
                                                                                                                              nMaxFiles,
                                                                                                                              nMaxEvents,
                                                                                                                              useScout,
                                                                                                                              fileList,
                                                                                                                              useFilesWithNewAttemptNr,
                                                                                                                              nFilesPerJob,
                                                                                                                              nEventsPerRange,
                                                                                                                              nChunksForScout,
                                                                                                                              includePatt,
                                                                                                                              excludePatt,
                                                                                                                              xmlConfig,
                                                                                                                              noWaitParent,
                                                                                                                              taskSpec.parent_tid,
                                                                                                                              self.pid,
                                                                                                                              maxFailure,
                                                                                                                              useRealNumEvents,
                                                                                                                              respectLB,
                                                                                                                              tgtNumEventsPerJob,
                                                                                                                              skipFilesUsedBy,
                                                                                                                              ramCount,
                                                                                                                              taskSpec,
                                                                                                                              skipShortInput)
                                    if retDB == False:
                                        taskSpec.setErrDiag('failed to insert files for {0}. {1}'.format(datasetSpec.datasetName,
                                                                                                         diagMap['errMsg']))
                                        allUpdated = False
                                        taskBroken = True
                                        break
                                    elif retDB == None:
                                        # the dataset is locked by another or status is not applicable
                                        allUpdated = False
                                        tmpLog.debug('escape since task or dataset is locked')
                                        break
                                    elif missingFileList != []:
                                        # files are missing
                                        tmpErrStr = '{0} files missing in {1}'.format(len(missingFileList),datasetSpec.datasetName)
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        allUpdated = False
                                        taskOnHold = True
                                        missingMap[datasetSpec.datasetName] = {'datasetSpec':datasetSpec,
                                                                               'missingFiles':missingFileList} 
                                    else:
                                        # reduce the number of files to be read
                                        if taskParamMap.has_key('nFiles'):
                                            if datasetSpec.isMaster():
                                                taskParamMap['nFiles'] -= nFilesUnique
                                        # reduce the number of files for scout
                                        if useScout:
                                            nChunksForScout = diagMap['nChunksForScout']
                                        # number of master input files
                                        if datasetSpec.isMaster():
                                            checkedMaster = True
                                            nFilesMaster += nFilesUnique
                                    # running task
                                    if diagMap['isRunningTask']:
                                        runningTask = True
                                    # no activated pending input for noWait
                                    if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout <= 0) \
                                            and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster():
                                        tmpErrStr = 'insufficient inputs are ready. '
                                        tmpErrStr += diagMap['errMsg']
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        taskOnHold = True
                                        setFrozenTime = False
                                        break
                            tmpLog.debug('end loop')
                    # no mater input
                    if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster:
                        tmpErrStr = 'no master input files. input dataset is empty'
                        tmpLog.error(tmpErrStr)
                        taskSpec.setErrDiag(tmpErrStr,None)
                        if taskSpec.allowEmptyInput() or noWaitParent:
                            taskOnHold = True
                        else:
                            taskBroken = True
                    # index consistency
                    if not taskOnHold and not taskBroken and len(datasetsIdxConsistency) > 0:
                        self.taskBufferIF.removeFilesIndexInconsistent_JEDI(jediTaskID,datasetsIdxConsistency)
                    # update task status
                    if taskBroken:
                        # task is broken
                        taskSpec.status = 'tobroken'
                        tmpMsg = 'set task_status={0}'.format(taskSpec.status)
                        tmpLog.info(tmpMsg)
                        tmpLog.sendMsg(tmpMsg,self.msgType)
                        allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid)
                    # change task status unless the task is running
                    if not runningTask:
                        if taskOnHold:
                            # go to pending state
                            if not taskSpec.status in ['broken','tobroken']:
                                taskSpec.setOnHold()
                            tmpMsg = 'set task_status={0}'.format(taskSpec.status)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg,self.msgType)
                            allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid,setFrozenTime=setFrozenTime)
                        elif allUpdated:
                            # all OK
                            allRet,newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,getTaskStatus=True,pid=self.pid,
                                                                                                       useWorldCloud=taskSpec.useWorldCloud())
                            tmpMsg = 'set task_status={0}'.format(newTaskStatus)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg,self.msgType)
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid)
                        tmpLog.debug('unlock not-running task with {0}'.format(retUnlock))
                    else:
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid)
                        tmpLog.debug('unlock task with {0}'.format(retUnlock))
                    tmpLog.debug('done')
            except:
                errtype,errvalue = sys.exc_info()[:2]
                logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
Пример #36
0
 def do_for_data_locality(self):
     tmp_log = MsgWrapper(logger)
     # refresh
     self.refresh()
     # list of resource type
     # resource_type_list = [ rt.resource_name for rt in self.taskBufferIF.load_resource_types() ]
     # loop
     for prod_source_label in self.prodSourceLabelList:
         # site-rse map and blacklisted rses
         site_rse_map, blacklisted_rse_set = self.get_site_rse_map_and_blacklisted_rse_set(
             prod_source_label)
         tmp_log.debug('Found {0} blacklisted RSEs : {1}'.format(
             len(blacklisted_rse_set), ','.join(list(blacklisted_rse_set))))
         # parameter from GDP config
         upplimit_ioIntensity = self.taskBufferIF.getConfigValue(
             'task_withholder',
             'LIMIT_IOINTENSITY_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         lowlimit_currentPriority = self.taskBufferIF.getConfigValue(
             'task_withholder',
             'LIMIT_PRIORITY_{0}'.format(prod_source_label), 'jedi',
             self.vo)
         if upplimit_ioIntensity is None:
             upplimit_ioIntensity = 999999
         if lowlimit_currentPriority is None:
             lowlimit_currentPriority = -999999
         upplimit_ioIntensity = max(upplimit_ioIntensity, 100)
         # get work queue for gshare
         work_queue_list = self.workQueueMapper.getAlignedQueueList(
             self.vo, prod_source_label)
         # loop over work queue
         for work_queue in work_queue_list:
             gshare = work_queue.queue_name
             # get cutoff
             cutoff = self.taskBufferIF.getConfigValue(
                 'jobbroker', 'NQUEUELIMITSITE_{}'.format(gshare), 'jedi',
                 self.vo)
             if not cutoff:
                 cutoff = 20
             # busy sites
             busy_sites_list = self.get_busy_sites(gshare, cutoff)
             # rses of busy sites
             busy_rses = set()
             for site in busy_sites_list:
                 try:
                     busy_rses.update(set(site_rse_map[site]))
                 except KeyError:
                     continue
             # make sql parameters of rses
             to_exclude_rses = list(busy_rses | blacklisted_rse_set)
             rse_params_list = []
             rse_params_map = {}
             for j, rse in enumerate(to_exclude_rses):
                 rse_param = ':rse_{0}'.format(j + 1)
                 rse_params_list.append(rse_param)
                 rse_params_map[rse_param] = rse
             rse_params_str = ','.join(rse_params_list)
             # sql
             sql_query = (
                 "SELECT t.jediTaskID "
                 "FROM {jedi_schema}.JEDI_Tasks t "
                 "WHERE t.status IN ('ready','running','scouting') AND t.lockedBy IS NULL "
                 "AND t.gshare=:gshare "
                 "AND t.ioIntensity>=:ioIntensity AND t.currentPriority<:currentPriority "
                 "AND EXISTS ( "
                 "SELECT * FROM {jedi_schema}.JEDI_Datasets d "
                 "WHERE d.jediTaskID=t.jediTaskID "
                 "AND d.type='input' "
                 ") "
                 "AND NOT EXISTS ( "
                 "SELECT * FROM {jedi_schema}.JEDI_Dataset_Locality dl "
                 "WHERE dl.jediTaskID=t.jediTaskID "
                 "AND dl.rse NOT IN ({rse_params_str}) "
                 ") "
                 "FOR UPDATE ").format(
                     jedi_schema=jedi_config.db.schemaJEDI,
                     rse_params_str=rse_params_str)
             # params map
             params_map = {
                 ':gshare': gshare,
                 ':ioIntensity': upplimit_ioIntensity,
                 ':currentPriority': lowlimit_currentPriority,
             }
             params_map.update(rse_params_map)
             # pending reason
             reason = 'no local input data, ioIntensity>={ioIntensity}, currentPriority<{currentPriority},'\
                      'nQueue>max({cutOff},nRunning*2) at all sites where the task can run'.format(
                 ioIntensity=upplimit_ioIntensity,currentPriority=lowlimit_currentPriority,
                 cutOff=cutoff)
             # set pending
             dry_run = False
             if dry_run:
                 dry_sql_query = (
                     "SELECT t.jediTaskID "
                     "FROM {jedi_schema}.JEDI_Tasks t "
                     "WHERE t.status IN ('ready','running','scouting') AND t.lockedBy IS NULL "
                     "AND t.gshare=:gshare "
                     "AND t.ioIntensity>=:ioIntensity AND t.currentPriority<:currentPriority "
                     "AND EXISTS ( "
                     "SELECT * FROM {jedi_schema}.JEDI_Datasets d "
                     "WHERE d.jediTaskID=t.jediTaskID "
                     "AND d.type='input' "
                     ") "
                     "AND NOT EXISTS ( "
                     "SELECT * FROM {jedi_schema}.JEDI_Dataset_Locality dl "
                     "WHERE dl.jediTaskID=t.jediTaskID "
                     "AND dl.rse NOT IN ({rse_params_str}) "
                     ") ").format(jedi_schema=jedi_config.db.schemaJEDI,
                                  rse_params_str=rse_params_str)
                 res = self.taskBufferIF.querySQL(dry_sql_query, params_map)
                 n_tasks = 0 if res is None else len(res)
                 if n_tasks > 0:
                     result = [x[0] for x in res]
                     tmp_log.debug(
                         '[dry run] gshare: {gshare:<16} {n_tasks:>5} tasks would be pending : {result} ; reason="{reason}" '
                         .format(gshare=gshare,
                                 n_tasks=n_tasks,
                                 result=result,
                                 reason=reason))
             else:
                 n_tasks = self.taskBufferIF.queryTasksToBePending_JEDI(
                     sql_query, params_map, reason)
                 if n_tasks is not None and n_tasks > 0:
                     tmp_log.info(
                         'gshare: {gshare:<16} {n_tasks:>5} tasks got pending ; reason="{reason}" '
                         .format(gshare=gshare,
                                 n_tasks=str(n_tasks),
                                 reason=reason))
Пример #37
0
    def doSetup(self,taskSpec,datasetToRegister,pandaJobs):
        # make logger
        tmpLog = MsgWrapper(logger,"< jediTaskID={0} >".format(taskSpec.jediTaskID))
        tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType))
        # returns
        retFatal    = self.SC_FATAL
        retTmpError = self.SC_FAILED
        retOK       = self.SC_SUCCEEDED
        try:
            # get DDM I/F
            ddmIF = self.ddmIF.getInterface(taskSpec.vo)
            # register datasets
            if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']:
                # prod vs anal
                userSetup = False
                if taskSpec.prodSourceLabel in ['user']:
                    userSetup = True
                    # collect datasetID to register datasets/containers just in case
                    for tmpPandaJob in pandaJobs:
                        if not tmpPandaJob.produceUnMerge():
                            for tmpFileSpec in tmpPandaJob.Files:
                                if tmpFileSpec.type in ['output','log']:
                                    if tmpFileSpec.datasetID not in datasetToRegister:
                                        datasetToRegister.append(tmpFileSpec.datasetID)
                tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister)))
                # get site mapper
                siteMapper = self.taskBufferIF.getSiteMapper()

                # loop over all datasets
                avDatasetList = []
                cnDatasetMap  = {}
                for datasetID in datasetToRegister:
                    # get output and log datasets
                    tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID))
                    tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID,
                                                                                  datasetID)
                    if not tmpStat:
                        tmpLog.error('failed to get output and log datasets')
                        return retFatal
                    if datasetSpec.isPseudo():
                        tmpLog.info('skip pseudo dataset')
                        continue
                    # DDM backend
                    ddmBackEnd = taskSpec.getDdmBackEnd()
                    tmpLog.info('checking {0}'.format(datasetSpec.datasetName))
                    # check if dataset and container are available in DDM
                    for targetName in [datasetSpec.datasetName,datasetSpec.containerName]:
                        if targetName is None:
                            continue
                        if targetName not in avDatasetList:
                            # set lifetime
                            if targetName.startswith('panda'):
                                if datasetSpec.type == 'trn_log' and taskSpec.prodSourceLabel == 'managed':
                                    lifetime = 365
                                else:
                                    lifetime = 14
                            else:
                                lifetime = None
                            # check dataset/container in DDM
                            tmpList = ddmIF.listDatasets(targetName)
                            if tmpList == []:
                                # get location
                                location = None
                                locForRule = None
                                if targetName == datasetSpec.datasetName:
                                    # dataset
                                    if datasetSpec.site in ['',None]:
                                        if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                                            locForRule = datasetSpec.destination
                                        elif DataServiceUtils.getDestinationSE(datasetSpec.storageToken) is not None:
                                            location = DataServiceUtils.getDestinationSE(datasetSpec.storageToken)
                                        elif taskSpec.cloud is not None:
                                            # use T1 SE
                                            tmpT1Name = siteMapper.getCloud(taskSpec.cloud)['source']
                                            location = siteMapper.getDdmEndpoint(tmpT1Name, datasetSpec.storageToken,
                                                                                 taskSpec.prodSourceLabel,
                                                                                 JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType))
                                    else:
                                        tmpLog.info('site={0} token={1}'.format(datasetSpec.site, datasetSpec.storageToken))
                                        location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken,
                                                                             taskSpec.prodSourceLabel,
                                                                             JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType))
                                if locForRule is None:
                                    locForRule = location
                                # set metadata
                                if taskSpec.prodSourceLabel in ['managed','test'] and targetName == datasetSpec.datasetName:
                                    metaData = {}
                                    metaData['task_id'] = taskSpec.jediTaskID
                                    if taskSpec.campaign not in [None,'']:
                                        metaData['campaign'] = taskSpec.campaign
                                    if datasetSpec.getTransient() is not None:
                                        metaData['transient'] = datasetSpec.getTransient()
                                else:
                                    metaData = None
                                # register dataset/container
                                tmpLog.info('registering {0} with location={1} backend={2} lifetime={3} meta={4}'.format(targetName,
                                                                                                                         location,
                                                                                                                         ddmBackEnd,
                                                                                                                         lifetime,
                                                                                                                         str(metaData)))
                                tmpStat = ddmIF.registerNewDataset(targetName,backEnd=ddmBackEnd,location=location,
                                                                   lifetime=lifetime,metaData=metaData)
                                if not tmpStat:
                                    tmpLog.error('failed to register {0}'.format(targetName))
                                    return retFatal
                                # procedures for user
                                if userSetup or DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                                    # register location
                                    tmpToRegister = False
                                    if userSetup and targetName == datasetSpec.datasetName and datasetSpec.site not in ['',None]:
                                        if taskSpec.workingGroup:
                                            userName = taskSpec.workingGroup
                                        else:
                                            userName = taskSpec.userName
                                        grouping = None
                                        tmpToRegister = True
                                    elif DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                                        userName = None
                                        grouping = 'NONE'
                                        tmpToRegister = True
                                    if tmpToRegister:
                                        activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel)
                                        tmpLog.info('registering location={} lifetime={} days activity={} grouping={} '
                                                    'owner={}'.format(locForRule, lifetime, activity, grouping,
                                                                      userName))
                                        tmpStat = ddmIF.registerDatasetLocation(targetName,locForRule,owner=userName,
                                                                                lifetime=lifetime,backEnd=ddmBackEnd,
                                                                                activity=activity,grouping=grouping)
                                        if not tmpStat:
                                            tmpLog.error('failed to register location {0} for {1}'.format(locForRule,
                                                                                                          targetName))
                                            return retFatal
                                        # double copy
                                        if userSetup and datasetSpec.type == 'output':
                                            if datasetSpec.destination != datasetSpec.site:
                                                tmpLog.info('skip making double copy as destination={0} is not site={1}'.format(datasetSpec.destination,
                                                                                                                                datasetSpec.site))
                                            else:

                                                second_copy = True
                                                try:
                                                    if taskSpec.site:
                                                        panda_site = siteMapper.getSite(taskSpec.site)
                                                        if panda_site.catchall and 'skip_2nd_copy' in panda_site.catchall:
                                                            tmpLog.info('skip making double copy as specified in {0} catchall'.format(panda_site))
                                                            second_copy = False
                                                except Exception:
                                                    second_copy = True

                                                if second_copy:
                                                    locForDouble = '(type=SCRATCHDISK)\\notforextracopy=True'
                                                    tmpMsg  = 'registering double copy '
                                                    tmpMsg += 'location="{0}" lifetime={1}days activity={2} for dataset={3}'.format(locForDouble,lifetime,
                                                                                                                                    activity,targetName)
                                                    tmpLog.info(tmpMsg)
                                                    tmpStat = ddmIF.registerDatasetLocation(targetName,locForDouble,copies=2,owner=userName,
                                                                                            lifetime=lifetime,activity=activity,
                                                                                            grouping='NONE',weight='freespace',
                                                                                            ignore_availability=False)
                                                    if not tmpStat:
                                                        tmpLog.error('failed to register double copylocation {0} for {1}'.format(locForDouble,
                                                                                                                               targetName))
                                                        return retFatal
                                avDatasetList.append(targetName)
                            else:
                                tmpLog.info('{0} already registered'.format(targetName))
                    # check if dataset is in the container
                    if datasetSpec.containerName is not None and datasetSpec.containerName != datasetSpec.datasetName:
                        # get list of constituent datasets in the container
                        if datasetSpec.containerName not in cnDatasetMap:
                            cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName)
                        # add dataset
                        if datasetSpec.datasetName not in cnDatasetMap[datasetSpec.containerName]:
                            tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName))
                            tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName],
                                                                   backEnd=ddmBackEnd)
                            if not tmpStat:
                                tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName,
                                                                               datasetSpec.containerName))
                                return retFatal
                            cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName)
                        else:
                            tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName))
                    # update dataset
                    datasetSpec.status = 'registered'
                    self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID,
                                                                      'datasetID':datasetID})
            # register ES datasets
            if taskSpec.registerEsFiles():
                targetName = EventServiceUtils.getEsDatasetName(taskSpec.jediTaskID)
                location = None
                metaData = {}
                metaData['task_id'] = taskSpec.jediTaskID
                metaData['hidden']  = True
                tmpLog.info('registering ES dataset {0} with location={1} meta={2}'.format(targetName,
                                                                                           location,
                                                                                           str(metaData)))
                tmpStat = ddmIF.registerNewDataset(targetName,location=location,metaData=metaData,
                                                   resurrect=True)
                if not tmpStat:
                    tmpLog.error('failed to register ES dataset {0}'.format(targetName))
                    return retFatal
                # register rule
                location = 'type=DATADISK'
                activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel)
                grouping = 'NONE'
                tmpLog.info('registering location={0} activity={1} grouping={2}'.format(location,
                                                                                        activity,
                                                                                        grouping))
                tmpStat = ddmIF.registerDatasetLocation(targetName,location,activity=activity,
                                                        grouping=grouping)
                if not tmpStat:
                    tmpLog.error('failed to register location {0} with {2} for {1}'.format(location,
                                                                                           targetName,
                                                                                           activity))
                    return retFatal
            # open datasets
            if taskSpec.prodSourceLabel in ['managed','test']:
                # get the list of output/log datasets
                outDatasetList = []
                for tmpPandaJob in pandaJobs:
                    for tmpFileSpec in tmpPandaJob.Files:
                        if tmpFileSpec.type in ['output','log']:
                            if tmpFileSpec.destinationDBlock not in outDatasetList:
                                outDatasetList.append(tmpFileSpec.destinationDBlock)
                # open datasets
                for outDataset in outDatasetList:
                    tmpLog.info('open {0}'.format(outDataset))
                    ddmIF.openDataset(outDataset)
                    # unset lifetime
                    ddmIF.setDatasetMetadata(outDataset,'lifetime',None)
            # return
            tmpLog.info('done')
            return retOK
        except Exception:
            errtype,errvalue = sys.exc_info()[:2]
            tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue))
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            return retFatal
Пример #38
0
    def runImpl(self):
        while True:
            try:
                # get a part of list
                nTasks = 10
                taskDsList = self.taskDsList.get(nTasks)
                # no more datasets
                if len(taskDsList) == 0:
                    self.logger.debug('%s terminating since no more items' %
                                      self.__class__.__name__)
                    return
                # loop over all tasks
                for jediTaskID, dsList in taskDsList:
                    allUpdated = True
                    taskBroken = False
                    taskOnHold = False
                    runningTask = False
                    missingMap = {}
                    # make logger
                    tmpLog = MsgWrapper(
                        self.logger, '< jediTaskID={0} >'.format(jediTaskID))
                    # get task
                    tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(
                        jediTaskID, False, True, self.pid, 10)
                    if not tmpStat or taskSpec == None:
                        tmpLog.error(
                            'failed to get taskSpec for jediTaskID={0}'.format(
                                jediTaskID))
                        continue
                    try:
                        # get task parameters
                        taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                            jediTaskID)
                        taskParamMap = RefinerUtils.decodeJSON(taskParam)
                    except:
                        errtype, errvalue = sys.exc_info()[:2]
                        tmpLog.error(
                            'task param conversion from json failed with {0}:{1}'
                            .format(errtype.__name__, errvalue))
                        taskBroken = True
                    # renaming of parameters
                    if taskParamMap.has_key('nEventsPerInputFile'):
                        taskParamMap['nEventsPerFile'] = taskParamMap[
                            'nEventsPerInputFile']
                    # the number of files per job
                    nFilesPerJob = None
                    if taskParamMap.has_key('nFilesPerJob'):
                        nFilesPerJob = taskParamMap['nFilesPerJob']
                    # the number of chunks used by scout
                    nChunksForScout = 10
                    # load XML
                    if taskSpec.useLoadXML():
                        xmlConfig = taskParamMap['loadXML']
                    else:
                        xmlConfig = None
                    # skip files used by another task
                    if 'skipFilesUsedBy' in taskParamMap:
                        skipFilesUsedBy = taskParamMap['skipFilesUsedBy']
                    else:
                        skipFilesUsedBy = None
                    # check no wait
                    noWaitParent = False
                    parentOutDatasets = set()
                    if taskSpec.noWaitParent() and not taskSpec.parent_tid in [
                            None, taskSpec.jediTaskID
                    ]:
                        tmpStat = self.taskBufferIF.checkParentTask_JEDI(
                            taskSpec.parent_tid)
                        if tmpStat == 'running':
                            noWaitParent = True
                            # get output datasets from parent task
                            tmpParentStat, tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                                taskSpec.parent_tid, ['output', 'log'])
                            # collect dataset names
                            for tmpParentOutDataset in tmpParentOutDatasets:
                                parentOutDatasets.add(
                                    tmpParentOutDataset.datasetName)
                    # loop over all datasets
                    nFilesMaster = 0
                    checkedMaster = False
                    setFrozenTime = True
                    if not taskBroken:
                        ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                        origNumFiles = None
                        if taskParamMap.has_key('nFiles'):
                            origNumFiles = taskParamMap['nFiles']
                        for datasetSpec in dsList:
                            tmpLog.debug('start loop for {0}(id={1})'.format(
                                datasetSpec.datasetName,
                                datasetSpec.datasetID))
                            # get dataset metadata
                            tmpLog.debug('get metadata')
                            gotMetadata = False
                            stateUpdateTime = datetime.datetime.utcnow()
                            try:
                                if not datasetSpec.isPseudo():
                                    tmpMetadata = ddmIF.getDatasetMetaData(
                                        datasetSpec.datasetName)
                                else:
                                    # dummy metadata for pseudo dataset
                                    tmpMetadata = {'state': 'closed'}
                                # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed
                                if (noWaitParent or taskSpec.runUntilClosed()) and \
                                        (tmpMetadata['state'] == 'open' \
                                             or datasetSpec.datasetName in parentOutDatasets \
                                             or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets):
                                    # dummy metadata when parent is running
                                    tmpMetadata = {'state': 'mutable'}
                                gotMetadata = True
                            except:
                                errtype, errvalue = sys.exc_info()[:2]
                                tmpLog.error(
                                    '{0} failed to get metadata to {1}:{2}'.
                                    format(self.__class__.__name__,
                                           errtype.__name__, errvalue))
                                if errtype == Interaction.JEDIFatalError:
                                    # fatal error
                                    datasetStatus = 'broken'
                                    taskBroken = True
                                    # update dataset status
                                    self.updateDatasetStatus(
                                        datasetSpec, datasetStatus, tmpLog)
                                else:
                                    if not taskSpec.ignoreMissingInDS():
                                        # temporary error
                                        taskOnHold = True
                                    else:
                                        # ignore missing
                                        datasetStatus = 'failed'
                                        # update dataset status
                                        self.updateDatasetStatus(
                                            datasetSpec, datasetStatus, tmpLog)
                                taskSpec.setErrDiag(
                                    'failed to get metadata for {0}'.format(
                                        datasetSpec.datasetName))
                                if not taskSpec.ignoreMissingInDS():
                                    allUpdated = False
                            else:
                                # get file list specified in task parameters
                                fileList, includePatt, excludePatt = RefinerUtils.extractFileList(
                                    taskParamMap, datasetSpec.datasetName)
                                # get the number of events in metadata
                                if taskParamMap.has_key(
                                        'getNumEventsInMetadata'):
                                    getNumEvents = True
                                else:
                                    getNumEvents = False
                                # get file list from DDM
                                tmpLog.debug('get files')
                                try:
                                    useInFilesWithNewAttemptNr = False
                                    skipDuplicate = not datasetSpec.useDuplicatedFiles(
                                    )
                                    if not datasetSpec.isPseudo():
                                        if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \
                                                not datasetSpec.containerName in ['',None]:
                                            # read files from container if file list is specified in task parameters
                                            tmpDatasetName = datasetSpec.containerName
                                        else:
                                            tmpDatasetName = datasetSpec.datasetName
                                        # use long format for LB
                                        longFormat = False
                                        if taskSpec.respectLumiblock():
                                            longFormat = True
                                        tmpRet = ddmIF.getFilesInDataset(
                                            tmpDatasetName,
                                            getNumEvents=getNumEvents,
                                            skipDuplicate=skipDuplicate,
                                            longFormat=longFormat)
                                        tmpLog.debug(
                                            'got {0} files in {1}'.format(
                                                len(tmpRet), tmpDatasetName))
                                        # remove lost files
                                        tmpLostFiles = ddmIF.findLostFiles(
                                            tmpDatasetName, tmpRet)
                                        if tmpLostFiles != {}:
                                            tmpLog.debug(
                                                'found {0} lost files in {1}'.
                                                format(len(tmpLostFiles),
                                                       tmpDatasetName))
                                            for tmpListGUID, tmpLostLFN in tmpLostFiles.iteritems(
                                            ):
                                                tmpLog.debug(
                                                    'removed {0}'.format(
                                                        tmpLostLFN))
                                                del tmpRet[tmpListGUID]
                                    else:
                                        if datasetSpec.isSeqNumber():
                                            # make dummy files for seq_number
                                            if datasetSpec.getNumRecords(
                                            ) != None:
                                                nPFN = datasetSpec.getNumRecords(
                                                )
                                            elif origNumFiles != None:
                                                nPFN = origNumFiles
                                                if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \
                                                        and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                    nPFN = nPFN * taskParamMap[
                                                        'nEventsPerFile'] / taskParamMap[
                                                            'nEventsPerJob']
                                                elif taskParamMap.has_key(
                                                        'nEventsPerFile'
                                                ) and taskParamMap.has_key(
                                                        'nEventsPerRange'):
                                                    nPFN = nPFN * taskParamMap[
                                                        'nEventsPerFile'] / taskParamMap[
                                                            'nEventsPerRange']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap:
                                                nPFN = taskParamMap[
                                                    'nEvents'] / taskParamMap[
                                                        'nEventsPerJob']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \
                                                    and 'nFilesPerJob' in taskParamMap:
                                                nPFN = taskParamMap[
                                                    'nEvents'] / taskParamMap[
                                                        'nEventsPerFile'] / taskParamMap[
                                                            'nFilesPerJob']
                                            else:
                                                # the default number of records for seq_number
                                                seqDefNumRecords = 10000
                                                # get nFiles of the master
                                                tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI(
                                                    datasetSpec.jediTaskID,
                                                    datasetSpec.masterID,
                                                    ['nFiles'])
                                                # use nFiles of the master as the number of records if it is larger than the default
                                                if 'nFiles' in tmpMasterAtt and tmpMasterAtt[
                                                        'nFiles'] > seqDefNumRecords:
                                                    nPFN = tmpMasterAtt[
                                                        'nFiles']
                                                else:
                                                    nPFN = seqDefNumRecords
                                                # check usedBy
                                                if skipFilesUsedBy != None:
                                                    for tmpJediTaskID in str(
                                                            skipFilesUsedBy
                                                    ).split(','):
                                                        tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI(
                                                            tmpJediTaskID, {
                                                                'datasetName':
                                                                datasetSpec.
                                                                datasetName
                                                            }, ['nFiles'])
                                                        if 'nFiles' in tmpParentAtt and tmpParentAtt[
                                                                'nFiles']:
                                                            nPFN += tmpParentAtt[
                                                                'nFiles']
                                            tmpRet = {}
                                            # get offset
                                            tmpOffset = datasetSpec.getOffset()
                                            tmpOffset += 1
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {
                                                    'lfn': iPFN + tmpOffset,
                                                    'scope': None,
                                                    'filesize': 0,
                                                    'checksum': None,
                                                }
                                        elif not taskSpec.useListPFN():
                                            # dummy file list for pseudo dataset
                                            tmpRet = {
                                                str(uuid.uuid4()): {
                                                    'lfn': 'pseudo_lfn',
                                                    'scope': None,
                                                    'filesize': 0,
                                                    'checksum': None,
                                                }
                                            }
                                        else:
                                            # make dummy file list for PFN list
                                            if taskParamMap.has_key('nFiles'):
                                                nPFN = taskParamMap['nFiles']
                                            else:
                                                nPFN = 1
                                            tmpRet = {}
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {
                                                    'lfn':
                                                    '{0:06d}:{1}'.format(
                                                        iPFN,
                                                        taskParamMap['pfnList']
                                                        [iPFN].split('/')[-1]),
                                                    'scope':
                                                    None,
                                                    'filesize':
                                                    0,
                                                    'checksum':
                                                    None,
                                                }
                                except:
                                    errtype, errvalue = sys.exc_info()[:2]
                                    tmpLog.error(
                                        'failed to get files due to {0}:{1} {2}'
                                        .format(self.__class__.__name__,
                                                errtype.__name__, errvalue))
                                    if errtype == Interaction.JEDIFatalError:
                                        # fatal error
                                        datasetStatus = 'broken'
                                        taskBroken = True
                                        # update dataset status
                                        self.updateDatasetStatus(
                                            datasetSpec, datasetStatus, tmpLog)
                                    else:
                                        # temporary error
                                        taskOnHold = True
                                    taskSpec.setErrDiag(
                                        'failed to get files for {0}'.format(
                                            datasetSpec.datasetName))
                                    allUpdated = False
                                else:
                                    # parameters for master input
                                    respectLB = False
                                    useRealNumEvents = False
                                    if datasetSpec.isMaster():
                                        # respect LB boundaries
                                        respectLB = taskSpec.respectLumiblock()
                                        # use real number of events
                                        useRealNumEvents = taskSpec.useRealNumEvents(
                                        )
                                    # the number of events per file
                                    nEventsPerFile = None
                                    nEventsPerJob = None
                                    nEventsPerRange = None
                                    tgtNumEventsPerJob = None
                                    if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \
                                            (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()):
                                        if taskParamMap.has_key(
                                                'nEventsPerFile'):
                                            nEventsPerFile = taskParamMap[
                                                'nEventsPerFile']
                                        elif datasetSpec.isMaster(
                                        ) and datasetSpec.isPseudo(
                                        ) and taskParamMap.has_key('nEvents'):
                                            # use nEvents as nEventsPerFile for pseudo input
                                            nEventsPerFile = taskParamMap[
                                                'nEvents']
                                        if taskParamMap.has_key(
                                                'nEventsPerJob'):
                                            nEventsPerJob = taskParamMap[
                                                'nEventsPerJob']
                                        elif taskParamMap.has_key(
                                                'nEventsPerRange'):
                                            nEventsPerRange = taskParamMap[
                                                'nEventsPerRange']
                                        if 'tgtNumEventsPerJob' in taskParamMap:
                                            tgtNumEventsPerJob = taskParamMap[
                                                'tgtNumEventsPerJob']
                                            # reset nEventsPerJob
                                            nEventsPerJob = None
                                    # max attempts
                                    maxAttempt = None
                                    maxFailure = None
                                    if datasetSpec.isMaster(
                                    ) or datasetSpec.toKeepTrack():
                                        # max attempts
                                        if taskSpec.disableAutoRetry():
                                            # disable auto retry
                                            maxAttempt = 1
                                        elif taskParamMap.has_key(
                                                'maxAttempt'):
                                            maxAttempt = taskParamMap[
                                                'maxAttempt']
                                        else:
                                            # use default value
                                            maxAttempt = 3
                                        # max failure
                                        if 'maxFailure' in taskParamMap:
                                            maxFailure = taskParamMap[
                                                'maxFailure']
                                    # first event number
                                    firstEventNumber = None
                                    if datasetSpec.isMaster():
                                        # first event number
                                        firstEventNumber = 1 + taskSpec.getFirstEventOffset(
                                        )
                                    # nMaxEvents
                                    nMaxEvents = None
                                    if datasetSpec.isMaster(
                                    ) and taskParamMap.has_key('nEvents'):
                                        nMaxEvents = taskParamMap['nEvents']
                                    # nMaxFiles
                                    nMaxFiles = None
                                    if taskParamMap.has_key('nFiles'):
                                        if datasetSpec.isMaster():
                                            nMaxFiles = taskParamMap['nFiles']
                                        else:
                                            # calculate for secondary
                                            nMaxFiles = datasetSpec.getNumMultByRatio(
                                                origNumFiles)
                                            # multipled by the number of jobs per file for event-level splitting
                                            if nMaxFiles != None and taskParamMap.has_key(
                                                    'nEventsPerFile'):
                                                if taskParamMap.has_key(
                                                        'nEventsPerJob'):
                                                    if taskParamMap[
                                                            'nEventsPerFile'] > taskParamMap[
                                                                'nEventsPerJob']:
                                                        nMaxFiles *= float(
                                                            taskParamMap[
                                                                'nEventsPerFile']
                                                        ) / float(taskParamMap[
                                                            'nEventsPerJob'])
                                                        nMaxFiles = int(
                                                            math.ceil(
                                                                nMaxFiles))
                                                elif taskParamMap.has_key(
                                                        'nEventsPerRange'):
                                                    if taskParamMap[
                                                            'nEventsPerFile'] > taskParamMap[
                                                                'nEventsPerRange']:
                                                        nMaxFiles *= float(
                                                            taskParamMap[
                                                                'nEventsPerFile']
                                                        ) / float(taskParamMap[
                                                            'nEventsPerRange'])
                                                        nMaxFiles = int(
                                                            math.ceil(
                                                                nMaxFiles))
                                    # use scout
                                    useScout = False
                                    if datasetSpec.isMaster(
                                    ) and taskSpec.useScout() and (
                                            datasetSpec.status != 'toupdate'
                                            or not taskSpec.isPostScout()):
                                        useScout = True
                                    # use files with new attempt numbers
                                    useFilesWithNewAttemptNr = False
                                    if not datasetSpec.isPseudo(
                                    ) and fileList != [] and taskParamMap.has_key(
                                            'useInFilesWithNewAttemptNr'):
                                        useFilesWithNewAttemptNr = True
                                    #ramCount
                                    ramCount = 0

                                    # feed files to the contents table
                                    tmpLog.debug('update contents')
                                    retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(
                                        datasetSpec, tmpRet,
                                        tmpMetadata['state'], stateUpdateTime,
                                        nEventsPerFile, nEventsPerJob,
                                        maxAttempt, firstEventNumber,
                                        nMaxFiles, nMaxEvents, useScout,
                                        fileList, useFilesWithNewAttemptNr,
                                        nFilesPerJob, nEventsPerRange,
                                        nChunksForScout, includePatt,
                                        excludePatt, xmlConfig, noWaitParent,
                                        taskSpec.parent_tid, self.pid,
                                        maxFailure, useRealNumEvents,
                                        respectLB, tgtNumEventsPerJob,
                                        skipFilesUsedBy, ramCount)
                                    if retDB == False:
                                        taskSpec.setErrDiag(
                                            'failed to insert files for {0}. {1}'
                                            .format(datasetSpec.datasetName,
                                                    diagMap['errMsg']))
                                        allUpdated = False
                                        taskBroken = True
                                        break
                                    elif retDB == None:
                                        # the dataset is locked by another or status is not applicable
                                        allUpdated = False
                                        tmpLog.debug(
                                            'escape since task or dataset is locked'
                                        )
                                        break
                                    elif missingFileList != []:
                                        # files are missing
                                        tmpErrStr = '{0} files missing in {1}'.format(
                                            len(missingFileList),
                                            datasetSpec.datasetName)
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        allUpdated = False
                                        taskOnHold = True
                                        missingMap[datasetSpec.datasetName] = {
                                            'datasetSpec': datasetSpec,
                                            'missingFiles': missingFileList
                                        }
                                    else:
                                        # reduce the number of files to be read
                                        if taskParamMap.has_key('nFiles'):
                                            if datasetSpec.isMaster():
                                                taskParamMap[
                                                    'nFiles'] -= nFilesUnique
                                        # reduce the number of files for scout
                                        if useScout:
                                            nChunksForScout = diagMap[
                                                'nChunksForScout']
                                        # number of master input files
                                        if datasetSpec.isMaster():
                                            checkedMaster = True
                                            nFilesMaster += nFilesUnique
                                    # running task
                                    if diagMap['isRunningTask']:
                                        runningTask = True
                                    # no activated pending input for noWait
                                    if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout == 0) \
                                            and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster():
                                        tmpErrStr = 'insufficient inputs are ready. '
                                        tmpErrStr += diagMap['errMsg']
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        taskOnHold = True
                                        setFrozenTime = False
                                        break
                            tmpLog.debug('end loop')
                    # no mater input
                    if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster:
                        tmpErrStr = 'no master input files. input dataset is empty'
                        tmpLog.error(tmpErrStr)
                        taskSpec.setErrDiag(tmpErrStr, None)
                        if taskSpec.allowEmptyInput() or noWaitParent:
                            taskOnHold = True
                        else:
                            taskBroken = True
                    # update task status
                    if taskBroken:
                        # task is broken
                        taskSpec.status = 'tobroken'
                        tmpMsg = 'set task.status={0}'.format(taskSpec.status)
                        tmpLog.info(tmpMsg)
                        tmpLog.sendMsg(tmpMsg, self.msgType)
                        allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                            jediTaskID, taskSpec, pid=self.pid)
                    # change task status unless the task is running
                    if not runningTask:
                        if taskOnHold:
                            # go to pending state
                            if not taskSpec.status in ['broken', 'tobroken']:
                                taskSpec.setOnHold()
                            tmpMsg = 'set task.status={0}'.format(
                                taskSpec.status)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg, self.msgType)
                            allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                                jediTaskID,
                                taskSpec,
                                pid=self.pid,
                                setFrozenTime=setFrozenTime)
                        elif allUpdated:
                            # all OK
                            allRet, newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                                jediTaskID,
                                getTaskStatus=True,
                                pid=self.pid,
                                useWorldCloud=taskSpec.useWorldCloud())
                            tmpMsg = 'set task.status={0}'.format(
                                newTaskStatus)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg, self.msgType)
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(
                            jediTaskID, self.pid)
                        tmpLog.debug('unlock not-running task with {0}'.format(
                            retUnlock))
                    else:
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(
                            jediTaskID, self.pid)
                        tmpLog.debug('unlock task with {0}'.format(retUnlock))
                    tmpLog.debug('done')
            except:
                errtype, errvalue = sys.exc_info()[:2]
                logger.error('{0} failed in runImpl() with {1}:{2}'.format(
                    self.__class__.__name__, errtype.__name__, errvalue))
Пример #39
0
 def runImpl(self):
     # cutoff for disk in TB
     diskThreshold = 5 * 1024
     # dataset type to ignore file availability check
     datasetTypeToSkipCheck = ['log']
     thrInputSize = 1024*1024*1024
     thrInputNum = 100
     thrInputSizeFrac = 0.1
     thrInputNumFrac = 0.1
     cutOffRW = 50
     negWeightTape = 0.001
     # main
     lastJediTaskID = None
     siteMapper = self.taskBufferIF.getSiteMapper()
     while True:
         try:
             taskInputList = self.inputList.get(1)
             # no more datasets
             if len(taskInputList) == 0:
                 self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__,
                                                                                                             self.numTasks))
                 return
             # loop over all tasks
             for taskSpec,inputChunk in taskInputList:
                 lastJediTaskID = taskSpec.jediTaskID
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='{0}'.format(taskSpec.jediTaskID))
                 tmpLog.debug('start')
                 # get nuclei
                 nucleusList = siteMapper.nuclei
                 if taskSpec.nucleus in nucleusList:
                     candidateNucleus = taskSpec.nucleus
                 else:
                     tmpLog.debug('got {0} candidates'.format(len(nucleusList)))
                     ######################################
                     # check status
                     newNucleusList = {}
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleusSpec.state in ['ACTIVE']:
                             tmpLog.debug('  skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus,
                                                                                                         tmpNucleusSpec.state))
                         else:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.debug('{0} candidates passed status check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # check endpoint
                     newNucleusList = {}
                     tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                                   ['output','log'])
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         toSkip = False
                         for tmpDatasetSpec in tmpDatasetSpecList:
                             # ignore distributed datasets
                             if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None:
                                 continue
                             # get endpoint with the pattern
                             tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken)
                             if tmpEP == None:
                                 tmpLog.debug('  skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus,
                                                                                                                     tmpDatasetSpec.storageToken))
                                 toSkip = True
                                 break
                             # check state
                             """
                             if not tmpEP['state'] in ['ACTIVE']:
                                 tmpLog.debug('  skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus,
                                                                                                                          tmpEP['ddm_endpoint_name'],
                                                                                                                          tmpEP['state']))
                                 toSkip = True
                                 break
                             """    
                             # check space
                             tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired']
                             if tmpSpaceSize < diskThreshold:
                                 tmpLog.debug('  skip nucleus={0} since disk shortage ({1}<{2}) at endpoint {3} criteria=-space'.format(tmpNucleus,
                                                                                                                                        tmpSpaceSize,
                                                                                                                                        diskThreshold,
                                                                                                                                        tmpEP['state']))
                                 toSkip = True
                                 break
                         if not toSkip:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.debug('{0} candidates passed endpoint check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ###################################### 
                     # data locality
                     toSkip = False
                     availableData = {}
                     for datasetSpec in inputChunk.getDatasets():
                         # only for real datasets
                         if datasetSpec.isPseudo():
                             continue
                         # ignore DBR
                         if DataServiceUtils.isDBR(datasetSpec.datasetName):
                             continue
                         # skip locality check
                         if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck:
                             continue
                         # get nuclei where data is available
                         tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF,
                                                                           datasetSpec.datasetName,
                                                                           nucleusList.keys())
                         if tmpSt != Interaction.SC_SUCCEEDED:
                             tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet))
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             toSkip = True
                             break
                         # sum
                         for tmpNucleus,tmpVals in tmpRet.iteritems():
                             if not tmpNucleus in availableData:
                                 availableData[tmpNucleus] = tmpVals
                             else:
                                 availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems())
                     if toSkip:
                         continue
                     if availableData != {}:
                         newNucleusList = {}
                         # skip if no data
                         for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                             if availableData[tmpNucleus]['tot_size'] > thrInputSize and \
                                     availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac:
                                 tmpLog.debug('  skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus,
                                                                                                                                         availableData[tmpNucleus]['ava_size_any'],
                                                                                                                                         availableData[tmpNucleus]['tot_size'],
                                                                                                                                         thrInputSizeFrac))
                             elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \
                                     availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac:
                                 tmpLog.debug('  skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus,
                                                                                                                                           availableData[tmpNucleus]['ava_num_any'],
                                                                                                                                           availableData[tmpNucleus]['tot_num'],
                                                                                                                                           thrInputNumFrac))
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         nucleusList = newNucleusList
                         tmpLog.debug('{0} candidates passed data check'.format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # ability to execute jobs
                     newNucleusList = {}
                     # get all panda sites
                     tmpSiteList = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         tmpSiteList += tmpNucleusSpec.allPandaSites
                     tmpSiteList = list(set(tmpSiteList))
                     tmpLog.debug('===== start for job check')
                     jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF)
                     tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True,
                                                          tmpSiteList,tmpLog)
                     tmpLog.debug('===== done for job check')
                     if tmpSt != Interaction.SC_SUCCEEDED:
                         tmpLog.debug('failed to get sites where jobs can run. Use any nuclei where input is available')
                         # use any nuclei where input is available if no sites can run jobs
                         tmpRet = tmpSiteList
                     okNuclei = set()
                     for tmpSite in tmpRet:
                         siteSpec = siteMapper.getSite(tmpSite)
                         okNuclei.add(siteSpec.pandasite)
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if tmpNucleus in okNuclei:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                         else:
                             tmpLog.debug('  skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus))
                     nucleusList = newNucleusList
                     tmpLog.debug('{0} candidates passed job check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ###################################### 
                     # RW
                     taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID)
                     ###################################### 
                     # weight
                     self.prioRW.acquire()
                     nucleusRW = self.prioRW[taskSpec.currentPriority]
                     self.prioRW.release()
                     totalWeight = 0
                     nucleusweights = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleus in nucleusRW:
                             nucleusRW[tmpNucleus] = 0
                         wStr = '1'
                         # with RW
                         if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW:
                             weight = 1 / float(nucleusRW[tmpNucleus])
                             wStr += '/({0}=RW)'.format(nucleusRW[tmpNucleus])
                         else:
                             weight = 1
                             wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW)
                         # with data
                         if availableData != {}:
                             weight *= float(availableData[tmpNucleus]['ava_size_any'])
                             weight /= float(availableData[tmpNucleus]['tot_size'])
                             wStr += '*({0}=available input size on DISK/TAPE)'.format(availableData[tmpNucleus]['ava_size_any'])
                             wStr += '/({0}=total input size)'.format(availableData[tmpNucleus]['tot_size'])
                             # negative weight for tape
                             if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']:
                                 weight *= negWeightTape
                                 wStr += '*({0}=weight for TAPE)'.format(negWeightTape)
                         tmpLog.debug('  use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr))
                         totalWeight += weight
                         nucleusweights.append((tmpNucleus,weight))
                     tmpLog.debug('final {0} candidates'.format(len(nucleusList)))
                     ###################################### 
                     # final selection
                     tgtWeight = random.uniform(0,totalWeight)
                     candidateNucleus = None
                     for tmpNucleus,weight in nucleusweights:
                         tgtWeight -= weight
                         if tgtWeight <= 0:
                             candidateNucleus = tmpNucleus
                             break
                     if candidateNucleus == None:
                         candidateNucleus = nucleusweights[-1][0]
                 ###################################### 
                 # update
                 nucleusSpec = nucleusList[candidateNucleus]
                 # get output/log datasets
                 tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                            ['output','log'])
                 # get destinations
                 retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)}
                 tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
                 tmpLog.info('  set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet))
                 # update RW table
                 self.prioRW.acquire()
                 for prio,rwMap in self.prioRW.iteritems():
                     if prio > taskSpec.currentPriority:
                         continue
                     if candidateNucleus in rwMap:
                         rwMap[candidateNucleus] += taskRW
                     else:
                         rwMap[candidateNucleus] = taskRW
                 self.prioRW.release()
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errMsg  = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID)
             errMsg += traceback.format_exc()
             logger.error(errMsg)
Пример #40
0
 def doGenerate(self,taskSpec,taskParamMap,**varMap):
     # make logger
     tmpLog = MsgWrapper(logger,"<jediTaskID={0}>".format(taskSpec.jediTaskID))
     tmpLog.info('start taskType={0}'.format(taskSpec.taskType))
     tmpLog.info(str(varMap))
     # returns
     retFatal    = self.SC_FATAL
     retTmpError = self.SC_FAILED
     retOK       = self.SC_SUCCEEDED
     try:
         # check prodSourceLabel
         if taskSpec.prodSourceLabel in ['managed','test']:
             # check taskType
             if taskSpec.taskType == 'recov':
                 # generate parent tasks for lost file recovery if it is not yet generated
                 if taskParamMap.has_key('parentGenerated'):
                     tmpLog.info('skip since already generated parent tasks')
                 else:
                     tmpLog.info('generating parent tasks for lost file recovery')
                     # missing files are undefined
                     if not varMap.has_key('missingFilesMap'):
                         tmpLog.error('missing files are undefined')
                         return retFatal
                     missingFilesMap = varMap['missingFilesMap']
                     # check datasets
                     for datasetName,datasetValMap in missingFilesMap.iteritems():
                         # dataset needs specify container
                         datasetSpec = datasetValMap['datasetSpec']
                         if datasetSpec.containerName in ['',None]:
                             errStr = 'cannot make parent tasks due to undefined container for datasetID={0}:{1}'.format(datasetSpec.datasetID,
                                                                                                                         datasetName)
                             taskSpec.setErrDiag(errStr)
                             tmpLog.error(errStr)
                             return retFatal
                     # make parameters for new task
                     newJsonStrList = []    
                     for datasetName,datasetValMap in missingFilesMap.iteritems():
                         datasetSpec = datasetValMap['datasetSpec']
                         newTaskParamMap = {}
                         newTaskParamMap['oldDatasetName']  = datasetName
                         newTaskParamMap['lostFiles']       = datasetValMap['missingFiles']
                         newTaskParamMap['vo']              = taskSpec.vo
                         newTaskParamMap['cloud']           = taskSpec.cloud
                         newTaskParamMap['taskPriority']    = taskSpec.taskPriority
                         newTaskParamMap['taskType']        = taskSpec.taskType
                         newTaskParamMap['prodSourceLabel'] = taskSpec.prodSourceLabel
                         logDatasetName = 'panda.jedi{0}.log.{1}'.format(taskSpec.taskType,uuid.uuid4())
                         newTaskParamMap['log'] = {'dataset': logDatasetName,
                                                   'type':'template',
                                                   'param_type':'log',
                                                   'token':'ATLASDATADISK',
                                                   'value':'{0}.${{SN}}.log.tgz'.format(logDatasetName)}
                         # make new datasetname
                         outDatasetName = datasetName
                         # remove /
                         outDatasetName = re.sub('/$','',outDatasetName)
                         # remove extension
                         outDatasetName = re.sub('\.{0}\d+$'.format(taskSpec.taskType),'',outDatasetName)
                         # add extension
                         outDatasetName = outDatasetName + '.{0}{1}'.format(taskSpec.taskType,taskSpec.jediTaskID)
                         newTaskParamMap['output'] = {'dataset': outDatasetName}
                         if not datasetSpec.containerName in ['',None]:
                             newTaskParamMap['output']['container'] = datasetSpec.containerName
                         # make json
                         jsonStr = json.dumps(newTaskParamMap)
                         newJsonStrList.append(jsonStr)
                     # change original task parameters to not repeat the same procedure and to use newly produced files 
                     taskParamMap['parentGenerated']         = True
                     taskParamMap['useInFilesInContainer']   = True
                     taskParamMap['useInFilesWithNewAttemptNr'] = True
                     jsonStr = json.dumps(taskParamMap)
                     # insert and update task parameters
                     sTmp,newJediTaskIDs = self.taskBufferIF.insertUpdateTaskParams_JEDI(taskSpec.jediTaskID,
                                                                                         taskSpec.vo,
                                                                                         taskSpec.prodSourceLabel,
                                                                                         jsonStr,newJsonStrList)
                     if sTmp:
                         tmpLog.info('inserted/updated tasks in DB : new jediTaskIDs={0}'.format(str(newJediTaskIDs)))
                     else:
                         tmpLog.error('failed to insert/update tasks in DB')
                         return retFatal
         # return
         tmpLog.info('done')        
         return retOK
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('doGenerate failed with {0}:{1}'.format(errtype.__name__,errvalue))
         return retFatal
Пример #41
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 100
             taskList = self.taskList.get(nTasks)
             totalTasks, idxTasks = self.taskList.stat()
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug(
                     '{0} terminating since no more items'.format(
                         self.__class__.__name__))
                 return
             # make logger
             tmpLog = MsgWrapper(self.logger)
             tmpLog.info(
                 'start TaskCheckerThread {0}/{1} for jediTaskID={2}'.
                 format(idxTasks, totalTasks, taskList))
             tmpStat = Interaction.SC_SUCCEEDED
             # get TaskSpecs
             taskSpecList = []
             for jediTaskID in taskList:
                 tmpRet, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(
                     jediTaskID, False)
                 if tmpRet and taskSpec is not None:
                     taskSpecList.append(taskSpec)
                 else:
                     tmpLog.error(
                         'failed to get taskSpec for jediTaskID={0}'.format(
                             jediTaskID))
             if taskSpecList != []:
                 # get impl
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('getting Impl')
                     try:
                         impl = self.implFactory.getImpl(
                             self.vo, self.prodSourceLabel)
                         if impl is None:
                             # task brokerage is undefined
                             tmpLog.error(
                                 'task broker is undefined for vo={0} sourceLabel={1}'
                                 .format(self.vo, self.prodSourceLabel))
                             tmpStat = Interaction.SC_FAILED
                     except Exception:
                         errtype, errvalue = sys.exc_info()[:2]
                         tmpLog.error('getImpl failed with {0}:{1}'.format(
                             errtype.__name__, errvalue))
                         tmpStat = Interaction.SC_FAILED
                 # check
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('check with {0}'.format(
                         impl.__class__.__name__))
                     try:
                         tmpStat, taskCloudMap = impl.doCheck(taskSpecList)
                     except Exception:
                         errtype, errvalue = sys.exc_info()[:2]
                         tmpLog.error('doCheck failed with {0}:{1}'.format(
                             errtype.__name__, errvalue))
                         tmpStat = Interaction.SC_FAILED
                 # update
                 if tmpStat != Interaction.SC_SUCCEEDED:
                     tmpLog.error('failed to check assignment')
                 else:
                     tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(
                         taskCloudMap)
                     tmpLog.info('done with {0} for {1}'.format(
                         tmpRet, str(taskCloudMap)))
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(
                 self.__class__.__name__, errtype.__name__, errvalue))
Пример #42
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,commandMap in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID))
                 commandStr = commandMap['command']
                 commentStr = commandMap['comment']
                 oldStatus  = commandMap['oldStatus']
                 tmpLog.info('start for {0}'.format(commandStr))
                 tmpStat = Interaction.SC_SUCCEEDED
                 if commandStr in ['kill','finish','reassign']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # loop twice to see immediate result
                     for iLoop in range(2):
                         # get active PandaIDs to be killed
                         if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr:
                             pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID)
                         else:
                             pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True)
                         if pandaIDs == None:
                             tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID))
                             tmpStat = Interaction.SC_FAILED
                         # kill jobs or update task
                         if tmpStat == Interaction.SC_SUCCEEDED:
                             if pandaIDs == []:
                                 # done since no active jobs
                                 tmpMsg = 'completed cleaning jobs'
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpTaskSpec = JediTaskSpec()
                                 tmpTaskSpec.jediTaskID = jediTaskID
                                 updateTaskStatus = True
                                 if commandStr != 'reassign':
                                     # reset oldStatus
                                     # keep oldStatus for task reassignment since it is reset when actually reassigned
                                     tmpTaskSpec.forceUpdate('oldStatus')
                                 else:
                                     # extract cloud or site
                                     if commentStr != None:
                                         tmpItems = commentStr.split(':')
                                         if tmpItems[0] == 'cloud':
                                             tmpTaskSpec.cloud = tmpItems[1]
                                         else:
                                             tmpTaskSpec.site = tmpItems[1]
                                         tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1])
                                         tmpLog.sendMsg(tmpMsg,self.msgType)
                                         tmpLog.info(tmpMsg)
                                         # back to oldStatus if necessary 
                                         if tmpItems[2] == 'y':
                                             tmpTaskSpec.status = oldStatus
                                             tmpTaskSpec.forceUpdate('oldStatus')
                                             updateTaskStatus = False
                                 if commandStr == 'reassign':
                                     tmpTaskSpec.forceUpdate('errorDialog')
                                 if updateTaskStatus:
                                     tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done']
                                 tmpMsg = 'set task.status={0}'.format(tmpTaskSpec.status)
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID})
                                 tmpLog.info('done with {0}'.format(str(tmpRet)))
                                 break
                             else:
                                 # kill only in the first loop
                                 if iLoop > 0:
                                     break
                                 # wait or kill jobs 
                                 if 'soft finish' in commentStr:
                                     tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = True
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                                     break
                                 else:
                                     tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpLog.sendMsg(tmpMsg,self.msgType)
                                     if commandStr in ['reassign','finish']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True)
                                     else:
                                         # normal kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True)
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                 elif commandStr in ['retry','incexec']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # change task params for incexec
                     if commandStr == 'incexec':
                         try:
                             # read task params
                             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                             taskParamMap = RefinerUtils.decodeJSON(taskParam)
                             # remove some params
                             for newKey in ['nFiles','fixedSandbox']:
                                 try:
                                     del taskParamMap[newKey]
                                 except:
                                     pass
                             # convert new params
                             newParamMap = RefinerUtils.decodeJSON(commentStr)
                             # change params
                             for newKey,newVal in newParamMap.iteritems():
                                 if newVal == None:
                                     # delete
                                     if newKey in taskParamMap:
                                         del taskParamMap[newKey]
                                 else:
                                     # change
                                     taskParamMap[newKey] = newVal
                             # overwrite sandbox
                             if 'fixedSandbox' in taskParamMap:
                                 # noBuild
                                 for tmpParam in taskParamMap['jobParameters']:
                                     if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None:
                                         tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox']
                                 # build
                                 if taskParamMap.has_key('buildSpec'):
                                     taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox']
                                 # merge
                                 if taskParamMap.has_key('mergeSpec'):
                                     taskParamMap['mergeSpec']['jobParameters'] = \
                                         re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters'])
                             # encode new param
                             strTaskParams = RefinerUtils.encodeJSON(taskParamMap)
                             tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams)
                             if tmpRet != True:
                                 tmpLog.error('failed to update task params')
                                 continue
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue))
                             continue
                     # retry failed files
                     tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr)
                     if tmpRet == True:
                         tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                         tmpLog.info(tmpMsg)
                     tmpLog.info('done with {0}'.format(tmpRet))
                 else:
                     tmpLog.error('unknown command')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errStr  = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errStr += traceback.format_exc()
             logger.error(errStr)
Пример #43
0
 def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,
                         '<jediTaskID={0}>'.format(taskSpec.jediTaskID))
     tmpLog.debug('start')
     # return for failure
     retFatal = self.SC_FATAL, inputChunk
     retTmpError = self.SC_FAILED, inputChunk
     # set cloud
     try:
         if not taskParamMap:
             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                 taskSpec.jediTaskID)
             taskParamMap = RefinerUtils.decodeJSON(taskParam)
         if not taskSpec.cloud and 'cloud' in taskParamMap:
             taskSpec.cloud = taskParamMap['cloud']
     except Exception:
         pass
     # get sites in the cloud
     site_preassigned = True
     if taskSpec.site not in ['', None]:
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
         if self.siteMapper.checkSite(taskSpec.site):
             scanSiteList = [taskSpec.site]
         else:
             scanSiteList = []
             for tmpSite in self.siteMapper.getCloud(
                     taskSpec.cloud)['sites']:
                 if re.search(taskSpec.site, tmpSite):
                     scanSiteList.append(tmpSite)
             if not scanSiteList:
                 tmpLog.error('unknown site={}'.format(taskSpec.site))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retTmpError
     elif inputChunk.getPreassignedSite() is not None:
         scanSiteList = [inputChunk.getPreassignedSite()]
         tmpLog.debug('site={0} is pre-assigned in masterDS'.format(
             inputChunk.getPreassignedSite()))
     else:
         site_preassigned = False
         scanSiteList = self.siteMapper.getCloud(taskSpec.cloud)['sites']
         # remove NA
         if 'NA' in scanSiteList:
             scanSiteList.remove('NA')
         tmpLog.debug('cloud=%s has %s candidates' %
                      (taskSpec.cloud, len(scanSiteList)))
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     ######################################
     # selection for status and PandaSite
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         if tmpSiteSpec.status != 'online' and not site_preassigned:
             tmpLog.debug('  skip %s due to status=%s' %
                          (tmpSiteName, tmpSiteSpec.status))
             continue
         # check PandaSite
         if 'PandaSite' in taskParamMap and taskParamMap['PandaSite']:
             if tmpSiteSpec.pandasite != taskParamMap['PandaSite']:
                 tmpLog.debug('  skip %s due to wrong PandaSite=%s <> %s' %
                              (tmpSiteName, tmpSiteSpec.pandasite,
                               taskParamMap['PandaSite']))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed site status check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for scratch disk
     minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize(
     ) + inputChunk.getMaxAtomSize()
     minDiskCountS = minDiskCountS // 1024 // 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = taskSpec.getOutDiskSize(
         ) + taskSpec.getWorkDiskSize()
         minDiskCountR = minDiskCountR // 1024 // 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir:
             if JediCoreUtils.use_direct_io_for_job(taskSpec, tmpSiteSpec,
                                                    inputChunk):
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug(
                     '  skip {0} due to small scratch disk={1} < {2}'.
                     format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # free space must be >= 200GB
         diskThreshold = 200
         tmpSpaceSize = tmpSiteSpec.space
         if tmpSiteSpec.space and tmpSpaceSize < diskThreshold:
             tmpLog.debug(
                 '  skip {0} due to disk shortage in SE = {1} < {2}GB'.
                 format(tmpSiteName, tmpSiteSpec.space, diskThreshold))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if minWalltime not in [0, None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug(
                     '  skip {0} due to short site walltime={1}(site upper limit) < {2}'
                     .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug(
                     '  skip {0} due to short job walltime={1}(site lower limit) > {2}'
                     .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(
             len(scanSiteList), minWalltime, taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for memory
     origMinRamCount = inputChunk.getMaxRamCount()
     if not site_preassigned and origMinRamCount:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # job memory requirement
             if taskSpec.ramPerCore():
                 minRamCount = origMinRamCount * (
                     tmpSiteSpec.coreCount if tmpSiteSpec.coreCount else 1)
                 minRamCount += (taskSpec.baseRamCount
                                 if taskSpec.baseRamCount else 0)
             else:
                 minRamCount = origMinRamCount
             # site max memory requirement
             site_maxmemory = tmpSiteSpec.maxrss if tmpSiteSpec.maxrss else 0
             # check at the site
             if site_maxmemory and minRamCount and minRamCount > site_maxmemory:
                 tmpMsg = '  skip site={0} due to site RAM shortage {1}(site upper limit) less than {2} '.format(
                     tmpSiteName, site_maxmemory, minRamCount)
                 tmpLog.debug(tmpMsg)
                 continue
             # site min memory requirement
             site_minmemory = tmpSiteSpec.minrss if tmpSiteSpec.minrss else 0
             if site_minmemory and minRamCount and minRamCount < site_minmemory:
                 tmpMsg = '  skip site={0} due to job RAM shortage {1}(site lower limit) greater than {2} '.format(
                     tmpSiteName, site_minmemory, minRamCount)
                 tmpLog.info(tmpMsg)
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed memory check'.format(
             len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if tmpSiteName in nWNmap:
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][
                 'updateJob']
         if nPilot == 0 and taskSpec.prodSourceLabel not in ['test']:
             tmpLog.debug('  skip %s due to no pilot' % tmpSiteName)
             #continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed pilot activity check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # sites already used by task
     tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(
         taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # get list of available files
     availableFileMap = {}
     for datasetSpec in inputChunk.getDatasets():
         try:
             # get list of site to be scanned
             tmpLog.debug(
                 'getting the list of available files for {0}'.format(
                     datasetSpec.datasetName))
             fileScanSiteList = []
             for tmpPseudoSiteName in scanSiteList:
                 tmpSiteSpec = self.siteMapper.getSite(tmpPseudoSiteName)
                 tmpSiteName = tmpSiteSpec.get_unified_name()
                 if tmpSiteName in fileScanSiteList:
                     continue
                 fileScanSiteList.append(tmpSiteName)
             # mapping between sites and input storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteInputStorageEndpointMap(
                 fileScanSiteList, self.siteMapper,
                 taskSpec.prodSourceLabel, None)
             # disable file lookup for merge jobs
             if inputChunk.isMerging:
                 checkCompleteness = False
             else:
                 checkCompleteness = True
             if not datasetSpec.isMaster():
                 useCompleteOnly = True
             else:
                 useCompleteOnly = False
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(
                 datasetSpec,
                 siteStorageEP,
                 self.siteMapper,
                 check_completeness=checkCompleteness,
                 file_scan_in_container=False,
                 complete_only=useCompleteOnly)
             if tmpAvFileMap is None:
                 raise Interaction.JEDITemporaryError(
                     'ddmIF.getAvailableFiles failed')
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except Exception as e:
             tmpLog.error('failed to get available files with {}'.format(e))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # calculate weight
     tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsByGlobalShare(
         taskSpec.vo)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     preSiteCandidateSpec = None
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName,
                                                'running', None, None)
         nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                 tmpSiteName, 'defined',
                                                 None, None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                  tmpSiteName, 'activated',
                                                  None, None)
         weight = float(nRunning + 1) / float(nActivated + nAssigned +
                                              1) / float(nAssigned + 1)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # set weight
         siteCandidateSpec.weight = weight
         # files
         for tmpDatasetName, availableFiles in six.iteritems(
                 availableFileMap):
             if tmpSiteName in availableFiles:
                 siteCandidateSpec.add_local_disk_files(
                     availableFiles[tmpSiteName]['localdisk'])
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if weight not in weightMap:
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)
     # limit the number of sites
     maxNumSites = 5
     weightList = list(weightMap.keys())
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         if len(candidateSpecList) >= maxNumSites:
             break
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight[:(maxNumSites -
                                                len(candidateSpecList))]
     # collect site names
     scanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         # append
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug('  use {} with weight={} nFiles={}'.format(
             siteCandidateSpec.siteName, siteCandidateSpec.weight,
             len(siteCandidateSpec.localDiskFiles)))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED, inputChunk
Пример #44
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,commandMap in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID))
                 commandStr = commandMap['command']
                 commentStr = commandMap['comment']
                 oldStatus  = commandMap['oldStatus']
                 tmpLog.info('start for {0}'.format(commandStr))
                 tmpStat = Interaction.SC_SUCCEEDED
                 if commandStr in ['kill','finish','reassign']:
                     # get active PandaIDs to be killed
                     pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True)
                     if pandaIDs == None:
                         tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID))
                         tmpStat = Interaction.SC_FAILED
                     # kill jobs or update task
                     if tmpStat == Interaction.SC_SUCCEEDED:
                         if pandaIDs == []:
                             # done since no active jobs
                             tmpLog.info('completed the command')
                             tmpTaskSpec = JediTaskSpec()
                             tmpTaskSpec.jediTaskID = jediTaskID
                             updateTaskStatus = True
                             if commandStr != 'reassign':
                                 # keep oldStatus for task reassignment since it is reset when actually reassigned
                                 tmpTaskSpec.forceUpdate('oldStatus')
                             else:
                                 # extract cloud or site
                                 tmpItems = commentStr.split(':')
                                 if tmpItems[0] == 'cloud':
                                     tmpTaskSpec.cloud = tmpItems[1]
                                 else:
                                     tmpTaskSpec.site = tmpItems[1]
                                 # back to oldStatus if necessary 
                                 if tmpItems[2] == 'y':
                                     tmpTaskSpec.status = oldStatus
                                     tmpTaskSpec.forceUpdate('oldStatus')
                                     updateTaskStatus = False
                             if updateTaskStatus:
                                 tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done']
                             tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID})
                         else:
                             tmpLog.info('sending kill command')
                             tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True)
                         tmpLog.info('done with {0}'.format(str(tmpRet)))
                 elif commandStr in ['retry','incexec']:
                     # change task params for incexec
                     if commandStr == 'incexec':
                         try:
                             # read task params
                             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                             taskParamMap = RefinerUtils.decodeJSON(taskParam)
                             # remove some params
                             for newKey in ['nFiles','fixedSandbox']:
                                 try:
                                     del taskParamMap[newKey]
                                 except:
                                     pass
                             # convert new params
                             newParamMap = RefinerUtils.decodeJSON(commentStr)
                             # change params
                             for newKey,newVal in newParamMap.iteritems():
                                 if newVal == None:
                                     # delete
                                     if newKey in taskParamMap:
                                         del taskParamMap[newKey]
                                 else:
                                     # change
                                     taskParamMap[newKey] = newVal
                             # overwrite sandbox
                             if 'fixedSandbox' in taskParamMap:
                                 # noBuild
                                 for tmpParam in taskParamMap['jobParameters']:
                                     if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None:
                                         tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox']
                                 # build
                                 if taskParamMap.has_key('buildSpec'):
                                     taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox']
                                 # merge
                                 if taskParamMap.has_key('mergeSpec'):
                                     taskParamMap['mergeSpec']['jobParameters'] = \
                                         re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters'])
                             # encode new param
                             strTaskParams = RefinerUtils.encodeJSON(taskParamMap)
                             tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams)
                             if tmpRet != True:
                                 tmpLog.error('failed to update task params')
                                 continue
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue))
                             continue
                     # retry failed files
                     tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr)
                     if tmpRet == True:
                         tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                         tmpLog.info(tmpMsg)
                     tmpLog.info('done with {0}'.format(tmpRet))
                 else:
                     tmpLog.error('unknown command')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
Пример #45
0
    def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap):
        # make logger
        tmpLog = MsgWrapper(logger,
                            '<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                            monToken='<jediTaskID={0} {1}>'.format(
                                taskSpec.jediTaskID,
                                datetime.datetime.utcnow().isoformat('/')))
        tmpLog.debug('start')
        # return for failure
        retFatal = self.SC_FATAL, inputChunk
        retTmpError = self.SC_FAILED, inputChunk
        # get primary site candidates
        sitePreAssigned = False
        excludeList = []
        includeList = None
        scanSiteList = []
        # get list of site access
        siteAccessList = self.taskBufferIF.listSiteAccess(
            None, taskSpec.userName)
        siteAccessMap = {}
        for tmpSiteName, tmpAccess in siteAccessList:
            siteAccessMap[tmpSiteName] = tmpAccess
        # site limitation
        if taskSpec.useLimitedSites():
            if 'excludedSite' in taskParamMap:
                excludeList = taskParamMap['excludedSite']
                # str to list for task retry
                try:
                    if type(excludeList) != types.ListType:
                        excludeList = excludeList.split(',')
                except:
                    pass
            if 'includedSite' in taskParamMap:
                includeList = taskParamMap['includedSite']
                # str to list for task retry
                if includeList == '':
                    includeList = None
                try:
                    if type(includeList) != types.ListType:
                        includeList = includeList.split(',')
                except:
                    pass
        # loop over all sites
        for siteName, tmpSiteSpec in self.siteMapper.siteSpecList.iteritems():
            if tmpSiteSpec.type == 'analysis':
                scanSiteList.append(siteName)
        # preassigned
        if not taskSpec.site in ['', None]:
            # site is pre-assigned
            tmpLog.info('site={0} is pre-assigned'.format(taskSpec.site))
            sitePreAssigned = True
            if not taskSpec.site in scanSiteList:
                scanSiteList.append(taskSpec.site)
        tmpLog.info('initial {0} candidates'.format(len(scanSiteList)))
        # allowed remote access protocol
        allowedRemoteProtocol = 'fax'
        # MP
        if taskSpec.coreCount != None and taskSpec.coreCount > 1:
            # use MCORE only
            useMP = 'only'
        elif taskSpec.coreCount == 0:
            # use MCORE and normal
            useMP = 'any'
        else:
            # not use MCORE
            useMP = 'unuse'
        ######################################
        # selection for status
        newScanSiteList = []
        for tmpSiteName in scanSiteList:
            tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
            # skip unified queues
            if tmpSiteSpec.is_unified:
                tmpLog.info(
                    '  skip site=%s due to is_unified=%s criteria=-unified' %
                    (tmpSiteName, tmpSiteSpec.is_unified))
                continue
            # check site status
            skipFlag = False
            if tmpSiteSpec.status in ['offline']:
                skipFlag = True
            elif tmpSiteSpec.status in ['brokeroff', 'test']:
                if not sitePreAssigned:
                    skipFlag = True
                elif tmpSiteName != taskSpec.site:
                    skipFlag = True
            if not skipFlag:
                newScanSiteList.append(tmpSiteName)
            else:
                tmpLog.info(
                    '  skip site=%s due to status=%s criteria=-status' %
                    (tmpSiteName, tmpSiteSpec.status))
        scanSiteList = newScanSiteList
        tmpLog.info('{0} candidates passed site status check'.format(
            len(scanSiteList)))
        if scanSiteList == []:
            tmpLog.error('no candidates')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retTmpError
        ######################################
        # selection for MP
        newScanSiteList = []
        for tmpSiteName in scanSiteList:
            tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
            # check at the site
            if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \
                    (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]):
                newScanSiteList.append(tmpSiteName)
            else:
                tmpLog.info('  skip site=%s due to core mismatch cores_site=%s <> cores_task=%s criteria=-cpucore' % \
                                (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount))
        scanSiteList = newScanSiteList
        tmpLog.info('{0} candidates passed for useMP={1}'.format(
            len(scanSiteList), useMP))
        if scanSiteList == []:
            tmpLog.error('no candidates')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retTmpError
        ######################################
        # selection for release
        if taskSpec.transHome != None:
            unified_site_list = self.get_unified_sites(scanSiteList)
            if taskSpec.transHome.startswith('ROOT'):
                # hack until x86_64-slc6-gcc47-opt is published in installedsw
                if taskSpec.architecture == 'x86_64-slc6-gcc47-opt':
                    tmpCmtConfig = 'x86_64-slc6-gcc46-opt'
                else:
                    tmpCmtConfig = taskSpec.architecture
                siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                    unified_site_list,
                    cmtConfig=tmpCmtConfig,
                    onlyCmtConfig=True)
            elif 'AthAnalysis' in taskSpec.transHome or re.search('Ath[a-zA-Z]+Base',taskSpec.transHome) != None \
                    or 'AnalysisBase' in taskSpec.transHome:
                # AthAnalysis
                siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                    unified_site_list,
                    cmtConfig=taskSpec.architecture,
                    onlyCmtConfig=True)
            else:
                # remove AnalysisTransforms-
                transHome = re.sub('^[^-]+-*', '', taskSpec.transHome)
                transHome = re.sub('_', '-', transHome)
                if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None and taskSpec.transHome != 'AnalysisTransforms' and \
                        re.search('\d{4}-\d{2}-\d{2}T\d{4}$',taskSpec.transHome) == None and \
                        re.search('_\d+\.\d+\.\d+$',taskSpec.transHome) is None:
                    # cache is checked
                    siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                        unified_site_list,
                        caches=transHome,
                        cmtConfig=taskSpec.architecture)
                elif (transHome == '' and taskSpec.transUses != None) or \
                        (re.search('_\d+\.\d+\.\d+$',taskSpec.transHome) is not None and \
                             (taskSpec.transUses is None or re.search('-\d+\.\d+$',taskSpec.transUses) is None)):
                    # remove Atlas-
                    transUses = taskSpec.transUses.split('-')[-1]
                    # release is checked
                    siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                        unified_site_list,
                        releases=transUses,
                        cmtConfig=taskSpec.architecture)
                    siteListWithSW += self.taskBufferIF.checkSitesWithRelease(
                        unified_site_list,
                        caches=transHome,
                        cmtConfig=taskSpec.architecture)
                else:
                    # nightlies
                    siteListWithSW = self.taskBufferIF.checkSitesWithRelease(
                        unified_site_list, releases='CVMFS')
            newScanSiteList = []
            for tmpSiteName in unified_site_list:
                tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                # release check is disabled or release is available
                if tmpSiteSpec.releases == ['ANY']:
                    newScanSiteList.append(tmpSiteName)
                elif tmpSiteName in siteListWithSW:
                    newScanSiteList.append(tmpSiteName)
                else:
                    # release is unavailable
                    tmpLog.info('  skip site=%s due to missing rel/cache %s:%s:%s criteria=-cache' % \
                                 (tmpSiteName,taskSpec.transUses,taskSpec.transHome,taskSpec.architecture))
            scanSiteList = self.get_pseudo_sites(newScanSiteList, scanSiteList)
            tmpLog.info('{0} candidates passed for SW {1}:{2}:{3}'.format(
                len(scanSiteList), taskSpec.transUses, taskSpec.transHome,
                taskSpec.architecture))
            if scanSiteList == []:
                tmpLog.error('no candidates')
                taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                # send info to logger
                self.sendLogMessage(tmpLog)
                return retTmpError
        ######################################
        # selection for memory
        minRamCount = inputChunk.getMaxRamCount()
        minRamCount = JediCoreUtils.compensateRamCount(minRamCount)
        if not minRamCount in [0, None]:
            newScanSiteList = []
            for tmpSiteName in scanSiteList:
                tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                # site max memory requirement
                if not tmpSiteSpec.maxrss in [0, None]:
                    site_maxmemory = tmpSiteSpec.maxrss
                else:
                    site_maxmemory = tmpSiteSpec.maxmemory
                if not site_maxmemory in [
                        0, None
                ] and minRamCount != 0 and minRamCount > site_maxmemory:
                    tmpLog.info(
                        '  skip site={0} due to site RAM shortage. site_maxmemory={1} < job_minramcount={2} criteria=-lowmemory'
                        .format(tmpSiteName, site_maxmemory, minRamCount))
                    continue
                # site min memory requirement
                if not tmpSiteSpec.minrss in [0, None]:
                    site_minmemory = tmpSiteSpec.minrss
                else:
                    site_minmemory = tmpSiteSpec.minmemory
                if not site_minmemory in [
                        0, None
                ] and minRamCount != 0 and minRamCount < site_minmemory:
                    tmpLog.info(
                        '  skip site={0} due to job RAM shortage. site_minmemory={1} > job_minramcount={2} criteria=-highmemory'
                        .format(tmpSiteName, site_minmemory, minRamCount))
                    continue
                newScanSiteList.append(tmpSiteName)
            scanSiteList = newScanSiteList
            ramUnit = taskSpec.ramUnit
            if ramUnit is None:
                ramUnit = 'MB'
            tmpLog.info('{0} candidates passed memory check = {1} {2}'.format(
                len(scanSiteList), minRamCount, ramUnit))
            if scanSiteList == []:
                tmpLog.error('no candidates')
                taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                # send info to logger
                self.sendLogMessage(tmpLog)
                return retTmpError
        ######################################
        # selection for scratch disk
        tmpMaxAtomSize = inputChunk.getMaxAtomSize()
        tmpEffAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True)
        tmpOutDiskSize = taskSpec.getOutDiskSize()
        tmpWorkDiskSize = taskSpec.getWorkDiskSize()
        minDiskCountS = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize + tmpMaxAtomSize
        minDiskCountS = minDiskCountS / 1024 / 1024
        # size for direct IO sites
        if taskSpec.useLocalIO():
            minDiskCountR = minDiskCountS
        else:
            minDiskCountR = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize
            minDiskCountR = minDiskCountR / 1024 / 1024
        tmpLog.info(
            'maxAtomSize={0} effectiveAtomSize={1} outDiskCount={2} workDiskSize={3}'
            .format(tmpMaxAtomSize, tmpEffAtomSize, tmpOutDiskSize,
                    tmpWorkDiskSize))
        tmpLog.info('minDiskCountScratch={0} minDiskCountRemote={1}'.format(
            minDiskCountS, minDiskCountR))
        newScanSiteList = []
        for tmpSiteName in self.get_unified_sites(scanSiteList):
            tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
            # check at the site
            if tmpSiteSpec.maxwdir != 0:
                if tmpSiteSpec.isDirectIO():
                    minDiskCount = minDiskCountR
                else:
                    minDiskCount = minDiskCountS
                if minDiskCount > tmpSiteSpec.maxwdir:
                    tmpLog.info(
                        '  skip site={0} due to small scratch disk={1} < {2} criteria=-disk'
                        .format(tmpSiteName, tmpSiteSpec.maxwdir,
                                minDiskCount))
                    continue
            newScanSiteList.append(tmpSiteName)
        scanSiteList = self.get_pseudo_sites(newScanSiteList, scanSiteList)
        tmpLog.info('{0} candidates passed scratch disk check'.format(
            len(scanSiteList)))
        if scanSiteList == []:
            tmpLog.error('no candidates')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retTmpError
        ######################################
        # selection for available space in SE
        newScanSiteList = []
        for tmpSiteName in self.get_unified_sites(scanSiteList):
            # check endpoint
            tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
            tmpEndPoint = tmpSiteSpec.ddm_endpoints_output.getEndPoint(
                tmpSiteSpec.ddm_output)
            if tmpEndPoint is not None:
                # free space must be >= 200GB
                diskThreshold = 200
                tmpSpaceSize = 0
                if tmpEndPoint['space_expired'] is not None:
                    tmpSpaceSize += tmpEndPoint['space_expired']
                if tmpEndPoint['space_free'] is not None:
                    tmpSpaceSize += tmpEndPoint['space_free']
                if tmpSpaceSize < diskThreshold:
                    tmpLog.info(
                        '  skip site={0} due to disk shortage in SE {1} < {2}GB criteria=-disk'
                        .format(tmpSiteName, tmpSpaceSize, diskThreshold))
                    continue
                # check if blacklisted
                if tmpEndPoint['blacklisted'] == 'Y':
                    tmpLog.info(
                        '  skip site={0} since {1} is blacklisted in DDM criteria=-blacklist'
                        .format(tmpSiteName, tmpSiteSpec.ddm_output))
                    continue
            newScanSiteList.append(tmpSiteName)
        scanSiteList = self.get_pseudo_sites(newScanSiteList, scanSiteList)
        tmpLog.info('{0} candidates passed SE space check'.format(
            len(scanSiteList)))
        if scanSiteList == []:
            tmpLog.error('no candidates')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retTmpError
        ######################################
        # selection for walltime
        minWalltime = taskSpec.walltime
        if not minWalltime in [0, None] and minWalltime > 0:
            minWalltime *= tmpEffAtomSize
            newScanSiteList = []
            for tmpSiteName in scanSiteList:
                tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                # check at the site
                if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                    tmpLog.info(
                        '  skip site={0} due to short site walltime={1}(site upper limit) < {2} criteria=-shortwalltime'
                        .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime))
                    continue
                if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                    tmpLog.info(
                        '  skip site={0} due to short job walltime={1}(site lower limit) > {2} criteria=-longwalltime'
                        .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime))
                    continue
                newScanSiteList.append(tmpSiteName)
            scanSiteList = newScanSiteList
            tmpLog.info('{0} candidates passed walltime check ={1}{2}'.format(
                len(scanSiteList), minWalltime, taskSpec.walltimeUnit))
            if scanSiteList == []:
                tmpLog.error('no candidates')
                taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                # send info to logger
                self.sendLogMessage(tmpLog)
                return retTmpError
        ######################################
        # selection for nPilot
        nWNmap = self.taskBufferIF.getCurrentSiteData()
        newScanSiteList = []
        for tmpSiteName in self.get_unified_sites(scanSiteList):
            # check at the site
            nPilot = 0
            if nWNmap.has_key(tmpSiteName):
                nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][
                    'updateJob']
            if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']:
                tmpLog.info(
                    '  skip site=%s due to no pilot criteria=-nopilot' %
                    tmpSiteName)
                if not self.testMode:
                    continue
            newScanSiteList.append(tmpSiteName)
        scanSiteList = self.get_pseudo_sites(newScanSiteList, scanSiteList)
        tmpLog.info('{0} candidates passed pilot activity check'.format(
            len(scanSiteList)))
        if scanSiteList == []:
            tmpLog.error('no candidates')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retTmpError
        ######################################
        # check inclusion and exclusion
        newScanSiteList = []
        sitesForANY = []
        for tmpSiteName in self.get_unified_sites(scanSiteList):
            autoSite = False
            # check exclusion
            if AtlasBrokerUtils.isMatched(tmpSiteName, excludeList):
                tmpLog.info(
                    '  skip site={0} excluded criteria=-excluded'.format(
                        tmpSiteName))
                continue
            # check inclusion
            if includeList != None and not AtlasBrokerUtils.isMatched(
                    tmpSiteName, includeList):
                if 'AUTO' in includeList:
                    autoSite = True
                else:
                    tmpLog.info(
                        '  skip site={0} not included criteria=-notincluded'.
                        format(tmpSiteName))
                    continue
            tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
            # limited access
            if tmpSiteSpec.accesscontrol == 'grouplist':
                if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \
                        siteAccessMap[tmpSiteSpec.sitename] != 'approved':
                    tmpLog.info(
                        '  skip site={0} limited access criteria=-limitedaccess'
                        .format(tmpSiteName))
                    continue
            # check cloud
            if not taskSpec.cloud in [None, '', 'any', tmpSiteSpec.cloud]:
                tmpLog.info(
                    '  skip site={0} cloud mismatch criteria=-cloudmismatch'.
                    format(tmpSiteName))
                continue
            if autoSite:
                sitesForANY.append(tmpSiteName)
            else:
                newScanSiteList.append(tmpSiteName)
        # use AUTO sites if no sites are included
        if newScanSiteList == []:
            newScanSiteList = sitesForANY
        else:
            for tmpSiteName in sitesForANY:
                tmpLog.info(
                    '  skip site={0} not included criteria=-notincluded'.
                    format(tmpSiteName))
        scanSiteList = self.get_pseudo_sites(newScanSiteList, scanSiteList)
        tmpLog.info('{0} candidates passed inclusion/exclusion/cloud'.format(
            len(scanSiteList)))
        if scanSiteList == []:
            tmpLog.error('no candidates')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retTmpError
        ######################################
        # selection for data availability
        hasDDS = False
        dataWeight = {}
        remoteSourceList = {}
        if inputChunk.getDatasets() != []:
            oldScanSiteList = copy.copy(scanSiteList)
            oldScanUnifiedSiteList = self.get_unified_sites(oldScanSiteList)
            for datasetSpec in inputChunk.getDatasets():
                datasetName = datasetSpec.datasetName
                if not self.dataSiteMap.has_key(datasetName):
                    # get the list of sites where data is available
                    tmpLog.debug(
                        'getting the list of sites where {0} is available'.
                        format(datasetName))
                    tmpSt, tmpRet = AtlasBrokerUtils.getAnalSitesWithData(
                        self.get_unified_sites(scanSiteList), self.siteMapper,
                        self.ddmIF, datasetName)
                    if tmpSt in [
                            Interaction.JEDITemporaryError,
                            Interaction.JEDITimeoutError
                    ]:
                        tmpLog.error(
                            'temporary failed to get the list of sites where data is available, since %s'
                            % tmpRet)
                        taskSpec.setErrDiag(
                            tmpLog.uploadLog(taskSpec.jediTaskID))
                        # send info to logger
                        self.sendLogMessage(tmpLog)
                        return retTmpError
                    if tmpSt == Interaction.JEDIFatalError:
                        tmpLog.error(
                            'fatal error when getting the list of sites where data is available, since %s'
                            % tmpRet)
                        taskSpec.setErrDiag(
                            tmpLog.uploadLog(taskSpec.jediTaskID))
                        # send info to logger
                        self.sendLogMessage(tmpLog)
                        return retFatal
                    # append
                    self.dataSiteMap[datasetName] = tmpRet
                    if datasetName.startswith('ddo'):
                        tmpLog.debug(' {0} sites'.format(len(tmpRet)))
                    else:
                        tmpLog.debug(' {0} sites : {1}'.format(
                            len(tmpRet), str(tmpRet)))
                        # check if distributed
                        if tmpRet != {}:
                            isDistributed = True
                            for tmpMap in tmpRet.values():
                                for tmpVal in tmpMap.values():
                                    if tmpVal['state'] == 'complete':
                                        isDistributed = False
                                        break
                                if not isDistributed:
                                    break
                            if isDistributed:
                                # check if really distributed
                                isDistributed = self.ddmIF.isDistributedDataset(
                                    datasetName)
                                if isDistributed:
                                    hasDDS = True
                                    datasetSpec.setDistributed()
                                    tmpLog.debug(' {0} is distributed'.format(
                                        datasetName))
                # check if the data is available at somewhere
                if self.dataSiteMap[datasetName] == {}:
                    for tmpSiteName in scanSiteList:
                        tmpLog.info(
                            '  skip site={0} data is unavailable criteria=-input'
                            .format(tmpSiteName))
                    tmpLog.error(
                        '{0} is unavailable at any site'.format(datasetName))
                    taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                    # send info to logger
                    self.sendLogMessage(tmpLog)
                    return retFatal
            # get the list of sites where data is available
            scanSiteList = None
            scanSiteListOnDisk = None
            normFactor = 0
            for datasetName, tmpDataSite in self.dataSiteMap.iteritems():
                normFactor += 1
                # get sites where replica is available
                tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(
                    tmpDataSite, includeTape=True)
                tmpDiskSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk(
                    tmpDataSite, includeTape=False)
                # get sites which can remotely access source sites
                if inputChunk.isMerging:
                    # disable remote access for merging
                    tmpSatelliteSites = {}
                elif (not sitePreAssigned) or (
                        sitePreAssigned and not taskSpec.site in tmpSiteList):
                    tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites(
                        tmpDiskSiteList,
                        self.taskBufferIF,
                        self.siteMapper,
                        nSites=50,
                        protocol=allowedRemoteProtocol)
                else:
                    tmpSatelliteSites = {}
                # make weight map for local
                for tmpSiteName in tmpSiteList:
                    if not dataWeight.has_key(tmpSiteName):
                        dataWeight[tmpSiteName] = 0
                    # give more weight to disk
                    if tmpSiteName in tmpDiskSiteList:
                        dataWeight[tmpSiteName] += 1
                    else:
                        dataWeight[tmpSiteName] += 0.001
                # make weight map for remote
                for tmpSiteName, tmpWeightSrcMap in tmpSatelliteSites.iteritems(
                ):
                    # skip since local data is available
                    if tmpSiteName in tmpSiteList:
                        continue
                    tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
                    # negative weight for remote access
                    wRemote = 50.0
                    if not tmpSiteSpec.wansinklimit in [0, None]:
                        wRemote /= float(tmpSiteSpec.wansinklimit)
                    # sum weight
                    if not dataWeight.has_key(tmpSiteName):
                        dataWeight[tmpSiteName] = float(
                            tmpWeightSrcMap['weight']) / wRemote
                    else:
                        dataWeight[tmpSiteName] += float(
                            tmpWeightSrcMap['weight']) / wRemote
                    # make remote source list
                    if not remoteSourceList.has_key(tmpSiteName):
                        remoteSourceList[tmpSiteName] = {}
                    remoteSourceList[tmpSiteName][
                        datasetName] = tmpWeightSrcMap['source']
                # first list
                if scanSiteList == None:
                    scanSiteList = []
                    for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                        if not tmpSiteName in oldScanUnifiedSiteList:
                            continue
                        if not tmpSiteName in scanSiteList:
                            scanSiteList.append(tmpSiteName)
                    scanSiteListOnDisk = set()
                    for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys(
                    ):
                        if not tmpSiteName in oldScanUnifiedSiteList:
                            continue
                        scanSiteListOnDisk.add(tmpSiteName)
                    continue
                # pickup sites which have all data
                newScanList = []
                for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys():
                    if tmpSiteName in scanSiteList and not tmpSiteName in newScanList:
                        newScanList.append(tmpSiteName)
                scanSiteList = newScanList
                tmpLog.debug('{0} is available at {1} sites'.format(
                    datasetName, len(scanSiteList)))
                # pickup sites which have all data on DISK
                newScanListOnDisk = set()
                for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys():
                    if tmpSiteName in scanSiteListOnDisk:
                        newScanListOnDisk.add(tmpSiteName)
                scanSiteListOnDisk = newScanListOnDisk
                tmpLog.debug('{0} is available at {1} sites on DISK'.format(
                    datasetName, len(scanSiteListOnDisk)))
            # check for preassigned
            if sitePreAssigned and not taskSpec.site in scanSiteList:
                scanSiteList = []
                tmpLog.info(
                    'data is unavailable locally or remotely at preassigned site {0}'
                    .format(taskSpec.site))
            elif len(scanSiteListOnDisk) > 0:
                # use only disk sites
                scanSiteList = list(scanSiteListOnDisk)
            scanSiteList = self.get_pseudo_sites(scanSiteList, oldScanSiteList)
            # dump
            for tmpSiteName in oldScanSiteList:
                if tmpSiteName not in scanSiteList:
                    tmpLog.info(
                        '  skip site={0} data is unavailable criteria=-input'.
                        format(tmpSiteName))
            tmpLog.info('{0} candidates have input data'.format(
                len(scanSiteList)))
            if scanSiteList == []:
                tmpLog.error('no candidates')
                taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                # send info to logger
                self.sendLogMessage(tmpLog)
                return retFatal
        ######################################
        # sites already used by task
        tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(
            taskSpec.jediTaskID)
        if not tmpSt:
            tmpLog.error('failed to get sites which already used by task')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retTmpError
        sitesUsedByTask = self.get_unified_sites(sitesUsedByTask)
        ######################################
        # calculate weight
        """
        fqans = taskSpec.makeFQANs()
        tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans,
                                                                                                  taskSpec.workingGroup,True)
        currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight)
        currentPriority -= 500
        tmpLog.debug('currentPriority={0}'.format(currentPriority))
        """
        tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsByGlobalShare(
            taskSpec.vo)
        if not tmpSt:
            tmpLog.error('failed to get job statistics with priority')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retTmpError
        # check for preassigned
        if sitePreAssigned and (taskSpec.site not in scanSiteList
                                and taskSpec.site
                                not in self.get_unified_sites(scanSiteList)):
            tmpLog.info("preassigned site {0} did not pass all tests".format(
                taskSpec.site))
            tmpLog.error('no candidates')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retFatal
        ######################################
        # final procedure
        tmpLog.info('final {0} candidates'.format(len(scanSiteList)))
        weightMap = {}
        candidateSpecList = []
        timeWindowForFC = 6
        preSiteCandidateSpec = None
        failureCounts = self.taskBufferIF.getFailureCountsForTask_JEDI(
            taskSpec.jediTaskID, timeWindowForFC)
        problematicSites = set()
        for tmpPseudoSiteName in scanSiteList:
            tmpSiteSpec = self.siteMapper.getSite(tmpPseudoSiteName)
            tmpSiteName = tmpSiteSpec.get_unified_name()
            # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
            nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName,
                                                   'running', None, None)
            nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                    tmpSiteName, 'defined',
                                                    None, None)
            nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) + \
                         AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None)
            nStarting = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                    tmpSiteName, 'starting',
                                                    None, None)
            nFailed = 0
            nClosed = 0
            nFinished = 0
            if tmpSiteName in failureCounts:
                if 'failed' in failureCounts[tmpSiteName]:
                    nFailed = failureCounts[tmpSiteName]['failed']
                if 'closed' in failureCounts[tmpSiteName]:
                    nClosed = failureCounts[tmpSiteName]['closed']
                if 'finished' in failureCounts[tmpSiteName]:
                    nFinished = failureCounts[tmpSiteName]['finished']
            # problematic sites
            if nFailed + nClosed > 2 * nFinished:
                problematicSites.add(tmpSiteName)
            # calculate weight
            weight = float(nRunning + 1) / float(nActivated + nAssigned +
                                                 nStarting + 1)
            nThrottled = 0
            if remoteSourceList.has_key(tmpSiteName):
                nThrottled = AtlasBrokerUtils.getNumJobs(
                    jobStatPrioMap, tmpSiteName, 'throttled', None, None)
                weight /= float(nThrottled + 1)
            # noramize weights by taking data availability into account
            tmpDataWeight = 1
            if dataWeight.has_key(tmpSiteName):
                weight = weight * dataWeight[tmpSiteName]
                tmpDataWeight = dataWeight[tmpSiteName]
            # make candidate
            siteCandidateSpec = SiteCandidate(tmpPseudoSiteName)
            # preassigned
            if sitePreAssigned and tmpSiteName == taskSpec.site:
                preSiteCandidateSpec = siteCandidateSpec
            # set weight
            siteCandidateSpec.weight = weight
            tmpStr = '  site={0} nRun={1} nDef={2} nAct={3} nStart={4} '.format(
                tmpPseudoSiteName, nRunning, nAssigned, nActivated, nStarting)
            tmpStr += 'nFailed={0} nClosed={1} nFinished={2} nTr={3} dataW={4} W={5}'.format(
                nFailed, nClosed, nFinished, nThrottled, tmpDataWeight, weight)
            tmpLog.info(tmpStr)
            # append
            if tmpSiteName in sitesUsedByTask:
                candidateSpecList.append(siteCandidateSpec)
            else:
                if not weightMap.has_key(weight):
                    weightMap[weight] = []
                weightMap[weight].append(siteCandidateSpec)
        # sort candidates by weights
        weightList = weightMap.keys()
        weightList.sort()
        weightList.reverse()
        for weightVal in weightList:
            sitesWithWeight = weightMap[weightVal]
            random.shuffle(sitesWithWeight)
            candidateSpecList += sitesWithWeight
        # limit the number of sites. use all sites for distributed datasets
        if not hasDDS:
            maxNumSites = 10
            # remove problematic sites
            candidateSpecList = AtlasBrokerUtils.skipProblematicSites(
                candidateSpecList, problematicSites, sitesUsedByTask,
                preSiteCandidateSpec, maxNumSites, timeWindowForFC, tmpLog)
        # append preassigned
        if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList:
            candidateSpecList.append(preSiteCandidateSpec)
        # collect site names
        scanSiteList = []
        for siteCandidateSpec in candidateSpecList:
            scanSiteList.append(siteCandidateSpec.siteName)
        # get list of available files
        availableFileMap = {}
        for datasetSpec in inputChunk.getDatasets():
            try:
                # get list of site to be scanned
                fileScanSiteList = []
                for tmpPseudoSiteName in scanSiteList:
                    tmpSiteSpec = self.siteMapper.getSite(tmpPseudoSiteName)
                    tmpSiteName = tmpSiteSpec.get_unified_name()
                    if tmpSiteName in fileScanSiteList:
                        continue
                    fileScanSiteList.append(tmpSiteName)
                    if remoteSourceList.has_key(
                            tmpSiteName
                    ) and remoteSourceList[tmpSiteName].has_key(
                            datasetSpec.datasetName):
                        for tmpRemoteSite in remoteSourceList[tmpSiteName][
                                datasetSpec.datasetName]:
                            if not tmpRemoteSite in fileScanSiteList:
                                fileScanSiteList.append(tmpRemoteSite)
                # mapping between sites and input storage endpoints
                siteStorageEP = AtlasBrokerUtils.getSiteInputStorageEndpointMap(
                    fileScanSiteList, self.siteMapper)

                # disable file lookup for merge jobs
                if inputChunk.isMerging:
                    checkCompleteness = False
                else:
                    checkCompleteness = True
                # get available files per site/endpoint
                tmpAvFileMap = self.ddmIF.getAvailableFiles(
                    datasetSpec,
                    siteStorageEP,
                    self.siteMapper,
                    check_completeness=checkCompleteness)
                if tmpAvFileMap == None:
                    raise Interaction.JEDITemporaryError, 'ddmIF.getAvailableFiles failed'
                availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
            except:
                errtype, errvalue = sys.exc_info()[:2]
                tmpLog.error('failed to get available files with %s %s' %
                             (errtype.__name__, errvalue))
                taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                # send info to logger
                self.sendLogMessage(tmpLog)
                return retTmpError
        # append candidates
        newScanSiteList = []
        for siteCandidateSpec in candidateSpecList:
            tmpPseudoSiteName = siteCandidateSpec.siteName
            tmpSiteSpec = self.siteMapper.getSite(tmpPseudoSiteName)
            tmpSiteName = tmpSiteSpec.get_unified_name()
            # preassigned
            if sitePreAssigned and tmpSiteName != taskSpec.site:
                tmpLog.info(
                    '  skip site={0} non pre-assigned site criteria=-nonpreassigned'
                    .format(tmpPseudoSiteName))
                continue
            # set available files
            if inputChunk.getDatasets() == []:
                isAvailable = True
            else:
                isAvailable = False
            for tmpDatasetName, availableFiles in availableFileMap.iteritems():
                tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName)
                # check remote files
                if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[
                        tmpSiteName].has_key(tmpDatasetName):
                    for tmpRemoteSite in remoteSourceList[tmpSiteName][
                            tmpDatasetName]:
                        if availableFiles.has_key(tmpRemoteSite) and \
                                len(tmpDatasetSpec.Files) <= len(availableFiles[tmpRemoteSite]['localdisk']):
                            # use only remote disk files
                            siteCandidateSpec.remoteFiles += availableFiles[
                                tmpRemoteSite]['localdisk']
                            # set remote site and access protocol
                            siteCandidateSpec.remoteProtocol = allowedRemoteProtocol
                            siteCandidateSpec.remoteSource = tmpRemoteSite
                            isAvailable = True
                            break
                # local files
                if availableFiles.has_key(tmpSiteName):
                    if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \
                            len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']) or \
                            len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localtape']) or \
                            (tmpDatasetSpec.isDistributed() and len(availableFiles[tmpSiteName]['all']) > 0):
                        siteCandidateSpec.localDiskFiles += availableFiles[
                            tmpSiteName]['localdisk']
                        # add cached files to local list since cached files go to pending when reassigned
                        siteCandidateSpec.localDiskFiles += availableFiles[
                            tmpSiteName]['cache']
                        siteCandidateSpec.localTapeFiles += availableFiles[
                            tmpSiteName]['localtape']
                        siteCandidateSpec.cacheFiles += availableFiles[
                            tmpSiteName]['cache']
                        siteCandidateSpec.remoteFiles += availableFiles[
                            tmpSiteName]['remote']
                        siteCandidateSpec.addAvailableFiles(
                            availableFiles[tmpSiteName]['all'])
                        isAvailable = True
                    else:
                        tmpMsg = '{0} is incomplete at {1} : nFiles={2} nLocal={3} nCached={4} nTape={5}'
                        tmpLog.debug(
                            tmpMsg.format(
                                tmpDatasetName,
                                tmpPseudoSiteName,
                                len(tmpDatasetSpec.Files),
                                len(availableFiles[tmpSiteName]['localdisk']),
                                len(availableFiles[tmpSiteName]['cache']),
                                len(availableFiles[tmpSiteName]['localtape']),
                            ))
                if not isAvailable:
                    break
            # append
            if not isAvailable:
                tmpLog.info(
                    '  skip site={0} file unavailable criteria=-fileunavailable'
                    .format(siteCandidateSpec.siteName))
                continue
            inputChunk.addSiteCandidate(siteCandidateSpec)
            newScanSiteList.append(siteCandidateSpec.siteName)
            tmpLog.info(
                '  use site={0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5} criteria=+use'
                .format(
                    siteCandidateSpec.siteName,
                    siteCandidateSpec.weight,
                    len(siteCandidateSpec.localDiskFiles),
                    len(siteCandidateSpec.localTapeFiles),
                    len(siteCandidateSpec.cacheFiles),
                    len(siteCandidateSpec.remoteFiles),
                ))
        scanSiteList = newScanSiteList
        if scanSiteList == []:
            tmpLog.error('no candidates')
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            # send info to logger
            self.sendLogMessage(tmpLog)
            return retTmpError
        # send info to logger
        self.sendLogMessage(tmpLog)
        # return
        tmpLog.debug('done')
        return self.SC_SUCCEEDED, inputChunk
Пример #46
0
    def start(self):
        # start base classes
        JediKnight.start(self)
        FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF)
        # go into main loop
        while True:
            startTime = datetime.datetime.utcnow()
            try:
                # get logger
                tmpLog = MsgWrapper(logger)
                tmpLog.info('start')
                # loop over all vos
                for vo in self.vos:
                    # loop over all sourceLabels
                    for prodSourceLabel in self.prodSourceLabels:
                        # rescue picked files
                        tmpLog.info('rescue tasks with picked files for vo={0} label={1}'.format(vo,prodSourceLabel)) 
                        tmpRet = self.taskBufferIF.rescuePickedFiles_JEDI(vo,prodSourceLabel,
                                                                          jedi_config.watchdog.waitForPicked)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to rescue')
                        else:
                            tmpLog.info('rescued {0} tasks'.format(tmpRet))

                        # reactivate pending tasks
                        tmpLog.info('reactivate pending tasks for vo={0} label={1}'.format(vo,prodSourceLabel)) 
                        timeoutForPending = None
                        if hasattr(jedi_config.watchdog,'timeoutForPendingVoLabel'): 
                            timeoutForPending = JediCoreUtils.getConfigParam(jedi_config.watchdog.timeoutForPendingVoLabel,vo,prodSourceLabel)
                        if timeoutForPending == None:
                            timeoutForPending = jedi_config.watchdog.timeoutForPending
                        timeoutForPending = int(timeoutForPending)    
                        tmpRet = self.taskBufferIF.reactivatePendingTasks_JEDI(vo,prodSourceLabel,
                                                                               jedi_config.watchdog.waitForPending,
                                                                               timeoutForPending)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to reactivate')
                        else:
                            tmpLog.info('reactivated {0} tasks'.format(tmpRet))
                        # unlock tasks
                        tmpLog.info('unlock tasks for vo={0} label={1}'.format(vo,prodSourceLabel)) 
                        tmpRet = self.taskBufferIF.unlockTasks_JEDI(vo,prodSourceLabel,
                                                                    jedi_config.watchdog.waitForLocked)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to unlock')
                        else:
                            tmpLog.info('unlock {0} tasks'.format(tmpRet))
                        # restart contents update
                        tmpLog.info('restart contents update for vo={0} label={1}'.format(vo,prodSourceLabel)) 
                        tmpRet = self.taskBufferIF.restartTasksForContentsUpdate_JEDI(vo,prodSourceLabel)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to restart')
                        else:
                            tmpLog.info('restarted {0} tasks'.format(tmpRet))
                        # kick exhausted tasks
                        tmpLog.info('kick exhausted tasks for vo={0} label={1}'.format(vo,prodSourceLabel)) 
                        tmpRet = self.taskBufferIF.kickExhaustedTasks_JEDI(vo,prodSourceLabel,
                                                                           jedi_config.watchdog.waitForExhausted)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to kick')
                        else:
                            tmpLog.info('kicked {0} tasks'.format(tmpRet))
                        # finish tasks when goal is reached
                        tmpLog.info('finish achieved tasks for vo={0} label={1}'.format(vo,prodSourceLabel)) 
                        tmpRet = self.taskBufferIF.getAchievedTasks_JEDI(vo,prodSourceLabel,
                                                                         jedi_config.watchdog.waitForAchieved)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to finish')
                        else:
                            for jediTaskID in tmpRet:
                                self.taskBufferIF.sendCommandTaskPanda(jediTaskID,'JEDI. Goal reached',True,'finish',comQualifier='soft')
                            tmpLog.info('finished {0} tasks'.format(tmpRet))
                        # vo/prodSourceLabel specific action
                        impl = self.getImpl(vo,prodSourceLabel)
                        if impl != None:
                            tmpLog.info('special action for vo={0} label={1} with {2}'.format(vo,prodSourceLabel,impl.__class__.__name__))
                            tmpStat = impl.doAction()
                            if tmpStat !=  Interaction.SC_SUCCEEDED:
                                tmpLog.error('failed to run special acction for vo={0} label={1}'.format(vo,prodSourceLabel))
                            else:
                                tmpLog.info('done for vo={0} label={1}'.format(vo,prodSourceLabel))
                tmpLog.info('done')
            except:
                errtype,errvalue = sys.exc_info()[:2]
                tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
            # sleep if needed
            loopCycle = jedi_config.watchdog.loopCycle
            timeDelta = datetime.datetime.utcnow() - startTime
            sleepPeriod = loopCycle - timeDelta.seconds
            if sleepPeriod > 0:
                time.sleep(sleepPeriod)
            # randomize cycle
            self.randomSleep()
Пример #47
0
 def doSetup(self,taskSpec,datasetToRegister,pandaJobs):
     # make logger
     tmpLog = MsgWrapper(logger,"<jediTaskID={0}>".format(taskSpec.jediTaskID))
     tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType))
     # returns
     retFatal    = self.SC_FATAL
     retTmpError = self.SC_FAILED
     retOK       = self.SC_SUCCEEDED
     try:
         # get DDM I/F
         ddmIF = self.ddmIF.getInterface(taskSpec.vo)
         # register datasets
         if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']:
             # prod vs anal
             userSetup = False
             if taskSpec.prodSourceLabel in ['user']:
                 userSetup = True
                 # collect datasetID to register datasets/containers just in case
                 for tmpPandaJob in pandaJobs:
                     if not tmpPandaJob.produceUnMerge():
                         for tmpFileSpec in tmpPandaJob.Files:
                             if tmpFileSpec.type in ['output','log']:
                                 if not tmpFileSpec.datasetID in datasetToRegister:
                                     datasetToRegister.append(tmpFileSpec.datasetID)
             tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister)))
             # get site mapper
             siteMapper = self.taskBufferIF.getSiteMapper()
             # loop over all datasets
             avDatasetList = []
             cnDatasetMap  = {}
             for datasetID in datasetToRegister:
                 # get output and log datasets
                 tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID))
                 tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID,
                                                                               datasetID)
                 if not tmpStat:
                     tmpLog.error('failed to get output and log datasets')
                     return retFatal
                 # DDM backend
                 ddmBackEnd = taskSpec.getDdmBackEnd()
                 tmpLog.info('checking {0}'.format(datasetSpec.datasetName)) 
                 # check if dataset and container are available in DDM
                 for targetName in [datasetSpec.datasetName,datasetSpec.containerName]:
                     if targetName == None:
                         continue
                     if not targetName in avDatasetList:
                         # set lifetime
                         if targetName.startswith('panda'):
                             lifetime = 14
                         else:
                             lifetime = None
                         # check dataset/container in DDM
                         tmpList = ddmIF.listDatasets(targetName)
                         if tmpList == []:
                             # get location
                             location = None
                             locForRule = None
                             if targetName == datasetSpec.datasetName:
                                 # dataset
                                 if datasetSpec.site in ['',None]:
                                     if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                                         locForRule = datasetSpec.destination
                                     elif DataServiceUtils.getDestinationSE(datasetSpec.storageToken) != None:
                                         location = DataServiceUtils.getDestinationSE(datasetSpec.storageToken)
                                     elif taskSpec.cloud != None:
                                         # use T1 SE
                                         tmpT1Name = siteMapper.getCloud(taskSpec.cloud)['source']
                                         location = siteMapper.getDdmEndpoint(tmpT1Name,datasetSpec.storageToken)
                                 else:
                                     location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken)
                             if locForRule == None:
                                 locForRule = location
                             # set metadata
                             if taskSpec.prodSourceLabel in ['managed','test'] and targetName == datasetSpec.datasetName:
                                 metaData = {}
                                 metaData['task_id'] = taskSpec.jediTaskID
                                 if not taskSpec.campaign in [None,'']:
                                     metaData['campaign'] = taskSpec.campaign 
                                 if datasetSpec.getTransient() != None:
                                     metaData['transient'] = datasetSpec.getTransient()
                             else:
                                 metaData = None
                             # register dataset/container
                             tmpLog.info('registering {0} with location={1} backend={2} lifetime={3} meta={4}'.format(targetName,
                                                                                                                      location,
                                                                                                                      ddmBackEnd,
                                                                                                                      lifetime,
                                                                                                                      str(metaData)))
                             tmpStat = ddmIF.registerNewDataset(targetName,backEnd=ddmBackEnd,location=location,
                                                                lifetime=lifetime,metaData=metaData)
                             if not tmpStat:
                                 tmpLog.error('failed to register {0}'.format(targetName))
                                 return retFatal
                             # procedures for user 
                             if userSetup or DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                                 # register location
                                 tmpToRegister = False
                                 if userSetup and targetName == datasetSpec.datasetName and not datasetSpec.site in ['',None]:
                                     userName = taskSpec.userName
                                     grouping = None
                                     tmpToRegister = True
                                 elif DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                                     userName = None
                                     grouping = 'NONE'
                                     tmpToRegister = True
                                 if tmpToRegister:
                                     activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel)
                                     tmpLog.info('registring location={0} lifetime={1}days activity={2} grouping={3}'.format(locForRule,lifetime,
                                                                                                                             activity,grouping))
                                     tmpStat = ddmIF.registerDatasetLocation(targetName,locForRule,owner=userName,
                                                                             lifetime=lifetime,backEnd=ddmBackEnd,
                                                                             activity=activity,grouping=grouping)
                                     if not tmpStat:
                                         tmpLog.error('failed to register location {0} with {2} for {1}'.format(locForRule,
                                                                                                                targetName,
                                                                                                                ddmBackEnd))
                                         return retFatal
                             avDatasetList.append(targetName)
                         else:
                             tmpLog.info('{0} already registered'.format(targetName))
                 # check if dataset is in the container
                 if datasetSpec.containerName != None and datasetSpec.containerName != datasetSpec.datasetName:
                     # get list of constituent datasets in the container
                     if not cnDatasetMap.has_key(datasetSpec.containerName):
                         cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName)
                     # add dataset
                     if not datasetSpec.datasetName in cnDatasetMap[datasetSpec.containerName]:
                         tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) 
                         tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName],
                                                                backEnd=ddmBackEnd)
                         if not tmpStat:
                             tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName,
                                                                            datasetSpec.containerName))
                             return retFatal
                         cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName)
                     else:
                         tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) 
                 # update dataset
                 datasetSpec.status = 'registered'
                 self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID,
                                                                   'datasetID':datasetID})
         # open datasets
         if taskSpec.prodSourceLabel in ['managed','test']:
             # get the list of output/log datasets
             outDatasetList = []
             for tmpPandaJob in pandaJobs:
                 for tmpFileSpec in tmpPandaJob.Files:
                     if tmpFileSpec.type in ['output','log']:
                         if not tmpFileSpec.destinationDBlock in outDatasetList:
                             outDatasetList.append(tmpFileSpec.destinationDBlock)
             # open datasets
             for outDataset in outDatasetList:
                 tmpLog.info('open {0}'.format(outDataset))
                 ddmIF.openDataset(outDataset)
                 # unset lifetime
                 ddmIF.setDatasetMetadata(outDataset,'lifetime',None)
         # return
         tmpLog.info('done')        
         return retOK
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue))
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retFatal
Пример #48
0
    def start(self):
        # start base classes
        JediKnight.start(self)
        FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF)
        # go into main loop
        while True:
            startTime = datetime.datetime.utcnow()
            try:
                # get logger
                tmpLog = MsgWrapper(logger)
                tmpLog.info('start')
                # loop over all vos
                for vo in self.vos:
                    # loop over all sourceLabels
                    for prodSourceLabel in self.prodSourceLabels:
                        # rescue picked files
                        tmpLog.info('rescue tasks with picked files for vo={0} label={1}'.format(vo,prodSourceLabel)) 
                        tmpRet = self.taskBufferIF.rescuePickedFiles_JEDI(vo,prodSourceLabel,
                                                                          jedi_config.watchdog.waitForPicked)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to rescue')
                        else:
                            tmpLog.info('rescued {0} tasks'.format(tmpRet))

                        # reactivate pending tasks
                        tmpLog.info('reactivate pending tasks for vo={0} label={1}'.format(vo,prodSourceLabel)) 
                        tmpRet = self.taskBufferIF.reactivatePendingTasks_JEDI(vo,prodSourceLabel,
                                                                               jedi_config.watchdog.waitForPending,
                                                                               jedi_config.watchdog.timeoutForPending)
                        if tmpRet == None:
                            # failed
                            tmpLog.error('failed to reactivate')
                        else:
                            tmpLog.info('reactivated {0} tasks'.format(tmpRet))
                        # vo/prodSourceLabel specific action
                        impl = self.getImpl(vo,prodSourceLabel)
                        if impl != None:
                            tmpLog.info('special action for vo={0} label={1} with {2}'.format(vo,prodSourceLabel,impl.__class__.__name__))
                            tmpStat = impl.doAction()
                            if tmpStat !=  Interaction.SC_SUCCEEDED:
                                tmpLog.error('failed to run special acction for vo={0} label={1}'.format(vo,prodSourceLabel))
                            else:
                                tmpLog.info('done for vo={0} label={1}'.format(vo,prodSourceLabel))
                tmpLog.info('done')
            except:
                errtype,errvalue = sys.exc_info()[:2]
                tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
            # sleep if needed
            loopCycle = jedi_config.watchdog.loopCycle
            timeDelta = datetime.datetime.utcnow() - startTime
            sleepPeriod = loopCycle - timeDelta.seconds
            if sleepPeriod > 0:
                time.sleep(sleepPeriod)
            # randomize cycle
            self.randomSleep()
Пример #49
0
 def doSplit(self,
             taskSpec,
             inputChunk,
             siteMapper,
             allow_chunk_size_limit=False):
     # return for failure
     retFatal = self.SC_FATAL, []
     retTmpError = self.SC_FAILED, []
     # make logger
     tmpLog = MsgWrapper(
         logger, '< jediTaskID={0} datasetID={1} >'.format(
             taskSpec.jediTaskID, inputChunk.masterIndexName))
     tmpLog.debug(
         'start chunk_size_limit={}'.format(allow_chunk_size_limit))
     if not inputChunk.isMerging:
         # set maxNumFiles using taskSpec if specified
         maxNumFiles = taskSpec.getMaxNumFilesPerJob()
         # set fsize gradients using taskSpec
         sizeGradients = taskSpec.getOutDiskSize()
         # set fsize intercepts using taskSpec
         sizeIntercepts = taskSpec.getWorkDiskSize()
         # walltime
         if not taskSpec.useHS06():
             walltimeGradient = taskSpec.walltime
         else:
             walltimeGradient = taskSpec.getCpuTime()
         # number of events per job if defined
         nEventsPerJob = taskSpec.getNumEventsPerJob()
         # number of files per job if defined
         if not taskSpec.dynamicNumEvents():
             nFilesPerJob = taskSpec.getNumFilesPerJob()
         else:
             nFilesPerJob = None
         if nFilesPerJob is None and nEventsPerJob is None and inputChunk.useScout() \
                 and not taskSpec.useLoadXML() and not taskSpec.respectSplitRule():
             nFilesPerJob = 1
         # grouping with boundaryID
         useBoundary = taskSpec.useGroupWithBoundaryID()
         # fsize intercepts per input size
         sizeGradientsPerInSize = None
         # max primay output size
         maxOutSize = None
         # max size per job
         maxSizePerJob = taskSpec.getMaxSizePerJob()
         if maxSizePerJob is not None:
             maxSizePerJob += InputChunk.defaultOutputSize
         # dynamic number of events
         dynNumEvents = taskSpec.dynamicNumEvents()
         # max number of event ranges
         maxNumEventRanges = None
         # multiplicity of jobs
         if taskSpec.useJobCloning():
             multiplicity = 1
         else:
             multiplicity = taskSpec.getNumEventServiceConsumer()
         # split with fields
         if taskSpec.getFieldNumToLFN(
         ) is not None and taskSpec.useFileAsSourceLFN():
             splitByFields = taskSpec.getFieldNumToLFN()
         else:
             splitByFields = None
     else:
         # set parameters for merging
         maxNumFiles = taskSpec.getMaxNumFilesPerMergeJob()
         sizeGradients = 0
         walltimeGradient = 0
         nFilesPerJob = taskSpec.getNumFilesPerMergeJob()
         nEventsPerJob = taskSpec.getNumEventsPerMergeJob()
         maxSizePerJob = None
         useBoundary = {'inSplit': 3}
         dynNumEvents = False
         maxNumEventRanges = None
         multiplicity = None
         # gradients per input size is 1 + margin
         sizeGradientsPerInSize = self.sizeGradientsPerInSizeForMerge
         # intercepts for libDS
         sizeIntercepts = taskSpec.getWorkDiskSize()
         # mergein of 500MB
         interceptsMergin = self.interceptsMerginForMerge
         if sizeIntercepts < interceptsMergin:
             sizeIntercepts = interceptsMergin
         maxOutSize = taskSpec.getMaxSizePerMergeJob()
         if maxOutSize is None:
             # max output size is 5GB for merging by default
             maxOutSize = 5 * 1024 * 1024 * 1024
         # split with fields
         if taskSpec.getFieldNumToLFN(
         ) is not None and taskSpec.useFileAsSourceLFN():
             splitByFields = list(
                 range(4 + 1, 4 + 1 + len(taskSpec.getFieldNumToLFN())))
         else:
             splitByFields = None
     # LB
     respectLB = taskSpec.respectLumiblock()
     # dump
     tmpLog.debug(
         'maxNumFiles={0} sizeGradients={1} sizeIntercepts={2} useBoundary={3}'
         .format(maxNumFiles, sizeGradients, sizeIntercepts, useBoundary))
     tmpLog.debug(
         'walltimeGradient={0} nFilesPerJob={1} nEventsPerJob={2}'.format(
             walltimeGradient, nFilesPerJob, nEventsPerJob))
     tmpLog.debug('useScout={} isMerging={}'.format(inputChunk.useScout(),
                                                    inputChunk.isMerging))
     tmpLog.debug(
         'sizeGradientsPerInSize={0} maxOutSize={1} respectLB={2} dynNumEvents={3}'
         .format(sizeGradientsPerInSize, maxOutSize, respectLB,
                 dynNumEvents))
     tmpLog.debug('multiplicity={0} splitByFields={1} nFiles={2}'.format(
         multiplicity, str(splitByFields),
         inputChunk.getNumFilesInMaster()))
     # split
     returnList = []
     subChunks = []
     iSubChunks = 0
     if inputChunk.useScout() and not inputChunk.isMerging:
         default_nSubChunks = 2
     elif taskSpec.is_hpo_workflow():
         default_nSubChunks = 2
     else:
         default_nSubChunks = 25
     subChunk = None
     nSubChunks = default_nSubChunks
     strict_chunkSize = False
     while True:
         # change site
         if iSubChunks % nSubChunks == 0 or subChunk == []:
             # append to return map
             if subChunks != []:
                 # get site names for parallel execution
                 if taskSpec.getNumSitesPerJob(
                 ) > 1 and not inputChunk.isMerging and inputChunk.useJumbo != 'fake':
                     siteName = inputChunk.getParallelSites(
                         taskSpec.getNumSitesPerJob(), nSubChunks,
                         [siteName])
                 returnList.append({
                     'siteName': siteName,
                     'subChunks': subChunks,
                     'siteCandidate': siteCandidate,
                 })
                 try:
                     gshare = taskSpec.gshare.replace(' ', '_')
                 except Exception:
                     gshare = None
                 tmpLog.info('split to nJobs=%s at site=%s gshare=%s' %
                             (len(subChunks), siteName, gshare))
                 # checkpoint
                 inputChunk.checkpoint_file_usage()
                 # reset
                 subChunks = []
             # skip PQs with chunk size limit
             ngList = []
             if not allow_chunk_size_limit:
                 for siteName in inputChunk.get_candidate_names():
                     siteSpec = siteMapper.getSite(siteName)
                     if siteSpec.get_job_chunk_size() is not None:
                         ngList.append(siteName)
             # new candidate
             siteCandidate, getCandidateMsg = inputChunk.getOneSiteCandidate(
                 nSubChunks, ngSites=ngList, get_msg=True)
             if siteCandidate is None:
                 tmpLog.debug('no candidate: {0}'.format(getCandidateMsg))
                 break
             siteName = siteCandidate.siteName
             siteSpec = siteMapper.getSite(siteName)
             # set chunk size
             nSubChunks = siteSpec.get_job_chunk_size()
             if nSubChunks is None:
                 nSubChunks = default_nSubChunks
                 strict_chunkSize = False
             else:
                 strict_chunkSize = True
             # directIO
             if not JediCoreUtils.use_direct_io_for_job(
                     taskSpec, siteSpec, inputChunk):
                 useDirectIO = False
             else:
                 useDirectIO = True
             # get maxSize if it is set in taskSpec
             maxSize = maxSizePerJob
             if maxSize is None:
                 # use maxwdir as the default maxSize
                 if not useDirectIO:
                     maxSize = siteCandidate.get_overridden_attribute(
                         'maxwdir')
                     if maxSize is None:
                         maxSize = siteSpec.maxwdir
                     if maxSize:
                         maxSize *= 1024 * 1024
                 elif nEventsPerJob is not None or nFilesPerJob is not None:
                     maxSize = None
                 else:
                     maxSize = siteCandidate.get_overridden_attribute(
                         'maxwdir')
                     if maxSize is None:
                         maxSize = siteSpec.maxwdir
                     maxSize = max(50000, maxSize) * 1024 * 1024
             else:
                 # add offset
                 maxSize += sizeIntercepts
             # max disk size
             maxDiskSize = siteCandidate.get_overridden_attribute('maxwdir')
             if maxDiskSize is None:
                 maxDiskSize = siteSpec.maxwdir
             if maxDiskSize:
                 maxDiskSize *= 1024 * 1024
             # max walltime
             maxWalltime = None
             if not inputChunk.isMerging:
                 maxWalltime = taskSpec.getMaxWalltime()
             if maxWalltime is None:
                 maxWalltime = siteSpec.maxtime
             # core count
             if siteSpec.coreCount:
                 coreCount = siteSpec.coreCount
             else:
                 coreCount = 1
             # core power
             corePower = siteSpec.corepower
             # max num of event ranges for dynNumEvents
             if dynNumEvents:
                 maxNumEventRanges = int(siteSpec.get_n_sim_events() //
                                         taskSpec.get_min_granularity())
                 if maxNumEventRanges == 0:
                     maxNumEventRanges = 1
             tmpLog.debug(
                 'chosen {0} : {1} : nQueue={2} nRunCap={3}'.format(
                     siteName, getCandidateMsg, siteCandidate.nQueuedJobs,
                     siteCandidate.nRunningJobsCap))
             tmpLog.debug('new weight {0}'.format(siteCandidate.weight))
             tmpLog.debug(
                 'maxSize={0} maxWalltime={1} coreCount={2} corePower={3} maxNumEventRanges={4} maxDisk={5}'
                 .format(maxSize, maxWalltime, coreCount, corePower,
                         maxNumEventRanges, maxDiskSize))
             tmpLog.debug('useDirectIO={0} label={1}'.format(
                 useDirectIO, taskSpec.prodSourceLabel))
         # get sub chunk
         subChunk = inputChunk.getSubChunk(
             siteName,
             maxSize=maxSize,
             maxNumFiles=maxNumFiles,
             sizeGradients=sizeGradients,
             sizeIntercepts=sizeIntercepts,
             nFilesPerJob=nFilesPerJob,
             walltimeGradient=walltimeGradient,
             maxWalltime=maxWalltime,
             nEventsPerJob=nEventsPerJob,
             useBoundary=useBoundary,
             sizeGradientsPerInSize=sizeGradientsPerInSize,
             maxOutSize=maxOutSize,
             coreCount=coreCount,
             respectLB=respectLB,
             corePower=corePower,
             dynNumEvents=dynNumEvents,
             maxNumEventRanges=maxNumEventRanges,
             multiplicity=multiplicity,
             splitByFields=splitByFields,
             tmpLog=tmpLog,
             useDirectIO=useDirectIO,
             maxDiskSize=maxDiskSize,
             enableLog=True)
         if subChunk is None:
             break
         if subChunk != []:
             # append
             subChunks.append(subChunk)
         iSubChunks += 1
     # append to return map if remain
     isSkipped = False
     if subChunks != []:
         # skip if chunk size is not enough
         if allow_chunk_size_limit and strict_chunkSize and len(
                 subChunks) < nSubChunks:
             tmpLog.debug(
                 'skip splitting since chunk size {} is less than chunk size limit {} at {}'
                 .format(len(subChunks), nSubChunks, siteName))
             inputChunk.rollback_file_usage()
             isSkipped = True
         else:
             # get site names for parallel execution
             if taskSpec.getNumSitesPerJob(
             ) > 1 and not inputChunk.isMerging:
                 siteName = inputChunk.getParallelSites(
                     taskSpec.getNumSitesPerJob(), nSubChunks, [siteName])
             returnList.append({
                 'siteName': siteName,
                 'subChunks': subChunks,
                 'siteCandidate': siteCandidate,
             })
             try:
                 gshare = taskSpec.gshare.replace(' ', '_')
             except Exception:
                 gshare = None
             tmpLog.info('split to nJobs=%s at site=%s gshare=%s' %
                         (len(subChunks), siteName, gshare))
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED, returnList, isSkipped
Пример #50
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 100
             taskList = self.taskList.get(nTasks)
             totalTasks, idxTasks = self.taskList.stat()
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug(
                     '{0} terminating since no more items'.format(
                         self.__class__.__name__))
                 return
             # make logger
             tmpLog = MsgWrapper(self.logger)
             tmpLog.info(
                 'start TaskBrokerThread {0}/{1} for jediTaskID={2}'.format(
                     idxTasks, totalTasks, taskList))
             tmpStat = Interaction.SC_SUCCEEDED
             # get TaskSpecs
             tmpListToAssign = []
             for tmpTaskItem in taskList:
                 tmpListItem = self.taskBufferIF.getTasksToBeProcessed_JEDI(
                     None,
                     None,
                     None,
                     None,
                     None,
                     simTasks=[tmpTaskItem],
                     readMinFiles=True)
                 if tmpListItem is None:
                     # failed
                     tmpLog.error(
                         'failed to get the input chunks for jediTaskID={0}'
                         .format(tmpTaskItem))
                     tmpStat = Interaction.SC_FAILED
                     break
                 tmpListToAssign += tmpListItem
             # get impl
             if tmpStat == Interaction.SC_SUCCEEDED:
                 tmpLog.info('getting Impl')
                 try:
                     impl = self.implFactory.getImpl(
                         self.vo, self.prodSourceLabel)
                     if impl is None:
                         # task refiner is undefined
                         tmpLog.error(
                             'task broker is undefined for vo={0} sourceLabel={1}'
                             .format(self.vo, self.prodSourceLabel))
                         tmpStat = Interaction.SC_FAILED
                 except Exception:
                     errtype, errvalue = sys.exc_info()[:2]
                     tmpLog.error('getImpl failed with {0}:{1}'.format(
                         errtype.__name__, errvalue))
                     tmpStat = Interaction.SC_FAILED
             # brokerage
             if tmpStat == Interaction.SC_SUCCEEDED:
                 tmpLog.info('brokerage with {0} for {1} tasks '.format(
                     impl.__class__.__name__, len(tmpListToAssign)))
                 try:
                     tmpStat = impl.doBrokerage(tmpListToAssign, self.vo,
                                                self.prodSourceLabel,
                                                self.workQueue,
                                                self.resource_name)
                 except Exception:
                     errtype, errvalue = sys.exc_info()[:2]
                     tmpLog.error('doBrokerage failed with {0}:{1}'.format(
                         errtype.__name__, errvalue))
                     tmpStat = Interaction.SC_FAILED
             # register
             if tmpStat != Interaction.SC_SUCCEEDED:
                 tmpLog.error('failed')
             else:
                 tmpLog.info('done')
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(
                 self.__class__.__name__, errtype.__name__, errvalue))
Пример #51
0
    def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue, resource_name):
        # params
        nBunch = 4
        threshold = 2.0
        nJobsInBunchMax = 600
        nJobsInBunchMin = 500
        minTotalWalltime = 50*1000*1000
        nWaitingLimit = 4
        nWaitingBunchLimit = 2
        nParallel = 2
        nParallelCap = 5
        # make logger
        tmpLog = MsgWrapper(logger)

        workQueueID = workQueue.getID()
        workQueueName = workQueue.queue_name

        workQueueName = '_'.join(workQueue.queue_name.split(' '))
        msgHeader = '{0}:{1} cloud={2} queue={3} resource_type={4}:'.format(vo, prodSourceLabel, cloudName,
                                                                            workQueueName, resource_name)
        tmpLog.debug('{0} start workQueueID={1}'.format(msgHeader, workQueueID))

        # get central configuration values
        config_map = self.__getConfiguration(vo, workQueue.queue_name, resource_name)
        configQueueLimit = config_map[NQUEUELIMIT]['value']
        configQueueCap = config_map[NQUEUECAP]['value']
        configRunningCap = config_map[NRUNNINGCAP]['value']

        tmpLog.debug(msgHeader + ' got configuration configQueueLimit={0}, configQueueCap={1}, configRunningCap={2}'
                     .format(configQueueLimit, configQueueCap, configRunningCap))

        # check if unthrottled
        if not workQueue.throttled:
            msgBody = "PASS unthrottled since GS_throttled is False"
            tmpLog.info(msgHeader+" "+msgBody)
            return self.retUnThrottled

        # get the jobs statistics for our wq/gs and expand the stats map
        jobstats_map = self.__prepareJobStats(workQueue, resource_name, config_map)
        nRunning_rt = jobstats_map['nRunning_rt']
        nRunning_gs = jobstats_map['nRunning_gs']
        nRunning_runningcap = jobstats_map['nRunning_runningcap']
        nNotRun_rt = jobstats_map['nNotRun_rt']
        nNotRun_gs = jobstats_map['nNotRun_gs']
        nNotRun_queuelimit = jobstats_map['nNotRun_queuelimit']
        nNotRun_queuecap = jobstats_map['nNotRun_queuecap']
        nDefine_rt = jobstats_map['nDefine_rt']
        nDefine_gs = jobstats_map['nDefine_gs']
        nDefine_queuelimit = jobstats_map['nDefine_queuelimit']
        nDefine_queuecap = jobstats_map['nDefine_queuecap']
        nWaiting_rt = jobstats_map['nWaiting_rt']
        nWaiting_gs = jobstats_map['nWaiting_gs']

        # check if higher prio tasks are waiting
        if workQueue.queue_name in non_rt_wqs:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed', cloudName, workQueue)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo, workQueue, 'managed', cloudName)
        else:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed', cloudName, workQueue, resource_name)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo, workQueue, 'managed', cloudName, resource_name)

        highestPrioInPandaDB = highestPrioJobStat['highestPrio']
        nNotRunHighestPrio   = highestPrioJobStat['nNotRun']
        if highestPrioWaiting is None:
            msgBody = 'failed to get the highest priority of waiting tasks'
            tmpLog.error("{0} {1}".format(msgHeader, msgBody))
            return self.retTmpError

        # high priority tasks are waiting
        highPrioQueued = False
        if highestPrioWaiting > highestPrioInPandaDB \
                or (highestPrioWaiting == highestPrioInPandaDB and nNotRunHighestPrio < nJobsInBunchMin):
            highPrioQueued = True
        tmpLog.debug("{0} highestPrio waiting:{1} inPanda:{2} numNotRun:{3} -> highPrioQueued={4}".format(msgHeader,
                                                                                                          highestPrioWaiting,
                                                                                                          highestPrioInPandaDB,
                                                                                                          nNotRunHighestPrio,
                                                                                                          highPrioQueued))
        # set maximum number of jobs to be submitted
        if workQueue.queue_name in non_rt_wqs:
            tmpRemainingSlot = int(nRunning_gs * threshold - nNotRun_gs)
        else:
            tmpRemainingSlot = int(nRunning_rt * threshold - nNotRun_rt)
        # use the lower limit to avoid creating too many _sub/_dis datasets
        nJobsInBunch = min(max(nJobsInBunchMin, tmpRemainingSlot), nJobsInBunchMax)

        if configQueueLimit is not None:
            nQueueLimit = configQueueLimit
        else:
            nQueueLimit = nJobsInBunch * nBunch

        # use nPrestage for reprocessing
        if workQueue.queue_name in ['Heavy Ion', 'Reprocessing default']:
            # reset nJobsInBunch
            if nQueueLimit > (nNotRun_queuelimit + nDefine_queuelimit):
                tmpRemainingSlot = nQueueLimit - (nNotRun_queuelimit + nDefine_queuelimit)
                if tmpRemainingSlot > nJobsInBunch:
                    nJobsInBunch = min(tmpRemainingSlot, nJobsInBunchMax)

        # get cap
        # set number of jobs to be submitted
        if configQueueCap is None:
            self.setMaxNumJobs(nJobsInBunch / nParallel)
        else:
            self.setMaxNumJobs(configQueueCap / nParallelCap)

        # get total walltime
        totWalltime = self.taskBufferIF.getTotalWallTime_JEDI(vo, prodSourceLabel, workQueue, resource_name, cloudName)

        # log the current situation and limits
        tmpLog.info("{0} nQueueLimit={1} nRunCap={2} nQueueCap={3}".format(msgHeader, nQueueLimit,
                                                                           configRunningCap, configQueueCap))
        tmpLog.info("{0} at global share level: nQueued={1} nDefine={2} nRunning={3}".format(msgHeader,
                                                                                             nNotRun_gs + nDefine_gs,
                                                                                             nDefine_gs, nRunning_gs))
        tmpLog.info("{0} at resource type level: nQueued_rt={1} nDefine_rt={2} nRunning_rt={3} totWalltime={4}".format(msgHeader,
                                                                                                                nNotRun_rt + nDefine_rt,
                                                                                                                nDefine_rt, nRunning_rt,
                                                                                                                totWalltime))

        # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling
        limitPriority = False
        if workQueue.queue_name not in non_rt_wqs \
                and nRunning_rt == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit \
                and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(nNotRun_queuelimit + nDefine_queuelimit,
                                                                                                     nQueueLimit, totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs \
                and nRunning_gs == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(nNotRun_queuelimit + nDefine_queuelimit,
                                                                                                     nQueueLimit, totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name not in non_rt_wqs and  nRunning_rt != 0 \
                and float(nNotRun_rt + nDefine_rt) / float(nRunning_rt) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_rt({0})/nRunning_rt({1})>{2} & nQueued_queuelimit({3})>{4} totWalltime({5})>{6}".format(nNotRun_rt + nDefine_rt, nRunning_rt,
                                                                                                               threshold, nNotRun_queuelimit + nDefine_queuelimit,
                                                                                                               nQueueLimit, totWalltime,
                                                                                                               minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs and nRunning_gs != 0 \
                and float(nNotRun_gs + nDefine_gs) / float(nRunning_gs) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_gs({0})/nRunning_gs({1})>{2} & nQueued_queuelimit({3})>{4}".format(nNotRun_gs + nDefine_gs, nRunning_gs,
                                                                                                               threshold, nNotRun_queuelimit + nDefine_queuelimit,
                                                                                                               nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif nDefine_queuelimit > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # brokerage is stuck
                msgBody = "SKIP too many nDefined_queuelimit({0})>{1}".format(nDefine_queuelimit, nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif nWaiting_rt > max(nRunning_rt * nWaitingLimit, nJobsInBunch * nWaitingBunchLimit):
            limitPriority = True
            if not highPrioQueued:
                # too many waiting
                msgBody = "SKIP too many nWaiting_rt({0})>max(nRunning_rt({1})x{2},{3}x{4})".format(nWaiting_rt, nRunning_rt, nWaitingLimit,
                                                                                                    nJobsInBunch, nWaitingBunchLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif configRunningCap and nRunning_runningcap > configRunningCap:
            # cap on running
            msgBody = "SKIP nRunning_runningcap({0})>nRunningCap({1})".format(nRunning_runningcap, configRunningCap)
            tmpLog.warning('{0} {1}'.format(msgHeader, msgBody))
            tmpLog.sendMsg('{0} {1}'.format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
            return self.retMergeUnThr

        elif configQueueCap and nNotRun_queuecap + nDefine_queuecap > configQueueCap:
            limitPriority = True
            if not highPrioQueued:
                # cap on queued
                msgBody = "SKIP nQueued_queuecap({0})>nQueueCap({1})".format(nNotRun_queuecap + nDefine_queuecap, configQueueCap)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        # get jobs from prodDB
        limitPriorityValue = None
        if limitPriority:
            limitPriorityValue = highestPrioWaiting
            self.setMinPriority(limitPriorityValue)
        else:
            # not enough jobs are queued
            if (nNotRun_queuelimit + nDefine_queuelimit < nQueueLimit * 0.9) \
                    or (workQueue.queue_name in non_rt_wqs and nNotRun_gs + nDefine_gs < nRunning_gs) \
                    or (workQueue.queue_name not in non_rt_wqs and nNotRun_rt + nDefine_rt < nRunning_rt):
                tmpLog.debug(msgHeader+" not enough jobs queued")
                self.notEnoughJobsQueued()
                self.setMaxNumJobs(max(self.maxNumJobs, nQueueLimit/20))

        msgBody = "PASS - priority limit={0} maxNumJobs={1}".format(limitPriorityValue, self.maxNumJobs)
        tmpLog.info(msgHeader+" "+msgBody)
        return self.retUnThrottled