Пример #1
0
    def doActionForReassign(self,gTmpLog):
        # get DDM I/F
        ddmIF = self.ddmIF.getInterface(self.vo)
        # get site mapper
        siteMapper = self.taskBufferIF.getSiteMapper()
        # get tasks to get reassigned
        taskList = self.taskBufferIF.getTasksToReassign_JEDI(self.vo,self.prodSourceLabel)

        gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList)))
        for taskSpec in taskList:
            tmpLog = MsgWrapper(logger, '< jediTaskID={0} >'.format(taskSpec.jediTaskID))
            tmpLog.debug('start to reassign')
            # DDM backend
            ddmBackEnd = taskSpec.getDdmBackEnd()
            # get datasets
            tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,['output','log'])
            if tmpStat is not True:
                tmpLog.error('failed to get datasets')
                continue
            # update DB
            if not taskSpec.useWorldCloud():
                # update cloudtasks
                tmpStat = self.taskBufferIF.setCloudTaskByUser('jedi',taskSpec.jediTaskID,taskSpec.cloud,'assigned',True)
                if tmpStat != 'SUCCEEDED':
                    tmpLog.error('failed to update CloudTasks')
                    continue
                # check cloud
                if not siteMapper.checkCloud(taskSpec.cloud):
                    tmpLog.error("cloud={0} doesn't exist".format(taskSpec.cloud))
                    continue
            else:
                # re-run task brokerage
                if taskSpec.nucleus in [None,'']:
                    taskSpec.status = 'assigning'
                    taskSpec.oldStatus = None
                    taskSpec.setToRegisterDatasets()
                    self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID': taskSpec.jediTaskID},
                                                      setOldModTime=True)
                    tmpLog.debug('#ATM #KV label=managed action=trigger_new_brokerage by setting task_status={0}'.
                                 format(taskSpec.status))
                    continue

                # get nucleus
                nucleusSpec = siteMapper.getNucleus(taskSpec.nucleus)
                if nucleusSpec is None:
                    tmpLog.error("nucleus={0} doesn't exist".format(taskSpec.nucleus))
                    continue

                # set nucleus
                retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,datasetSpecList)}
                tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)

            # get T1/nucleus
            if not taskSpec.useWorldCloud():
                t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest']
            else:
                t1SiteName = nucleusSpec.getOnePandaSite()
            t1Site = siteMapper.getSite(t1SiteName)

            # loop over all datasets
            isOK = True
            for datasetSpec in datasetSpecList:
                tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName))
                if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                    tmpLog.debug('skip {0} is distributed'.format(datasetSpec.datasetName))
                    continue
                # get location
                location = siteMapper.getDdmEndpoint(t1Site.sitename, datasetSpec.storageToken, taskSpec.prodSourceLabel,
                                                     JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType))
                # make subscription
                try:
                    tmpLog.debug('registering subscription to {0} with backend={1}'.format(location,
                                                                                           ddmBackEnd))
                    tmpStat = ddmIF.registerDatasetSubscription(datasetSpec.datasetName,location,
                                                                'Production Output',asynchronous=True)
                    if tmpStat is not True:
                        tmpLog.error("failed to make subscription")
                        isOK = False
                        break
                except Exception:
                    errtype,errvalue = sys.exc_info()[:2]
                    tmpLog.warning('failed to make subscription with {0}:{1}'.format(errtype.__name__,errvalue))
                    isOK = False
                    break
            # succeeded
            if isOK:
                # activate task
                if taskSpec.oldStatus in ['assigning','exhausted',None]:
                    taskSpec.status = 'ready'
                else:
                    taskSpec.status = taskSpec.oldStatus
                taskSpec.oldStatus = None
                self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID},
                                                  setOldModTime=True)
                tmpLog.debug('finished to reassign')
Пример #2
0
 def doActionForReassgin(self,gTmpLog):
     # get DDM I/F
     ddmIF = self.ddmIF.getInterface(self.vo)
     # get site mapper
     siteMapper = self.taskBufferIF.getSiteMapper()
     # get tasks to get reassigned
     taskList = self.taskBufferIF.getTasksToReassign_JEDI(self.vo,self.prodSourceLabel)
     gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList)))
     for taskSpec in taskList:
         tmpLog = MsgWrapper(logger,'<jediTaskID={0}'.format(taskSpec.jediTaskID))
         tmpLog.debug('start to reassign')
         # DDM backend
         ddmBackEnd = taskSpec.getDdmBackEnd()
         # get datasets
         tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,['output','log'])
         if tmpStat != True:
             tmpLog.error('failed to get datasets')
             continue
         # update DB
         if not taskSpec.useWorldCloud():
             # update cloudtasks
             tmpStat = self.taskBufferIF.setCloudTaskByUser('jedi',taskSpec.jediTaskID,taskSpec.cloud,'assigned',True)
             if tmpStat != 'SUCCEEDED':
                 tmpLog.error('failed to update CloudTasks')
                 continue
             # check cloud
             if not siteMapper.checkCloud(taskSpec.cloud):
                 tmpLog.error("cloud={0} doesn't exist".format(taskSpec.cloud))
                 continue
         else:
             # re-run task brokerage
             if taskSpec.nucleus in [None,'']:
                 taskSpec.status = 'assigning'
                 taskSpec.oldStatus = None
                 taskSpec.setToRegisterDatasets()
                 self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID},
                                                   setOldModTime=True)
                 tmpLog.debug('set task_status={0} to trigger task brokerage again'.format(taskSpec.status))
                 continue
             # get nucleus
             nucleusSpec = siteMapper.getNucleus(taskSpec.nucleus)
             if nucleusSpec == None:
                 tmpLog.error("nucleus={0} doesn't exist".format(taskSpec.nucleus))
                 continue
             # set nucleus
             retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,datasetSpecList)}
             tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
         # get T1/nucleus
         if not taskSpec.useWorldCloud():
             t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest']
         else:
             t1SiteName = nucleusSpec.getOnePandaSite()
         t1Site = siteMapper.getSite(t1SiteName)
         # loop over all datasets
         isOK = True
         for datasetSpec in datasetSpecList:
             tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName))
             if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                 tmpLog.debug('skip {0} is distributed'.format(datasetSpec.datasetName))
                 continue
             # get location
             location = siteMapper.getDdmEndpoint(t1Site.sitename,datasetSpec.storageToken)
             # make subscription
             try:
                 tmpLog.debug('registering subscription to {0} with backend={1}'.format(location,
                                                                                        ddmBackEnd))
                 tmpStat = ddmIF.registerDatasetSubscription(datasetSpec.datasetName,location,
                                                             'Production Output',asynchronous=True)
                 if tmpStat != True:
                     tmpLog.error("failed to make subscription")
                     isOK = False
                     break
             except:
                 errtype,errvalue = sys.exc_info()[:2]
                 tmpLog.warning('failed to make subscription with {0}:{1}'.format(errtype.__name__,errvalue))
                 isOK = False
                 break
         # succeeded
         if isOK:    
             # activate task
             if taskSpec.oldStatus in ['assigning','exhausted',None]:
                 taskSpec.status = 'ready'
             else:
                 taskSpec.status = taskSpec.oldStatus
             taskSpec.oldStatus = None
             self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID},
                                               setOldModTime=True)
             tmpLog.debug('finished to reassign')
Пример #3
0
    def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue,
                      resource_name):
        # params
        nBunch = 4
        threshold = 2.0
        nJobsInBunchMax = 600
        nJobsInBunchMin = 500
        minTotalWalltime = 50 * 1000 * 1000
        nWaitingLimit = 4
        nWaitingBunchLimit = 2
        nParallel = 2
        nParallelCap = 5
        # make logger
        tmpLog = MsgWrapper(logger)

        workQueueID = workQueue.getID()
        workQueueName = workQueue.queue_name

        workQueueName = '_'.join(workQueue.queue_name.split(' '))
        msgHeader = '{0}:{1} cloud={2} queue={3} resource_type={4}:'.format(
            vo, prodSourceLabel, cloudName, workQueueName, resource_name)
        tmpLog.debug('{0} start workQueueID={1}'.format(
            msgHeader, workQueueID))

        # get central configuration values
        config_map = self.__getConfiguration(vo, workQueue.queue_name,
                                             resource_name)
        configQueueLimit = config_map[NQUEUELIMIT]['value']
        configQueueCap = config_map[NQUEUECAP]['value']
        configRunningCap = config_map[NRUNNINGCAP]['value']

        tmpLog.debug(
            msgHeader +
            ' got configuration configQueueLimit={0}, configQueueCap={1}, configRunningCap={2}'
            .format(configQueueLimit, configQueueCap, configRunningCap))

        # check if unthrottled
        if not workQueue.throttled:
            msgBody = "PASS unthrottled since GS_throttled is False"
            tmpLog.info(msgHeader + " " + msgBody)
            return self.retUnThrottled

        # get the jobs statistics for our wq/gs and expand the stats map
        jobstats_map = self.__prepareJobStats(workQueue, resource_name,
                                              config_map)
        nRunning_rt = jobstats_map['nRunning_rt']
        nRunning_gs = jobstats_map['nRunning_gs']
        nRunning_runningcap = jobstats_map['nRunning_runningcap']
        nNotRun_rt = jobstats_map['nNotRun_rt']
        nNotRun_gs = jobstats_map['nNotRun_gs']
        nNotRun_queuelimit = jobstats_map['nNotRun_queuelimit']
        nNotRun_queuecap = jobstats_map['nNotRun_queuecap']
        nDefine_rt = jobstats_map['nDefine_rt']
        nDefine_gs = jobstats_map['nDefine_gs']
        nDefine_queuelimit = jobstats_map['nDefine_queuelimit']
        nDefine_queuecap = jobstats_map['nDefine_queuecap']
        nWaiting_rt = jobstats_map['nWaiting_rt']
        nWaiting_gs = jobstats_map['nWaiting_gs']

        # check if higher prio tasks are waiting
        if workQueue.queue_name in non_rt_wqs:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI(
                'managed', cloudName, workQueue)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(
                vo, workQueue, 'managed', cloudName)
        else:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI(
                'managed', cloudName, workQueue, resource_name)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(
                vo, workQueue, 'managed', cloudName, resource_name)

        highestPrioInPandaDB = highestPrioJobStat['highestPrio']
        nNotRunHighestPrio = highestPrioJobStat['nNotRun']
        if highestPrioWaiting is None:
            msgBody = 'failed to get the highest priority of waiting tasks'
            tmpLog.error("{0} {1}".format(msgHeader, msgBody))
            return self.retTmpError

        # high priority tasks are waiting
        highPrioQueued = False
        if highestPrioWaiting > highestPrioInPandaDB \
                or (highestPrioWaiting == highestPrioInPandaDB and nNotRunHighestPrio < nJobsInBunchMin):
            highPrioQueued = True
        tmpLog.debug(
            "{0} highestPrio waiting:{1} inPanda:{2} numNotRun:{3} -> highPrioQueued={4}"
            .format(msgHeader, highestPrioWaiting, highestPrioInPandaDB,
                    nNotRunHighestPrio, highPrioQueued))
        # set maximum number of jobs to be submitted
        if workQueue.queue_name in non_rt_wqs:
            tmpRemainingSlot = int(nRunning_gs * threshold - nNotRun_gs)
        else:
            tmpRemainingSlot = int(nRunning_rt * threshold - nNotRun_rt)
        # use the lower limit to avoid creating too many _sub/_dis datasets
        nJobsInBunch = min(max(nJobsInBunchMin, tmpRemainingSlot),
                           nJobsInBunchMax)

        if configQueueLimit is not None:
            nQueueLimit = configQueueLimit
        else:
            nQueueLimit = nJobsInBunch * nBunch

        # use nPrestage for reprocessing
        if workQueue.queue_name in ['Heavy Ion', 'Reprocessing default']:
            # reset nJobsInBunch
            if nQueueLimit > (nNotRun_queuelimit + nDefine_queuelimit):
                tmpRemainingSlot = nQueueLimit - (nNotRun_queuelimit +
                                                  nDefine_queuelimit)
                if tmpRemainingSlot > nJobsInBunch:
                    nJobsInBunch = min(tmpRemainingSlot, nJobsInBunchMax)

        # get cap
        # set number of jobs to be submitted
        if configQueueCap is None:
            self.setMaxNumJobs(nJobsInBunch / nParallel)
        else:
            self.setMaxNumJobs(configQueueCap / nParallelCap)

        # get total walltime
        totWalltime = self.taskBufferIF.getTotalWallTime_JEDI(
            vo, prodSourceLabel, workQueue, resource_name, cloudName)

        # log the current situation and limits
        tmpLog.info("{0} nQueueLimit={1} nRunCap={2} nQueueCap={3}".format(
            msgHeader, nQueueLimit, configRunningCap, configQueueCap))
        tmpLog.info(
            "{0} at global share level: nQueued={1} nDefine={2} nRunning={3}".
            format(msgHeader, nNotRun_gs + nDefine_gs, nDefine_gs,
                   nRunning_gs))
        tmpLog.info(
            "{0} at resource type level: nQueued_rt={1} nDefine_rt={2} nRunning_rt={3} totWalltime={4}"
            .format(msgHeader, nNotRun_rt + nDefine_rt, nDefine_rt,
                    nRunning_rt, totWalltime))

        # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling
        limitPriority = False
        if workQueue.queue_name not in non_rt_wqs \
                and nRunning_rt == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit \
                and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs \
                and nRunning_gs == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name not in non_rt_wqs and  nRunning_rt != 0 \
                and float(nNotRun_rt + nDefine_rt) / float(nRunning_rt) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_rt({0})/nRunning_rt({1})>{2} & nQueued_queuelimit({3})>{4} totWalltime({5})>{6}".format(
                    nNotRun_rt + nDefine_rt, nRunning_rt, threshold,
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs and nRunning_gs != 0 \
                and float(nNotRun_gs + nDefine_gs) / float(nRunning_gs) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_gs({0})/nRunning_gs({1})>{2} & nQueued_queuelimit({3})>{4}".format(
                    nNotRun_gs + nDefine_gs, nRunning_gs, threshold,
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif nDefine_queuelimit > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # brokerage is stuck
                msgBody = "SKIP too many nDefined_queuelimit({0})>{1}".format(
                    nDefine_queuelimit, nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif nWaiting_rt > max(nRunning_rt * nWaitingLimit,
                               nJobsInBunch * nWaitingBunchLimit):
            limitPriority = True
            if not highPrioQueued:
                # too many waiting
                msgBody = "SKIP too many nWaiting_rt({0})>max(nRunning_rt({1})x{2},{3}x{4})".format(
                    nWaiting_rt, nRunning_rt, nWaitingLimit, nJobsInBunch,
                    nWaitingBunchLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif configRunningCap and nRunning_runningcap > configRunningCap:
            # cap on running
            msgBody = "SKIP nRunning_runningcap({0})>nRunningCap({1})".format(
                nRunning_runningcap, configRunningCap)
            tmpLog.warning('{0} {1}'.format(msgHeader, msgBody))
            tmpLog.sendMsg('{0} {1}'.format(msgHeader, msgBody),
                           self.msgType,
                           msgLevel='warning',
                           escapeChar=True)
            return self.retMergeUnThr

        elif configQueueCap and nNotRun_queuecap + nDefine_queuecap > configQueueCap:
            limitPriority = True
            if not highPrioQueued:
                # cap on queued
                msgBody = "SKIP nQueued_queuecap({0})>nQueueCap({1})".format(
                    nNotRun_queuecap + nDefine_queuecap, configQueueCap)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        # get jobs from prodDB
        limitPriorityValue = None
        if limitPriority:
            limitPriorityValue = highestPrioWaiting
            self.setMinPriority(limitPriorityValue)
        else:
            # not enough jobs are queued
            if (nNotRun_queuelimit + nDefine_queuelimit < nQueueLimit * 0.9) \
                    or (workQueue.queue_name in non_rt_wqs and nNotRun_gs + nDefine_gs < nRunning_gs) \
                    or (workQueue.queue_name not in non_rt_wqs and nNotRun_rt + nDefine_rt < nRunning_rt):
                tmpLog.debug(msgHeader + " not enough jobs queued")
                if not workQueue.queue_name in non_rt_wqs:
                    self.notEnoughJobsQueued()
                self.setMaxNumJobs(max(self.maxNumJobs, nQueueLimit / 20))

        msgBody = "PASS - priority limit={0} maxNumJobs={1}".format(
            limitPriorityValue, self.maxNumJobs)
        tmpLog.info(msgHeader + " " + msgBody)
        return self.retUnThrottled
Пример #4
0
 def toBeThrottled(self,vo,prodSourceLabel,cloudName,workQueue,jobStat):
     # component name
     compName = 'prod_job_throttler'
     # params
     nBunch = 4
     threshold = 2.0
     thresholdForSite = threshold - 1.0
     nJobsInBunchMax = 600
     nJobsInBunchMin = 500
     nJobsInBunchMaxES = 1000
     if workQueue.criteria != None and 'site' in workQueue.criteria:
         minTotalWalltime = 10*1000*1000
     else:
         minTotalWalltime = 50*1000*1000
     nWaitingLimit = 4
     nWaitingBunchLimit = 2
     nParallel = 2
     # make logger
     tmpLog = MsgWrapper(logger)
     workQueueIDs = workQueue.getIDs()
     msgHeader = '{0}:{1} cloud={2} queue={3}:'.format(vo,prodSourceLabel,cloudName,workQueue.queue_name)
     tmpLog.debug(msgHeader+' start workQueueID={0}'.format(str(workQueueIDs)))
     # change threashold
     if workQueue.queue_name in ['mcore']:
         threshold = 5.0
     # check cloud status
     if not self.siteMapper.checkCloud(cloudName):
         msgBody = "SKIP cloud={0} undefined".format(cloudName)
         tmpLog.warning(msgHeader+" "+msgBody)
         tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
         return self.retThrottled
     cloudSpec = self.siteMapper.getCloud(cloudName)
     if cloudSpec['status'] in ['offline']:
         msgBody = "SKIP cloud.status={0}".format(cloudSpec['status'])
         tmpLog.warning(msgHeader+" "+msgBody)
         tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
         return self.retThrottled
     if cloudSpec['status'] in ['test']:
         if workQueue.queue_name != 'test':
             msgBody = "SKIP cloud.status={0} for non test queue ({1})".format(cloudSpec['status'],
                                                                               workQueue.queue_name)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
             tmpLog.warning(msgHeader+" "+msgBody)
             return self.retThrottled
     # check if unthrottled
     if workQueue.queue_share == None:
         msgBody = "PASS unthrottled since share=None"
         tmpLog.debug(msgHeader+" "+msgBody)
         return self.retUnThrottled
     # count number of jobs in each status
     nRunning = 0
     nNotRun  = 0
     nDefine  = 0
     nWaiting = 0
     for workQueueID in workQueueIDs:
         if jobStat.has_key(cloudName) and \
                jobStat[cloudName].has_key(workQueueID):
             tmpLog.debug(msgHeader+" "+str(jobStat[cloudName][workQueueID]))
             for pState,pNumber in jobStat[cloudName][workQueueID].iteritems():
                 if pState in ['running']:
                     nRunning += pNumber
                 elif pState in ['assigned','activated','starting']:
                     nNotRun  += pNumber
                 elif pState in ['defined']:
                     nDefine  += pNumber
                 elif pState in ['waiting']:
                     nWaiting += pNumber
     # check if higher prio tasks are waiting
     tmpStat,highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed',cloudName,workQueue)
     highestPrioInPandaDB = highestPrioJobStat['highestPrio']
     nNotRunHighestPrio   = highestPrioJobStat['nNotRun']
     # the highest priority of waiting tasks 
     highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo,workQueue,
                                                                      'managed',cloudName)
     if highestPrioWaiting == None:
         msgBody = 'failed to get the highest priority of waiting tasks'
         tmpLog.error(msgHeader+" "+msgBody)
         return self.retTmpError
     # high priority tasks are waiting
     highPrioQueued = False
     if highestPrioWaiting > highestPrioInPandaDB or (highestPrioWaiting == highestPrioInPandaDB and \
                                                      nNotRunHighestPrio < nJobsInBunchMin):
         highPrioQueued = True
     tmpLog.debug(msgHeader+" highestPrio waiting:{0} inPanda:{1} numNotRun:{2} -> highPrioQueued={3}".format(highestPrioWaiting,
                                                                                                              highestPrioInPandaDB,
                                                                                                              nNotRunHighestPrio,
                                                                                                              highPrioQueued))
     # set maximum number of jobs to be submitted
     tmpRemainingSlot = int(nRunning*threshold-nNotRun)
     if tmpRemainingSlot < nJobsInBunchMin:
         # use the lower limit to avoid creating too many _sub/_dis datasets
         nJobsInBunch = nJobsInBunchMin
     else:
         if workQueue.queue_name in ['evgensimul']:
             # use higher limit for evgensimul
             if tmpRemainingSlot < nJobsInBunchMaxES:
                 nJobsInBunch = tmpRemainingSlot
             else:
                 nJobsInBunch = nJobsInBunchMaxES
         else:
             if tmpRemainingSlot < nJobsInBunchMax:
                 nJobsInBunch = tmpRemainingSlot
             else:
                 nJobsInBunch = nJobsInBunchMax
     nQueueLimit = nJobsInBunch*nBunch
     # use special nQueueLimit
     tmpVal = self.taskBufferIF.getConfigValue(compName, 'NQUEUELIMIT_{0}'.format(workQueue.queue_name), 'jedi', 'atlas')
     if tmpVal is not None:
         nQueueLimit = tmpVal
     # use nPrestage for reprocessing   
     if workQueue.queue_name in ['reprocessing','mcore_repro']:
         # reset nJobsInBunch
         if nQueueLimit > (nNotRun+nDefine):
             tmpRemainingSlot = nQueueLimit - (nNotRun+nDefine)
             if tmpRemainingSlot < nJobsInBunch:
                 pass
             elif tmpRemainingSlot < nJobsInBunchMax:
                 nJobsInBunch = tmpRemainingSlot
             else:
                 nJobsInBunch = nJobsInBunchMax
     # get cap
     nRunningCap = self.taskBufferIF.getConfigValue(compName, 'NRUNNINGCAP_{0}'.format(workQueue.queue_name), 'jedi', 'atlas')
     nQueueCap = self.taskBufferIF.getConfigValue(compName, 'NQUEUECAP_{0}'.format(workQueue.queue_name), 'jedi', 'atlas')
     # set number of jobs to be submitted
     self.setMaxNumJobs(nJobsInBunch/nParallel)
     # get total walltime
     totWalltime = self.taskBufferIF.getTotalWallTime_JEDI(vo,prodSourceLabel,workQueue,cloudName)
     # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling
     limitPriority = False
     tmpStr = msgHeader+" nQueueLimit:{0} nQueued:{1} nDefine:{2} nRunning:{3} totWalltime:{4} nRunCap:{5} nQueueCap:{6}"
     tmpLog.debug(tmpStr.format(nQueueLimit,
                                nNotRun+nDefine,
                                nDefine,
                                nRunning,
                                totWalltime,
                                nRunningCap,
                                nQueueCap))
     # check
     if nRunning == 0 and (nNotRun+nDefine) > nQueueLimit and (totWalltime == None or totWalltime > minTotalWalltime):
         limitPriority = True
         if not highPrioQueued:
             # pilot is not running or DDM has a problem
             msgBody = "SKIP no running and enough nQueued({0})>{1} totWalltime({2})>{3} ".format(nNotRun+nDefine,nQueueLimit,
                                                                                                  totWalltime,minTotalWalltime)
             tmpLog.warning(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True)
             return self.retMergeUnThr
     elif nRunning != 0 and float(nNotRun+nDefine)/float(nRunning) > threshold and \
             (nNotRun+nDefine) > nQueueLimit and (totWalltime == None or totWalltime > minTotalWalltime):
         limitPriority = True
         if not highPrioQueued:
             # enough jobs in Panda
             msgBody = "SKIP nQueued({0})/nRunning({1})>{2} & nQueued({3})>{4} totWalltime({5})>{6}".format(nNotRun+nDefine,nRunning,
                                                                                                            threshold,nNotRun+nDefine,
                                                                                                            nQueueLimit,
                                                                                                            totWalltime,minTotalWalltime)
             tmpLog.warning(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True)
             return self.retMergeUnThr
     elif nDefine > nQueueLimit:
         limitPriority = True
         if not highPrioQueued:
             # brokerage is stuck
             msgBody = "SKIP too many nDefined({0})>{1}".format(nDefine,nQueueLimit)
             tmpLog.warning(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True)
             return self.retMergeUnThr
     elif nWaiting > nRunning*nWaitingLimit and nWaiting > nJobsInBunch*nWaitingBunchLimit:
         limitPriority = True
         if not highPrioQueued:
             # too many waiting
             msgBody = "SKIP too many nWaiting({0})>max(nRunning({1})x{2},{3}x{4})".format(nWaiting,nRunning,nWaitingLimit,
                                                                                           nJobsInBunch,nWaitingBunchLimit)
             tmpLog.warning(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True)
             return self.retMergeUnThr
     elif nRunningCap is not None and nRunning > nRunningCap:
         limitPriority = True
         if not highPrioQueued:
             # cap on running
             msgBody = "SKIP nRunning({0})>nRunningCap({1})".format(nRunning,nRunningCap)
             tmpLog.warning(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True)
             return self.retMergeUnThr
     elif nQueueCap is not None and nNotRun+nDefine > nQueueCap:
         limitPriority = True
         if not highPrioQueued:
             # cap on queued
             msgBody = "SKIP nQueue({0})>nQueueCap({1})".format(nNotRun+nDefine,nQueueCap)
             tmpLog.warning(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True)
             return self.retMergeUnThr
     # get jobs from prodDB
     limitPriorityValue = None
     if limitPriority:
         limitPriorityValue = highestPrioWaiting
         self.setMinPriority(limitPriorityValue)
     else:
         # not enough jobs are queued
         if nNotRun+nDefine < max(nQueueLimit,nRunning) or (totWalltime != None and totWalltime < minTotalWalltime):
             tmpLog.debug(msgHeader+" not enough jobs queued")
             self.notEnoughJobsQueued()
             self.setMaxNumJobs(max(self.maxNumJobs,nQueueLimit/20))
     msgBody = "PASS - priority limit={0}".format(limitPriorityValue)
     tmpLog.debug(msgHeader+" "+msgBody)
     return self.retUnThrottled
Пример #5
0
 def getLatestDBRelease(self):
     methodName = 'getLatestDBRelease'
     tmpLog = MsgWrapper(logger,methodName)
     tmpLog.info('trying to get the latest version number of DBR')
     # get ddo datasets
     tmpStat,ddoDatasets = self.listDatasets('ddo.*')
     if tmpStat != self.SC_SUCCEEDED or ddoDatasets == {}:
         tmpLog.error('failed to get a list of DBRelease datasets from DQ2')
         return self.SC_FAILED,None
     # reverse sort to avoid redundant lookup   
     ddoDatasets.sort()
     ddoDatasets.reverse()
     # extract version number
     latestVerMajor = 0
     latestVerMinor = 0
     latestVerBuild = 0
     latestVerRev   = 0
     latestDBR = ''
     for tmpName in ddoDatasets:
         # ignore CDRelease
         if ".CDRelease." in tmpName:
             continue
         # ignore user
         if tmpName.startswith('ddo.user'):
             continue
         # use Atlas.Ideal
         if not ".Atlas.Ideal." in tmpName:
             continue
         match = re.search('\.v(\d+)(_*[^\.]*)$',tmpName)
         if match == None:
             tmpLog.warning('cannot extract version number from %s' % tmpName)
             continue
         # ignore special DBRs
         if match.group(2) != '':
             continue
         # get major,minor,build,revision numbers
         tmpVerStr = match.group(1)
         tmpVerMajor = 0
         tmpVerMinor = 0
         tmpVerBuild = 0
         tmpVerRev   = 0
         try:
             tmpVerMajor = int(tmpVerStr[0:2])
         except:
             pass
         try:
             tmpVerMinor = int(tmpVerStr[2:4])
         except:
             pass
         try:
             tmpVerBuild = int(tmpVerStr[4:6])
         except:
             pass
         try:
             tmpVerRev = int(tmpVerStr[6:])
             # use only three digit DBR
             continue
         except:
             pass
         # compare
         if latestVerMajor > tmpVerMajor:
             continue
         elif latestVerMajor == tmpVerMajor:
             if latestVerMinor > tmpVerMinor:
                 continue
             elif latestVerMinor == tmpVerMinor:
                 if latestVerBuild > tmpVerBuild:
                     continue
                 elif latestVerBuild == tmpVerBuild:
                     if latestVerRev > tmpVerRev:
                         continue
         # check if well replicated
         tmpStat,ddoReplicas = self.listDatasetReplicas(tmpName)
         if len(ddoReplicas) < 10:
             continue
         # higher or equal version
         latestVerMajor = tmpVerMajor
         latestVerMinor = tmpVerMinor
         latestVerBuild = tmpVerBuild
         latestVerRev   = tmpVerRev
         latestDBR = tmpName
     # failed
     if latestDBR == '':
         tmpLog.error('failed to get the latest version of DBRelease dataset from DQ2')
         return self.SC_FAILED,None
     tmpLog.info('use {0}'.format(latestDBR))
     return self.SC_SUCCEEDED,latestDBR
Пример #6
0
 def getFilesInDataset(self,datasetName,getNumEvents=False,skipDuplicate=True):
     methodName = 'getFilesInDataset'
     methodName += ' <datasetName={0}>'.format(datasetName)
     tmpLog = MsgWrapper(logger,methodName)
     try:
         # get DQ2 API            
         dq2=DQ2()
         # get file list
         tmpRet = dq2.listFilesInDataset(datasetName)
         if tmpRet == ():
             fileMap = {}
         else:
             fileMap = tmpRet[0]
             # skip duplicated files
             if skipDuplicate:
                 newFileMap = {}
                 baseLFNmap = {}
                 for tmpGUID,valMap in fileMap.iteritems():
                     # extract base LFN and attempt number
                     lfn = valMap['lfn']
                     baseLFN = re.sub('(\.(\d+))$','',lfn)
                     attNr = re.sub(baseLFN+'\.*','',lfn)
                     if attNr == '':
                         # without attempt number
                         attNr = -1
                     else:
                         attNr = int(attNr)
                     # compare attempt numbers    
                     addMap = False    
                     if baseLFNmap.has_key(baseLFN):
                         # use larger attempt number
                         oldMap = baseLFNmap[baseLFN]
                         if oldMap['attNr'] < attNr:
                             del newFileMap[oldMap['guid']]
                             addMap = True
                     else:
                         addMap = True
                     # append    
                     if addMap:    
                         baseLFNmap[baseLFN] = {'guid':tmpGUID,
                                                'attNr':attNr}
                         newFileMap[tmpGUID] = valMap
                 # use new map
                 fileMap = newFileMap
             # get number of events in each file
             if getNumEvents:
                 try:
                     amiDatasetName = re.sub('(_tid\d+)*(_\d+)*/$','',datasetName)
                     amiclient = AMIClient()
                     for amiItem in amiquery.get_files(amiclient,amiDatasetName):
                         amiGUID = amiItem['fileGUID']
                         if fileMap.has_key(amiGUID):
                             fileMap[amiGUID]['nevents'] = long(amiItem['events'])
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     errStr = '{0} AMI failed with {1} {2}'.format(methodName,errtype.__name__,errvalue)
                     tmpLog.warning(errStr)
         return self.SC_SUCCEEDED,fileMap
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errCode = self.checkError(errtype)
         errStr = '{0} {1}'.format(errtype.__name__,errvalue)
         tmpLog.error(errStr)
         return errCode,'{0} : {1}'.format(methodName,errStr)
Пример #7
0
    def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue, resource_name):
        # params
        nBunch = 4
        threshold = 2.0
        nJobsInBunchMax = 600
        nJobsInBunchMin = 500
        minTotalWalltime = 50*1000*1000
        nWaitingLimit = 4
        nWaitingBunchLimit = 2
        nParallel = 2
        nParallelCap = 5
        # make logger
        tmpLog = MsgWrapper(logger)

        workQueueID = workQueue.getID()
        workQueueName = workQueue.queue_name

        workQueueName = '_'.join(workQueue.queue_name.split(' '))
        msgHeader = '{0}:{1} cloud={2} queue={3} resource_type={4}:'.format(vo, prodSourceLabel, cloudName,
                                                                            workQueueName, resource_name)
        tmpLog.debug('{0} start workQueueID={1}'.format(msgHeader, workQueueID))

        # get central configuration values
        config_map = self.__getConfiguration(vo, workQueue.queue_name, resource_name)
        configQueueLimit = config_map[NQUEUELIMIT]['value']
        configQueueCap = config_map[NQUEUECAP]['value']
        configRunningCap = config_map[NRUNNINGCAP]['value']

        tmpLog.debug(msgHeader + ' got configuration configQueueLimit={0}, configQueueCap={1}, configRunningCap={2}'
                     .format(configQueueLimit, configQueueCap, configRunningCap))

        # check if unthrottled
        if not workQueue.throttled:
            msgBody = "PASS unthrottled since GS_throttled is False"
            tmpLog.info(msgHeader+" "+msgBody)
            return self.retUnThrottled

        # get the jobs statistics for our wq/gs and expand the stats map
        jobstats_map = self.__prepareJobStats(workQueue, resource_name, config_map)
        nRunning_rt = jobstats_map['nRunning_rt']
        nRunning_gs = jobstats_map['nRunning_gs']
        nRunning_runningcap = jobstats_map['nRunning_runningcap']
        nNotRun_rt = jobstats_map['nNotRun_rt']
        nNotRun_gs = jobstats_map['nNotRun_gs']
        nNotRun_queuelimit = jobstats_map['nNotRun_queuelimit']
        nNotRun_queuecap = jobstats_map['nNotRun_queuecap']
        nDefine_rt = jobstats_map['nDefine_rt']
        nDefine_gs = jobstats_map['nDefine_gs']
        nDefine_queuelimit = jobstats_map['nDefine_queuelimit']
        nDefine_queuecap = jobstats_map['nDefine_queuecap']
        nWaiting_rt = jobstats_map['nWaiting_rt']
        nWaiting_gs = jobstats_map['nWaiting_gs']

        # check if higher prio tasks are waiting
        if workQueue.queue_name in non_rt_wqs:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed', cloudName, workQueue)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo, workQueue, 'managed', cloudName)
        else:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed', cloudName, workQueue, resource_name)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo, workQueue, 'managed', cloudName, resource_name)

        highestPrioInPandaDB = highestPrioJobStat['highestPrio']
        nNotRunHighestPrio   = highestPrioJobStat['nNotRun']
        if highestPrioWaiting is None:
            msgBody = 'failed to get the highest priority of waiting tasks'
            tmpLog.error("{0} {1}".format(msgHeader, msgBody))
            return self.retTmpError

        # high priority tasks are waiting
        highPrioQueued = False
        if highestPrioWaiting > highestPrioInPandaDB \
                or (highestPrioWaiting == highestPrioInPandaDB and nNotRunHighestPrio < nJobsInBunchMin):
            highPrioQueued = True
        tmpLog.debug("{0} highestPrio waiting:{1} inPanda:{2} numNotRun:{3} -> highPrioQueued={4}".format(msgHeader,
                                                                                                          highestPrioWaiting,
                                                                                                          highestPrioInPandaDB,
                                                                                                          nNotRunHighestPrio,
                                                                                                          highPrioQueued))
        # set maximum number of jobs to be submitted
        if workQueue.queue_name in non_rt_wqs:
            tmpRemainingSlot = int(nRunning_gs * threshold - nNotRun_gs)
        else:
            tmpRemainingSlot = int(nRunning_rt * threshold - nNotRun_rt)
        # use the lower limit to avoid creating too many _sub/_dis datasets
        nJobsInBunch = min(max(nJobsInBunchMin, tmpRemainingSlot), nJobsInBunchMax)

        if configQueueLimit is not None:
            nQueueLimit = configQueueLimit
        else:
            nQueueLimit = nJobsInBunch * nBunch

        # use nPrestage for reprocessing
        if workQueue.queue_name in ['Heavy Ion', 'Reprocessing default']:
            # reset nJobsInBunch
            if nQueueLimit > (nNotRun_queuelimit + nDefine_queuelimit):
                tmpRemainingSlot = nQueueLimit - (nNotRun_queuelimit + nDefine_queuelimit)
                if tmpRemainingSlot > nJobsInBunch:
                    nJobsInBunch = min(tmpRemainingSlot, nJobsInBunchMax)

        # get cap
        # set number of jobs to be submitted
        if configQueueCap is None:
            self.setMaxNumJobs(nJobsInBunch / nParallel)
        else:
            self.setMaxNumJobs(configQueueCap / nParallelCap)

        # get total walltime
        totWalltime = self.taskBufferIF.getTotalWallTime_JEDI(vo, prodSourceLabel, workQueue, resource_name, cloudName)

        # log the current situation and limits
        tmpLog.info("{0} nQueueLimit={1} nRunCap={2} nQueueCap={3}".format(msgHeader, nQueueLimit,
                                                                           configRunningCap, configQueueCap))
        tmpLog.info("{0} at global share level: nQueued={1} nDefine={2} nRunning={3}".format(msgHeader,
                                                                                             nNotRun_gs + nDefine_gs,
                                                                                             nDefine_gs, nRunning_gs))
        tmpLog.info("{0} at resource type level: nQueued_rt={1} nDefine_rt={2} nRunning_rt={3} totWalltime={4}".format(msgHeader,
                                                                                                                nNotRun_rt + nDefine_rt,
                                                                                                                nDefine_rt, nRunning_rt,
                                                                                                                totWalltime))

        # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling
        limitPriority = False
        if workQueue.queue_name not in non_rt_wqs \
                and nRunning_rt == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit \
                and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(nNotRun_queuelimit + nDefine_queuelimit,
                                                                                                     nQueueLimit, totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs \
                and nRunning_gs == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(nNotRun_queuelimit + nDefine_queuelimit,
                                                                                                     nQueueLimit, totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name not in non_rt_wqs and  nRunning_rt != 0 \
                and float(nNotRun_rt + nDefine_rt) / float(nRunning_rt) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_rt({0})/nRunning_rt({1})>{2} & nQueued_queuelimit({3})>{4} totWalltime({5})>{6}".format(nNotRun_rt + nDefine_rt, nRunning_rt,
                                                                                                               threshold, nNotRun_queuelimit + nDefine_queuelimit,
                                                                                                               nQueueLimit, totWalltime,
                                                                                                               minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs and nRunning_gs != 0 \
                and float(nNotRun_gs + nDefine_gs) / float(nRunning_gs) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_gs({0})/nRunning_gs({1})>{2} & nQueued_queuelimit({3})>{4}".format(nNotRun_gs + nDefine_gs, nRunning_gs,
                                                                                                               threshold, nNotRun_queuelimit + nDefine_queuelimit,
                                                                                                               nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif nDefine_queuelimit > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # brokerage is stuck
                msgBody = "SKIP too many nDefined_queuelimit({0})>{1}".format(nDefine_queuelimit, nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif nWaiting_rt > max(nRunning_rt * nWaitingLimit, nJobsInBunch * nWaitingBunchLimit):
            limitPriority = True
            if not highPrioQueued:
                # too many waiting
                msgBody = "SKIP too many nWaiting_rt({0})>max(nRunning_rt({1})x{2},{3}x{4})".format(nWaiting_rt, nRunning_rt, nWaitingLimit,
                                                                                                    nJobsInBunch, nWaitingBunchLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif configRunningCap and nRunning_runningcap > configRunningCap:
            # cap on running
            msgBody = "SKIP nRunning_runningcap({0})>nRunningCap({1})".format(nRunning_runningcap, configRunningCap)
            tmpLog.warning('{0} {1}'.format(msgHeader, msgBody))
            tmpLog.sendMsg('{0} {1}'.format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
            return self.retMergeUnThr

        elif configQueueCap and nNotRun_queuecap + nDefine_queuecap > configQueueCap:
            limitPriority = True
            if not highPrioQueued:
                # cap on queued
                msgBody = "SKIP nQueued_queuecap({0})>nQueueCap({1})".format(nNotRun_queuecap + nDefine_queuecap, configQueueCap)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        # get jobs from prodDB
        limitPriorityValue = None
        if limitPriority:
            limitPriorityValue = highestPrioWaiting
            self.setMinPriority(limitPriorityValue)
        else:
            # not enough jobs are queued
            if (nNotRun_queuelimit + nDefine_queuelimit < nQueueLimit * 0.9) \
                    or (workQueue.queue_name in non_rt_wqs and nNotRun_gs + nDefine_gs < nRunning_gs) \
                    or (workQueue.queue_name not in non_rt_wqs and nNotRun_rt + nDefine_rt < nRunning_rt):
                tmpLog.debug(msgHeader+" not enough jobs queued")
                self.notEnoughJobsQueued()
                self.setMaxNumJobs(max(self.maxNumJobs, nQueueLimit/20))

        msgBody = "PASS - priority limit={0} maxNumJobs={1}".format(limitPriorityValue, self.maxNumJobs)
        tmpLog.info(msgHeader+" "+msgBody)
        return self.retUnThrottled