def runImpl(self): while True: try: # get a part of list nTasks = 10 taskDsList = self.taskDsList.get(nTasks) # no more datasets if len(taskDsList) == 0: self.logger.debug('%s terminating since no more items' % self.__class__.__name__) return # loop over all tasks for jediTaskID,dsList in taskDsList: allUpdated = True taskBroken = False taskOnHold = False runningTask = False missingMap = {} datasetsIdxConsistency = [] # get task tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False,True,self.pid,10) if not tmpStat or taskSpec == None: self.logger.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID)) continue # make logger try: gshare = '_'.join(taskSpec.gshare.split(' ')) except: gshare = 'Undefined' tmpLog = MsgWrapper(self.logger,'<jediTaskID={0} gshare={1}>'.format(jediTaskID, gshare)) try: # get task parameters taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue)) taskBroken = True # renaming of parameters if taskParamMap.has_key('nEventsPerInputFile'): taskParamMap['nEventsPerFile'] = taskParamMap['nEventsPerInputFile'] # the number of files per job nFilesPerJob = taskSpec.getNumFilesPerJob() # the number of chunks used by scout nChunksForScout = 10 # load XML if taskSpec.useLoadXML(): xmlConfig = taskParamMap['loadXML'] else: xmlConfig = None # skip files used by another task if 'skipFilesUsedBy' in taskParamMap: skipFilesUsedBy = taskParamMap['skipFilesUsedBy'] else: skipFilesUsedBy = None # check no wait noWaitParent = False parentOutDatasets = set() if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None,taskSpec.jediTaskID]: tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid) if tmpStat == 'running': noWaitParent = True # get output datasets from parent task tmpParentStat,tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.parent_tid, ['output','log']) # collect dataset names for tmpParentOutDataset in tmpParentOutDatasets: parentOutDatasets.add(tmpParentOutDataset.datasetName) # loop over all datasets nFilesMaster = 0 checkedMaster = False setFrozenTime = True if not taskBroken: ddmIF = self.ddmIF.getInterface(taskSpec.vo) origNumFiles = None if taskParamMap.has_key('nFiles'): origNumFiles = taskParamMap['nFiles'] for datasetSpec in dsList: tmpLog.debug('start loop for {0}(id={1})'.format(datasetSpec.datasetName,datasetSpec.datasetID)) # index consistency if datasetSpec.indexConsistent(): datasetsIdxConsistency.append(datasetSpec.datasetID) # get dataset metadata tmpLog.debug('get metadata') gotMetadata = False stateUpdateTime = datetime.datetime.utcnow() try: if not datasetSpec.isPseudo(): tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName) else: # dummy metadata for pseudo dataset tmpMetadata = {'state':'closed'} # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed if (noWaitParent or taskSpec.runUntilClosed()) and \ (tmpMetadata['state'] == 'open' \ or datasetSpec.datasetName in parentOutDatasets \ or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets): # dummy metadata when parent is running tmpMetadata = {'state':'mutable'} gotMetadata = True except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('{0} failed to get metadata to {1}:{2}'.format(self.__class__.__name__, errtype.__name__,errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog) else: if not taskSpec.ignoreMissingInDS(): # temporary error taskOnHold = True else: # ignore missing datasetStatus = 'failed' # update dataset status self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog) taskSpec.setErrDiag('failed to get metadata for {0}'.format(datasetSpec.datasetName)) if not taskSpec.ignoreMissingInDS(): allUpdated = False else: # get file list specified in task parameters fileList,includePatt,excludePatt = RefinerUtils.extractFileList(taskParamMap,datasetSpec.datasetName) # get the number of events in metadata if taskParamMap.has_key('getNumEventsInMetadata'): getNumEvents = True else: getNumEvents = False # get file list from DDM tmpLog.debug('get files') try: useInFilesWithNewAttemptNr = False skipDuplicate = not datasetSpec.useDuplicatedFiles() if not datasetSpec.isPseudo(): if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \ not datasetSpec.containerName in ['',None]: # read files from container if file list is specified in task parameters tmpDatasetName = datasetSpec.containerName else: tmpDatasetName = datasetSpec.datasetName # use long format for LB longFormat = False if taskSpec.respectLumiblock() or taskSpec.orderByLB(): longFormat = True tmpRet = ddmIF.getFilesInDataset(tmpDatasetName, getNumEvents=getNumEvents, skipDuplicate=skipDuplicate, longFormat=longFormat ) tmpLog.debug('got {0} files in {1}'.format(len(tmpRet),tmpDatasetName)) # remove lost files tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName,tmpRet) if tmpLostFiles != {}: tmpLog.debug('found {0} lost files in {1}'.format(len(tmpLostFiles),tmpDatasetName)) for tmpListGUID,tmpLostLFN in tmpLostFiles.iteritems(): tmpLog.debug('removed {0}'.format(tmpLostLFN)) del tmpRet[tmpListGUID] else: if datasetSpec.isSeqNumber(): # make dummy files for seq_number if datasetSpec.getNumRecords() != None: nPFN = datasetSpec.getNumRecords() elif origNumFiles != None: nPFN = origNumFiles if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \ and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']: nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerJob'] elif taskParamMap.has_key('nEventsPerFile') and taskParamMap.has_key('nEventsPerRange'): nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerRange'] elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap: nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerJob'] elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \ and taskSpec.getNumFilesPerJob() is not None: nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerFile'] / taskSpec.getNumFilesPerJob() else: # the default number of records for seq_number seqDefNumRecords = 10000 # get nFiles of the master tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI(datasetSpec.jediTaskID, datasetSpec.masterID, ['nFiles']) # use nFiles of the master as the number of records if it is larger than the default if 'nFiles' in tmpMasterAtt and tmpMasterAtt['nFiles'] > seqDefNumRecords: nPFN = tmpMasterAtt['nFiles'] else: nPFN = seqDefNumRecords # check usedBy if skipFilesUsedBy != None: for tmpJediTaskID in str(skipFilesUsedBy).split(','): tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI(tmpJediTaskID, {'datasetName':datasetSpec.datasetName}, ['nFiles']) if 'nFiles' in tmpParentAtt and tmpParentAtt['nFiles']: nPFN += tmpParentAtt['nFiles'] tmpRet = {} # get offset tmpOffset = datasetSpec.getOffset() tmpOffset += 1 for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = {'lfn':iPFN+tmpOffset, 'scope':None, 'filesize':0, 'checksum':None, } elif not taskSpec.useListPFN(): # dummy file list for pseudo dataset tmpRet = {str(uuid.uuid4()):{'lfn':'pseudo_lfn', 'scope':None, 'filesize':0, 'checksum':None, } } else: # make dummy file list for PFN list if taskParamMap.has_key('nFiles'): nPFN = taskParamMap['nFiles'] else: nPFN = 1 tmpRet = {} for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = {'lfn':'{0:06d}:{1}'.format(iPFN,taskParamMap['pfnList'][iPFN].split('/')[-1]), 'scope':None, 'filesize':0, 'checksum':None, } except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to get files due to {0}:{1} {2}'.format(self.__class__.__name__, errtype.__name__,errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag('failed to get files for {0}'.format(datasetSpec.datasetName)) allUpdated = False else: # parameters for master input respectLB = False useRealNumEvents = False if datasetSpec.isMaster(): # respect LB boundaries respectLB = taskSpec.respectLumiblock() # use real number of events useRealNumEvents = taskSpec.useRealNumEvents() # the number of events per file nEventsPerFile = None nEventsPerJob = None nEventsPerRange = None tgtNumEventsPerJob = None if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \ (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()): if taskParamMap.has_key('nEventsPerFile'): nEventsPerFile = taskParamMap['nEventsPerFile'] elif datasetSpec.isMaster() and datasetSpec.isPseudo() and taskParamMap.has_key('nEvents'): # use nEvents as nEventsPerFile for pseudo input nEventsPerFile = taskParamMap['nEvents'] if taskParamMap.has_key('nEventsPerJob'): nEventsPerJob = taskParamMap['nEventsPerJob'] elif taskParamMap.has_key('nEventsPerRange'): nEventsPerRange = taskParamMap['nEventsPerRange'] if 'tgtNumEventsPerJob' in taskParamMap: tgtNumEventsPerJob = taskParamMap['tgtNumEventsPerJob'] # reset nEventsPerJob nEventsPerJob = None # max attempts maxAttempt = None maxFailure = None if datasetSpec.isMaster() or datasetSpec.toKeepTrack(): # max attempts if taskSpec.disableAutoRetry(): # disable auto retry maxAttempt = 1 elif taskParamMap.has_key('maxAttempt'): maxAttempt = taskParamMap['maxAttempt'] else: # use default value maxAttempt = 3 # max failure if 'maxFailure' in taskParamMap: maxFailure = taskParamMap['maxFailure'] # first event number firstEventNumber = None if datasetSpec.isMaster(): # first event number firstEventNumber = 1 + taskSpec.getFirstEventOffset() # nMaxEvents nMaxEvents = None if datasetSpec.isMaster() and taskParamMap.has_key('nEvents'): nMaxEvents = taskParamMap['nEvents'] # nMaxFiles nMaxFiles = None if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): nMaxFiles = taskParamMap['nFiles'] else: # calculate for secondary nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles) # multipled by the number of jobs per file for event-level splitting if nMaxFiles != None and taskParamMap.has_key('nEventsPerFile'): if taskParamMap.has_key('nEventsPerJob'): if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']: nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerJob']) nMaxFiles = int(math.ceil(nMaxFiles)) elif taskParamMap.has_key('nEventsPerRange'): if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerRange']: nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerRange']) nMaxFiles = int(math.ceil(nMaxFiles)) # use scout useScout = False if datasetSpec.isMaster() and taskSpec.useScout() and (datasetSpec.status != 'toupdate' or not taskSpec.isPostScout()): useScout = True # use files with new attempt numbers useFilesWithNewAttemptNr = False if not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key('useInFilesWithNewAttemptNr'): useFilesWithNewAttemptNr = True # ramCount ramCount = 0 # skip short input if datasetSpec.isMaster() and not datasetSpec.isPseudo() \ and nEventsPerFile is not None and nEventsPerJob is not None \ and nEventsPerFile >= nEventsPerJob \ and 'skipShortInput' in taskParamMap and taskParamMap['skipShortInput'] == True: skipShortInput = True else: skipShortInput = False # feed files to the contents table tmpLog.debug('update contents') retDB,missingFileList,nFilesUnique,diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(datasetSpec,tmpRet, tmpMetadata['state'], stateUpdateTime, nEventsPerFile, nEventsPerJob, maxAttempt, firstEventNumber, nMaxFiles, nMaxEvents, useScout, fileList, useFilesWithNewAttemptNr, nFilesPerJob, nEventsPerRange, nChunksForScout, includePatt, excludePatt, xmlConfig, noWaitParent, taskSpec.parent_tid, self.pid, maxFailure, useRealNumEvents, respectLB, tgtNumEventsPerJob, skipFilesUsedBy, ramCount, taskSpec, skipShortInput) if retDB == False: taskSpec.setErrDiag('failed to insert files for {0}. {1}'.format(datasetSpec.datasetName, diagMap['errMsg'])) allUpdated = False taskBroken = True break elif retDB == None: # the dataset is locked by another or status is not applicable allUpdated = False tmpLog.debug('escape since task or dataset is locked') break elif missingFileList != []: # files are missing tmpErrStr = '{0} files missing in {1}'.format(len(missingFileList),datasetSpec.datasetName) tmpLog.debug(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) allUpdated = False taskOnHold = True missingMap[datasetSpec.datasetName] = {'datasetSpec':datasetSpec, 'missingFiles':missingFileList} else: # reduce the number of files to be read if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): taskParamMap['nFiles'] -= nFilesUnique # reduce the number of files for scout if useScout: nChunksForScout = diagMap['nChunksForScout'] # number of master input files if datasetSpec.isMaster(): checkedMaster = True nFilesMaster += nFilesUnique # running task if diagMap['isRunningTask']: runningTask = True # no activated pending input for noWait if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout <= 0) \ and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster(): tmpErrStr = 'insufficient inputs are ready. ' tmpErrStr += diagMap['errMsg'] tmpLog.debug(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) taskOnHold = True setFrozenTime = False break tmpLog.debug('end loop') # no mater input if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster: tmpErrStr = 'no master input files. input dataset is empty' tmpLog.error(tmpErrStr) taskSpec.setErrDiag(tmpErrStr,None) if taskSpec.allowEmptyInput() or noWaitParent: taskOnHold = True else: taskBroken = True # index consistency if not taskOnHold and not taskBroken and len(datasetsIdxConsistency) > 0: self.taskBufferIF.removeFilesIndexInconsistent_JEDI(jediTaskID,datasetsIdxConsistency) # update task status if taskBroken: # task is broken taskSpec.status = 'tobroken' tmpMsg = 'set task_status={0}'.format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid) # change task status unless the task is running if not runningTask: if taskOnHold: # go to pending state if not taskSpec.status in ['broken','tobroken']: taskSpec.setOnHold() tmpMsg = 'set task_status={0}'.format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid,setFrozenTime=setFrozenTime) elif allUpdated: # all OK allRet,newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,getTaskStatus=True,pid=self.pid, useWorldCloud=taskSpec.useWorldCloud()) tmpMsg = 'set task_status={0}'.format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) # just unlock retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid) tmpLog.debug('unlock not-running task with {0}'.format(retUnlock)) else: # just unlock retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid) tmpLog.debug('unlock task with {0}'.format(retUnlock)) tmpLog.debug('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue, resource_name): # params nBunch = 4 threshold = 2.0 nJobsInBunchMax = 600 nJobsInBunchMin = 500 minTotalWalltime = 50 * 1000 * 1000 nWaitingLimit = 4 nWaitingBunchLimit = 2 nParallel = 2 nParallelCap = 5 # make logger tmpLog = MsgWrapper(logger) workQueueID = workQueue.getID() workQueueName = workQueue.queue_name workQueueName = '_'.join(workQueue.queue_name.split(' ')) msgHeader = '{0}:{1} cloud={2} queue={3} resource_type={4}:'.format( vo, prodSourceLabel, cloudName, workQueueName, resource_name) tmpLog.debug('{0} start workQueueID={1}'.format( msgHeader, workQueueID)) # get central configuration values config_map = self.__getConfiguration(vo, workQueue.queue_name, resource_name) configQueueLimit = config_map[NQUEUELIMIT]['value'] configQueueCap = config_map[NQUEUECAP]['value'] configRunningCap = config_map[NRUNNINGCAP]['value'] tmpLog.debug( msgHeader + ' got configuration configQueueLimit={0}, configQueueCap={1}, configRunningCap={2}' .format(configQueueLimit, configQueueCap, configRunningCap)) # check if unthrottled if not workQueue.throttled: msgBody = "PASS unthrottled since GS_throttled is False" tmpLog.info(msgHeader + " " + msgBody) return self.retUnThrottled # get the jobs statistics for our wq/gs and expand the stats map jobstats_map = self.__prepareJobStats(workQueue, resource_name, config_map) nRunning_rt = jobstats_map['nRunning_rt'] nRunning_gs = jobstats_map['nRunning_gs'] nRunning_runningcap = jobstats_map['nRunning_runningcap'] nNotRun_rt = jobstats_map['nNotRun_rt'] nNotRun_gs = jobstats_map['nNotRun_gs'] nNotRun_queuelimit = jobstats_map['nNotRun_queuelimit'] nNotRun_queuecap = jobstats_map['nNotRun_queuecap'] nDefine_rt = jobstats_map['nDefine_rt'] nDefine_gs = jobstats_map['nDefine_gs'] nDefine_queuelimit = jobstats_map['nDefine_queuelimit'] nDefine_queuecap = jobstats_map['nDefine_queuecap'] nWaiting_rt = jobstats_map['nWaiting_rt'] nWaiting_gs = jobstats_map['nWaiting_gs'] # check if higher prio tasks are waiting if workQueue.queue_name in non_rt_wqs: # find highest priority of currently defined jobs tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI( 'managed', cloudName, workQueue) # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI( vo, workQueue, 'managed', cloudName) else: # find highest priority of currently defined jobs tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI( 'managed', cloudName, workQueue, resource_name) # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI( vo, workQueue, 'managed', cloudName, resource_name) highestPrioInPandaDB = highestPrioJobStat['highestPrio'] nNotRunHighestPrio = highestPrioJobStat['nNotRun'] if highestPrioWaiting is None: msgBody = 'failed to get the highest priority of waiting tasks' tmpLog.error("{0} {1}".format(msgHeader, msgBody)) return self.retTmpError # high priority tasks are waiting highPrioQueued = False if highestPrioWaiting > highestPrioInPandaDB \ or (highestPrioWaiting == highestPrioInPandaDB and nNotRunHighestPrio < nJobsInBunchMin): highPrioQueued = True tmpLog.debug( "{0} highestPrio waiting:{1} inPanda:{2} numNotRun:{3} -> highPrioQueued={4}" .format(msgHeader, highestPrioWaiting, highestPrioInPandaDB, nNotRunHighestPrio, highPrioQueued)) # set maximum number of jobs to be submitted if workQueue.queue_name in non_rt_wqs: tmpRemainingSlot = int(nRunning_gs * threshold - nNotRun_gs) else: tmpRemainingSlot = int(nRunning_rt * threshold - nNotRun_rt) # use the lower limit to avoid creating too many _sub/_dis datasets nJobsInBunch = min(max(nJobsInBunchMin, tmpRemainingSlot), nJobsInBunchMax) if configQueueLimit is not None: nQueueLimit = configQueueLimit else: nQueueLimit = nJobsInBunch * nBunch # use nPrestage for reprocessing if workQueue.queue_name in ['Heavy Ion', 'Reprocessing default']: # reset nJobsInBunch if nQueueLimit > (nNotRun_queuelimit + nDefine_queuelimit): tmpRemainingSlot = nQueueLimit - (nNotRun_queuelimit + nDefine_queuelimit) if tmpRemainingSlot > nJobsInBunch: nJobsInBunch = min(tmpRemainingSlot, nJobsInBunchMax) # get cap # set number of jobs to be submitted if configQueueCap is None: self.setMaxNumJobs(nJobsInBunch / nParallel) else: self.setMaxNumJobs(configQueueCap / nParallelCap) # get total walltime totWalltime = self.taskBufferIF.getTotalWallTime_JEDI( vo, prodSourceLabel, workQueue, resource_name, cloudName) # log the current situation and limits tmpLog.info("{0} nQueueLimit={1} nRunCap={2} nQueueCap={3}".format( msgHeader, nQueueLimit, configRunningCap, configQueueCap)) tmpLog.info( "{0} at global share level: nQueued={1} nDefine={2} nRunning={3}". format(msgHeader, nNotRun_gs + nDefine_gs, nDefine_gs, nRunning_gs)) tmpLog.info( "{0} at resource type level: nQueued_rt={1} nDefine_rt={2} nRunning_rt={3} totWalltime={4}" .format(msgHeader, nNotRun_rt + nDefine_rt, nDefine_rt, nRunning_rt, totWalltime)) # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling limitPriority = False if workQueue.queue_name not in non_rt_wqs \ and nRunning_rt == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit \ and (totWalltime is None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format( nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name in non_rt_wqs \ and nRunning_gs == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit: limitPriority = True if not highPrioQueued: # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format( nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name not in non_rt_wqs and nRunning_rt != 0 \ and float(nNotRun_rt + nDefine_rt) / float(nRunning_rt) > threshold and \ (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit and (totWalltime is None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # enough jobs in Panda msgBody = "SKIP nQueued_rt({0})/nRunning_rt({1})>{2} & nQueued_queuelimit({3})>{4} totWalltime({5})>{6}".format( nNotRun_rt + nDefine_rt, nRunning_rt, threshold, nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name in non_rt_wqs and nRunning_gs != 0 \ and float(nNotRun_gs + nDefine_gs) / float(nRunning_gs) > threshold and \ (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit: limitPriority = True if not highPrioQueued: # enough jobs in Panda msgBody = "SKIP nQueued_gs({0})/nRunning_gs({1})>{2} & nQueued_queuelimit({3})>{4}".format( nNotRun_gs + nDefine_gs, nRunning_gs, threshold, nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif nDefine_queuelimit > nQueueLimit: limitPriority = True if not highPrioQueued: # brokerage is stuck msgBody = "SKIP too many nDefined_queuelimit({0})>{1}".format( nDefine_queuelimit, nQueueLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif nWaiting_rt > max(nRunning_rt * nWaitingLimit, nJobsInBunch * nWaitingBunchLimit): limitPriority = True if not highPrioQueued: # too many waiting msgBody = "SKIP too many nWaiting_rt({0})>max(nRunning_rt({1})x{2},{3}x{4})".format( nWaiting_rt, nRunning_rt, nWaitingLimit, nJobsInBunch, nWaitingBunchLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif configRunningCap and nRunning_runningcap > configRunningCap: # cap on running msgBody = "SKIP nRunning_runningcap({0})>nRunningCap({1})".format( nRunning_runningcap, configRunningCap) tmpLog.warning('{0} {1}'.format(msgHeader, msgBody)) tmpLog.sendMsg('{0} {1}'.format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif configQueueCap and nNotRun_queuecap + nDefine_queuecap > configQueueCap: limitPriority = True if not highPrioQueued: # cap on queued msgBody = "SKIP nQueued_queuecap({0})>nQueueCap({1})".format( nNotRun_queuecap + nDefine_queuecap, configQueueCap) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr # get jobs from prodDB limitPriorityValue = None if limitPriority: limitPriorityValue = highestPrioWaiting self.setMinPriority(limitPriorityValue) else: # not enough jobs are queued if (nNotRun_queuelimit + nDefine_queuelimit < nQueueLimit * 0.9) \ or (workQueue.queue_name in non_rt_wqs and nNotRun_gs + nDefine_gs < nRunning_gs) \ or (workQueue.queue_name not in non_rt_wqs and nNotRun_rt + nDefine_rt < nRunning_rt): tmpLog.debug(msgHeader + " not enough jobs queued") if not workQueue.queue_name in non_rt_wqs: self.notEnoughJobsQueued() self.setMaxNumJobs(max(self.maxNumJobs, nQueueLimit / 20)) msgBody = "PASS - priority limit={0} maxNumJobs={1}".format( limitPriorityValue, self.maxNumJobs) tmpLog.info(msgHeader + " " + msgBody) return self.retUnThrottled
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,commandMap in taskList: # make logger tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID)) commandStr = commandMap['command'] commentStr = commandMap['comment'] oldStatus = commandMap['oldStatus'] tmpLog.info('start for {0}'.format(commandStr)) tmpStat = Interaction.SC_SUCCEEDED if commandStr in ['kill','finish','reassign']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.sendMsg(tmpMsg,self.msgType) # loop twice to see immediate result for iLoop in range(2): # get active PandaIDs to be killed if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr: pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID) else: pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True) if pandaIDs == None: tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID)) tmpStat = Interaction.SC_FAILED # kill jobs or update task if tmpStat == Interaction.SC_SUCCEEDED: if pandaIDs == []: # done since no active jobs tmpMsg = 'completed cleaning jobs' tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID updateTaskStatus = True if commandStr != 'reassign': # reset oldStatus # keep oldStatus for task reassignment since it is reset when actually reassigned tmpTaskSpec.forceUpdate('oldStatus') else: # extract cloud or site if commentStr != None: tmpItems = commentStr.split(':') if tmpItems[0] == 'cloud': tmpTaskSpec.cloud = tmpItems[1] else: tmpTaskSpec.site = tmpItems[1] tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1]) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) # back to oldStatus if necessary if tmpItems[2] == 'y': tmpTaskSpec.status = oldStatus tmpTaskSpec.forceUpdate('oldStatus') updateTaskStatus = False if commandStr == 'reassign': tmpTaskSpec.forceUpdate('errorDialog') if updateTaskStatus: tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done'] tmpMsg = 'set task.status={0}'.format(tmpTaskSpec.status) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID}) tmpLog.info('done with {0}'.format(str(tmpRet))) break else: # kill only in the first loop if iLoop > 0: break # wait or kill jobs if 'soft finish' in commentStr: tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpRet = True tmpLog.info('done with {0}'.format(str(tmpRet))) break else: tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) if commandStr in ['reassign','finish']: # force kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True) else: # normal kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True) tmpLog.info('done with {0}'.format(str(tmpRet))) elif commandStr in ['retry','incexec']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.sendMsg(tmpMsg,self.msgType) # change task params for incexec if commandStr == 'incexec': try: # read task params taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) # remove some params for newKey in ['nFiles','fixedSandbox']: try: del taskParamMap[newKey] except: pass # convert new params newParamMap = RefinerUtils.decodeJSON(commentStr) # change params for newKey,newVal in newParamMap.iteritems(): if newVal == None: # delete if newKey in taskParamMap: del taskParamMap[newKey] else: # change taskParamMap[newKey] = newVal # overwrite sandbox if 'fixedSandbox' in taskParamMap: # noBuild for tmpParam in taskParamMap['jobParameters']: if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None: tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox'] # build if taskParamMap.has_key('buildSpec'): taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox'] # merge if taskParamMap.has_key('mergeSpec'): taskParamMap['mergeSpec']['jobParameters'] = \ re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters']) # encode new param strTaskParams = RefinerUtils.encodeJSON(taskParamMap) tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams) if tmpRet != True: tmpLog.error('failed to update task params') continue except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue)) continue # retry failed files tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr) if tmpRet == True: tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpLog.info('done with {0}'.format(tmpRet)) else: tmpLog.error('unknown command') except: errtype,errvalue = sys.exc_info()[:2] errStr = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue) errStr += traceback.format_exc() logger.error(errStr)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskDsList = self.taskDsList.get(nTasks) # no more datasets if len(taskDsList) == 0: self.logger.debug('%s terminating since no more items' % self.__class__.__name__) return # loop over all tasks for jediTaskID, dsList in taskDsList: allUpdated = True taskBroken = False taskOnHold = False runningTask = False missingMap = {} # make logger tmpLog = MsgWrapper( self.logger, '< jediTaskID={0} >'.format(jediTaskID)) # get task tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI( jediTaskID, False, True, self.pid, 10) if not tmpStat or taskSpec == None: tmpLog.error( 'failed to get taskSpec for jediTaskID={0}'.format( jediTaskID)) continue try: # get task parameters taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI( jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( 'task param conversion from json failed with {0}:{1}' .format(errtype.__name__, errvalue)) taskBroken = True # renaming of parameters if taskParamMap.has_key('nEventsPerInputFile'): taskParamMap['nEventsPerFile'] = taskParamMap[ 'nEventsPerInputFile'] # the number of files per job nFilesPerJob = None if taskParamMap.has_key('nFilesPerJob'): nFilesPerJob = taskParamMap['nFilesPerJob'] # the number of chunks used by scout nChunksForScout = 10 # load XML if taskSpec.useLoadXML(): xmlConfig = taskParamMap['loadXML'] else: xmlConfig = None # skip files used by another task if 'skipFilesUsedBy' in taskParamMap: skipFilesUsedBy = taskParamMap['skipFilesUsedBy'] else: skipFilesUsedBy = None # check no wait noWaitParent = False parentOutDatasets = set() if taskSpec.noWaitParent() and not taskSpec.parent_tid in [ None, taskSpec.jediTaskID ]: tmpStat = self.taskBufferIF.checkParentTask_JEDI( taskSpec.parent_tid) if tmpStat == 'running': noWaitParent = True # get output datasets from parent task tmpParentStat, tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( taskSpec.parent_tid, ['output', 'log']) # collect dataset names for tmpParentOutDataset in tmpParentOutDatasets: parentOutDatasets.add( tmpParentOutDataset.datasetName) # loop over all datasets nFilesMaster = 0 checkedMaster = False setFrozenTime = True if not taskBroken: ddmIF = self.ddmIF.getInterface(taskSpec.vo) origNumFiles = None if taskParamMap.has_key('nFiles'): origNumFiles = taskParamMap['nFiles'] for datasetSpec in dsList: tmpLog.debug('start loop for {0}(id={1})'.format( datasetSpec.datasetName, datasetSpec.datasetID)) # get dataset metadata tmpLog.debug('get metadata') gotMetadata = False stateUpdateTime = datetime.datetime.utcnow() try: if not datasetSpec.isPseudo(): tmpMetadata = ddmIF.getDatasetMetaData( datasetSpec.datasetName) else: # dummy metadata for pseudo dataset tmpMetadata = {'state': 'closed'} # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed if (noWaitParent or taskSpec.runUntilClosed()) and \ (tmpMetadata['state'] == 'open' \ or datasetSpec.datasetName in parentOutDatasets \ or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets): # dummy metadata when parent is running tmpMetadata = {'state': 'mutable'} gotMetadata = True except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( '{0} failed to get metadata to {1}:{2}'. format(self.__class__.__name__, errtype.__name__, errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus( datasetSpec, datasetStatus, tmpLog) else: if not taskSpec.ignoreMissingInDS(): # temporary error taskOnHold = True else: # ignore missing datasetStatus = 'failed' # update dataset status self.updateDatasetStatus( datasetSpec, datasetStatus, tmpLog) taskSpec.setErrDiag( 'failed to get metadata for {0}'.format( datasetSpec.datasetName)) if not taskSpec.ignoreMissingInDS(): allUpdated = False else: # get file list specified in task parameters fileList, includePatt, excludePatt = RefinerUtils.extractFileList( taskParamMap, datasetSpec.datasetName) # get the number of events in metadata if taskParamMap.has_key( 'getNumEventsInMetadata'): getNumEvents = True else: getNumEvents = False # get file list from DDM tmpLog.debug('get files') try: useInFilesWithNewAttemptNr = False skipDuplicate = not datasetSpec.useDuplicatedFiles( ) if not datasetSpec.isPseudo(): if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \ not datasetSpec.containerName in ['',None]: # read files from container if file list is specified in task parameters tmpDatasetName = datasetSpec.containerName else: tmpDatasetName = datasetSpec.datasetName # use long format for LB longFormat = False if taskSpec.respectLumiblock(): longFormat = True tmpRet = ddmIF.getFilesInDataset( tmpDatasetName, getNumEvents=getNumEvents, skipDuplicate=skipDuplicate, longFormat=longFormat) tmpLog.debug( 'got {0} files in {1}'.format( len(tmpRet), tmpDatasetName)) # remove lost files tmpLostFiles = ddmIF.findLostFiles( tmpDatasetName, tmpRet) if tmpLostFiles != {}: tmpLog.debug( 'found {0} lost files in {1}'. format(len(tmpLostFiles), tmpDatasetName)) for tmpListGUID, tmpLostLFN in tmpLostFiles.iteritems( ): tmpLog.debug( 'removed {0}'.format( tmpLostLFN)) del tmpRet[tmpListGUID] else: if datasetSpec.isSeqNumber(): # make dummy files for seq_number if datasetSpec.getNumRecords( ) != None: nPFN = datasetSpec.getNumRecords( ) elif origNumFiles != None: nPFN = origNumFiles if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \ and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']: nPFN = nPFN * taskParamMap[ 'nEventsPerFile'] / taskParamMap[ 'nEventsPerJob'] elif taskParamMap.has_key( 'nEventsPerFile' ) and taskParamMap.has_key( 'nEventsPerRange'): nPFN = nPFN * taskParamMap[ 'nEventsPerFile'] / taskParamMap[ 'nEventsPerRange'] elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap: nPFN = taskParamMap[ 'nEvents'] / taskParamMap[ 'nEventsPerJob'] elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \ and 'nFilesPerJob' in taskParamMap: nPFN = taskParamMap[ 'nEvents'] / taskParamMap[ 'nEventsPerFile'] / taskParamMap[ 'nFilesPerJob'] else: # the default number of records for seq_number seqDefNumRecords = 10000 # get nFiles of the master tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI( datasetSpec.jediTaskID, datasetSpec.masterID, ['nFiles']) # use nFiles of the master as the number of records if it is larger than the default if 'nFiles' in tmpMasterAtt and tmpMasterAtt[ 'nFiles'] > seqDefNumRecords: nPFN = tmpMasterAtt[ 'nFiles'] else: nPFN = seqDefNumRecords # check usedBy if skipFilesUsedBy != None: for tmpJediTaskID in str( skipFilesUsedBy ).split(','): tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI( tmpJediTaskID, { 'datasetName': datasetSpec. datasetName }, ['nFiles']) if 'nFiles' in tmpParentAtt and tmpParentAtt[ 'nFiles']: nPFN += tmpParentAtt[ 'nFiles'] tmpRet = {} # get offset tmpOffset = datasetSpec.getOffset() tmpOffset += 1 for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = { 'lfn': iPFN + tmpOffset, 'scope': None, 'filesize': 0, 'checksum': None, } elif not taskSpec.useListPFN(): # dummy file list for pseudo dataset tmpRet = { str(uuid.uuid4()): { 'lfn': 'pseudo_lfn', 'scope': None, 'filesize': 0, 'checksum': None, } } else: # make dummy file list for PFN list if taskParamMap.has_key('nFiles'): nPFN = taskParamMap['nFiles'] else: nPFN = 1 tmpRet = {} for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = { 'lfn': '{0:06d}:{1}'.format( iPFN, taskParamMap['pfnList'] [iPFN].split('/')[-1]), 'scope': None, 'filesize': 0, 'checksum': None, } except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( 'failed to get files due to {0}:{1} {2}' .format(self.__class__.__name__, errtype.__name__, errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus( datasetSpec, datasetStatus, tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag( 'failed to get files for {0}'.format( datasetSpec.datasetName)) allUpdated = False else: # parameters for master input respectLB = False useRealNumEvents = False if datasetSpec.isMaster(): # respect LB boundaries respectLB = taskSpec.respectLumiblock() # use real number of events useRealNumEvents = taskSpec.useRealNumEvents( ) # the number of events per file nEventsPerFile = None nEventsPerJob = None nEventsPerRange = None tgtNumEventsPerJob = None if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \ (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()): if taskParamMap.has_key( 'nEventsPerFile'): nEventsPerFile = taskParamMap[ 'nEventsPerFile'] elif datasetSpec.isMaster( ) and datasetSpec.isPseudo( ) and taskParamMap.has_key('nEvents'): # use nEvents as nEventsPerFile for pseudo input nEventsPerFile = taskParamMap[ 'nEvents'] if taskParamMap.has_key( 'nEventsPerJob'): nEventsPerJob = taskParamMap[ 'nEventsPerJob'] elif taskParamMap.has_key( 'nEventsPerRange'): nEventsPerRange = taskParamMap[ 'nEventsPerRange'] if 'tgtNumEventsPerJob' in taskParamMap: tgtNumEventsPerJob = taskParamMap[ 'tgtNumEventsPerJob'] # reset nEventsPerJob nEventsPerJob = None # max attempts maxAttempt = None maxFailure = None if datasetSpec.isMaster( ) or datasetSpec.toKeepTrack(): # max attempts if taskSpec.disableAutoRetry(): # disable auto retry maxAttempt = 1 elif taskParamMap.has_key( 'maxAttempt'): maxAttempt = taskParamMap[ 'maxAttempt'] else: # use default value maxAttempt = 3 # max failure if 'maxFailure' in taskParamMap: maxFailure = taskParamMap[ 'maxFailure'] # first event number firstEventNumber = None if datasetSpec.isMaster(): # first event number firstEventNumber = 1 + taskSpec.getFirstEventOffset( ) # nMaxEvents nMaxEvents = None if datasetSpec.isMaster( ) and taskParamMap.has_key('nEvents'): nMaxEvents = taskParamMap['nEvents'] # nMaxFiles nMaxFiles = None if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): nMaxFiles = taskParamMap['nFiles'] else: # calculate for secondary nMaxFiles = datasetSpec.getNumMultByRatio( origNumFiles) # multipled by the number of jobs per file for event-level splitting if nMaxFiles != None and taskParamMap.has_key( 'nEventsPerFile'): if taskParamMap.has_key( 'nEventsPerJob'): if taskParamMap[ 'nEventsPerFile'] > taskParamMap[ 'nEventsPerJob']: nMaxFiles *= float( taskParamMap[ 'nEventsPerFile'] ) / float(taskParamMap[ 'nEventsPerJob']) nMaxFiles = int( math.ceil( nMaxFiles)) elif taskParamMap.has_key( 'nEventsPerRange'): if taskParamMap[ 'nEventsPerFile'] > taskParamMap[ 'nEventsPerRange']: nMaxFiles *= float( taskParamMap[ 'nEventsPerFile'] ) / float(taskParamMap[ 'nEventsPerRange']) nMaxFiles = int( math.ceil( nMaxFiles)) # use scout useScout = False if datasetSpec.isMaster( ) and taskSpec.useScout() and ( datasetSpec.status != 'toupdate' or not taskSpec.isPostScout()): useScout = True # use files with new attempt numbers useFilesWithNewAttemptNr = False if not datasetSpec.isPseudo( ) and fileList != [] and taskParamMap.has_key( 'useInFilesWithNewAttemptNr'): useFilesWithNewAttemptNr = True #ramCount ramCount = 0 # feed files to the contents table tmpLog.debug('update contents') retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI( datasetSpec, tmpRet, tmpMetadata['state'], stateUpdateTime, nEventsPerFile, nEventsPerJob, maxAttempt, firstEventNumber, nMaxFiles, nMaxEvents, useScout, fileList, useFilesWithNewAttemptNr, nFilesPerJob, nEventsPerRange, nChunksForScout, includePatt, excludePatt, xmlConfig, noWaitParent, taskSpec.parent_tid, self.pid, maxFailure, useRealNumEvents, respectLB, tgtNumEventsPerJob, skipFilesUsedBy, ramCount) if retDB == False: taskSpec.setErrDiag( 'failed to insert files for {0}. {1}' .format(datasetSpec.datasetName, diagMap['errMsg'])) allUpdated = False taskBroken = True break elif retDB == None: # the dataset is locked by another or status is not applicable allUpdated = False tmpLog.debug( 'escape since task or dataset is locked' ) break elif missingFileList != []: # files are missing tmpErrStr = '{0} files missing in {1}'.format( len(missingFileList), datasetSpec.datasetName) tmpLog.debug(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) allUpdated = False taskOnHold = True missingMap[datasetSpec.datasetName] = { 'datasetSpec': datasetSpec, 'missingFiles': missingFileList } else: # reduce the number of files to be read if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): taskParamMap[ 'nFiles'] -= nFilesUnique # reduce the number of files for scout if useScout: nChunksForScout = diagMap[ 'nChunksForScout'] # number of master input files if datasetSpec.isMaster(): checkedMaster = True nFilesMaster += nFilesUnique # running task if diagMap['isRunningTask']: runningTask = True # no activated pending input for noWait if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout == 0) \ and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster(): tmpErrStr = 'insufficient inputs are ready. ' tmpErrStr += diagMap['errMsg'] tmpLog.debug(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) taskOnHold = True setFrozenTime = False break tmpLog.debug('end loop') # no mater input if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster: tmpErrStr = 'no master input files. input dataset is empty' tmpLog.error(tmpErrStr) taskSpec.setErrDiag(tmpErrStr, None) if taskSpec.allowEmptyInput() or noWaitParent: taskOnHold = True else: taskBroken = True # update task status if taskBroken: # task is broken taskSpec.status = 'tobroken' tmpMsg = 'set task.status={0}'.format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI( jediTaskID, taskSpec, pid=self.pid) # change task status unless the task is running if not runningTask: if taskOnHold: # go to pending state if not taskSpec.status in ['broken', 'tobroken']: taskSpec.setOnHold() tmpMsg = 'set task.status={0}'.format( taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI( jediTaskID, taskSpec, pid=self.pid, setFrozenTime=setFrozenTime) elif allUpdated: # all OK allRet, newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI( jediTaskID, getTaskStatus=True, pid=self.pid, useWorldCloud=taskSpec.useWorldCloud()) tmpMsg = 'set task.status={0}'.format( newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) # just unlock retUnlock = self.taskBufferIF.unlockSingleTask_JEDI( jediTaskID, self.pid) tmpLog.debug('unlock not-running task with {0}'.format( retUnlock)) else: # just unlock retUnlock = self.taskBufferIF.unlockSingleTask_JEDI( jediTaskID, self.pid) tmpLog.debug('unlock task with {0}'.format(retUnlock)) tmpLog.debug('done') except: errtype, errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format( self.__class__.__name__, errtype.__name__, errvalue))
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,splitRule,taskStatus,parent_tid in taskList: # make logger tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(jediTaskID)) tmpLog.info('start') tmpStat = Interaction.SC_SUCCEEDED errStr = '' # read task parameters try: taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: # get VO and sourceLabel vo = taskParamMap['vo'] prodSourceLabel = taskParamMap['prodSourceLabel'] taskType = taskParamMap['taskType'] tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType)) # get impl impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType, self.taskBufferIF,self.ddmIF) if impl == None: # task refiner is undefined errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # extract common parameters if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('extracting common') try: # initalize impl impl.initializeRefiner(tmpLog) # extarct common parameters impl.extractCommon(jediTaskID,taskParamMap,self.workQueueMapper,splitRule) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to extract common parameters with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # check parent noWaitParent = False if tmpStat == Interaction.SC_SUCCEEDED: if not parent_tid in [None,jediTaskID]: tmpLog.info('check parent task') try: tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid) if tmpStat == 'completed': # parent is done tmpStat = Interaction.SC_SUCCEEDED elif tmpStat == 'running': if not impl.taskSpec.noWaitParent(): # parent is running errStr = 'pending until parent task {0} is done'.format(parent_tid) impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(errStr) tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}) continue else: # not wait for parent tmpStat = Interaction.SC_SUCCEEDED noWaitParent = True else: # parent is corrupted tmpStat = Interaction.SC_FAILED tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid) impl.taskSpec.setErrDiag(tmpErrStr) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # refine if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('refining with {0}'.format(impl.__class__.__name__)) try: tmpStat = impl.doRefine(jediTaskID,taskParamMap) except: errtype,errvalue = sys.exc_info()[:2] # no wait for parent if impl.taskSpec.noWaitParent() and errtype == JediException.UnknownDatasetError: impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() errStr = 'pending until parent produces input' tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}) continue else: errStr = 'failed to refine task' tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # register if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to refine the task') if impl == None or impl.taskSpec == None: tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID else: tmpTaskSpec = impl.taskSpec tmpTaskSpec.status = 'tobroken' if errStr != '': tmpTaskSpec.setErrDiag(errStr,True) self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID}) else: tmpLog.info('registering') # fill JEDI tables try: # enable protection against task duplication if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \ not impl.taskSpec.checkPreProcessed(): uniqueTaskName = True else: uniqueTaskName = False strTaskParams = None if impl.updatedTaskParams != None: strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams) if taskStatus == 'registered': # unset pre-process flag if impl.taskSpec.checkPreProcessed(): impl.taskSpec.setPostPreProcess() # full registration tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec, impl.inMasterDatasetSpec, impl.inSecDatasetSpecList, impl.outDatasetSpecList, impl.outputTemplateMap, impl.jobParamsTemplate, strTaskParams, impl.unmergeMasterDatasetSpec, impl.unmergeDatasetSpecMap, uniqueTaskName) if not tmpStat: tmpErrStr = 'failed to register the task to JEDI in a single shot' tmpLog.error(tmpErrStr) impl.taskSpec.status = 'tobroken' impl.taskSpec.setErrDiag(tmpErrStr,True) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}) tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) else: # appending for incremetnal execution tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec, impl.inSecDatasetSpecList) if not tmpStat: tmpLog.error('failed to append datasets for incexec') except: errtype,errvalue = sys.exc_info()[:2] tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(tmpErrStr) else: tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def doForPriorityMassage(self): tmpLog = MsgWrapper(logger, ' #ATM #KV doForPriorityMassage label=user') tmpLog.debug('start') # lock got_lock = self.taskBufferIF.lockProcess_JEDI( vo=self.vo, prodSourceLabel=self.prodSourceLabel, cloud=None, workqueue_id=None, resource_name=None, component='AtlasAnalWatchDog.doForPriorityMassage', pid=self.pid, timeLimit=6) if not got_lock: tmpLog.debug('locked by another process. Skipped') return try: # get usage breakdown usageBreakDownPerUser, usageBreakDownPerSite = self.taskBufferIF.getUsageBreakdown_JEDI(self.prodSourceLabel) # get total number of users and running/done jobs totalUsers = 0 totalRunDone = 0 usersTotalJobs = {} usersTotalCores = {} for prodUserName in usageBreakDownPerUser: wgValMap = usageBreakDownPerUser[prodUserName] for workingGroup in wgValMap: siteValMap = wgValMap[workingGroup] totalUsers += 1 for computingSite in siteValMap: statValMap = siteValMap[computingSite] totalRunDone += statValMap['rundone'] usersTotalJobs.setdefault(prodUserName, {}) usersTotalJobs[prodUserName].setdefault(workingGroup, 0) usersTotalJobs[prodUserName][workingGroup] += statValMap['running'] usersTotalCores.setdefault(prodUserName, {}) usersTotalCores[prodUserName].setdefault(workingGroup, 0) usersTotalCores[prodUserName][workingGroup] += statValMap['runcores'] tmpLog.debug('total {0} users, {1} RunDone jobs'.format(totalUsers, totalRunDone)) # skip if no user if totalUsers == 0: tmpLog.debug('no user. Skipped...') return # cap num of running jobs tmpLog.debug('cap running jobs') prodUserName = None maxNumRunPerUser = self.taskBufferIF.getConfigValue('prio_mgr', 'CAP_RUNNING_USER_JOBS') maxNumRunPerGroup = self.taskBufferIF.getConfigValue('prio_mgr', 'CAP_RUNNING_GROUP_JOBS') maxNumCorePerUser = self.taskBufferIF.getConfigValue('prio_mgr', 'CAP_RUNNING_USER_CORES') maxNumCorePerGroup = self.taskBufferIF.getConfigValue('prio_mgr', 'CAP_RUNNING_GROUP_CORES') if maxNumRunPerUser is None: maxNumRunPerUser = 10000 if maxNumRunPerGroup is None: maxNumRunPerGroup = 10000 if maxNumCorePerUser is None: maxNumCorePerUser = 10000 if maxNumCorePerGroup is None: maxNumCorePerGroup = 10000 try: throttledUsers = self.taskBufferIF.getThrottledUsers() for prodUserName in usersTotalJobs: for workingGroup in usersTotalJobs[prodUserName]: tmpNumTotalJobs = usersTotalJobs[prodUserName][workingGroup] tmpNumTotalCores = usersTotalCores[prodUserName][workingGroup] if workingGroup is None: maxNumRun = maxNumRunPerUser maxNumCore = maxNumCorePerUser else: maxNumRun = maxNumRunPerGroup maxNumCore = maxNumCorePerGroup if tmpNumTotalJobs >= maxNumRun or tmpNumTotalCores >= maxNumCore: # throttle user tmpNumJobs = self.taskBufferIF.throttleUserJobs(prodUserName, workingGroup, get_dict=True) if tmpNumJobs is not None: for tmpJediTaskID, tmpNumJob in iteritems(tmpNumJobs): msg = ('throttled {} jobs in jediTaskID={} for user="******" group={} ' 'since too many running jobs ({} > {}) or cores ({} > {}) ').format( tmpNumJob, tmpJediTaskID, prodUserName, workingGroup, tmpNumTotalJobs, maxNumRun, tmpNumTotalCores, maxNumCore) tmpLog.debug(msg) tmpLog.sendMsg(msg, 'userCap', msgLevel='warning') elif tmpNumTotalJobs < maxNumRun*0.9 and tmpNumTotalCores < maxNumCore*0.9 and \ (prodUserName, workingGroup) in throttledUsers: # unthrottle user tmpNumJobs = self.taskBufferIF.unThrottleUserJobs(prodUserName, workingGroup, get_dict=True) if tmpNumJobs is not None: for tmpJediTaskID, tmpNumJob in iteritems(tmpNumJobs): msg = ('released {} jobs in jediTaskID={} for user="******" group={} ' 'since number of running jobs and cores are less than {} and {}').format( tmpNumJob, tmpJediTaskID, prodUserName, workingGroup, maxNumRun, maxNumCore) tmpLog.debug(msg) tmpLog.sendMsg(msg, 'userCap') except Exception as e: errStr = "cap failed for %s : %s" % (prodUserName, str(e)) errStr.strip() errStr += traceback.format_exc() tmpLog.error(errStr) # to boost tmpLog.debug('boost jobs') # global average globalAverageRunDone = float(totalRunDone)/float(totalUsers) tmpLog.debug('global average: {0}'.format(globalAverageRunDone)) # count the number of users and run/done jobs for each site siteRunDone = {} siteUsers = {} for computingSite in usageBreakDownPerSite: userValMap = usageBreakDownPerSite[computingSite] for prodUserName in userValMap: wgValMap = userValMap[prodUserName] for workingGroup in wgValMap: statValMap = wgValMap[workingGroup] # count the number of users and running/done jobs siteUsers.setdefault(computingSite, 0) siteUsers[computingSite] += 1 siteRunDone.setdefault(computingSite, 0) siteRunDone[computingSite] += statValMap['rundone'] # get site average tmpLog.debug('site average') siteAverageRunDone = {} for computingSite in siteRunDone: nRunDone = siteRunDone[computingSite] siteAverageRunDone[computingSite] = float(nRunDone)/float(siteUsers[computingSite]) tmpLog.debug(" %-25s : %s" % (computingSite,siteAverageRunDone[computingSite])) # check if the number of user's jobs is lower than the average for prodUserName in usageBreakDownPerUser: wgValMap = usageBreakDownPerUser[prodUserName] for workingGroup in wgValMap: tmpLog.debug("---> %s group=%s" % (prodUserName, workingGroup)) # count the number of running/done jobs userTotalRunDone = 0 for computingSite in wgValMap[workingGroup]: statValMap = wgValMap[workingGroup][computingSite] userTotalRunDone += statValMap['rundone'] # no priority boost when the number of jobs is higher than the average if userTotalRunDone >= globalAverageRunDone: tmpLog.debug("enough running %s > %s (global average)" % (userTotalRunDone,globalAverageRunDone)) continue tmpLog.debug("user total:%s global average:%s" % (userTotalRunDone,globalAverageRunDone)) # check with site average toBeBoostedSites = [] for computingSite in wgValMap[workingGroup]: statValMap = wgValMap[workingGroup][computingSite] # the number of running/done jobs is lower than the average and activated jobs are waiting if statValMap['rundone'] >= siteAverageRunDone[computingSite]: tmpLog.debug("enough running %s > %s (site average) at %s" % \ (statValMap['rundone'],siteAverageRunDone[computingSite],computingSite)) elif statValMap['activated'] == 0: tmpLog.debug("no activated jobs at %s" % computingSite) else: toBeBoostedSites.append(computingSite) # no boost is required if toBeBoostedSites == []: tmpLog.debug("no sites to be boosted") continue # check special prioritized site siteAccessForUser = {} varMap = {} varMap[':dn'] = prodUserName sql = "SELECT pandaSite,pOffset,status,workingGroups FROM ATLAS_PANDAMETA.siteAccess WHERE dn=:dn" res = self.taskBufferIF.querySQL(sql, varMap, arraySize=10000) if res is not None: for pandaSite, pOffset, pStatus, workingGroups in res: # ignore special working group for now if workingGroups not in ['', None]: continue # only approved sites if pStatus != 'approved': continue # no priority boost if pOffset == 0: continue # append siteAccessForUser[pandaSite] = pOffset # set weight totalW = 0 defaultW = 100 for computingSite in toBeBoostedSites: totalW += defaultW if computingSite in siteAccessForUser: totalW += siteAccessForUser[computingSite] totalW = float(totalW) # the total number of jobs to be boosted numBoostedJobs = globalAverageRunDone - float(userTotalRunDone) # get quota quotaFactor = 1.0 + self.taskBufferIF.checkQuota(prodUserName) tmpLog.debug("quota factor:%s" % quotaFactor) # make priority boost nJobsPerPrioUnit = 5 highestPrio = 1000 for computingSite in toBeBoostedSites: weight = float(defaultW) if computingSite in siteAccessForUser: weight += float(siteAccessForUser[computingSite]) weight /= totalW # the number of boosted jobs at the site numBoostedJobsSite = int(numBoostedJobs * weight / quotaFactor) tmpLog.debug("nSite:%s nAll:%s W:%s Q:%s at %s" % (numBoostedJobsSite, numBoostedJobs, weight, quotaFactor, computingSite)) if numBoostedJobsSite/nJobsPerPrioUnit == 0: tmpLog.debug("too small number of jobs %s to be boosted at %s" % (numBoostedJobsSite, computingSite)) continue # get the highest prio of activated jobs at the site varMap = {} varMap[':jobStatus'] = 'activated' varMap[':prodSourceLabel'] = self.prodSourceLabel varMap[':pmerge'] = 'pmerge' varMap[':prodUserName'] = prodUserName varMap[':computingSite'] = computingSite sql = "SELECT MAX(currentPriority) FROM ATLAS_PANDA.jobsActive4 " sql += "WHERE prodSourceLabel=:prodSourceLabel AND jobStatus=:jobStatus AND computingSite=:computingSite " sql += "AND processingType<>:pmerge AND prodUserName=:prodUserName " if workingGroup is not None: varMap[':workingGroup'] = workingGroup sql += "AND workingGroup=:workingGroup " else: sql += "AND workingGroup IS NULL " res = self.taskBufferIF.querySQL(sql, varMap, arraySize=10) maxPrio = None if res is not None: try: maxPrio = res[0][0] except Exception: pass if maxPrio is None: tmpLog.debug("cannot get the highest prio at %s" % computingSite) continue # delta for priority boost prioDelta = highestPrio - maxPrio # already boosted if prioDelta <= 0: tmpLog.debug("already boosted (prio=%s) at %s" % (maxPrio,computingSite)) continue # lower limit minPrio = maxPrio - numBoostedJobsSite/nJobsPerPrioUnit # SQL for priority boost varMap = {} varMap[':jobStatus'] = 'activated' varMap[':prodSourceLabel'] = self.prodSourceLabel varMap[':prodUserName'] = prodUserName varMap[':computingSite'] = computingSite varMap[':prioDelta'] = prioDelta varMap[':maxPrio'] = maxPrio varMap[':minPrio'] = minPrio varMap[':rlimit'] = numBoostedJobsSite sql = "UPDATE ATLAS_PANDA.jobsActive4 SET currentPriority=currentPriority+:prioDelta " sql += "WHERE prodSourceLabel=:prodSourceLabel AND prodUserName=:prodUserName " if workingGroup is not None: varMap[':workingGroup'] = workingGroup sql += "AND workingGroup=:workingGroup " else: sql += "AND workingGroup IS NULL " sql += "AND jobStatus=:jobStatus AND computingSite=:computingSite AND currentPriority>:minPrio " sql += "AND currentPriority<=:maxPrio AND rownum<=:rlimit" tmpLog.debug("boost %s" % str(varMap)) res = self.taskBufferIF.querySQL(sql, varMap, arraySize=10) tmpLog.debug(" database return : %s" % res) # done tmpLog.debug('done') except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0} {1} {2}'.format(errtype, errvalue, traceback.format_exc()))
def runImpl(self): # cutoff for disk in TB diskThreshold = self.taskBufferIF.getConfigValue( self.msgType, 'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name), 'jedi', 'atlas') if diskThreshold is None: diskThreshold = 100 * 1024 # dataset type to ignore file availability check datasetTypeToSkipCheck = ['log'] # thresholds for data availability check thrInputSize = self.taskBufferIF.getConfigValue( self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas') if thrInputSize is None: thrInputSize = 1 thrInputSize *= 1024 * 1024 * 1024 thrInputNum = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_THRESHOLD', 'jedi', 'atlas') if thrInputNum is None: thrInputNum = 100 thrInputSizeFrac = self.taskBufferIF.getConfigValue( self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas') if thrInputSizeFrac is None: thrInputSizeFrac = 10 thrInputSizeFrac = float(thrInputSizeFrac) / 100 thrInputNumFrac = self.taskBufferIF.getConfigValue( self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas') if thrInputNumFrac is None: thrInputNumFrac = 10 thrInputNumFrac = float(thrInputNumFrac) / 100 cutOffRW = 50 negWeightTape = 0.001 minIoIntensityWithLD = self.taskBufferIF.getConfigValue( self.msgType, 'MIN_IO_INTENSITY_WITH_LOCAL_DATA', 'jedi', 'atlas') if minIoIntensityWithLD is None: minIoIntensityWithLD = 200 minInputSizeWithLD = self.taskBufferIF.getConfigValue( self.msgType, 'MIN_INPUT_SIZE_WITH_LOCAL_DATA', 'jedi', 'atlas') if minInputSizeWithLD is None: minInputSizeWithLD = 10000 maxTaskPrioWithLD = self.taskBufferIF.getConfigValue( self.msgType, 'MAX_TASK_PRIO_WITH_LOCAL_DATA', 'jedi', 'atlas') if maxTaskPrioWithLD is None: maxTaskPrioWithLD = 800 # main lastJediTaskID = None siteMapper = self.taskBufferIF.getSiteMapper() while True: try: taskInputList = self.inputList.get(1) # no more datasets if len(taskInputList) == 0: self.logger.debug( '{0} terminating after processing {1} tasks since no more inputs ' .format(self.__class__.__name__, self.numTasks)) return # loop over all tasks for taskSpec, inputChunk in taskInputList: lastJediTaskID = taskSpec.jediTaskID # make logger tmpLog = MsgWrapper( self.logger, '<jediTaskID={0}>'.format(taskSpec.jediTaskID), monToken='jediTaskID={0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start') tmpLog.info( 'thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}' .format(thrInputSize, thrInputNum, thrInputSizeFrac, thrInputNumFrac)) # read task parameters try: taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI( taskSpec.jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except Exception: tmpLog.error('failed to read task params') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue # RW taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI( taskSpec.jediTaskID) # get nuclei nucleusList = siteMapper.nuclei if taskSpec.nucleus in siteMapper.nuclei: candidateNucleus = taskSpec.nucleus elif taskSpec.nucleus in siteMapper.satellites: nucleusList = siteMapper.satellites candidateNucleus = taskSpec.nucleus else: tmpLog.info('got {0} candidates'.format( len(nucleusList))) ###################################### # check status newNucleusList = {} for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleusSpec.state not in ['ACTIVE']: tmpLog.info( ' skip nucleus={0} due to status={1} criteria=-status' .format(tmpNucleus, tmpNucleusSpec.state)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info( '{0} candidates passed status check'.format( len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check status of transfer backlog t1Weight = taskSpec.getT1Weight() if t1Weight < 0: tmpLog.info( 'skip transfer backlog check due to negative T1Weight' ) else: newNucleusList = {} backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei( ) for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleus in backlogged_nuclei: tmpLog.info( ' skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog' .format(tmpNucleus)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info( '{0} candidates passed transfer backlog check'. format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check endpoint fractionFreeSpace = {} newNucleusList = {} tmpStat, tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( taskSpec.jediTaskID, ['output', 'log']) for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): toSkip = False for tmpDatasetSpec in tmpDatasetSpecList: # ignore distributed datasets if DataServiceUtils.getDistributedDestination( tmpDatasetSpec.storageToken ) is not None: continue # get endpoint with the pattern tmpEP = tmpNucleusSpec.getAssociatedEndpoint( tmpDatasetSpec.storageToken) if tmpEP is None: tmpLog.info( ' skip nucleus={0} since no endpoint with {1} criteria=-match' .format(tmpNucleus, tmpDatasetSpec.storageToken)) toSkip = True break # check state """ if tmpEP['state'] not in ['ACTIVE']: tmpLog.info(' skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus, tmpEP['ddm_endpoint_name'], tmpEP['state'])) toSkip = True break """ # check space tmpSpaceSize = tmpEP['space_free'] + tmpEP[ 'space_expired'] tmpSpaceToUse = 0 if tmpNucleus in self.fullRW: # 0.25GB per cpuTime/corePower/day tmpSpaceToUse = long( self.fullRW[tmpNucleus] / 10 / 24 / 3600 * 0.25) if tmpSpaceSize - tmpSpaceToUse < diskThreshold: tmpLog.info( ' skip nucleus={0} since disk shortage (free {1} GB - reserved {2} GB < thr {3} GB) at endpoint {4} criteria=-space' .format(tmpNucleus, tmpSpaceSize, tmpSpaceToUse, diskThreshold, tmpEP['ddm_endpoint_name'])) toSkip = True break # keep fraction of free space if tmpNucleus not in fractionFreeSpace: fractionFreeSpace[tmpNucleus] = { 'total': 0, 'free': 0 } try: tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \ float(fractionFreeSpace[tmpNucleus]['total']) except Exception: tmpOld = None try: tmpNew = float(tmpSpaceSize - tmpSpaceToUse) / float( tmpEP['space_total']) except Exception: tmpNew = None if tmpNew is not None and (tmpOld is None or tmpNew < tmpOld): fractionFreeSpace[tmpNucleus] = { 'total': tmpEP['space_total'], 'free': tmpSpaceSize - tmpSpaceToUse } if not toSkip: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info( '{0} candidates passed endpoint check {1} TB'. format(len(nucleusList), diskThreshold / 1024)) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # ability to execute jobs newNucleusList = {} # get all panda sites tmpSiteList = [] for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): tmpSiteList += tmpNucleusSpec.allPandaSites tmpSiteList = list(set(tmpSiteList)) tmpLog.debug('===== start for job check') jobBroker = AtlasProdJobBroker(self.ddmIF, self.taskBufferIF) tmpSt, tmpRet = jobBroker.doBrokerage( taskSpec, taskSpec.cloud, inputChunk, None, True, tmpSiteList, tmpLog) tmpLog.debug('===== done for job check') if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error('no sites can run jobs') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue okNuclei = set() for tmpSite in tmpRet: siteSpec = siteMapper.getSite(tmpSite) okNuclei.add(siteSpec.pandasite) for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleus in okNuclei: newNucleusList[tmpNucleus] = tmpNucleusSpec else: tmpLog.info( ' skip nucleus={0} due to missing ability to run jobs criteria=-job' .format(tmpNucleus)) nucleusList = newNucleusList tmpLog.info('{0} candidates passed job check'.format( len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # data locality toSkip = False availableData = {} for datasetSpec in inputChunk.getDatasets(): # only for real datasets if datasetSpec.isPseudo(): continue # ignore DBR if DataServiceUtils.isDBR(datasetSpec.datasetName): continue # skip locality check if DataServiceUtils.getDatasetType( datasetSpec.datasetName ) in datasetTypeToSkipCheck: continue # primary only if taskParamMap.get( 'taskBrokerOnMaster' ) is True and not datasetSpec.isMaster(): continue # use deep scan for primary dataset unless data carousel if datasetSpec.isMaster( ) and not taskSpec.inputPreStaging(): deepScan = True else: deepScan = False # get nuclei where data is available tmpSt, tmpRet = AtlasBrokerUtils.getNucleiWithData( siteMapper, self.ddmIF, datasetSpec.datasetName, list(nucleusList.keys()), deepScan) if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error( 'failed to get nuclei where data is available, since {0}' .format(tmpRet)) taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) toSkip = True break # sum for tmpNucleus, tmpVals in iteritems(tmpRet): if tmpNucleus not in availableData: availableData[tmpNucleus] = tmpVals else: availableData[tmpNucleus] = dict( (k, v + tmpVals[k]) for (k, v) in iteritems( availableData[tmpNucleus])) if toSkip: continue if availableData != {}: newNucleusList = {} # skip if no data skipMsgList = [] for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if taskSpec.inputPreStaging( ) and availableData[tmpNucleus][ 'ava_num_any'] > 0: # use incomplete replicas for data carousel since the completeness is guaranteed newNucleusList[tmpNucleus] = tmpNucleusSpec elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \ availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac: tmpMsg = ' skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format( tmpNucleus, availableData[tmpNucleus] ['ava_size_any'], availableData[tmpNucleus]['tot_size'], thrInputSizeFrac) skipMsgList.append(tmpMsg) elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \ availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac: tmpMsg = ' skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format( tmpNucleus, availableData[tmpNucleus] ['ava_num_any'], availableData[tmpNucleus]['tot_num'], thrInputNumFrac) skipMsgList.append(tmpMsg) else: newNucleusList[tmpNucleus] = tmpNucleusSpec totInputSize = list(availableData.values( ))[0]['tot_size'] / 1024 / 1024 / 1024 data_locality_check_str = ( '(ioIntensity ({0}) is None or less than {1} kBPerS ' 'and input size ({2} GB) is less than {3}) ' 'or task.currentPriority ({4}) is higher than or equal to {5}' ).format(taskSpec.ioIntensity, minIoIntensityWithLD, int(totInputSize), minInputSizeWithLD, taskSpec.currentPriority, maxTaskPrioWithLD) if len(newNucleusList) > 0: nucleusList = newNucleusList for tmpMsg in skipMsgList: tmpLog.info(tmpMsg) elif ((taskSpec.ioIntensity is None or taskSpec.ioIntensity <= minIoIntensityWithLD) and totInputSize <= minInputSizeWithLD) \ or taskSpec.currentPriority >= maxTaskPrioWithLD: availableData = {} tmpLog.info( ' disable data locality check since no nucleus has input data, {}' .format(data_locality_check_str)) else: # no candidate + unavoidable data locality check nucleusList = newNucleusList for tmpMsg in skipMsgList: tmpLog.info(tmpMsg) tmpLog.info( ' the following conditions required to disable data locality check: {}' .format(data_locality_check_str)) tmpLog.info( '{0} candidates passed data check'.format( len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # weight self.prioRW.acquire() nucleusRW = self.prioRW[taskSpec.currentPriority] self.prioRW.release() totalWeight = 0 nucleusweights = [] for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleus not in nucleusRW: nucleusRW[tmpNucleus] = 0 wStr = '1' # with RW if tmpNucleus in nucleusRW and nucleusRW[ tmpNucleus] >= cutOffRW: weight = 1 / float(nucleusRW[tmpNucleus]) wStr += '/( RW={0} )'.format( nucleusRW[tmpNucleus]) else: weight = 1 wStr += '/(1 : RW={0}<{1})'.format( nucleusRW[tmpNucleus], cutOffRW) # with data if availableData != {}: if availableData[tmpNucleus]['tot_size'] > 0: weight *= float(availableData[tmpNucleus] ['ava_size_any']) weight /= float( availableData[tmpNucleus]['tot_size']) wStr += '* ( available_input_size_DISKTAPE={0} )'.format( availableData[tmpNucleus] ['ava_size_any']) wStr += '/ ( total_input_size={0} )'.format( availableData[tmpNucleus]['tot_size']) # negative weight for tape if availableData[tmpNucleus][ 'ava_size_any'] > availableData[ tmpNucleus]['ava_size_disk']: weight *= negWeightTape wStr += '*( weight_TAPE={0} )'.format( negWeightTape) # fraction of free space if tmpNucleus in fractionFreeSpace: try: tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \ float(fractionFreeSpace[tmpNucleus]['total']) weight *= tmpFrac wStr += '*( free_space={0} )/( total_space={1} )'.format( fractionFreeSpace[tmpNucleus]['free'], fractionFreeSpace[tmpNucleus]['total']) except Exception: pass tmpLog.info( ' use nucleus={0} weight={1} {2} criteria=+use' .format(tmpNucleus, weight, wStr)) totalWeight += weight nucleusweights.append((tmpNucleus, weight)) tmpLog.info('final {0} candidates'.format( len(nucleusList))) ###################################### # final selection tgtWeight = random.uniform(0, totalWeight) candidateNucleus = None for tmpNucleus, weight in nucleusweights: tgtWeight -= weight if tgtWeight <= 0: candidateNucleus = tmpNucleus break if candidateNucleus is None: candidateNucleus = nucleusweights[-1][0] ###################################### # update nucleusSpec = nucleusList[candidateNucleus] # get output/log datasets tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( taskSpec.jediTaskID, ['output', 'log']) # get destinations retMap = { taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus( nucleusSpec, tmpDatasetSpecs) } tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) tmpLog.info( ' set nucleus={0} with {1} criteria=+set'.format( candidateNucleus, tmpRet)) self.sendLogMessage(tmpLog) if tmpRet: tmpMsg = 'set task_status=ready' tmpLog.sendMsg(tmpMsg, self.msgType) # update RW table self.prioRW.acquire() for prio, rwMap in iteritems(self.prioRW): if prio > taskSpec.currentPriority: continue if candidateNucleus in rwMap: rwMap[candidateNucleus] += taskRW else: rwMap[candidateNucleus] = taskRW self.prioRW.release() except Exception: errtype, errvalue = sys.exc_info()[:2] errMsg = '{0}.runImpl() failed with {1} {2} '.format( self.__class__.__name__, errtype.__name__, errvalue) errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID) errMsg += traceback.format_exc() logger.error(errMsg)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug( '{0} terminating since no more items'.format( self.__class__.__name__)) return # loop over all tasks for jediTaskID, commandMap in taskList: # make logger tmpLog = MsgWrapper( self.logger, ' < jediTaskID={0} >'.format(jediTaskID)) commandStr = commandMap['command'] commentStr = commandMap['comment'] oldStatus = commandMap['oldStatus'] tmpLog.info('start for {0}'.format(commandStr)) tmpStat = Interaction.SC_SUCCEEDED if commandStr in ['kill', 'finish', 'reassign']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) # loop twice to see immediate result for iLoop in range(2): # get active PandaIDs to be killed if commandStr == 'reassign' and commentStr is not None and 'soft reassign' in commentStr: pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI( jediTaskID) elif commandStr == 'reassign' and commentStr is not None and 'nokill reassign' in commentStr: pandaIDs = [] else: pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI( jediTaskID, True) if pandaIDs is None: tmpLog.error( 'failed to get PandaIDs for jediTaskID={0}' .format(jediTaskID)) tmpStat = Interaction.SC_FAILED # kill jobs or update task if tmpStat == Interaction.SC_SUCCEEDED: if pandaIDs == []: # done since no active jobs tmpMsg = 'completed cleaning jobs' tmpLog.sendMsg(tmpMsg, self.msgType) tmpLog.info(tmpMsg) tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID updateTaskStatus = True if commandStr != 'reassign': # reset oldStatus # keep oldStatus for task reassignment since it is reset when actually reassigned tmpTaskSpec.forceUpdate('oldStatus') else: # extract cloud or site if commentStr is not None: tmpItems = commentStr.split(':') if tmpItems[0] == 'cloud': tmpTaskSpec.cloud = tmpItems[1] elif tmpItems[0] == 'nucleus': tmpTaskSpec.nucleus = tmpItems[ 1] else: tmpTaskSpec.site = tmpItems[1] tmpMsg = 'set {0}={1}'.format( tmpItems[0], tmpItems[1]) tmpLog.sendMsg( tmpMsg, self.msgType) tmpLog.info(tmpMsg) # back to oldStatus if necessary if tmpItems[2] == 'y': tmpTaskSpec.status = oldStatus tmpTaskSpec.forceUpdate( 'oldStatus') updateTaskStatus = False if commandStr == 'reassign': tmpTaskSpec.forceUpdate('errorDialog') if commandStr == 'finish': # update datasets tmpLog.info( 'updating datasets to finish') tmpStat = self.taskBufferIF.updateDatasetsToFinishTask_JEDI( jediTaskID, self.pid) if not tmpStat: tmpLog.info( 'wait until datasets are updated to finish' ) # ignore failGoalUnreached when manually finished tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI( jediTaskID) tmpTaskSpec.splitRule = taskSpec.splitRule tmpTaskSpec.unsetFailGoalUnreached() if updateTaskStatus: tmpTaskSpec.status = JediTaskSpec.commandStatusMap( )[commandStr]['done'] tmpMsg = 'set task_status={0}'.format( tmpTaskSpec.status) tmpLog.sendMsg(tmpMsg, self.msgType) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.updateTask_JEDI( tmpTaskSpec, {'jediTaskID': jediTaskID}, setOldModTime=True) tmpLog.info('done with {0}'.format( str(tmpRet))) break else: # kill only in the first loop if iLoop > 0: break # wait or kill jobs if commentStr and 'soft finish' in commentStr: queuedPandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI( jediTaskID) tmpMsg = "trying to kill {0} queued jobs for soft finish".format( len(queuedPandaIDs)) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.killJobs( queuedPandaIDs, commentStr, '52', True) tmpMsg = "wating {0} jobs for soft finish".format( len(pandaIDs)) tmpLog.info(tmpMsg) tmpRet = True tmpLog.info('done with {0}'.format( str(tmpRet))) break else: tmpMsg = "trying to kill {0} jobs".format( len(pandaIDs)) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) if commandStr in ['finish']: # force kill tmpRet = self.taskBufferIF.killJobs( pandaIDs, commentStr, '52', True) elif commandStr in ['reassign']: # force kill tmpRet = self.taskBufferIF.killJobs( pandaIDs, commentStr, '51', True) else: # normal kill tmpRet = self.taskBufferIF.killJobs( pandaIDs, commentStr, '50', True) tmpLog.info('done with {0}'.format( str(tmpRet))) elif commandStr in ['retry', 'incexec']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) # change task params for incexec if commandStr == 'incexec': try: # read task params taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI( jediTaskID) taskParamMap = RefinerUtils.decodeJSON( taskParam) # remove some params for newKey in ['nFiles', 'fixedSandbox']: try: del taskParamMap[newKey] except Exception: pass # convert new params newParamMap = RefinerUtils.decodeJSON( commentStr) # change params for newKey, newVal in iteritems(newParamMap): if newVal is None: # delete if newKey in taskParamMap: del taskParamMap[newKey] else: # change taskParamMap[newKey] = newVal # overwrite sandbox if 'fixedSandbox' in taskParamMap: # noBuild for tmpParam in taskParamMap[ 'jobParameters']: if tmpParam[ 'type'] == 'constant' and re.search( '^-a [^ ]+$', tmpParam['value'] ) is not None: tmpParam['value'] = '-a {0}'.format( taskParamMap['fixedSandbox']) # build if 'buildSpec' in taskParamMap: taskParamMap['buildSpec'][ 'archiveName'] = taskParamMap[ 'fixedSandbox'] # merge if 'mergeSpec' in taskParamMap: taskParamMap['mergeSpec']['jobParameters'] = \ re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters']) # encode new param strTaskParams = RefinerUtils.encodeJSON( taskParamMap) tmpRet = self.taskBufferIF.updateTaskParams_JEDI( jediTaskID, strTaskParams) if tmpRet is not True: tmpLog.error( 'failed to update task params') continue except Exception as e: tmpLog.error( 'failed to change task params with {} {}'. format(str(e), traceback.format_exc())) continue # retry child tasks if 'sole ' in commentStr: retryChildTasks = False else: retryChildTasks = True # discard events if 'discard ' in commentStr: discardEvents = True else: discardEvents = False # release un-staged files if 'staged ' in commentStr: releaseUnstaged = True else: releaseUnstaged = False tmpRet, newTaskStatus = self.taskBufferIF.retryTask_JEDI( jediTaskID, commandStr, retryChildTasks=retryChildTasks, discardEvents=discardEvents, release_unstaged=releaseUnstaged) if tmpRet is True: tmpMsg = 'set task_status={0}'.format( newTaskStatus) tmpLog.sendMsg(tmpMsg, self.msgType) tmpLog.info(tmpMsg) tmpLog.info('done with {0}'.format(tmpRet)) else: tmpLog.error('unknown command') except Exception as e: errStr = '{} failed in runImpl() with {} {} '.format( self.__class__.__name__, str(e), traceback.format_exc()) logger.error(errStr)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,commandMap in taskList: # make logger tmpLog = MsgWrapper(self.logger,' < jediTaskID={0} >'.format(jediTaskID)) commandStr = commandMap['command'] commentStr = commandMap['comment'] oldStatus = commandMap['oldStatus'] tmpLog.info('start for {0}'.format(commandStr)) tmpStat = Interaction.SC_SUCCEEDED if commandStr in ['kill','finish','reassign']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) # loop twice to see immediate result for iLoop in range(2): # get active PandaIDs to be killed if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr: pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID) elif commandStr == 'reassign' and commentStr != None and 'nokill reassign' in commentStr: pandaIDs = [] else: pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True) if pandaIDs == None: tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID)) tmpStat = Interaction.SC_FAILED # kill jobs or update task if tmpStat == Interaction.SC_SUCCEEDED: if pandaIDs == []: # done since no active jobs tmpMsg = 'completed cleaning jobs' tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID updateTaskStatus = True if commandStr != 'reassign': # reset oldStatus # keep oldStatus for task reassignment since it is reset when actually reassigned tmpTaskSpec.forceUpdate('oldStatus') else: # extract cloud or site if commentStr != None: tmpItems = commentStr.split(':') if tmpItems[0] == 'cloud': tmpTaskSpec.cloud = tmpItems[1] elif tmpItems[0] == 'nucleus': tmpTaskSpec.nucleus = tmpItems[1] else: tmpTaskSpec.site = tmpItems[1] tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1]) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) # back to oldStatus if necessary if tmpItems[2] == 'y': tmpTaskSpec.status = oldStatus tmpTaskSpec.forceUpdate('oldStatus') updateTaskStatus = False if commandStr == 'reassign': tmpTaskSpec.forceUpdate('errorDialog') if commandStr == 'finish': # update datasets tmpLog.info('updating datasets to finish') tmpStat = self.taskBufferIF.updateDatasetsToFinishTask_JEDI(jediTaskID, self.pid) if not tmpStat: tmpLog.info('wait until datasets are updated to finish') # ignore failGoalUnreached when manually finished tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID) tmpTaskSpec.splitRule = taskSpec.splitRule tmpTaskSpec.unsetFailGoalUnreached() if updateTaskStatus: tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done'] tmpMsg = 'set task_status={0}'.format(tmpTaskSpec.status) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID}, setOldModTime=True) tmpLog.info('done with {0}'.format(str(tmpRet))) break else: # kill only in the first loop if iLoop > 0: break # wait or kill jobs if 'soft finish' in commentStr: queuedPandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID) tmpMsg = "trying to kill {0} queued jobs for soft finish".format(len(queuedPandaIDs)) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.killJobs(queuedPandaIDs,commentStr,'52',True) tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpRet = True tmpLog.info('done with {0}'.format(str(tmpRet))) break else: tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) if commandStr in ['finish']: # force kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True) elif commandStr in ['reassign']: # force kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'51',True) else: # normal kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True) tmpLog.info('done with {0}'.format(str(tmpRet))) elif commandStr in ['retry','incexec']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) # change task params for incexec if commandStr == 'incexec': try: # read task params taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) # remove some params for newKey in ['nFiles','fixedSandbox']: try: del taskParamMap[newKey] except: pass # convert new params newParamMap = RefinerUtils.decodeJSON(commentStr) # change params for newKey,newVal in newParamMap.iteritems(): if newVal == None: # delete if newKey in taskParamMap: del taskParamMap[newKey] else: # change taskParamMap[newKey] = newVal # overwrite sandbox if 'fixedSandbox' in taskParamMap: # noBuild for tmpParam in taskParamMap['jobParameters']: if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None: tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox'] # build if taskParamMap.has_key('buildSpec'): taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox'] # merge if taskParamMap.has_key('mergeSpec'): taskParamMap['mergeSpec']['jobParameters'] = \ re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters']) # encode new param strTaskParams = RefinerUtils.encodeJSON(taskParamMap) tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams) if tmpRet != True: tmpLog.error('failed to update task params') continue except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue)) continue # retry child tasks if 'sole ' in commentStr: retryChildTasks = False else: retryChildTasks = True # discard events if 'discard ' in commentStr: discardEvents = True else: discardEvents = False tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr, retryChildTasks=retryChildTasks, discardEvents=discardEvents) if tmpRet == True: tmpMsg = 'set task_status={0}'.format(newTaskStatus) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpLog.info('done with {0}'.format(tmpRet)) else: tmpLog.error('unknown command') except: errtype,errvalue = sys.exc_info()[:2] errStr = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue) errStr += traceback.format_exc() logger.error(errStr)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.info('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,splitRule,taskStatus,parent_tid in taskList: # make logger tmpLog = MsgWrapper(self.logger,'< jediTaskID={0} >'.format(jediTaskID)) tmpLog.debug('start') tmpStat = Interaction.SC_SUCCEEDED errStr = '' # read task parameters try: taskParam = None taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.debug(taskParam) tmpLog.error(errStr) continue tmpStat = Interaction.SC_FAILED # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: # get VO and sourceLabel vo = taskParamMap['vo'] prodSourceLabel = taskParamMap['prodSourceLabel'] taskType = taskParamMap['taskType'] tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType)) # get impl impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType, self.taskBufferIF,self.ddmIF) if impl == None: # task refiner is undefined errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # extract common parameters if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('extracting common') try: # initalize impl impl.initializeRefiner(tmpLog) impl.oldTaskStatus = taskStatus # extract common parameters impl.extractCommon(jediTaskID,taskParamMap,self.workQueueMapper,splitRule) # set parent tid if not parent_tid in [None,jediTaskID]: impl.taskSpec.parent_tid = parent_tid except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to extract common parameters with {0}:{1} {2}'.format(errtype.__name__,errvalue, traceback.format_exc()) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # check attribute length if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('checking attribute length') if not impl.taskSpec.checkAttrLength(): tmpLog.error(impl.taskSpec.errorDialog) tmpStat = Interaction.SC_FAILED # check parent noWaitParent = False parentState = None if tmpStat == Interaction.SC_SUCCEEDED: if not parent_tid in [None,jediTaskID]: tmpLog.info('check parent task') try: tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid) parentState = tmpStat if tmpStat == 'completed': # parent is done tmpStat = Interaction.SC_SUCCEEDED elif tmpStat == 'running': if not impl.taskSpec.noWaitParent(): # parent is running errStr = 'pending until parent task {0} is done'.format(parent_tid) impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(errStr) tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus],setFrozenTime=False) continue else: # not wait for parent tmpStat = Interaction.SC_SUCCEEDED noWaitParent = True else: # parent is corrupted tmpStat = Interaction.SC_FAILED tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid) impl.taskSpec.setErrDiag(tmpErrStr) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # refine if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('refining with {0}'.format(impl.__class__.__name__)) try: tmpStat = impl.doRefine(jediTaskID,taskParamMap) except: errtype,errvalue = sys.exc_info()[:2] # wait unknown input if noWaitParent or waitInput if ((impl.taskSpec.noWaitParent() or impl.taskSpec.waitInput()) \ and errtype == JediException.UnknownDatasetError) or parentState == 'running' \ or errtype == Interaction.JEDITemporaryError: if impl.taskSpec.noWaitParent() or parentState == 'running': tmpErrStr = 'pending until parent produces input' setFrozenTime=False elif errtype == Interaction.JEDITemporaryError: tmpErrStr = 'pending due to DDM problem. {0}'.format(errvalue) setFrozenTime=True else: tmpErrStr = 'pending until input is staged' setFrozenTime=True impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(tmpErrStr) tmpLog.info(tmpErrStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus], insertUnknown=impl.unknownDatasetList, setFrozenTime=setFrozenTime) continue else: errStr = 'failed to refine task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # register if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to refine the task') if impl == None or impl.taskSpec == None: tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID else: tmpTaskSpec = impl.taskSpec tmpTaskSpec.status = 'tobroken' if errStr != '': tmpTaskSpec.setErrDiag(errStr,True) self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID},oldStatus=[taskStatus]) else: tmpLog.info('registering') # fill JEDI tables try: # enable protection against task duplication if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \ not impl.taskSpec.checkPreProcessed(): uniqueTaskName = True else: uniqueTaskName = False strTaskParams = None if impl.updatedTaskParams != None: strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams) if taskStatus == 'registered': # unset pre-process flag if impl.taskSpec.checkPreProcessed(): impl.taskSpec.setPostPreProcess() # full registration tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec, impl.inMasterDatasetSpec, impl.inSecDatasetSpecList, impl.outDatasetSpecList, impl.outputTemplateMap, impl.jobParamsTemplate, strTaskParams, impl.unmergeMasterDatasetSpec, impl.unmergeDatasetSpecMap, uniqueTaskName, taskStatus) if not tmpStat: tmpErrStr = 'failed to register the task to JEDI in a single shot' tmpLog.error(tmpErrStr) impl.taskSpec.status = newTaskStatus impl.taskSpec.setErrDiag(tmpErrStr,True) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus]) tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) else: # disable scouts if previous attempt didn't use it if not impl.taskSpec.useScout(splitRule): impl.taskSpec.setUseScout(False) # update task with new params self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus]) # appending for incremetnal execution tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec, impl.inSecDatasetSpecList) if not tmpStat: tmpLog.error('failed to append datasets for incexec') except: errtype,errvalue = sys.exc_info()[:2] tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(tmpErrStr) else: tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskDsList = self.taskDsList.get(nTasks) # no more datasets if len(taskDsList) == 0: self.logger.debug('%s terminating since no more items' % self.__class__.__name__) return # loop over all tasks for jediTaskID,dsList in taskDsList: allUpdated = True taskBroken = False taskOnHold = False runningTask = False missingMap = {} # make logger tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(jediTaskID)) # get task tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False,True,self.pid,10) if not tmpStat or taskSpec == None: tmpLog.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID)) continue try: # get task parameters taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue)) taskBroken = True # renaming of parameters if taskParamMap.has_key('nEventsPerInputFile'): taskParamMap['nEventsPerFile'] = taskParamMap['nEventsPerInputFile'] # the number of files per job nFilesPerJob = None if taskParamMap.has_key('nFilesPerJob'): nFilesPerJob = taskParamMap['nFilesPerJob'] # the number of chunks used by scout nChunksForScout = 10 # load XML if taskSpec.useLoadXML(): xmlConfig = taskParamMap['loadXML'] else: xmlConfig = None # check no wait noWaitParent = False if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None,taskSpec.jediTaskID]: tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid) if tmpStat == 'running': noWaitParent = True # loop over all datasets nFilesMaster = 0 checkedMaster = False setFrozenTime = True if not taskBroken: ddmIF = self.ddmIF.getInterface(taskSpec.vo) origNumFiles = None if taskParamMap.has_key('nFiles'): origNumFiles = taskParamMap['nFiles'] for datasetSpec in dsList: tmpLog.info('start loop for {0}(id={1})'.format(datasetSpec.datasetName,datasetSpec.datasetID)) # get dataset metadata tmpLog.info('get metadata') gotMetadata = False stateUpdateTime = datetime.datetime.utcnow() try: if not datasetSpec.isPseudo(): tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName) else: # dummy metadata for pseudo dataset tmpMetadata = {'state':'closed'} # set mutable when parent is running and the dataset is open if noWaitParent and tmpMetadata['state'] == 'open': # dummy metadata when parent is running tmpMetadata = {'state':'mutable'} gotMetadata = True except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('{0} failed to get metadata to {1}:{2}'.format(self.__class__.__name__, errtype.__name__,errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag('failed to get metadata for {0}'.format(datasetSpec.datasetName)) allUpdated = False else: # get file list specified in task parameters fileList,includePatt,excludePatt = RefinerUtils.extractFileList(taskParamMap,datasetSpec.datasetName) # get the number of events in metadata if taskParamMap.has_key('getNumEventsInMetadata'): getNumEvents = True else: getNumEvents = False # get file list from DDM tmpLog.info('get files') try: useInFilesWithNewAttemptNr = False skipDuplicate = not datasetSpec.useDuplicatedFiles() if not datasetSpec.isPseudo(): if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \ not datasetSpec.containerName in ['',None]: # read files from container if file list is specified in task parameters tmpDatasetName = datasetSpec.containerName else: tmpDatasetName = datasetSpec.datasetName tmpRet = ddmIF.getFilesInDataset(tmpDatasetName, getNumEvents=getNumEvents, skipDuplicate=skipDuplicate ) tmpLog.info('got {0} files in {1}'.format(len(tmpRet),tmpDatasetName)) # remove lost files tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName,tmpRet) if tmpLostFiles != {}: tmpLog.info('found {0} lost files in {1}'.format(len(tmpLostFiles),tmpDatasetName)) for tmpListGUID,tmpLostLFN in tmpLostFiles.iteritems(): tmpLog.info('removed {0}'.format(tmpLostLFN)) del tmpRet[tmpListGUID] else: if not taskSpec.useListPFN(): # dummy file list for pseudo dataset tmpRet = {str(uuid.uuid4()):{'lfn':'pseudo_lfn', 'scope':None, 'filesize':0, 'checksum':None, } } else: # make dummy file list for PFN list if taskParamMap.has_key('nFiles'): nPFN = taskParamMap['nFiles'] else: nPFN = 1 tmpRet = {} for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = {'lfn':'{0:06d}:{1}'.format(iPFN,taskParamMap['pfnList'][iPFN].split('/')[-1]), 'scope':None, 'filesize':0, 'checksum':None, } except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to get files due to {0}:{1}'.format(self.__class__.__name__, errtype.__name__,errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag('failed to get files for {0}'.format(datasetSpec.datasetName)) allUpdated = False else: # the number of events per file nEventsPerFile = None nEventsPerJob = None nEventsPerRange = None if (datasetSpec.isMaster() and taskParamMap.has_key('nEventsPerFile')) or \ (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents')): if taskParamMap.has_key('nEventsPerFile'): nEventsPerFile = taskParamMap['nEventsPerFile'] elif datasetSpec.isPseudo() and taskParamMap.has_key('nEvents'): # use nEvents as nEventsPerFile for pseudo input nEventsPerFile = taskParamMap['nEvents'] if taskParamMap.has_key('nEventsPerJob'): nEventsPerJob = taskParamMap['nEventsPerJob'] elif taskParamMap.has_key('nEventsPerRange'): nEventsPerRange = taskParamMap['nEventsPerRange'] # max attempts maxAttempt = None if datasetSpec.isMaster() or datasetSpec.toKeepTrack(): # max attempts if taskSpec.disableAutoRetry(): # disable auto retry maxAttempt = 1 elif taskParamMap.has_key('maxAttempt'): maxAttempt = taskParamMap['maxAttempt'] else: # use default value maxAttempt = 3 # first event number firstEventNumber = None if datasetSpec.isMaster(): # first event number firstEventNumber = 1 + taskSpec.getFirstEventOffset() # nMaxEvents nMaxEvents = None if datasetSpec.isMaster() and taskParamMap.has_key('nEvents'): nMaxEvents = taskParamMap['nEvents'] # nMaxFiles nMaxFiles = None if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): nMaxFiles = taskParamMap['nFiles'] else: # calculate for secondary nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles) # multipled by the number of jobs per file for event-level splitting if nMaxFiles != None and taskParamMap.has_key('nEventsPerFile'): if taskParamMap.has_key('nEventsPerJob'): if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']: nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerJob']) nMaxFiles = int(math.ceil(nMaxFiles)) elif taskParamMap.has_key('nEventsPerRange'): if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerRange']: nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerRange']) nMaxFiles = int(math.ceil(nMaxFiles)) # use scout useScout = False if datasetSpec.isMaster() and taskSpec.useScout() and datasetSpec.status != 'toupdate': useScout = True # use files with new attempt numbers useFilesWithNewAttemptNr = False if not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key('useInFilesWithNewAttemptNr'): useFilesWithNewAttemptNr = True # feed files to the contents table tmpLog.info('update contents') retDB,missingFileList,nFilesUnique,diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(datasetSpec,tmpRet, tmpMetadata['state'], stateUpdateTime, nEventsPerFile, nEventsPerJob, maxAttempt, firstEventNumber, nMaxFiles, nMaxEvents, useScout, fileList, useFilesWithNewAttemptNr, nFilesPerJob, nEventsPerRange, nChunksForScout, includePatt, excludePatt, xmlConfig, noWaitParent, taskSpec.parent_tid, self.pid) if retDB == False: taskSpec.setErrDiag('failed to insert files for {0}. {1}'.format(datasetSpec.datasetName, diagMap['errMsg'])) allUpdated = False taskBroken = True break elif retDB == None: # the dataset is locked by another or status is not applicable allUpdated = False tmpLog.info('escape since task or dataset is locked') break elif missingFileList != []: # files are missing tmpErrStr = '{0} files missing in {1}'.format(len(missingFileList),datasetSpec.datasetName) tmpLog.info(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) allUpdated = False taskOnHold = True missingMap[datasetSpec.datasetName] = {'datasetSpec':datasetSpec, 'missingFiles':missingFileList} else: # reduce the number of files to be read if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): taskParamMap['nFiles'] -= nFilesUnique # reduce the number of files for scout if useScout: nChunksForScout = diagMap['nChunksForScout'] # number of master input files if datasetSpec.isMaster(): checkedMaster = True nFilesMaster += nFilesUnique # running task if diagMap['isRunningTask']: runningTask = True # no activated pending input for noWait if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout == 0): tmpErrStr = 'insufficient inputs are ready' tmpLog.info(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) taskOnHold = True setFrozenTime = False break tmpLog.info('end loop') # no mater input if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster: tmpErrStr = 'no master input files. input dataset is empty' tmpLog.error(tmpErrStr) taskSpec.setErrDiag(tmpErrStr,None) if taskSpec.allowEmptyInput() or noWaitParent: taskOnHold = True else: taskBroken = True # update task status if taskBroken: # task is broken taskSpec.status = 'tobroken' tmpMsg = 'set task.status={0}'.format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid) # change task status unless the task is running if not runningTask: if taskOnHold: if not noWaitParent: # initialize task generator taskGenerator = TaskGenerator(taskSpec.vo,taskSpec.prodSourceLabel) tmpStat = taskGenerator.initializeMods(self.taskBufferIF, self.ddmIF.getInterface(taskSpec.vo)) if not tmpStat: tmpErrStr = 'failed to initialize TaskGenerator' tmpLog.error(tmpErrStr) taskSpec.status = 'tobroken' taskSpec.setErrDiag(tmpErrStr) else: # make parent tasks if necessary tmpLog.info('make parent tasks with {0} (if necessary)'.format(taskGenerator.getClassName(taskSpec.vo, taskSpec.prodSourceLabel))) tmpStat = taskGenerator.doGenerate(taskSpec,taskParamMap,missingFilesMap=missingMap) if tmpStat == Interaction.SC_FATAL: # failed to make parent tasks taskSpec.status = 'tobroken' tmpLog.error('failed to make parent tasks') # go to pending state if not taskSpec.status in ['broken','tobroken']: taskSpec.setOnHold() tmpMsg = 'set task.status={0}'.format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid,setFrozenTime=setFrozenTime) elif allUpdated: # all OK allRet,newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,getTaskStatus=True, pid=self.pid) tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def runImpl(self): # cutoff for disk in TB diskThreshold = self.taskBufferIF.getConfigValue(self.msgType, 'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name), 'jedi', 'atlas') if diskThreshold is None: diskThreshold = 100 * 1024 # dataset type to ignore file availability check datasetTypeToSkipCheck = ['log'] # thresholds for data availability check thrInputSize = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas') if thrInputSize is None: thrInputSize = 1 thrInputSize *= 1024*1024*1024 thrInputNum = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_THRESHOLD', 'jedi', 'atlas') if thrInputNum is None: thrInputNum = 100 thrInputSizeFrac = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas') if thrInputSizeFrac is None: thrInputSizeFrac = 10 thrInputSizeFrac = float(thrInputSizeFrac) / 100 thrInputNumFrac = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas') if thrInputNumFrac is None: thrInputNumFrac = 10 thrInputNumFrac = float(thrInputNumFrac) / 100 cutOffRW = 50 negWeightTape = 0.001 # main lastJediTaskID = None siteMapper = self.taskBufferIF.getSiteMapper() while True: try: taskInputList = self.inputList.get(1) # no more datasets if len(taskInputList) == 0: self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__, self.numTasks)) return # loop over all tasks for taskSpec,inputChunk in taskInputList: lastJediTaskID = taskSpec.jediTaskID # make logger tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='jediTaskID={0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start') tmpLog.info('thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}'.format(thrInputSize, thrInputNum, thrInputSizeFrac, thrInputNumFrac)) # RW taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID) # get nuclei nucleusList = siteMapper.nuclei if taskSpec.nucleus in nucleusList: candidateNucleus = taskSpec.nucleus else: tmpLog.info('got {0} candidates'.format(len(nucleusList))) ###################################### # check status newNucleusList = {} for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if not tmpNucleusSpec.state in ['ACTIVE']: tmpLog.info(' skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus, tmpNucleusSpec.state)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info('{0} candidates passed status check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check status of transfer backlog t1Weight = taskSpec.getT1Weight() if t1Weight < 0: tmpLog.info('skip transfer backlog check due to negative T1Weight') else: newNucleusList = {} backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei() for tmpNucleus, tmpNucleusSpec in nucleusList.iteritems(): if tmpNucleus in backlogged_nuclei: tmpLog.info(' skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog'. format(tmpNucleus)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info('{0} candidates passed transfer backlog check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check endpoint fractionFreeSpace = {} newNucleusList = {} tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID, ['output','log']) for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): toSkip = False for tmpDatasetSpec in tmpDatasetSpecList: # ignore distributed datasets if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None: continue # get endpoint with the pattern tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken) if tmpEP == None: tmpLog.info(' skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus, tmpDatasetSpec.storageToken)) toSkip = True break # check state """ if not tmpEP['state'] in ['ACTIVE']: tmpLog.info(' skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus, tmpEP['ddm_endpoint_name'], tmpEP['state'])) toSkip = True break """ # check space tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired'] tmpSpaceToUse = 0 if tmpNucleus in self.fullRW: # 0.25GB per cpuTime/corePower/day tmpSpaceToUse = long(self.fullRW[tmpNucleus]/10/24/3600*0.25) if tmpSpaceSize-tmpSpaceToUse < diskThreshold: tmpLog.info(' skip nucleus={0} since disk shortage (free {1} - reserved {2} < thr {3}) at endpoint {4} criteria=-space'.format(tmpNucleus, tmpSpaceSize, tmpSpaceToUse, diskThreshold, tmpEP['ddm_endpoint_name'])) toSkip = True break # keep fraction of free space if not tmpNucleus in fractionFreeSpace: fractionFreeSpace[tmpNucleus] = {'total':0,'free':0} try: tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \ float(fractionFreeSpace[tmpNucleus]['total']) except: tmpOld = None try: tmpNew = float(tmpSpaceSize-tmpSpaceToUse)/float(tmpEP['space_total']) except: tmpNew = None if tmpNew != None and (tmpOld == None or tmpNew < tmpOld): fractionFreeSpace[tmpNucleus] = {'total':tmpEP['space_total'], 'free':tmpSpaceSize-tmpSpaceToUse} if not toSkip: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info('{0} candidates passed endpoint check {1} TB'.format(len(nucleusList),diskThreshold/1024)) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # ability to execute jobs newNucleusList = {} # get all panda sites tmpSiteList = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): tmpSiteList += tmpNucleusSpec.allPandaSites tmpSiteList = list(set(tmpSiteList)) tmpLog.debug('===== start for job check') jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF) tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True, tmpSiteList,tmpLog) tmpLog.debug('===== done for job check') if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error('no sites can run jobs') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue okNuclei = set() for tmpSite in tmpRet: siteSpec = siteMapper.getSite(tmpSite) okNuclei.add(siteSpec.pandasite) for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if tmpNucleus in okNuclei: newNucleusList[tmpNucleus] = tmpNucleusSpec else: tmpLog.info(' skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus)) nucleusList = newNucleusList tmpLog.info('{0} candidates passed job check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # data locality toSkip = False availableData = {} for datasetSpec in inputChunk.getDatasets(): # only for real datasets if datasetSpec.isPseudo(): continue # ignore DBR if DataServiceUtils.isDBR(datasetSpec.datasetName): continue # skip locality check if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck: continue # use deep scan for primary dataset if datasetSpec.isMaster(): deepScan = True else: deepScan = False # get nuclei where data is available tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF, datasetSpec.datasetName, nucleusList.keys(), deepScan) if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) toSkip = True break # sum for tmpNucleus,tmpVals in tmpRet.iteritems(): if not tmpNucleus in availableData: availableData[tmpNucleus] = tmpVals else: availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems()) if toSkip: continue if availableData != {}: newNucleusList = {} # skip if no data skipMsgList = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if len(nucleusList) == 1: tmpLog.info(' disable data locality check for nucleus={0} since no other candidate'.format(tmpNucleus)) newNucleusList[tmpNucleus] = tmpNucleusSpec elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \ availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac: tmpMsg = ' skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus, availableData[tmpNucleus]['ava_size_any'], availableData[tmpNucleus]['tot_size'], thrInputSizeFrac) skipMsgList.append(tmpMsg) elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \ availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac: tmpMsg = ' skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus, availableData[tmpNucleus]['ava_num_any'], availableData[tmpNucleus]['tot_num'], thrInputNumFrac) skipMsgList.append(tmpMsg) else: newNucleusList[tmpNucleus] = tmpNucleusSpec if len(newNucleusList) > 0: nucleusList = newNucleusList for tmpMsg in skipMsgList: tmpLog.info(tmpMsg) else: tmpLog.info(' disable data locality check since no nucleus has input data') tmpLog.info('{0} candidates passed data check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # weight self.prioRW.acquire() nucleusRW = self.prioRW[taskSpec.currentPriority] self.prioRW.release() totalWeight = 0 nucleusweights = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if not tmpNucleus in nucleusRW: nucleusRW[tmpNucleus] = 0 wStr = '1' # with RW if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW: weight = 1 / float(nucleusRW[tmpNucleus]) wStr += '/( RW={0} )'.format(nucleusRW[tmpNucleus]) else: weight = 1 wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW) # with data if availableData != {}: if availableData[tmpNucleus]['tot_size'] > 0: weight *= float(availableData[tmpNucleus]['ava_size_any']) weight /= float(availableData[tmpNucleus]['tot_size']) wStr += '* ( available_input_size_DISKTAPE={0} )'.format(availableData[tmpNucleus]['ava_size_any']) wStr += '/ ( total_input_size={0} )'.format(availableData[tmpNucleus]['tot_size']) # negative weight for tape if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']: weight *= negWeightTape wStr += '*( weight_TAPE={0} )'.format(negWeightTape) # fraction of free space if tmpNucleus in fractionFreeSpace: try: tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \ float(fractionFreeSpace[tmpNucleus]['total']) weight *= tmpFrac wStr += '*( free_space={0} )/( total_space={1} )'.format(fractionFreeSpace[tmpNucleus]['free'], fractionFreeSpace[tmpNucleus]['total']) except: pass tmpLog.info(' use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr)) totalWeight += weight nucleusweights.append((tmpNucleus,weight)) tmpLog.info('final {0} candidates'.format(len(nucleusList))) ###################################### # final selection tgtWeight = random.uniform(0,totalWeight) candidateNucleus = None for tmpNucleus,weight in nucleusweights: tgtWeight -= weight if tgtWeight <= 0: candidateNucleus = tmpNucleus break if candidateNucleus == None: candidateNucleus = nucleusweights[-1][0] ###################################### # update nucleusSpec = nucleusList[candidateNucleus] # get output/log datasets tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID, ['output','log']) # get destinations retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)} tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) tmpLog.info(' set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet)) self.sendLogMessage(tmpLog) if tmpRet: tmpMsg = 'set task.status=ready' tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) # update RW table self.prioRW.acquire() for prio,rwMap in self.prioRW.iteritems(): if prio > taskSpec.currentPriority: continue if candidateNucleus in rwMap: rwMap[candidateNucleus] += taskRW else: rwMap[candidateNucleus] = taskRW self.prioRW.release() except: errtype,errvalue = sys.exc_info()[:2] errMsg = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue) errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID) errMsg += traceback.format_exc() logger.error(errMsg)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskDsList = self.taskDsList.get(nTasks) # no more datasets if len(taskDsList) == 0: self.logger.debug("%s terminating since no more items" % self.__class__.__name__) return # loop over all tasks for jediTaskID, dsList in taskDsList: allUpdated = True taskBroken = False taskOnHold = False runningTask = False missingMap = {} # make logger tmpLog = MsgWrapper(self.logger, "<jediTaskID={0}>".format(jediTaskID)) # get task tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID, False, True, None, 10) if not tmpStat or taskSpec == None: tmpLog.error("failed to get taskSpec for jediTaskID={0}".format(jediTaskID)) continue try: # get task parameters taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( "task param conversion from json failed with {0}:{1}".format(errtype.__name__, errvalue) ) taskBroken = True # renaming of parameters if taskParamMap.has_key("nEventsPerInputFile"): taskParamMap["nEventsPerFile"] = taskParamMap["nEventsPerInputFile"] # the number of files per job nFilesPerJob = None if taskParamMap.has_key("nFilesPerJob"): nFilesPerJob = taskParamMap["nFilesPerJob"] # the number of files used by scout nFilesForScout = 0 if nFilesPerJob != None: nFilesForScout = 10 * nFilesPerJob else: nFilesForScout = 10 # load XML if taskSpec.useLoadXML(): try: loadXML = taskParamMap["loadXML"] xmlConfig = ParseJobXML.dom_parser(xmlStr=loadXML) except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error("failed to load XML config with {0}:{1}".format(errtype.__name__, errvalue)) taskBroken = True else: xmlConfig = None # check no wait noWaitParent = False if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None, taskSpec.jediTaskID]: tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid) if tmpStat == "running": noWaitParent = True # loop over all datasets nFilesMaster = 0 if not taskBroken: ddmIF = self.ddmIF.getInterface(taskSpec.vo) origNumFiles = None if taskParamMap.has_key("nFiles"): origNumFiles = taskParamMap["nFiles"] for datasetSpec in dsList: tmpLog.info( "start loop for {0}(id={1})".format(datasetSpec.datasetName, datasetSpec.datasetID) ) # get dataset metadata tmpLog.info("get metadata") gotMetadata = False stateUpdateTime = datetime.datetime.utcnow() try: if not datasetSpec.isPseudo(): tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName) else: # dummy metadata for pseudo dataset tmpMetadata = {"state": "closed"} # set mutable when parent is running and the dataset is open if noWaitParent and tmpMetadata["state"] == "open": # dummy metadata when parent is running tmpMetadata = {"state": "mutable"} gotMetadata = True except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( "{0} failed to get metadata to {1}:{2}".format( self.__class__.__name__, errtype.__name__, errvalue ) ) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = "broken" taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec, datasetStatus, tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag("failed to get metadata for {0}".format(datasetSpec.datasetName)) allUpdated = False else: # get file list specified in task parameters fileList, includePatt, excludePatt = RefinerUtils.extractFileList( taskParamMap, datasetSpec.datasetName ) # get the number of events in metadata if taskParamMap.has_key("getNumEventsInMetadata"): getNumEvents = True else: getNumEvents = False # get file list from DDM tmpLog.info("get files") try: useInFilesWithNewAttemptNr = False skipDuplicate = not datasetSpec.useDuplicatedFiles() if not datasetSpec.isPseudo(): if ( fileList != [] and taskParamMap.has_key("useInFilesInContainer") and not datasetSpec.containerName in ["", None] ): # read files from container if file list is specified in task parameters tmpDatasetName = datasetSpec.containerName else: tmpDatasetName = datasetSpec.datasetName tmpRet = ddmIF.getFilesInDataset( tmpDatasetName, getNumEvents=getNumEvents, skipDuplicate=skipDuplicate ) # remove lost files tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName, tmpRet) if tmpLostFiles != {}: tmpLog.info( "found {0} lost files in {1}".format(len(tmpLostFiles), tmpDatasetName) ) for tmpListGUID, tmpLostLFN in tmpLostFiles.iteritems(): tmpLog.info("removed {0}".format(tmpLostLFN)) del tmpRet[tmpListGUID] else: if not taskSpec.useListPFN(): # dummy file list for pseudo dataset tmpRet = { str(uuid.uuid4()): { "lfn": "pseudo_lfn", "scope": None, "filesize": 0, "checksum": None, } } else: # make dummy file list for PFN list if taskParamMap.has_key("nFiles"): nPFN = taskParamMap["nFiles"] else: nPFN = 1 tmpRet = {} for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = { "lfn": "{0:06d}:{1}".format( iPFN, taskParamMap["pfnList"][iPFN].split("/")[-1] ), "scope": None, "filesize": 0, "checksum": None, } except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( "failed to get files due to {0}:{1}".format( self.__class__.__name__, errtype.__name__, errvalue ) ) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = "broken" taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec, datasetStatus, tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag("failed to get files for {0}".format(datasetSpec.datasetName)) allUpdated = False else: # the number of events per file nEventsPerFile = None nEventsPerJob = None nEventsPerRange = None if (datasetSpec.isMaster() and taskParamMap.has_key("nEventsPerFile")) or ( datasetSpec.isPseudo() and taskParamMap.has_key("nEvents") ): if taskParamMap.has_key("nEventsPerFile"): nEventsPerFile = taskParamMap["nEventsPerFile"] elif datasetSpec.isPseudo() and taskParamMap.has_key("nEvents"): # use nEvents as nEventsPerFile for pseudo input nEventsPerFile = taskParamMap["nEvents"] if taskParamMap.has_key("nEventsPerJob"): nEventsPerJob = taskParamMap["nEventsPerJob"] elif taskParamMap.has_key("nEventsPerRange"): nEventsPerRange = taskParamMap["nEventsPerRange"] # max attempts and first event number maxAttempt = None firstEventNumber = None if datasetSpec.isMaster(): # max attempts if taskSpec.disableAutoRetry(): # disable auto retry maxAttempt = 1 elif taskParamMap.has_key("maxAttempt"): maxAttempt = taskParamMap["maxAttempt"] else: # use default value maxAttempt = 3 # first event number firstEventNumber = 1 + taskSpec.getFirstEventOffset() # nMaxEvents nMaxEvents = None if datasetSpec.isMaster() and taskParamMap.has_key("nEvents"): nMaxEvents = taskParamMap["nEvents"] # nMaxFiles nMaxFiles = None if taskParamMap.has_key("nFiles"): if datasetSpec.isMaster(): nMaxFiles = taskParamMap["nFiles"] else: # calculate for secondary nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles) # multipled by the number of jobs per file for event-level splitting if nMaxFiles != None and taskParamMap.has_key("nEventsPerFile"): if taskParamMap.has_key("nEventsPerJob"): if taskParamMap["nEventsPerFile"] > taskParamMap["nEventsPerJob"]: nMaxFiles *= float(taskParamMap["nEventsPerFile"]) / float( taskParamMap["nEventsPerJob"] ) nMaxFiles = int(math.ceil(nMaxFiles)) elif taskParamMap.has_key("nEventsPerRange"): if taskParamMap["nEventsPerFile"] > taskParamMap["nEventsPerRange"]: nMaxFiles *= float(taskParamMap["nEventsPerFile"]) / float( taskParamMap["nEventsPerRange"] ) nMaxFiles = int(math.ceil(nMaxFiles)) # use scout useScout = False if datasetSpec.isMaster() and taskSpec.useScout(): useScout = True # use files with new attempt numbers useFilesWithNewAttemptNr = False if ( not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key("useInFilesWithNewAttemptNr") ): useFilesWithNewAttemptNr = True # feed files to the contents table tmpLog.info("update contents") retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI( datasetSpec, tmpRet, tmpMetadata["state"], stateUpdateTime, nEventsPerFile, nEventsPerJob, maxAttempt, firstEventNumber, nMaxFiles, nMaxEvents, useScout, fileList, useFilesWithNewAttemptNr, nFilesPerJob, nEventsPerRange, nFilesForScout, includePatt, excludePatt, xmlConfig, noWaitParent, taskSpec.parent_tid, ) if retDB == False: taskSpec.setErrDiag( "failed to insert files for {0}. {1}".format( datasetSpec.datasetName, diagMap["errMsg"] ) ) allUpdated = False taskBroken = True break elif retDB == None: # the dataset is locked by another or status is not applicable allUpdated = False elif missingFileList != []: # files are missing tmpErrStr = "{0} files missing in {1}".format( len(missingFileList), datasetSpec.datasetName ) tmpLog.info(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) allUpdated = False taskOnHold = True missingMap[datasetSpec.datasetName] = { "datasetSpec": datasetSpec, "missingFiles": missingFileList, } else: # reduce the number of files to be read if taskParamMap.has_key("nFiles"): if datasetSpec.isMaster(): taskParamMap["nFiles"] -= nFilesUnique # reduce the number of files for scout if useScout: nFilesForScout = diagMap["nFilesForScout"] # number of master input files if datasetSpec.isMaster(): nFilesMaster += nFilesUnique # running task if diagMap["isRunningTask"]: runningTask = True # no activated pending input for noWait if noWaitParent and diagMap["nActivatedPending"] == 0: tmpErrStr = "insufficient inputs are ready" tmpLog.info(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) taskOnHold = True tmpLog.info("end loop") # no mater input if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0: tmpErrStr = "no master input files. input dataset is empty" tmpLog.error(tmpErrStr) taskSpec.setErrDiag(tmpErrStr, None) if taskSpec.allowEmptyInput() or noWaitParent: taskOnHold = True else: taskBroken = True # update task status if taskBroken: # task is broken taskSpec.status = "tobroken" tmpMsg = "set task.status={0}".format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID, taskSpec) # change task status unless the task is running if not runningTask: if taskOnHold: if not noWaitParent: # initialize task generator taskGenerator = TaskGenerator(taskSpec.vo, taskSpec.prodSourceLabel) tmpStat = taskGenerator.initializeMods( self.taskBufferIF, self.ddmIF.getInterface(taskSpec.vo) ) if not tmpStat: tmpErrStr = "failed to initialize TaskGenerator" tmpLog.error(tmpErrStr) taskSpec.status = "tobroken" taskSpec.setErrDiag(tmpErrStr) else: # make parent tasks if necessary tmpLog.info( "make parent tasks with {0} (if necessary)".format( taskGenerator.getClassName(taskSpec.vo, taskSpec.prodSourceLabel) ) ) tmpStat = taskGenerator.doGenerate( taskSpec, taskParamMap, missingFilesMap=missingMap ) if tmpStat == Interaction.SC_FATAL: # failed to make parent tasks taskSpec.status = "tobroken" tmpLog.error("failed to make parent tasks") # go to pending state if not taskSpec.status in ["broken", "tobroken"]: taskSpec.setOnHold() tmpMsg = "set task.status={0}".format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID, taskSpec) elif allUpdated: # all OK allRet, newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI( jediTaskID, getTaskStatus=True ) tmpMsg = "set task.status={0}".format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) tmpLog.info("done") except: errtype, errvalue = sys.exc_info()[:2] logger.error( "{0} failed in runImpl() with {1}:{2}".format(self.__class__.__name__, errtype.__name__, errvalue) )
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.info('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,splitRule,taskStatus,parent_tid in taskList: # make logger tmpLog = MsgWrapper(self.logger,'< jediTaskID={0} >'.format(jediTaskID)) tmpLog.debug('start') tmpStat = Interaction.SC_SUCCEEDED errStr = '' # read task parameters try: taskParam = None taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.debug(taskParam) tmpLog.error(errStr) continue tmpStat = Interaction.SC_FAILED # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: # get VO and sourceLabel vo = taskParamMap['vo'] prodSourceLabel = taskParamMap['prodSourceLabel'] taskType = taskParamMap['taskType'] tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType)) # get impl impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType, self.taskBufferIF,self.ddmIF) if impl == None: # task refiner is undefined errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # extract common parameters if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('extracting common') try: # initalize impl impl.initializeRefiner(tmpLog) impl.oldTaskStatus = taskStatus # extract common parameters impl.extractCommon(jediTaskID, taskParamMap, self.workQueueMapper, splitRule) # set parent tid if not parent_tid in [None,jediTaskID]: impl.taskSpec.parent_tid = parent_tid except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to extract common parameters with {0}:{1} {2}'.format(errtype.__name__,errvalue, traceback.format_exc()) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # check attribute length if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('checking attribute length') if not impl.taskSpec.checkAttrLength(): tmpLog.error(impl.taskSpec.errorDialog) tmpStat = Interaction.SC_FAILED # staging if tmpStat == Interaction.SC_SUCCEEDED: if 'toStaging' in taskParamMap and taskStatus <> 'staged': errStr = 'wait until staging is done' impl.taskSpec.status = 'staging' impl.taskSpec.oldStatus = taskStatus impl.taskSpec.setErrDiag(errStr) # not to update some task attributes impl.taskSpec.resetRefinedAttrs() tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec, {'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus], updateDEFT=False, setFrozenTime=False) continue # check parent noWaitParent = False parentState = None if tmpStat == Interaction.SC_SUCCEEDED: if parent_tid not in [None,jediTaskID]: tmpLog.info('check parent task') try: tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid) parentState = tmpStat if tmpStat == 'completed': # parent is done tmpStat = Interaction.SC_SUCCEEDED elif tmpStat == 'running': if not impl.taskSpec.noWaitParent(): # parent is running errStr = 'pending until parent task {0} is done'.format(parent_tid) impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(errStr) # not to update some task attributes impl.taskSpec.resetRefinedAttrs() tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus],setFrozenTime=False) continue else: # not wait for parent tmpStat = Interaction.SC_SUCCEEDED noWaitParent = True else: # parent is corrupted tmpStat = Interaction.SC_FAILED tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid) impl.taskSpec.setErrDiag(tmpErrStr) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # refine if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('refining with {0}'.format(impl.__class__.__name__)) try: tmpStat = impl.doRefine(jediTaskID,taskParamMap) except: errtype,errvalue = sys.exc_info()[:2] # wait unknown input if noWaitParent or waitInput if ((impl.taskSpec.noWaitParent() or impl.taskSpec.waitInput()) \ and errtype == JediException.UnknownDatasetError) or parentState == 'running' \ or errtype == Interaction.JEDITemporaryError: if impl.taskSpec.noWaitParent() or parentState == 'running': tmpErrStr = 'pending until parent produces input' setFrozenTime=False elif errtype == Interaction.JEDITemporaryError: tmpErrStr = 'pending due to DDM problem. {0}'.format(errvalue) setFrozenTime=True else: tmpErrStr = 'pending until input is staged' setFrozenTime=True impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(tmpErrStr) # not to update some task attributes impl.taskSpec.resetRefinedAttrs() tmpLog.info(tmpErrStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus], insertUnknown=impl.unknownDatasetList, setFrozenTime=setFrozenTime) continue else: errStr = 'failed to refine task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # register if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to refine the task') if impl == None or impl.taskSpec == None: tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID else: tmpTaskSpec = impl.taskSpec tmpTaskSpec.status = 'tobroken' if errStr != '': tmpTaskSpec.setErrDiag(errStr,True) self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID},oldStatus=[taskStatus]) else: tmpLog.info('registering') # fill JEDI tables try: # enable protection against task duplication if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \ not impl.taskSpec.checkPreProcessed(): uniqueTaskName = True else: uniqueTaskName = False strTaskParams = None if impl.updatedTaskParams != None: strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams) if taskStatus in ['registered', 'staged']: # unset pre-process flag if impl.taskSpec.checkPreProcessed(): impl.taskSpec.setPostPreProcess() # full registration tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec, impl.inMasterDatasetSpec, impl.inSecDatasetSpecList, impl.outDatasetSpecList, impl.outputTemplateMap, impl.jobParamsTemplate, strTaskParams, impl.unmergeMasterDatasetSpec, impl.unmergeDatasetSpecMap, uniqueTaskName, taskStatus) if not tmpStat: tmpErrStr = 'failed to register the task to JEDI in a single shot' tmpLog.error(tmpErrStr) impl.taskSpec.status = newTaskStatus impl.taskSpec.setErrDiag(tmpErrStr,True) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus]) tmpMsg = 'set task_status={0}'.format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) else: # disable scouts if previous attempt didn't use it if not impl.taskSpec.useScout(splitRule): impl.taskSpec.setUseScout(False) # disallow to reset some attributes for attName in ['ramCount', 'walltime', 'cpuTime', 'startTime']: impl.taskSpec.resetChangedAttr(attName) # update task with new params self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus]) # appending for incremetnal execution tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec, impl.inSecDatasetSpecList) if not tmpStat: tmpLog.error('failed to append datasets for incexec') except: errtype,errvalue = sys.exc_info()[:2] tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(tmpErrStr) else: tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def toBeThrottled(self,vo,prodSourceLabel,cloudName,workQueue,jobStat): # params nBunch = 4 threshold = 2.0 thresholdForSite = threshold - 1.0 nJobsInBunchMax = 500 nJobsInBunchMin = 300 nJobsInBunchMaxES = 1000 nWaitingLimit = 4 nWaitingBunchLimit = 2 nParallel = 8 # make logger tmpLog = MsgWrapper(logger) workQueueIDs = workQueue.getIDs() msgHeader = '{0}:{1} cloud={2} queue={3}:'.format(vo,prodSourceLabel,cloudName,workQueue.queue_name) tmpLog.debug(msgHeader+' start workQueueID={0}'.format(str(workQueueIDs))) # check cloud status if not self.siteMapper.checkCloud(cloudName): msgBody = "SKIP cloud={0} undefined".format(cloudName) tmpLog.debug(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') return self.retThrottled cloudSpec = self.siteMapper.getCloud(cloudName) if cloudSpec['status'] in ['offline']: msgBody = "SKIP cloud.status={0}".format(cloudSpec['status']) tmpLog.debug(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') return self.retThrottled if cloudSpec['status'] in ['test']: if workQueue.queue_name != 'test': msgBody = "SKIP cloud.status={0} for non test queue ({1})".format(cloudSpec['status'], workQueue.queue_name) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') tmpLog.debug(msgHeader+" "+msgBody) return self.retThrottled # check if unthrottled if workQueue.queue_share == None: msgBody = "PASS unthrottled since share=None" tmpLog.debug(msgHeader+" "+msgBody) return self.retUnThrottled # count number of jobs in each status nRunning = 0 nNotRun = 0 nDefine = 0 nWaiting = 0 for workQueueID in workQueueIDs: if jobStat.has_key(cloudName) and \ jobStat[cloudName].has_key(workQueueID): tmpLog.debug(msgHeader+" "+str(jobStat[cloudName][workQueueID])) for pState,pNumber in jobStat[cloudName][workQueueID].iteritems(): if pState in ['running']: nRunning += pNumber elif pState in ['assigned','activated','starting']: nNotRun += pNumber elif pState in ['defined']: nDefine += pNumber elif pState in ['waiting']: nWaiting += pNumber # check if higher prio tasks are waiting tmpStat,highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed',cloudName,workQueue) highestPrioInPandaDB = highestPrioJobStat['highestPrio'] nNotRunHighestPrio = highestPrioJobStat['nNotRun'] # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo,workQueue, 'managed',cloudName) if highestPrioWaiting == None: msgBody = 'failed to get the highest priority of waiting tasks' tmpLog.error(msgHeader+" "+msgBody) return self.retTmpError # high priority tasks are waiting highPrioQueued = False if highestPrioWaiting > highestPrioInPandaDB or (highestPrioWaiting == highestPrioInPandaDB and \ nNotRunHighestPrio < nJobsInBunchMin): highPrioQueued = True tmpLog.debug(msgHeader+" highestPrio waiting:{0} inPanda:{1} numNotRun:{2} -> highPrioQueued={3}".format(highestPrioWaiting, highestPrioInPandaDB, nNotRunHighestPrio, highPrioQueued)) # set maximum number of jobs to be submitted tmpRemainingSlot = int(nRunning*threshold-nNotRun) if tmpRemainingSlot < nJobsInBunchMin: # use the lower limit to avoid creating too many _sub/_dis datasets nJobsInBunch = nJobsInBunchMin else: if workQueue.queue_name in ['evgensimul']: # use higher limit for evgensimul if tmpRemainingSlot < nJobsInBunchMaxES: nJobsInBunch = tmpRemainingSlot else: nJobsInBunch = nJobsInBunchMaxES else: if tmpRemainingSlot < nJobsInBunchMax: nJobsInBunch = tmpRemainingSlot else: nJobsInBunch = nJobsInBunchMax nQueueLimit = nJobsInBunch*nBunch # use special limit for CERN if cloudName == 'CERN': nQueueLimit = 2000 # use nPrestage for reprocessing if workQueue.queue_name in ['reprocessing']: if cloudSpec.has_key('nprestage') and cloudSpec['nprestage'] > 0: nQueueLimit = cloudSpec['nprestage'] # reset nJobsInBunch if nQueueLimit > (nNotRun+nDefine): tmpRemainingSlot = nQueueLimit - (nNotRun+nDefine) if tmpRemainingSlot < nJobsInBunch: pass elif tmpRemainingSlot < nJobsInBunchMax: nJobsInBunch = tmpRemainingSlot else: nJobsInBunch = nJobsInBunchMax # set number of jobs to be submitted self.setMaxNumJobs(nJobsInBunch/nParallel) # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling limitPriority = False tmpLog.debug(msgHeader+" nQueueLimit:{0} nQueued:{1} nDefine:{2} nRunning:{3}".format(nQueueLimit, nNotRun+nDefine, nDefine, nRunning)) # check when high prio tasks are not waiting if not highPrioQueued: if nRunning == 0 and (nNotRun+nDefine) > nQueueLimit: limitPriority = True # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued({0})>{1}".format(nNotRun+nDefine,nQueueLimit) tmpLog.debug(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') return self.retMergeUnThr elif nRunning != 0 and float(nNotRun)/float(nRunning) > threshold and (nNotRun+nDefine) > nQueueLimit: limitPriority = True # enough jobs in Panda msgBody = "SKIP nQueued({0})/nRunning({1})>{2} & nQueued+Defined({3})>{4}".format(nNotRun,nRunning, threshold,nNotRun+nDefine, nQueueLimit) tmpLog.debug(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') return self.retMergeUnThr elif nDefine > nQueueLimit: limitPriority = True # brokerage is stuck msgBody = "SKIP too many nDefined({0})>{1}".format(nDefine,nQueueLimit) tmpLog.debug(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') return self.retMergeUnThr elif nWaiting > nRunning*nWaitingLimit and nWaiting > nJobsInBunch*nWaitingBunchLimit: limitPriority = True # too many waiting msgBody = "SKIP too many nWaiting({0})>max(nRunning({1})x{2},{3}x{4})".format(nWaiting,nRunning,nWaitingLimit, nJobsInBunch,nWaitingBunchLimit) tmpLog.debug(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') return self.retMergeUnThr # get jobs from prodDB limitPriorityValue = None if limitPriority: limitPriorityValue = highestPrioInPandaDB self.setMinPriority(limitPriorityValue) msgBody = "PASS - priority limit={0}".format(limitPriorityValue) tmpLog.debug(msgHeader+" "+msgBody) return self.retUnThrottled
def toBeThrottled(self,vo,prodSourceLabel,cloudName,workQueue,jobStat): # component name compName = 'prod_job_throttler' # params nBunch = 4 threshold = 2.0 thresholdForSite = threshold - 1.0 nJobsInBunchMax = 600 nJobsInBunchMin = 500 nJobsInBunchMaxES = 1000 if workQueue.criteria != None and 'site' in workQueue.criteria: minTotalWalltime = 10*1000*1000 else: minTotalWalltime = 50*1000*1000 nWaitingLimit = 4 nWaitingBunchLimit = 2 nParallel = 2 # make logger tmpLog = MsgWrapper(logger) workQueueIDs = workQueue.getIDs() msgHeader = '{0}:{1} cloud={2} queue={3}:'.format(vo,prodSourceLabel,cloudName,workQueue.queue_name) tmpLog.debug(msgHeader+' start workQueueID={0}'.format(str(workQueueIDs))) # change threashold if workQueue.queue_name in ['mcore']: threshold = 5.0 # check cloud status if not self.siteMapper.checkCloud(cloudName): msgBody = "SKIP cloud={0} undefined".format(cloudName) tmpLog.warning(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') return self.retThrottled cloudSpec = self.siteMapper.getCloud(cloudName) if cloudSpec['status'] in ['offline']: msgBody = "SKIP cloud.status={0}".format(cloudSpec['status']) tmpLog.warning(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') return self.retThrottled if cloudSpec['status'] in ['test']: if workQueue.queue_name != 'test': msgBody = "SKIP cloud.status={0} for non test queue ({1})".format(cloudSpec['status'], workQueue.queue_name) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning') tmpLog.warning(msgHeader+" "+msgBody) return self.retThrottled # check if unthrottled if workQueue.queue_share == None: msgBody = "PASS unthrottled since share=None" tmpLog.debug(msgHeader+" "+msgBody) return self.retUnThrottled # count number of jobs in each status nRunning = 0 nNotRun = 0 nDefine = 0 nWaiting = 0 for workQueueID in workQueueIDs: if jobStat.has_key(cloudName) and \ jobStat[cloudName].has_key(workQueueID): tmpLog.debug(msgHeader+" "+str(jobStat[cloudName][workQueueID])) for pState,pNumber in jobStat[cloudName][workQueueID].iteritems(): if pState in ['running']: nRunning += pNumber elif pState in ['assigned','activated','starting']: nNotRun += pNumber elif pState in ['defined']: nDefine += pNumber elif pState in ['waiting']: nWaiting += pNumber # check if higher prio tasks are waiting tmpStat,highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed',cloudName,workQueue) highestPrioInPandaDB = highestPrioJobStat['highestPrio'] nNotRunHighestPrio = highestPrioJobStat['nNotRun'] # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo,workQueue, 'managed',cloudName) if highestPrioWaiting == None: msgBody = 'failed to get the highest priority of waiting tasks' tmpLog.error(msgHeader+" "+msgBody) return self.retTmpError # high priority tasks are waiting highPrioQueued = False if highestPrioWaiting > highestPrioInPandaDB or (highestPrioWaiting == highestPrioInPandaDB and \ nNotRunHighestPrio < nJobsInBunchMin): highPrioQueued = True tmpLog.debug(msgHeader+" highestPrio waiting:{0} inPanda:{1} numNotRun:{2} -> highPrioQueued={3}".format(highestPrioWaiting, highestPrioInPandaDB, nNotRunHighestPrio, highPrioQueued)) # set maximum number of jobs to be submitted tmpRemainingSlot = int(nRunning*threshold-nNotRun) if tmpRemainingSlot < nJobsInBunchMin: # use the lower limit to avoid creating too many _sub/_dis datasets nJobsInBunch = nJobsInBunchMin else: if workQueue.queue_name in ['evgensimul']: # use higher limit for evgensimul if tmpRemainingSlot < nJobsInBunchMaxES: nJobsInBunch = tmpRemainingSlot else: nJobsInBunch = nJobsInBunchMaxES else: if tmpRemainingSlot < nJobsInBunchMax: nJobsInBunch = tmpRemainingSlot else: nJobsInBunch = nJobsInBunchMax nQueueLimit = nJobsInBunch*nBunch # use special nQueueLimit tmpVal = self.taskBufferIF.getConfigValue(compName, 'NQUEUELIMIT_{0}'.format(workQueue.queue_name), 'jedi', 'atlas') if tmpVal is not None: nQueueLimit = tmpVal # use nPrestage for reprocessing if workQueue.queue_name in ['reprocessing','mcore_repro']: # reset nJobsInBunch if nQueueLimit > (nNotRun+nDefine): tmpRemainingSlot = nQueueLimit - (nNotRun+nDefine) if tmpRemainingSlot < nJobsInBunch: pass elif tmpRemainingSlot < nJobsInBunchMax: nJobsInBunch = tmpRemainingSlot else: nJobsInBunch = nJobsInBunchMax # get cap nRunningCap = self.taskBufferIF.getConfigValue(compName, 'NRUNNINGCAP_{0}'.format(workQueue.queue_name), 'jedi', 'atlas') nQueueCap = self.taskBufferIF.getConfigValue(compName, 'NQUEUECAP_{0}'.format(workQueue.queue_name), 'jedi', 'atlas') # set number of jobs to be submitted self.setMaxNumJobs(nJobsInBunch/nParallel) # get total walltime totWalltime = self.taskBufferIF.getTotalWallTime_JEDI(vo,prodSourceLabel,workQueue,cloudName) # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling limitPriority = False tmpStr = msgHeader+" nQueueLimit:{0} nQueued:{1} nDefine:{2} nRunning:{3} totWalltime:{4} nRunCap:{5} nQueueCap:{6}" tmpLog.debug(tmpStr.format(nQueueLimit, nNotRun+nDefine, nDefine, nRunning, totWalltime, nRunningCap, nQueueCap)) # check if nRunning == 0 and (nNotRun+nDefine) > nQueueLimit and (totWalltime == None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued({0})>{1} totWalltime({2})>{3} ".format(nNotRun+nDefine,nQueueLimit, totWalltime,minTotalWalltime) tmpLog.warning(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True) return self.retMergeUnThr elif nRunning != 0 and float(nNotRun+nDefine)/float(nRunning) > threshold and \ (nNotRun+nDefine) > nQueueLimit and (totWalltime == None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # enough jobs in Panda msgBody = "SKIP nQueued({0})/nRunning({1})>{2} & nQueued({3})>{4} totWalltime({5})>{6}".format(nNotRun+nDefine,nRunning, threshold,nNotRun+nDefine, nQueueLimit, totWalltime,minTotalWalltime) tmpLog.warning(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True) return self.retMergeUnThr elif nDefine > nQueueLimit: limitPriority = True if not highPrioQueued: # brokerage is stuck msgBody = "SKIP too many nDefined({0})>{1}".format(nDefine,nQueueLimit) tmpLog.warning(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True) return self.retMergeUnThr elif nWaiting > nRunning*nWaitingLimit and nWaiting > nJobsInBunch*nWaitingBunchLimit: limitPriority = True if not highPrioQueued: # too many waiting msgBody = "SKIP too many nWaiting({0})>max(nRunning({1})x{2},{3}x{4})".format(nWaiting,nRunning,nWaitingLimit, nJobsInBunch,nWaitingBunchLimit) tmpLog.warning(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True) return self.retMergeUnThr elif nRunningCap is not None and nRunning > nRunningCap: limitPriority = True if not highPrioQueued: # cap on running msgBody = "SKIP nRunning({0})>nRunningCap({1})".format(nRunning,nRunningCap) tmpLog.warning(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True) return self.retMergeUnThr elif nQueueCap is not None and nNotRun+nDefine > nQueueCap: limitPriority = True if not highPrioQueued: # cap on queued msgBody = "SKIP nQueue({0})>nQueueCap({1})".format(nNotRun+nDefine,nQueueCap) tmpLog.warning(msgHeader+" "+msgBody) tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True) return self.retMergeUnThr # get jobs from prodDB limitPriorityValue = None if limitPriority: limitPriorityValue = highestPrioWaiting self.setMinPriority(limitPriorityValue) else: # not enough jobs are queued if nNotRun+nDefine < max(nQueueLimit,nRunning) or (totWalltime != None and totWalltime < minTotalWalltime): tmpLog.debug(msgHeader+" not enough jobs queued") self.notEnoughJobsQueued() self.setMaxNumJobs(max(self.maxNumJobs,nQueueLimit/20)) msgBody = "PASS - priority limit={0}".format(limitPriorityValue) tmpLog.debug(msgHeader+" "+msgBody) return self.retUnThrottled
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,commandMap in taskList: # make logger tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID)) commandStr = commandMap['command'] commentStr = commandMap['comment'] oldStatus = commandMap['oldStatus'] tmpLog.info('start for {0}'.format(commandStr)) tmpStat = Interaction.SC_SUCCEEDED if commandStr in ['kill','finish','reassign']: # get active PandaIDs to be killed pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True) if pandaIDs == None: tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID)) tmpStat = Interaction.SC_FAILED # kill jobs or update task if tmpStat == Interaction.SC_SUCCEEDED: if pandaIDs == []: # done since no active jobs tmpLog.info('completed the command') tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID updateTaskStatus = True if commandStr != 'reassign': # keep oldStatus for task reassignment since it is reset when actually reassigned tmpTaskSpec.forceUpdate('oldStatus') else: # extract cloud or site tmpItems = commentStr.split(':') if tmpItems[0] == 'cloud': tmpTaskSpec.cloud = tmpItems[1] else: tmpTaskSpec.site = tmpItems[1] # back to oldStatus if necessary if tmpItems[2] == 'y': tmpTaskSpec.status = oldStatus tmpTaskSpec.forceUpdate('oldStatus') updateTaskStatus = False if updateTaskStatus: tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done'] tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID}) else: tmpLog.info('sending kill command') tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True) tmpLog.info('done with {0}'.format(str(tmpRet))) elif commandStr in ['retry','incexec']: # change task params for incexec if commandStr == 'incexec': try: # read task params taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) # remove some params for newKey in ['nFiles','fixedSandbox']: try: del taskParamMap[newKey] except: pass # convert new params newParamMap = RefinerUtils.decodeJSON(commentStr) # change params for newKey,newVal in newParamMap.iteritems(): if newVal == None: # delete if newKey in taskParamMap: del taskParamMap[newKey] else: # change taskParamMap[newKey] = newVal # overwrite sandbox if 'fixedSandbox' in taskParamMap: # noBuild for tmpParam in taskParamMap['jobParameters']: if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None: tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox'] # build if taskParamMap.has_key('buildSpec'): taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox'] # merge if taskParamMap.has_key('mergeSpec'): taskParamMap['mergeSpec']['jobParameters'] = \ re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters']) # encode new param strTaskParams = RefinerUtils.encodeJSON(taskParamMap) tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams) if tmpRet != True: tmpLog.error('failed to update task params') continue except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue)) continue # retry failed files tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr) if tmpRet == True: tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpLog.info('done with {0}'.format(tmpRet)) else: tmpLog.error('unknown command') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue, resource_name): # params nBunch = 4 threshold = 2.0 nJobsInBunchMax = 600 nJobsInBunchMin = 500 minTotalWalltime = 50*1000*1000 nWaitingLimit = 4 nWaitingBunchLimit = 2 nParallel = 2 nParallelCap = 5 # make logger tmpLog = MsgWrapper(logger) workQueueID = workQueue.getID() workQueueName = workQueue.queue_name workQueueName = '_'.join(workQueue.queue_name.split(' ')) msgHeader = '{0}:{1} cloud={2} queue={3} resource_type={4}:'.format(vo, prodSourceLabel, cloudName, workQueueName, resource_name) tmpLog.debug('{0} start workQueueID={1}'.format(msgHeader, workQueueID)) # get central configuration values config_map = self.__getConfiguration(vo, workQueue.queue_name, resource_name) configQueueLimit = config_map[NQUEUELIMIT]['value'] configQueueCap = config_map[NQUEUECAP]['value'] configRunningCap = config_map[NRUNNINGCAP]['value'] tmpLog.debug(msgHeader + ' got configuration configQueueLimit={0}, configQueueCap={1}, configRunningCap={2}' .format(configQueueLimit, configQueueCap, configRunningCap)) # check if unthrottled if not workQueue.throttled: msgBody = "PASS unthrottled since GS_throttled is False" tmpLog.info(msgHeader+" "+msgBody) return self.retUnThrottled # get the jobs statistics for our wq/gs and expand the stats map jobstats_map = self.__prepareJobStats(workQueue, resource_name, config_map) nRunning_rt = jobstats_map['nRunning_rt'] nRunning_gs = jobstats_map['nRunning_gs'] nRunning_runningcap = jobstats_map['nRunning_runningcap'] nNotRun_rt = jobstats_map['nNotRun_rt'] nNotRun_gs = jobstats_map['nNotRun_gs'] nNotRun_queuelimit = jobstats_map['nNotRun_queuelimit'] nNotRun_queuecap = jobstats_map['nNotRun_queuecap'] nDefine_rt = jobstats_map['nDefine_rt'] nDefine_gs = jobstats_map['nDefine_gs'] nDefine_queuelimit = jobstats_map['nDefine_queuelimit'] nDefine_queuecap = jobstats_map['nDefine_queuecap'] nWaiting_rt = jobstats_map['nWaiting_rt'] nWaiting_gs = jobstats_map['nWaiting_gs'] # check if higher prio tasks are waiting if workQueue.queue_name in non_rt_wqs: # find highest priority of currently defined jobs tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed', cloudName, workQueue) # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo, workQueue, 'managed', cloudName) else: # find highest priority of currently defined jobs tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed', cloudName, workQueue, resource_name) # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo, workQueue, 'managed', cloudName, resource_name) highestPrioInPandaDB = highestPrioJobStat['highestPrio'] nNotRunHighestPrio = highestPrioJobStat['nNotRun'] if highestPrioWaiting is None: msgBody = 'failed to get the highest priority of waiting tasks' tmpLog.error("{0} {1}".format(msgHeader, msgBody)) return self.retTmpError # high priority tasks are waiting highPrioQueued = False if highestPrioWaiting > highestPrioInPandaDB \ or (highestPrioWaiting == highestPrioInPandaDB and nNotRunHighestPrio < nJobsInBunchMin): highPrioQueued = True tmpLog.debug("{0} highestPrio waiting:{1} inPanda:{2} numNotRun:{3} -> highPrioQueued={4}".format(msgHeader, highestPrioWaiting, highestPrioInPandaDB, nNotRunHighestPrio, highPrioQueued)) # set maximum number of jobs to be submitted if workQueue.queue_name in non_rt_wqs: tmpRemainingSlot = int(nRunning_gs * threshold - nNotRun_gs) else: tmpRemainingSlot = int(nRunning_rt * threshold - nNotRun_rt) # use the lower limit to avoid creating too many _sub/_dis datasets nJobsInBunch = min(max(nJobsInBunchMin, tmpRemainingSlot), nJobsInBunchMax) if configQueueLimit is not None: nQueueLimit = configQueueLimit else: nQueueLimit = nJobsInBunch * nBunch # use nPrestage for reprocessing if workQueue.queue_name in ['Heavy Ion', 'Reprocessing default']: # reset nJobsInBunch if nQueueLimit > (nNotRun_queuelimit + nDefine_queuelimit): tmpRemainingSlot = nQueueLimit - (nNotRun_queuelimit + nDefine_queuelimit) if tmpRemainingSlot > nJobsInBunch: nJobsInBunch = min(tmpRemainingSlot, nJobsInBunchMax) # get cap # set number of jobs to be submitted if configQueueCap is None: self.setMaxNumJobs(nJobsInBunch / nParallel) else: self.setMaxNumJobs(configQueueCap / nParallelCap) # get total walltime totWalltime = self.taskBufferIF.getTotalWallTime_JEDI(vo, prodSourceLabel, workQueue, resource_name, cloudName) # log the current situation and limits tmpLog.info("{0} nQueueLimit={1} nRunCap={2} nQueueCap={3}".format(msgHeader, nQueueLimit, configRunningCap, configQueueCap)) tmpLog.info("{0} at global share level: nQueued={1} nDefine={2} nRunning={3}".format(msgHeader, nNotRun_gs + nDefine_gs, nDefine_gs, nRunning_gs)) tmpLog.info("{0} at resource type level: nQueued_rt={1} nDefine_rt={2} nRunning_rt={3} totWalltime={4}".format(msgHeader, nNotRun_rt + nDefine_rt, nDefine_rt, nRunning_rt, totWalltime)) # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling limitPriority = False if workQueue.queue_name not in non_rt_wqs \ and nRunning_rt == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit \ and (totWalltime is None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name in non_rt_wqs \ and nRunning_gs == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit: limitPriority = True if not highPrioQueued: # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name not in non_rt_wqs and nRunning_rt != 0 \ and float(nNotRun_rt + nDefine_rt) / float(nRunning_rt) > threshold and \ (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit and (totWalltime is None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # enough jobs in Panda msgBody = "SKIP nQueued_rt({0})/nRunning_rt({1})>{2} & nQueued_queuelimit({3})>{4} totWalltime({5})>{6}".format(nNotRun_rt + nDefine_rt, nRunning_rt, threshold, nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name in non_rt_wqs and nRunning_gs != 0 \ and float(nNotRun_gs + nDefine_gs) / float(nRunning_gs) > threshold and \ (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit: limitPriority = True if not highPrioQueued: # enough jobs in Panda msgBody = "SKIP nQueued_gs({0})/nRunning_gs({1})>{2} & nQueued_queuelimit({3})>{4}".format(nNotRun_gs + nDefine_gs, nRunning_gs, threshold, nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif nDefine_queuelimit > nQueueLimit: limitPriority = True if not highPrioQueued: # brokerage is stuck msgBody = "SKIP too many nDefined_queuelimit({0})>{1}".format(nDefine_queuelimit, nQueueLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif nWaiting_rt > max(nRunning_rt * nWaitingLimit, nJobsInBunch * nWaitingBunchLimit): limitPriority = True if not highPrioQueued: # too many waiting msgBody = "SKIP too many nWaiting_rt({0})>max(nRunning_rt({1})x{2},{3}x{4})".format(nWaiting_rt, nRunning_rt, nWaitingLimit, nJobsInBunch, nWaitingBunchLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif configRunningCap and nRunning_runningcap > configRunningCap: # cap on running msgBody = "SKIP nRunning_runningcap({0})>nRunningCap({1})".format(nRunning_runningcap, configRunningCap) tmpLog.warning('{0} {1}'.format(msgHeader, msgBody)) tmpLog.sendMsg('{0} {1}'.format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif configQueueCap and nNotRun_queuecap + nDefine_queuecap > configQueueCap: limitPriority = True if not highPrioQueued: # cap on queued msgBody = "SKIP nQueued_queuecap({0})>nQueueCap({1})".format(nNotRun_queuecap + nDefine_queuecap, configQueueCap) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr # get jobs from prodDB limitPriorityValue = None if limitPriority: limitPriorityValue = highestPrioWaiting self.setMinPriority(limitPriorityValue) else: # not enough jobs are queued if (nNotRun_queuelimit + nDefine_queuelimit < nQueueLimit * 0.9) \ or (workQueue.queue_name in non_rt_wqs and nNotRun_gs + nDefine_gs < nRunning_gs) \ or (workQueue.queue_name not in non_rt_wqs and nNotRun_rt + nDefine_rt < nRunning_rt): tmpLog.debug(msgHeader+" not enough jobs queued") self.notEnoughJobsQueued() self.setMaxNumJobs(max(self.maxNumJobs, nQueueLimit/20)) msgBody = "PASS - priority limit={0} maxNumJobs={1}".format(limitPriorityValue, self.maxNumJobs) tmpLog.info(msgHeader+" "+msgBody) return self.retUnThrottled