Python MsgWrapper.sendMsg примеры использования

Язык программирования: Python

Пространство имен/Пакет: pandajedi.jedicore.MsgWrapper

Класс/Тип: MsgWrapper

Метод/Функция: sendMsg

Примеров на hotexamples.com: 18

Python MsgWrapper.sendMsg - 18 примеров найдено. Это лучшие примеры Python кода для pandajedi.jedicore.MsgWrapper.MsgWrapper.sendMsg, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

MsgWrapper(30)

debug(30)

error(30)

info(23)

sendMsg(8)

uploadLog(7)

warning(3)

Пример #1

Показать файл

Файл: ContentsFeeder.py Проект: PanDAWMS/panda-jedi

    def runImpl(self):
        while True:
            try:
                # get a part of list
                nTasks = 10
                taskDsList = self.taskDsList.get(nTasks)
                # no more datasets
                if len(taskDsList) == 0:
                    self.logger.debug('%s terminating since no more items' % self.__class__.__name__)
                    return
                # loop over all tasks
                for jediTaskID,dsList in taskDsList:
                    allUpdated = True
                    taskBroken = False
                    taskOnHold = False
                    runningTask = False
                    missingMap = {}
                    datasetsIdxConsistency = []

                    # get task
                    tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False,True,self.pid,10)
                    if not tmpStat or taskSpec == None:
                        self.logger.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID))
                        continue

                    # make logger
                    try:
                        gshare = '_'.join(taskSpec.gshare.split(' '))
                    except:
                        gshare = 'Undefined'
                    tmpLog = MsgWrapper(self.logger,'<jediTaskID={0} gshare={1}>'.format(jediTaskID, gshare))

                    try:
                        # get task parameters
                        taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                        taskParamMap = RefinerUtils.decodeJSON(taskParam)
                    except:
                        errtype,errvalue = sys.exc_info()[:2]
                        tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue))
                        taskBroken = True
                    # renaming of parameters
                    if taskParamMap.has_key('nEventsPerInputFile'):
                        taskParamMap['nEventsPerFile'] = taskParamMap['nEventsPerInputFile']
                    # the number of files per job
                    nFilesPerJob = taskSpec.getNumFilesPerJob()
                    # the number of chunks used by scout 
                    nChunksForScout = 10
                    # load XML
                    if taskSpec.useLoadXML():
                        xmlConfig = taskParamMap['loadXML']
                    else:
                        xmlConfig = None
                    # skip files used by another task
                    if 'skipFilesUsedBy' in taskParamMap:
                        skipFilesUsedBy = taskParamMap['skipFilesUsedBy']
                    else:
                        skipFilesUsedBy = None
                    # check no wait
                    noWaitParent = False
                    parentOutDatasets = set()
                    if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None,taskSpec.jediTaskID]:
                        tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid)
                        if tmpStat == 'running':
                            noWaitParent = True
                            # get output datasets from parent task
                            tmpParentStat,tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.parent_tid,
                                                                                                                  ['output','log'])
                            # collect dataset names
                            for tmpParentOutDataset in tmpParentOutDatasets:
                                parentOutDatasets.add(tmpParentOutDataset.datasetName)
                    # loop over all datasets
                    nFilesMaster = 0
                    checkedMaster = False
                    setFrozenTime = True
                    if not taskBroken:
                        ddmIF = self.ddmIF.getInterface(taskSpec.vo) 
                        origNumFiles = None
                        if taskParamMap.has_key('nFiles'):
                            origNumFiles = taskParamMap['nFiles']
                        for datasetSpec in dsList:
                            tmpLog.debug('start loop for {0}(id={1})'.format(datasetSpec.datasetName,datasetSpec.datasetID))
                            # index consistency
                            if datasetSpec.indexConsistent():
                                datasetsIdxConsistency.append(datasetSpec.datasetID)
                            # get dataset metadata
                            tmpLog.debug('get metadata')
                            gotMetadata = False
                            stateUpdateTime = datetime.datetime.utcnow()                    
                            try:
                                if not datasetSpec.isPseudo():
                                    tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName)
                                else:
                                    # dummy metadata for pseudo dataset
                                    tmpMetadata = {'state':'closed'}
                                # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed 
                                if (noWaitParent or taskSpec.runUntilClosed()) and \
                                        (tmpMetadata['state'] == 'open' \
                                             or datasetSpec.datasetName in parentOutDatasets \
                                             or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets):
                                    # dummy metadata when parent is running
                                    tmpMetadata = {'state':'mutable'}
                                gotMetadata = True
                            except:
                                errtype,errvalue = sys.exc_info()[:2]
                                tmpLog.error('{0} failed to get metadata to {1}:{2}'.format(self.__class__.__name__,
                                                                                            errtype.__name__,errvalue))
                                if errtype == Interaction.JEDIFatalError:
                                    # fatal error
                                    datasetStatus = 'broken'
                                    taskBroken = True
                                    # update dataset status    
                                    self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                else:
                                    if not taskSpec.ignoreMissingInDS():
                                        # temporary error
                                        taskOnHold = True
                                    else:
                                        # ignore missing 
                                        datasetStatus = 'failed'
                                        # update dataset status
                                        self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                taskSpec.setErrDiag('failed to get metadata for {0}'.format(datasetSpec.datasetName))
                                if not taskSpec.ignoreMissingInDS():
                                    allUpdated = False
                            else:
                                # get file list specified in task parameters
                                fileList,includePatt,excludePatt = RefinerUtils.extractFileList(taskParamMap,datasetSpec.datasetName)   
                                # get the number of events in metadata
                                if taskParamMap.has_key('getNumEventsInMetadata'):
                                    getNumEvents = True
                                else:
                                    getNumEvents = False
                                # get file list from DDM
                                tmpLog.debug('get files')
                                try:
                                    useInFilesWithNewAttemptNr = False
                                    skipDuplicate = not datasetSpec.useDuplicatedFiles()
                                    if not datasetSpec.isPseudo():
                                        if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \
                                                not datasetSpec.containerName in ['',None]:
                                            # read files from container if file list is specified in task parameters
                                            tmpDatasetName = datasetSpec.containerName
                                        else:
                                            tmpDatasetName = datasetSpec.datasetName
                                        # use long format for LB
                                        longFormat = False
                                        if taskSpec.respectLumiblock() or taskSpec.orderByLB():
                                            longFormat = True
                                        tmpRet = ddmIF.getFilesInDataset(tmpDatasetName,
                                                                         getNumEvents=getNumEvents,
                                                                         skipDuplicate=skipDuplicate,
                                                                         longFormat=longFormat
                                                                         )
                                        tmpLog.debug('got {0} files in {1}'.format(len(tmpRet),tmpDatasetName))
                                        # remove lost files
                                        tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName,tmpRet)
                                        if tmpLostFiles != {}:
                                            tmpLog.debug('found {0} lost files in {1}'.format(len(tmpLostFiles),tmpDatasetName))
                                            for tmpListGUID,tmpLostLFN in tmpLostFiles.iteritems():
                                                tmpLog.debug('removed {0}'.format(tmpLostLFN))
                                                del tmpRet[tmpListGUID]
                                    else:
                                        if datasetSpec.isSeqNumber():
                                            # make dummy files for seq_number
                                            if datasetSpec.getNumRecords() != None:
                                                nPFN = datasetSpec.getNumRecords()
                                            elif origNumFiles != None:
                                                nPFN = origNumFiles
                                                if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \
                                                        and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                    nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerJob']
                                                elif taskParamMap.has_key('nEventsPerFile') and taskParamMap.has_key('nEventsPerRange'):
                                                    nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerRange']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap:
                                                nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerJob']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \
                                                    and taskSpec.getNumFilesPerJob() is not None:
                                                nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerFile'] / taskSpec.getNumFilesPerJob()
                                            else:
                                                # the default number of records for seq_number
                                                seqDefNumRecords = 10000
                                                # get nFiles of the master
                                                tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI(datasetSpec.jediTaskID,
                                                                                                           datasetSpec.masterID,
                                                                                                           ['nFiles'])
                                                # use nFiles of the master as the number of records if it is larger than the default
                                                if 'nFiles' in tmpMasterAtt and tmpMasterAtt['nFiles'] > seqDefNumRecords:
                                                    nPFN = tmpMasterAtt['nFiles']
                                                else:
                                                    nPFN = seqDefNumRecords
                                                # check usedBy 
                                                if skipFilesUsedBy != None:
                                                    for tmpJediTaskID in str(skipFilesUsedBy).split(','):
                                                        tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI(tmpJediTaskID,
                                                                                                                          {'datasetName':datasetSpec.datasetName},
                                                                                                                          ['nFiles'])
                                                        if 'nFiles' in tmpParentAtt and tmpParentAtt['nFiles']:
                                                            nPFN += tmpParentAtt['nFiles']
                                            tmpRet = {}
                                            # get offset
                                            tmpOffset = datasetSpec.getOffset()
                                            tmpOffset += 1
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {'lfn':iPFN+tmpOffset,
                                                                             'scope':None,
                                                                             'filesize':0,
                                                                             'checksum':None,
                                                                             }
                                        elif not taskSpec.useListPFN():
                                            # dummy file list for pseudo dataset
                                            tmpRet = {str(uuid.uuid4()):{'lfn':'pseudo_lfn',
                                                                         'scope':None,
                                                                         'filesize':0,
                                                                         'checksum':None,
                                                                         }
                                                      }
                                        else:
                                            # make dummy file list for PFN list
                                            if taskParamMap.has_key('nFiles'):
                                                nPFN = taskParamMap['nFiles']
                                            else:
                                                nPFN = 1
                                            tmpRet = {}
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {'lfn':'{0:06d}:{1}'.format(iPFN,taskParamMap['pfnList'][iPFN].split('/')[-1]),
                                                                             'scope':None,
                                                                             'filesize':0,
                                                                             'checksum':None,
                                                                             }
                                except:
                                    errtype,errvalue = sys.exc_info()[:2]
                                    tmpLog.error('failed to get files due to {0}:{1} {2}'.format(self.__class__.__name__,
                                                                                                 errtype.__name__,errvalue))
                                    if errtype == Interaction.JEDIFatalError:
                                        # fatal error
                                        datasetStatus = 'broken'
                                        taskBroken = True
                                        # update dataset status    
                                        self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                    else:
                                        # temporary error
                                        taskOnHold = True
                                    taskSpec.setErrDiag('failed to get files for {0}'.format(datasetSpec.datasetName))
                                    allUpdated = False
                                else:
                                    # parameters for master input
                                    respectLB = False
                                    useRealNumEvents = False
                                    if datasetSpec.isMaster():
                                        # respect LB boundaries
                                        respectLB = taskSpec.respectLumiblock()
                                        # use real number of events
                                        useRealNumEvents = taskSpec.useRealNumEvents()
                                    # the number of events per file
                                    nEventsPerFile  = None
                                    nEventsPerJob   = None
                                    nEventsPerRange = None
                                    tgtNumEventsPerJob = None
                                    if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \
                                            (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()):
                                        if taskParamMap.has_key('nEventsPerFile'):
                                            nEventsPerFile = taskParamMap['nEventsPerFile']
                                        elif datasetSpec.isMaster() and datasetSpec.isPseudo() and taskParamMap.has_key('nEvents'):
                                            # use nEvents as nEventsPerFile for pseudo input
                                            nEventsPerFile = taskParamMap['nEvents']
                                        if taskParamMap.has_key('nEventsPerJob'):
                                            nEventsPerJob = taskParamMap['nEventsPerJob']
                                        elif taskParamMap.has_key('nEventsPerRange'):
                                            nEventsPerRange = taskParamMap['nEventsPerRange']
                                        if 'tgtNumEventsPerJob' in taskParamMap:
                                            tgtNumEventsPerJob = taskParamMap['tgtNumEventsPerJob']
                                            # reset nEventsPerJob
                                            nEventsPerJob = None
                                    # max attempts
                                    maxAttempt = None
                                    maxFailure = None
                                    if datasetSpec.isMaster() or datasetSpec.toKeepTrack():
                                        # max attempts 
                                        if taskSpec.disableAutoRetry():
                                            # disable auto retry 
                                            maxAttempt = 1
                                        elif taskParamMap.has_key('maxAttempt'):
                                            maxAttempt = taskParamMap['maxAttempt']
                                        else:
                                            # use default value
                                            maxAttempt = 3
                                        # max failure
                                        if 'maxFailure' in taskParamMap:
                                            maxFailure = taskParamMap['maxFailure']
                                    # first event number
                                    firstEventNumber = None
                                    if datasetSpec.isMaster():
                                        # first event number
                                        firstEventNumber = 1 + taskSpec.getFirstEventOffset()
                                    # nMaxEvents
                                    nMaxEvents = None 
                                    if datasetSpec.isMaster() and taskParamMap.has_key('nEvents'):
                                        nMaxEvents = taskParamMap['nEvents']
                                    # nMaxFiles
                                    nMaxFiles = None
                                    if taskParamMap.has_key('nFiles'):
                                        if datasetSpec.isMaster():
                                            nMaxFiles = taskParamMap['nFiles']
                                        else:
                                            # calculate for secondary
                                            nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles)
                                            # multipled by the number of jobs per file for event-level splitting
                                            if nMaxFiles != None and taskParamMap.has_key('nEventsPerFile'):
                                                if taskParamMap.has_key('nEventsPerJob'):
                                                    if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                        nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerJob'])
                                                        nMaxFiles = int(math.ceil(nMaxFiles))
                                                elif taskParamMap.has_key('nEventsPerRange'):
                                                    if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerRange']:
                                                        nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerRange'])
                                                        nMaxFiles = int(math.ceil(nMaxFiles))
                                    # use scout
                                    useScout = False    
                                    if datasetSpec.isMaster() and taskSpec.useScout() and (datasetSpec.status != 'toupdate' or not taskSpec.isPostScout()):
                                        useScout = True
                                    # use files with new attempt numbers    
                                    useFilesWithNewAttemptNr = False
                                    if not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key('useInFilesWithNewAttemptNr'):
                                        useFilesWithNewAttemptNr = True
                                    # ramCount
                                    ramCount = 0
                                    # skip short input
                                    if datasetSpec.isMaster() and not datasetSpec.isPseudo() \
                                            and nEventsPerFile is not None and nEventsPerJob is not None \
                                            and nEventsPerFile >= nEventsPerJob \
                                            and 'skipShortInput' in taskParamMap and taskParamMap['skipShortInput'] == True:
                                        skipShortInput = True
                                    else:
                                        skipShortInput = False
                                    # feed files to the contents table
                                    tmpLog.debug('update contents')
                                    retDB,missingFileList,nFilesUnique,diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(datasetSpec,tmpRet,
                                                                                                                              tmpMetadata['state'],
                                                                                                                              stateUpdateTime,
                                                                                                                              nEventsPerFile,
                                                                                                                              nEventsPerJob,
                                                                                                                              maxAttempt,
                                                                                                                              firstEventNumber,
                                                                                                                              nMaxFiles,
                                                                                                                              nMaxEvents,
                                                                                                                              useScout,
                                                                                                                              fileList,
                                                                                                                              useFilesWithNewAttemptNr,
                                                                                                                              nFilesPerJob,
                                                                                                                              nEventsPerRange,
                                                                                                                              nChunksForScout,
                                                                                                                              includePatt,
                                                                                                                              excludePatt,
                                                                                                                              xmlConfig,
                                                                                                                              noWaitParent,
                                                                                                                              taskSpec.parent_tid,
                                                                                                                              self.pid,
                                                                                                                              maxFailure,
                                                                                                                              useRealNumEvents,
                                                                                                                              respectLB,
                                                                                                                              tgtNumEventsPerJob,
                                                                                                                              skipFilesUsedBy,
                                                                                                                              ramCount,
                                                                                                                              taskSpec,
                                                                                                                              skipShortInput)
                                    if retDB == False:
                                        taskSpec.setErrDiag('failed to insert files for {0}. {1}'.format(datasetSpec.datasetName,
                                                                                                         diagMap['errMsg']))
                                        allUpdated = False
                                        taskBroken = True
                                        break
                                    elif retDB == None:
                                        # the dataset is locked by another or status is not applicable
                                        allUpdated = False
                                        tmpLog.debug('escape since task or dataset is locked')
                                        break
                                    elif missingFileList != []:
                                        # files are missing
                                        tmpErrStr = '{0} files missing in {1}'.format(len(missingFileList),datasetSpec.datasetName)
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        allUpdated = False
                                        taskOnHold = True
                                        missingMap[datasetSpec.datasetName] = {'datasetSpec':datasetSpec,
                                                                               'missingFiles':missingFileList} 
                                    else:
                                        # reduce the number of files to be read
                                        if taskParamMap.has_key('nFiles'):
                                            if datasetSpec.isMaster():
                                                taskParamMap['nFiles'] -= nFilesUnique
                                        # reduce the number of files for scout
                                        if useScout:
                                            nChunksForScout = diagMap['nChunksForScout']
                                        # number of master input files
                                        if datasetSpec.isMaster():
                                            checkedMaster = True
                                            nFilesMaster += nFilesUnique
                                    # running task
                                    if diagMap['isRunningTask']:
                                        runningTask = True
                                    # no activated pending input for noWait
                                    if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout <= 0) \
                                            and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster():
                                        tmpErrStr = 'insufficient inputs are ready. '
                                        tmpErrStr += diagMap['errMsg']
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        taskOnHold = True
                                        setFrozenTime = False
                                        break
                            tmpLog.debug('end loop')
                    # no mater input
                    if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster:
                        tmpErrStr = 'no master input files. input dataset is empty'
                        tmpLog.error(tmpErrStr)
                        taskSpec.setErrDiag(tmpErrStr,None)
                        if taskSpec.allowEmptyInput() or noWaitParent:
                            taskOnHold = True
                        else:
                            taskBroken = True
                    # index consistency
                    if not taskOnHold and not taskBroken and len(datasetsIdxConsistency) > 0:
                        self.taskBufferIF.removeFilesIndexInconsistent_JEDI(jediTaskID,datasetsIdxConsistency)
                    # update task status
                    if taskBroken:
                        # task is broken
                        taskSpec.status = 'tobroken'
                        tmpMsg = 'set task_status={0}'.format(taskSpec.status)
                        tmpLog.info(tmpMsg)
                        tmpLog.sendMsg(tmpMsg,self.msgType)
                        allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid)
                    # change task status unless the task is running
                    if not runningTask:
                        if taskOnHold:
                            # go to pending state
                            if not taskSpec.status in ['broken','tobroken']:
                                taskSpec.setOnHold()
                            tmpMsg = 'set task_status={0}'.format(taskSpec.status)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg,self.msgType)
                            allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid,setFrozenTime=setFrozenTime)
                        elif allUpdated:
                            # all OK
                            allRet,newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,getTaskStatus=True,pid=self.pid,
                                                                                                       useWorldCloud=taskSpec.useWorldCloud())
                            tmpMsg = 'set task_status={0}'.format(newTaskStatus)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg,self.msgType)
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid)
                        tmpLog.debug('unlock not-running task with {0}'.format(retUnlock))
                    else:
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid)
                        tmpLog.debug('unlock task with {0}'.format(retUnlock))
                    tmpLog.debug('done')
            except:
                errtype,errvalue = sys.exc_info()[:2]
                logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))

Пример #2

Показать файл

    def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue,
                      resource_name):
        # params
        nBunch = 4
        threshold = 2.0
        nJobsInBunchMax = 600
        nJobsInBunchMin = 500
        minTotalWalltime = 50 * 1000 * 1000
        nWaitingLimit = 4
        nWaitingBunchLimit = 2
        nParallel = 2
        nParallelCap = 5
        # make logger
        tmpLog = MsgWrapper(logger)

        workQueueID = workQueue.getID()
        workQueueName = workQueue.queue_name

        workQueueName = '_'.join(workQueue.queue_name.split(' '))
        msgHeader = '{0}:{1} cloud={2} queue={3} resource_type={4}:'.format(
            vo, prodSourceLabel, cloudName, workQueueName, resource_name)
        tmpLog.debug('{0} start workQueueID={1}'.format(
            msgHeader, workQueueID))

        # get central configuration values
        config_map = self.__getConfiguration(vo, workQueue.queue_name,
                                             resource_name)
        configQueueLimit = config_map[NQUEUELIMIT]['value']
        configQueueCap = config_map[NQUEUECAP]['value']
        configRunningCap = config_map[NRUNNINGCAP]['value']

        tmpLog.debug(
            msgHeader +
            ' got configuration configQueueLimit={0}, configQueueCap={1}, configRunningCap={2}'
            .format(configQueueLimit, configQueueCap, configRunningCap))

        # check if unthrottled
        if not workQueue.throttled:
            msgBody = "PASS unthrottled since GS_throttled is False"
            tmpLog.info(msgHeader + " " + msgBody)
            return self.retUnThrottled

        # get the jobs statistics for our wq/gs and expand the stats map
        jobstats_map = self.__prepareJobStats(workQueue, resource_name,
                                              config_map)
        nRunning_rt = jobstats_map['nRunning_rt']
        nRunning_gs = jobstats_map['nRunning_gs']
        nRunning_runningcap = jobstats_map['nRunning_runningcap']
        nNotRun_rt = jobstats_map['nNotRun_rt']
        nNotRun_gs = jobstats_map['nNotRun_gs']
        nNotRun_queuelimit = jobstats_map['nNotRun_queuelimit']
        nNotRun_queuecap = jobstats_map['nNotRun_queuecap']
        nDefine_rt = jobstats_map['nDefine_rt']
        nDefine_gs = jobstats_map['nDefine_gs']
        nDefine_queuelimit = jobstats_map['nDefine_queuelimit']
        nDefine_queuecap = jobstats_map['nDefine_queuecap']
        nWaiting_rt = jobstats_map['nWaiting_rt']
        nWaiting_gs = jobstats_map['nWaiting_gs']

        # check if higher prio tasks are waiting
        if workQueue.queue_name in non_rt_wqs:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI(
                'managed', cloudName, workQueue)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(
                vo, workQueue, 'managed', cloudName)
        else:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI(
                'managed', cloudName, workQueue, resource_name)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(
                vo, workQueue, 'managed', cloudName, resource_name)

        highestPrioInPandaDB = highestPrioJobStat['highestPrio']
        nNotRunHighestPrio = highestPrioJobStat['nNotRun']
        if highestPrioWaiting is None:
            msgBody = 'failed to get the highest priority of waiting tasks'
            tmpLog.error("{0} {1}".format(msgHeader, msgBody))
            return self.retTmpError

        # high priority tasks are waiting
        highPrioQueued = False
        if highestPrioWaiting > highestPrioInPandaDB \
                or (highestPrioWaiting == highestPrioInPandaDB and nNotRunHighestPrio < nJobsInBunchMin):
            highPrioQueued = True
        tmpLog.debug(
            "{0} highestPrio waiting:{1} inPanda:{2} numNotRun:{3} -> highPrioQueued={4}"
            .format(msgHeader, highestPrioWaiting, highestPrioInPandaDB,
                    nNotRunHighestPrio, highPrioQueued))
        # set maximum number of jobs to be submitted
        if workQueue.queue_name in non_rt_wqs:
            tmpRemainingSlot = int(nRunning_gs * threshold - nNotRun_gs)
        else:
            tmpRemainingSlot = int(nRunning_rt * threshold - nNotRun_rt)
        # use the lower limit to avoid creating too many _sub/_dis datasets
        nJobsInBunch = min(max(nJobsInBunchMin, tmpRemainingSlot),
                           nJobsInBunchMax)

        if configQueueLimit is not None:
            nQueueLimit = configQueueLimit
        else:
            nQueueLimit = nJobsInBunch * nBunch

        # use nPrestage for reprocessing
        if workQueue.queue_name in ['Heavy Ion', 'Reprocessing default']:
            # reset nJobsInBunch
            if nQueueLimit > (nNotRun_queuelimit + nDefine_queuelimit):
                tmpRemainingSlot = nQueueLimit - (nNotRun_queuelimit +
                                                  nDefine_queuelimit)
                if tmpRemainingSlot > nJobsInBunch:
                    nJobsInBunch = min(tmpRemainingSlot, nJobsInBunchMax)

        # get cap
        # set number of jobs to be submitted
        if configQueueCap is None:
            self.setMaxNumJobs(nJobsInBunch / nParallel)
        else:
            self.setMaxNumJobs(configQueueCap / nParallelCap)

        # get total walltime
        totWalltime = self.taskBufferIF.getTotalWallTime_JEDI(
            vo, prodSourceLabel, workQueue, resource_name, cloudName)

        # log the current situation and limits
        tmpLog.info("{0} nQueueLimit={1} nRunCap={2} nQueueCap={3}".format(
            msgHeader, nQueueLimit, configRunningCap, configQueueCap))
        tmpLog.info(
            "{0} at global share level: nQueued={1} nDefine={2} nRunning={3}".
            format(msgHeader, nNotRun_gs + nDefine_gs, nDefine_gs,
                   nRunning_gs))
        tmpLog.info(
            "{0} at resource type level: nQueued_rt={1} nDefine_rt={2} nRunning_rt={3} totWalltime={4}"
            .format(msgHeader, nNotRun_rt + nDefine_rt, nDefine_rt,
                    nRunning_rt, totWalltime))

        # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling
        limitPriority = False
        if workQueue.queue_name not in non_rt_wqs \
                and nRunning_rt == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit \
                and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs \
                and nRunning_gs == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name not in non_rt_wqs and  nRunning_rt != 0 \
                and float(nNotRun_rt + nDefine_rt) / float(nRunning_rt) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_rt({0})/nRunning_rt({1})>{2} & nQueued_queuelimit({3})>{4} totWalltime({5})>{6}".format(
                    nNotRun_rt + nDefine_rt, nRunning_rt, threshold,
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit,
                    totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs and nRunning_gs != 0 \
                and float(nNotRun_gs + nDefine_gs) / float(nRunning_gs) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_gs({0})/nRunning_gs({1})>{2} & nQueued_queuelimit({3})>{4}".format(
                    nNotRun_gs + nDefine_gs, nRunning_gs, threshold,
                    nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif nDefine_queuelimit > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # brokerage is stuck
                msgBody = "SKIP too many nDefined_queuelimit({0})>{1}".format(
                    nDefine_queuelimit, nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif nWaiting_rt > max(nRunning_rt * nWaitingLimit,
                               nJobsInBunch * nWaitingBunchLimit):
            limitPriority = True
            if not highPrioQueued:
                # too many waiting
                msgBody = "SKIP too many nWaiting_rt({0})>max(nRunning_rt({1})x{2},{3}x{4})".format(
                    nWaiting_rt, nRunning_rt, nWaitingLimit, nJobsInBunch,
                    nWaitingBunchLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        elif configRunningCap and nRunning_runningcap > configRunningCap:
            # cap on running
            msgBody = "SKIP nRunning_runningcap({0})>nRunningCap({1})".format(
                nRunning_runningcap, configRunningCap)
            tmpLog.warning('{0} {1}'.format(msgHeader, msgBody))
            tmpLog.sendMsg('{0} {1}'.format(msgHeader, msgBody),
                           self.msgType,
                           msgLevel='warning',
                           escapeChar=True)
            return self.retMergeUnThr

        elif configQueueCap and nNotRun_queuecap + nDefine_queuecap > configQueueCap:
            limitPriority = True
            if not highPrioQueued:
                # cap on queued
                msgBody = "SKIP nQueued_queuecap({0})>nQueueCap({1})".format(
                    nNotRun_queuecap + nDefine_queuecap, configQueueCap)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),
                               self.msgType,
                               msgLevel='warning',
                               escapeChar=True)
                return self.retMergeUnThr

        # get jobs from prodDB
        limitPriorityValue = None
        if limitPriority:
            limitPriorityValue = highestPrioWaiting
            self.setMinPriority(limitPriorityValue)
        else:
            # not enough jobs are queued
            if (nNotRun_queuelimit + nDefine_queuelimit < nQueueLimit * 0.9) \
                    or (workQueue.queue_name in non_rt_wqs and nNotRun_gs + nDefine_gs < nRunning_gs) \
                    or (workQueue.queue_name not in non_rt_wqs and nNotRun_rt + nDefine_rt < nRunning_rt):
                tmpLog.debug(msgHeader + " not enough jobs queued")
                if not workQueue.queue_name in non_rt_wqs:
                    self.notEnoughJobsQueued()
                self.setMaxNumJobs(max(self.maxNumJobs, nQueueLimit / 20))

        msgBody = "PASS - priority limit={0} maxNumJobs={1}".format(
            limitPriorityValue, self.maxNumJobs)
        tmpLog.info(msgHeader + " " + msgBody)
        return self.retUnThrottled

Пример #3

Показать файл

Файл: TaskCommando.py Проект: tertychnyy/panda-jedi

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,commandMap in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID))
                 commandStr = commandMap['command']
                 commentStr = commandMap['comment']
                 oldStatus  = commandMap['oldStatus']
                 tmpLog.info('start for {0}'.format(commandStr))
                 tmpStat = Interaction.SC_SUCCEEDED
                 if commandStr in ['kill','finish','reassign']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # loop twice to see immediate result
                     for iLoop in range(2):
                         # get active PandaIDs to be killed
                         if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr:
                             pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID)
                         else:
                             pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True)
                         if pandaIDs == None:
                             tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID))
                             tmpStat = Interaction.SC_FAILED
                         # kill jobs or update task
                         if tmpStat == Interaction.SC_SUCCEEDED:
                             if pandaIDs == []:
                                 # done since no active jobs
                                 tmpMsg = 'completed cleaning jobs'
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpTaskSpec = JediTaskSpec()
                                 tmpTaskSpec.jediTaskID = jediTaskID
                                 updateTaskStatus = True
                                 if commandStr != 'reassign':
                                     # reset oldStatus
                                     # keep oldStatus for task reassignment since it is reset when actually reassigned
                                     tmpTaskSpec.forceUpdate('oldStatus')
                                 else:
                                     # extract cloud or site
                                     if commentStr != None:
                                         tmpItems = commentStr.split(':')
                                         if tmpItems[0] == 'cloud':
                                             tmpTaskSpec.cloud = tmpItems[1]
                                         else:
                                             tmpTaskSpec.site = tmpItems[1]
                                         tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1])
                                         tmpLog.sendMsg(tmpMsg,self.msgType)
                                         tmpLog.info(tmpMsg)
                                         # back to oldStatus if necessary 
                                         if tmpItems[2] == 'y':
                                             tmpTaskSpec.status = oldStatus
                                             tmpTaskSpec.forceUpdate('oldStatus')
                                             updateTaskStatus = False
                                 if commandStr == 'reassign':
                                     tmpTaskSpec.forceUpdate('errorDialog')
                                 if updateTaskStatus:
                                     tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done']
                                 tmpMsg = 'set task.status={0}'.format(tmpTaskSpec.status)
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID})
                                 tmpLog.info('done with {0}'.format(str(tmpRet)))
                                 break
                             else:
                                 # kill only in the first loop
                                 if iLoop > 0:
                                     break
                                 # wait or kill jobs 
                                 if 'soft finish' in commentStr:
                                     tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = True
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                                     break
                                 else:
                                     tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpLog.sendMsg(tmpMsg,self.msgType)
                                     if commandStr in ['reassign','finish']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True)
                                     else:
                                         # normal kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True)
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                 elif commandStr in ['retry','incexec']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # change task params for incexec
                     if commandStr == 'incexec':
                         try:
                             # read task params
                             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                             taskParamMap = RefinerUtils.decodeJSON(taskParam)
                             # remove some params
                             for newKey in ['nFiles','fixedSandbox']:
                                 try:
                                     del taskParamMap[newKey]
                                 except:
                                     pass
                             # convert new params
                             newParamMap = RefinerUtils.decodeJSON(commentStr)
                             # change params
                             for newKey,newVal in newParamMap.iteritems():
                                 if newVal == None:
                                     # delete
                                     if newKey in taskParamMap:
                                         del taskParamMap[newKey]
                                 else:
                                     # change
                                     taskParamMap[newKey] = newVal
                             # overwrite sandbox
                             if 'fixedSandbox' in taskParamMap:
                                 # noBuild
                                 for tmpParam in taskParamMap['jobParameters']:
                                     if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None:
                                         tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox']
                                 # build
                                 if taskParamMap.has_key('buildSpec'):
                                     taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox']
                                 # merge
                                 if taskParamMap.has_key('mergeSpec'):
                                     taskParamMap['mergeSpec']['jobParameters'] = \
                                         re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters'])
                             # encode new param
                             strTaskParams = RefinerUtils.encodeJSON(taskParamMap)
                             tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams)
                             if tmpRet != True:
                                 tmpLog.error('failed to update task params')
                                 continue
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue))
                             continue
                     # retry failed files
                     tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr)
                     if tmpRet == True:
                         tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                         tmpLog.info(tmpMsg)
                     tmpLog.info('done with {0}'.format(tmpRet))
                 else:
                     tmpLog.error('unknown command')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errStr  = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errStr += traceback.format_exc()
             logger.error(errStr)

Пример #4

Показать файл

    def runImpl(self):
        while True:
            try:
                # get a part of list
                nTasks = 10
                taskDsList = self.taskDsList.get(nTasks)
                # no more datasets
                if len(taskDsList) == 0:
                    self.logger.debug('%s terminating since no more items' %
                                      self.__class__.__name__)
                    return
                # loop over all tasks
                for jediTaskID, dsList in taskDsList:
                    allUpdated = True
                    taskBroken = False
                    taskOnHold = False
                    runningTask = False
                    missingMap = {}
                    # make logger
                    tmpLog = MsgWrapper(
                        self.logger, '< jediTaskID={0} >'.format(jediTaskID))
                    # get task
                    tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(
                        jediTaskID, False, True, self.pid, 10)
                    if not tmpStat or taskSpec == None:
                        tmpLog.error(
                            'failed to get taskSpec for jediTaskID={0}'.format(
                                jediTaskID))
                        continue
                    try:
                        # get task parameters
                        taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                            jediTaskID)
                        taskParamMap = RefinerUtils.decodeJSON(taskParam)
                    except:
                        errtype, errvalue = sys.exc_info()[:2]
                        tmpLog.error(
                            'task param conversion from json failed with {0}:{1}'
                            .format(errtype.__name__, errvalue))
                        taskBroken = True
                    # renaming of parameters
                    if taskParamMap.has_key('nEventsPerInputFile'):
                        taskParamMap['nEventsPerFile'] = taskParamMap[
                            'nEventsPerInputFile']
                    # the number of files per job
                    nFilesPerJob = None
                    if taskParamMap.has_key('nFilesPerJob'):
                        nFilesPerJob = taskParamMap['nFilesPerJob']
                    # the number of chunks used by scout
                    nChunksForScout = 10
                    # load XML
                    if taskSpec.useLoadXML():
                        xmlConfig = taskParamMap['loadXML']
                    else:
                        xmlConfig = None
                    # skip files used by another task
                    if 'skipFilesUsedBy' in taskParamMap:
                        skipFilesUsedBy = taskParamMap['skipFilesUsedBy']
                    else:
                        skipFilesUsedBy = None
                    # check no wait
                    noWaitParent = False
                    parentOutDatasets = set()
                    if taskSpec.noWaitParent() and not taskSpec.parent_tid in [
                            None, taskSpec.jediTaskID
                    ]:
                        tmpStat = self.taskBufferIF.checkParentTask_JEDI(
                            taskSpec.parent_tid)
                        if tmpStat == 'running':
                            noWaitParent = True
                            # get output datasets from parent task
                            tmpParentStat, tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                                taskSpec.parent_tid, ['output', 'log'])
                            # collect dataset names
                            for tmpParentOutDataset in tmpParentOutDatasets:
                                parentOutDatasets.add(
                                    tmpParentOutDataset.datasetName)
                    # loop over all datasets
                    nFilesMaster = 0
                    checkedMaster = False
                    setFrozenTime = True
                    if not taskBroken:
                        ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                        origNumFiles = None
                        if taskParamMap.has_key('nFiles'):
                            origNumFiles = taskParamMap['nFiles']
                        for datasetSpec in dsList:
                            tmpLog.debug('start loop for {0}(id={1})'.format(
                                datasetSpec.datasetName,
                                datasetSpec.datasetID))
                            # get dataset metadata
                            tmpLog.debug('get metadata')
                            gotMetadata = False
                            stateUpdateTime = datetime.datetime.utcnow()
                            try:
                                if not datasetSpec.isPseudo():
                                    tmpMetadata = ddmIF.getDatasetMetaData(
                                        datasetSpec.datasetName)
                                else:
                                    # dummy metadata for pseudo dataset
                                    tmpMetadata = {'state': 'closed'}
                                # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed
                                if (noWaitParent or taskSpec.runUntilClosed()) and \
                                        (tmpMetadata['state'] == 'open' \
                                             or datasetSpec.datasetName in parentOutDatasets \
                                             or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets):
                                    # dummy metadata when parent is running
                                    tmpMetadata = {'state': 'mutable'}
                                gotMetadata = True
                            except:
                                errtype, errvalue = sys.exc_info()[:2]
                                tmpLog.error(
                                    '{0} failed to get metadata to {1}:{2}'.
                                    format(self.__class__.__name__,
                                           errtype.__name__, errvalue))
                                if errtype == Interaction.JEDIFatalError:
                                    # fatal error
                                    datasetStatus = 'broken'
                                    taskBroken = True
                                    # update dataset status
                                    self.updateDatasetStatus(
                                        datasetSpec, datasetStatus, tmpLog)
                                else:
                                    if not taskSpec.ignoreMissingInDS():
                                        # temporary error
                                        taskOnHold = True
                                    else:
                                        # ignore missing
                                        datasetStatus = 'failed'
                                        # update dataset status
                                        self.updateDatasetStatus(
                                            datasetSpec, datasetStatus, tmpLog)
                                taskSpec.setErrDiag(
                                    'failed to get metadata for {0}'.format(
                                        datasetSpec.datasetName))
                                if not taskSpec.ignoreMissingInDS():
                                    allUpdated = False
                            else:
                                # get file list specified in task parameters
                                fileList, includePatt, excludePatt = RefinerUtils.extractFileList(
                                    taskParamMap, datasetSpec.datasetName)
                                # get the number of events in metadata
                                if taskParamMap.has_key(
                                        'getNumEventsInMetadata'):
                                    getNumEvents = True
                                else:
                                    getNumEvents = False
                                # get file list from DDM
                                tmpLog.debug('get files')
                                try:
                                    useInFilesWithNewAttemptNr = False
                                    skipDuplicate = not datasetSpec.useDuplicatedFiles(
                                    )
                                    if not datasetSpec.isPseudo():
                                        if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \
                                                not datasetSpec.containerName in ['',None]:
                                            # read files from container if file list is specified in task parameters
                                            tmpDatasetName = datasetSpec.containerName
                                        else:
                                            tmpDatasetName = datasetSpec.datasetName
                                        # use long format for LB
                                        longFormat = False
                                        if taskSpec.respectLumiblock():
                                            longFormat = True
                                        tmpRet = ddmIF.getFilesInDataset(
                                            tmpDatasetName,
                                            getNumEvents=getNumEvents,
                                            skipDuplicate=skipDuplicate,
                                            longFormat=longFormat)
                                        tmpLog.debug(
                                            'got {0} files in {1}'.format(
                                                len(tmpRet), tmpDatasetName))
                                        # remove lost files
                                        tmpLostFiles = ddmIF.findLostFiles(
                                            tmpDatasetName, tmpRet)
                                        if tmpLostFiles != {}:
                                            tmpLog.debug(
                                                'found {0} lost files in {1}'.
                                                format(len(tmpLostFiles),
                                                       tmpDatasetName))
                                            for tmpListGUID, tmpLostLFN in tmpLostFiles.iteritems(
                                            ):
                                                tmpLog.debug(
                                                    'removed {0}'.format(
                                                        tmpLostLFN))
                                                del tmpRet[tmpListGUID]
                                    else:
                                        if datasetSpec.isSeqNumber():
                                            # make dummy files for seq_number
                                            if datasetSpec.getNumRecords(
                                            ) != None:
                                                nPFN = datasetSpec.getNumRecords(
                                                )
                                            elif origNumFiles != None:
                                                nPFN = origNumFiles
                                                if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \
                                                        and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                    nPFN = nPFN * taskParamMap[
                                                        'nEventsPerFile'] / taskParamMap[
                                                            'nEventsPerJob']
                                                elif taskParamMap.has_key(
                                                        'nEventsPerFile'
                                                ) and taskParamMap.has_key(
                                                        'nEventsPerRange'):
                                                    nPFN = nPFN * taskParamMap[
                                                        'nEventsPerFile'] / taskParamMap[
                                                            'nEventsPerRange']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap:
                                                nPFN = taskParamMap[
                                                    'nEvents'] / taskParamMap[
                                                        'nEventsPerJob']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \
                                                    and 'nFilesPerJob' in taskParamMap:
                                                nPFN = taskParamMap[
                                                    'nEvents'] / taskParamMap[
                                                        'nEventsPerFile'] / taskParamMap[
                                                            'nFilesPerJob']
                                            else:
                                                # the default number of records for seq_number
                                                seqDefNumRecords = 10000
                                                # get nFiles of the master
                                                tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI(
                                                    datasetSpec.jediTaskID,
                                                    datasetSpec.masterID,
                                                    ['nFiles'])
                                                # use nFiles of the master as the number of records if it is larger than the default
                                                if 'nFiles' in tmpMasterAtt and tmpMasterAtt[
                                                        'nFiles'] > seqDefNumRecords:
                                                    nPFN = tmpMasterAtt[
                                                        'nFiles']
                                                else:
                                                    nPFN = seqDefNumRecords
                                                # check usedBy
                                                if skipFilesUsedBy != None:
                                                    for tmpJediTaskID in str(
                                                            skipFilesUsedBy
                                                    ).split(','):
                                                        tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI(
                                                            tmpJediTaskID, {
                                                                'datasetName':
                                                                datasetSpec.
                                                                datasetName
                                                            }, ['nFiles'])
                                                        if 'nFiles' in tmpParentAtt and tmpParentAtt[
                                                                'nFiles']:
                                                            nPFN += tmpParentAtt[
                                                                'nFiles']
                                            tmpRet = {}
                                            # get offset
                                            tmpOffset = datasetSpec.getOffset()
                                            tmpOffset += 1
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {
                                                    'lfn': iPFN + tmpOffset,
                                                    'scope': None,
                                                    'filesize': 0,
                                                    'checksum': None,
                                                }
                                        elif not taskSpec.useListPFN():
                                            # dummy file list for pseudo dataset
                                            tmpRet = {
                                                str(uuid.uuid4()): {
                                                    'lfn': 'pseudo_lfn',
                                                    'scope': None,
                                                    'filesize': 0,
                                                    'checksum': None,
                                                }
                                            }
                                        else:
                                            # make dummy file list for PFN list
                                            if taskParamMap.has_key('nFiles'):
                                                nPFN = taskParamMap['nFiles']
                                            else:
                                                nPFN = 1
                                            tmpRet = {}
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {
                                                    'lfn':
                                                    '{0:06d}:{1}'.format(
                                                        iPFN,
                                                        taskParamMap['pfnList']
                                                        [iPFN].split('/')[-1]),
                                                    'scope':
                                                    None,
                                                    'filesize':
                                                    0,
                                                    'checksum':
                                                    None,
                                                }
                                except:
                                    errtype, errvalue = sys.exc_info()[:2]
                                    tmpLog.error(
                                        'failed to get files due to {0}:{1} {2}'
                                        .format(self.__class__.__name__,
                                                errtype.__name__, errvalue))
                                    if errtype == Interaction.JEDIFatalError:
                                        # fatal error
                                        datasetStatus = 'broken'
                                        taskBroken = True
                                        # update dataset status
                                        self.updateDatasetStatus(
                                            datasetSpec, datasetStatus, tmpLog)
                                    else:
                                        # temporary error
                                        taskOnHold = True
                                    taskSpec.setErrDiag(
                                        'failed to get files for {0}'.format(
                                            datasetSpec.datasetName))
                                    allUpdated = False
                                else:
                                    # parameters for master input
                                    respectLB = False
                                    useRealNumEvents = False
                                    if datasetSpec.isMaster():
                                        # respect LB boundaries
                                        respectLB = taskSpec.respectLumiblock()
                                        # use real number of events
                                        useRealNumEvents = taskSpec.useRealNumEvents(
                                        )
                                    # the number of events per file
                                    nEventsPerFile = None
                                    nEventsPerJob = None
                                    nEventsPerRange = None
                                    tgtNumEventsPerJob = None
                                    if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \
                                            (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()):
                                        if taskParamMap.has_key(
                                                'nEventsPerFile'):
                                            nEventsPerFile = taskParamMap[
                                                'nEventsPerFile']
                                        elif datasetSpec.isMaster(
                                        ) and datasetSpec.isPseudo(
                                        ) and taskParamMap.has_key('nEvents'):
                                            # use nEvents as nEventsPerFile for pseudo input
                                            nEventsPerFile = taskParamMap[
                                                'nEvents']
                                        if taskParamMap.has_key(
                                                'nEventsPerJob'):
                                            nEventsPerJob = taskParamMap[
                                                'nEventsPerJob']
                                        elif taskParamMap.has_key(
                                                'nEventsPerRange'):
                                            nEventsPerRange = taskParamMap[
                                                'nEventsPerRange']
                                        if 'tgtNumEventsPerJob' in taskParamMap:
                                            tgtNumEventsPerJob = taskParamMap[
                                                'tgtNumEventsPerJob']
                                            # reset nEventsPerJob
                                            nEventsPerJob = None
                                    # max attempts
                                    maxAttempt = None
                                    maxFailure = None
                                    if datasetSpec.isMaster(
                                    ) or datasetSpec.toKeepTrack():
                                        # max attempts
                                        if taskSpec.disableAutoRetry():
                                            # disable auto retry
                                            maxAttempt = 1
                                        elif taskParamMap.has_key(
                                                'maxAttempt'):
                                            maxAttempt = taskParamMap[
                                                'maxAttempt']
                                        else:
                                            # use default value
                                            maxAttempt = 3
                                        # max failure
                                        if 'maxFailure' in taskParamMap:
                                            maxFailure = taskParamMap[
                                                'maxFailure']
                                    # first event number
                                    firstEventNumber = None
                                    if datasetSpec.isMaster():
                                        # first event number
                                        firstEventNumber = 1 + taskSpec.getFirstEventOffset(
                                        )
                                    # nMaxEvents
                                    nMaxEvents = None
                                    if datasetSpec.isMaster(
                                    ) and taskParamMap.has_key('nEvents'):
                                        nMaxEvents = taskParamMap['nEvents']
                                    # nMaxFiles
                                    nMaxFiles = None
                                    if taskParamMap.has_key('nFiles'):
                                        if datasetSpec.isMaster():
                                            nMaxFiles = taskParamMap['nFiles']
                                        else:
                                            # calculate for secondary
                                            nMaxFiles = datasetSpec.getNumMultByRatio(
                                                origNumFiles)
                                            # multipled by the number of jobs per file for event-level splitting
                                            if nMaxFiles != None and taskParamMap.has_key(
                                                    'nEventsPerFile'):
                                                if taskParamMap.has_key(
                                                        'nEventsPerJob'):
                                                    if taskParamMap[
                                                            'nEventsPerFile'] > taskParamMap[
                                                                'nEventsPerJob']:
                                                        nMaxFiles *= float(
                                                            taskParamMap[
                                                                'nEventsPerFile']
                                                        ) / float(taskParamMap[
                                                            'nEventsPerJob'])
                                                        nMaxFiles = int(
                                                            math.ceil(
                                                                nMaxFiles))
                                                elif taskParamMap.has_key(
                                                        'nEventsPerRange'):
                                                    if taskParamMap[
                                                            'nEventsPerFile'] > taskParamMap[
                                                                'nEventsPerRange']:
                                                        nMaxFiles *= float(
                                                            taskParamMap[
                                                                'nEventsPerFile']
                                                        ) / float(taskParamMap[
                                                            'nEventsPerRange'])
                                                        nMaxFiles = int(
                                                            math.ceil(
                                                                nMaxFiles))
                                    # use scout
                                    useScout = False
                                    if datasetSpec.isMaster(
                                    ) and taskSpec.useScout() and (
                                            datasetSpec.status != 'toupdate'
                                            or not taskSpec.isPostScout()):
                                        useScout = True
                                    # use files with new attempt numbers
                                    useFilesWithNewAttemptNr = False
                                    if not datasetSpec.isPseudo(
                                    ) and fileList != [] and taskParamMap.has_key(
                                            'useInFilesWithNewAttemptNr'):
                                        useFilesWithNewAttemptNr = True
                                    #ramCount
                                    ramCount = 0

                                    # feed files to the contents table
                                    tmpLog.debug('update contents')
                                    retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(
                                        datasetSpec, tmpRet,
                                        tmpMetadata['state'], stateUpdateTime,
                                        nEventsPerFile, nEventsPerJob,
                                        maxAttempt, firstEventNumber,
                                        nMaxFiles, nMaxEvents, useScout,
                                        fileList, useFilesWithNewAttemptNr,
                                        nFilesPerJob, nEventsPerRange,
                                        nChunksForScout, includePatt,
                                        excludePatt, xmlConfig, noWaitParent,
                                        taskSpec.parent_tid, self.pid,
                                        maxFailure, useRealNumEvents,
                                        respectLB, tgtNumEventsPerJob,
                                        skipFilesUsedBy, ramCount)
                                    if retDB == False:
                                        taskSpec.setErrDiag(
                                            'failed to insert files for {0}. {1}'
                                            .format(datasetSpec.datasetName,
                                                    diagMap['errMsg']))
                                        allUpdated = False
                                        taskBroken = True
                                        break
                                    elif retDB == None:
                                        # the dataset is locked by another or status is not applicable
                                        allUpdated = False
                                        tmpLog.debug(
                                            'escape since task or dataset is locked'
                                        )
                                        break
                                    elif missingFileList != []:
                                        # files are missing
                                        tmpErrStr = '{0} files missing in {1}'.format(
                                            len(missingFileList),
                                            datasetSpec.datasetName)
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        allUpdated = False
                                        taskOnHold = True
                                        missingMap[datasetSpec.datasetName] = {
                                            'datasetSpec': datasetSpec,
                                            'missingFiles': missingFileList
                                        }
                                    else:
                                        # reduce the number of files to be read
                                        if taskParamMap.has_key('nFiles'):
                                            if datasetSpec.isMaster():
                                                taskParamMap[
                                                    'nFiles'] -= nFilesUnique
                                        # reduce the number of files for scout
                                        if useScout:
                                            nChunksForScout = diagMap[
                                                'nChunksForScout']
                                        # number of master input files
                                        if datasetSpec.isMaster():
                                            checkedMaster = True
                                            nFilesMaster += nFilesUnique
                                    # running task
                                    if diagMap['isRunningTask']:
                                        runningTask = True
                                    # no activated pending input for noWait
                                    if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout == 0) \
                                            and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster():
                                        tmpErrStr = 'insufficient inputs are ready. '
                                        tmpErrStr += diagMap['errMsg']
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        taskOnHold = True
                                        setFrozenTime = False
                                        break
                            tmpLog.debug('end loop')
                    # no mater input
                    if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster:
                        tmpErrStr = 'no master input files. input dataset is empty'
                        tmpLog.error(tmpErrStr)
                        taskSpec.setErrDiag(tmpErrStr, None)
                        if taskSpec.allowEmptyInput() or noWaitParent:
                            taskOnHold = True
                        else:
                            taskBroken = True
                    # update task status
                    if taskBroken:
                        # task is broken
                        taskSpec.status = 'tobroken'
                        tmpMsg = 'set task.status={0}'.format(taskSpec.status)
                        tmpLog.info(tmpMsg)
                        tmpLog.sendMsg(tmpMsg, self.msgType)
                        allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                            jediTaskID, taskSpec, pid=self.pid)
                    # change task status unless the task is running
                    if not runningTask:
                        if taskOnHold:
                            # go to pending state
                            if not taskSpec.status in ['broken', 'tobroken']:
                                taskSpec.setOnHold()
                            tmpMsg = 'set task.status={0}'.format(
                                taskSpec.status)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg, self.msgType)
                            allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                                jediTaskID,
                                taskSpec,
                                pid=self.pid,
                                setFrozenTime=setFrozenTime)
                        elif allUpdated:
                            # all OK
                            allRet, newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                                jediTaskID,
                                getTaskStatus=True,
                                pid=self.pid,
                                useWorldCloud=taskSpec.useWorldCloud())
                            tmpMsg = 'set task.status={0}'.format(
                                newTaskStatus)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg, self.msgType)
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(
                            jediTaskID, self.pid)
                        tmpLog.debug('unlock not-running task with {0}'.format(
                            retUnlock))
                    else:
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(
                            jediTaskID, self.pid)
                        tmpLog.debug('unlock task with {0}'.format(retUnlock))
                    tmpLog.debug('done')
            except:
                errtype, errvalue = sys.exc_info()[:2]
                logger.error('{0} failed in runImpl() with {1}:{2}'.format(
                    self.__class__.__name__, errtype.__name__, errvalue))

Пример #5

Показать файл

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,splitRule,taskStatus,parent_tid in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(jediTaskID))
                 tmpLog.info('start')
                 tmpStat = Interaction.SC_SUCCEEDED
                 errStr = ''
                 # read task parameters
                 try:
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue)
                     tmpLog.error(errStr)
                     tmpStat = Interaction.SC_FAILED
                 # get impl
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('getting Impl')
                     try:
                         # get VO and sourceLabel
                         vo = taskParamMap['vo']
                         prodSourceLabel = taskParamMap['prodSourceLabel']
                         taskType = taskParamMap['taskType']
                         tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType))
                         # get impl
                         impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType,
                                                                 self.taskBufferIF,self.ddmIF)
                         if impl == None:
                             # task refiner is undefined
                             errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # extract common parameters
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('extracting common')                    
                     try:
                         # initalize impl
                         impl.initializeRefiner(tmpLog)
                         # extarct common parameters
                         impl.extractCommon(jediTaskID,taskParamMap,self.workQueueMapper,splitRule)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to extract common parameters with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # check parent
                 noWaitParent = False
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     if not parent_tid in [None,jediTaskID]:
                         tmpLog.info('check parent task')
                         try:
                             tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid)
                             if tmpStat == 'completed':
                                 # parent is done
                                 tmpStat = Interaction.SC_SUCCEEDED
                             elif tmpStat == 'running':
                                 if not impl.taskSpec.noWaitParent():
                                     # parent is running
                                     errStr = 'pending until parent task {0} is done'.format(parent_tid)
                                     impl.taskSpec.status = taskStatus
                                     impl.taskSpec.setOnHold()
                                     impl.taskSpec.setErrDiag(errStr)
                                     tmpLog.info(errStr)
                                     self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID})
                                     continue
                                 else:
                                     # not wait for parent
                                     tmpStat = Interaction.SC_SUCCEEDED
                                     noWaitParent = True
                             else:
                                 # parent is corrupted
                                 tmpStat = Interaction.SC_FAILED
                                 tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid)
                                 impl.taskSpec.setErrDiag(tmpErrStr)
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # refine
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('refining with {0}'.format(impl.__class__.__name__))
                     try:
                         tmpStat = impl.doRefine(jediTaskID,taskParamMap)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         # no wait for parent
                         if impl.taskSpec.noWaitParent() and errtype == JediException.UnknownDatasetError:
                             impl.taskSpec.status = taskStatus
                             impl.taskSpec.setOnHold()
                             errStr = 'pending until parent produces input'
                             tmpLog.info(errStr)
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID})
                             continue
                         else:
                             errStr = 'failed to refine task'
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # register
                 if tmpStat != Interaction.SC_SUCCEEDED:
                     tmpLog.error('failed to refine the task')
                     if impl == None or impl.taskSpec == None:
                         tmpTaskSpec = JediTaskSpec()
                         tmpTaskSpec.jediTaskID = jediTaskID
                     else:
                         tmpTaskSpec = impl.taskSpec
                     tmpTaskSpec.status = 'tobroken'
                     if errStr != '':
                         tmpTaskSpec.setErrDiag(errStr,True)
                     self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID})
                 else:
                     tmpLog.info('registering')                    
                     # fill JEDI tables
                     try:
                         # enable protection against task duplication
                         if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \
                                 not impl.taskSpec.checkPreProcessed():
                             uniqueTaskName = True
                         else:
                             uniqueTaskName = False
                         strTaskParams = None
                         if impl.updatedTaskParams != None:
                             strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams)
                         if taskStatus == 'registered':
                             # unset pre-process flag
                             if impl.taskSpec.checkPreProcessed():
                                 impl.taskSpec.setPostPreProcess()
                             # full registration
                             tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec,
                                                                                                  impl.inMasterDatasetSpec,
                                                                                                  impl.inSecDatasetSpecList,
                                                                                                  impl.outDatasetSpecList,
                                                                                                  impl.outputTemplateMap,
                                                                                                  impl.jobParamsTemplate,
                                                                                                  strTaskParams,
                                                                                                  impl.unmergeMasterDatasetSpec,
                                                                                                  impl.unmergeDatasetSpecMap,
                                                                                                  uniqueTaskName) 
                             if not tmpStat:
                                 tmpErrStr = 'failed to register the task to JEDI in a single shot'
                                 tmpLog.error(tmpErrStr)
                                 impl.taskSpec.status = 'tobroken'
                                 impl.taskSpec.setErrDiag(tmpErrStr,True)
                                 self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID})
                             tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                             tmpLog.info(tmpMsg)
                             tmpLog.sendMsg(tmpMsg,self.msgType)
                         else:        
                             # appending for incremetnal execution
                             tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec,
                                                                             impl.inSecDatasetSpecList)
                             if not tmpStat:
                                 tmpLog.error('failed to append datasets for incexec')
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(tmpErrStr)
                     else:
                         tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))

Пример #6

Показать файл

 def doForPriorityMassage(self):
     tmpLog = MsgWrapper(logger, ' #ATM #KV doForPriorityMassage label=user')
     tmpLog.debug('start')
     # lock
     got_lock = self.taskBufferIF.lockProcess_JEDI(  vo=self.vo, prodSourceLabel=self.prodSourceLabel,
                                                     cloud=None, workqueue_id=None, resource_name=None,
                                                     component='AtlasAnalWatchDog.doForPriorityMassage',
                                                     pid=self.pid, timeLimit=6)
     if not got_lock:
         tmpLog.debug('locked by another process. Skipped')
         return
     try:
         # get usage breakdown
         usageBreakDownPerUser, usageBreakDownPerSite = self.taskBufferIF.getUsageBreakdown_JEDI(self.prodSourceLabel)
         # get total number of users and running/done jobs
         totalUsers = 0
         totalRunDone = 0
         usersTotalJobs = {}
         usersTotalCores = {}
         for prodUserName in usageBreakDownPerUser:
             wgValMap = usageBreakDownPerUser[prodUserName]
             for workingGroup in wgValMap:
                 siteValMap = wgValMap[workingGroup]
                 totalUsers += 1
                 for computingSite in siteValMap:
                     statValMap = siteValMap[computingSite]
                     totalRunDone += statValMap['rundone']
                     usersTotalJobs.setdefault(prodUserName, {})
                     usersTotalJobs[prodUserName].setdefault(workingGroup, 0)
                     usersTotalJobs[prodUserName][workingGroup] += statValMap['running']
                     usersTotalCores.setdefault(prodUserName, {})
                     usersTotalCores[prodUserName].setdefault(workingGroup, 0)
                     usersTotalCores[prodUserName][workingGroup] += statValMap['runcores']
         tmpLog.debug('total {0} users, {1} RunDone jobs'.format(totalUsers, totalRunDone))
         # skip if no user
         if totalUsers == 0:
             tmpLog.debug('no user. Skipped...')
             return
         # cap num of running jobs
         tmpLog.debug('cap running jobs')
         prodUserName = None
         maxNumRunPerUser = self.taskBufferIF.getConfigValue('prio_mgr', 'CAP_RUNNING_USER_JOBS')
         maxNumRunPerGroup = self.taskBufferIF.getConfigValue('prio_mgr', 'CAP_RUNNING_GROUP_JOBS')
         maxNumCorePerUser = self.taskBufferIF.getConfigValue('prio_mgr', 'CAP_RUNNING_USER_CORES')
         maxNumCorePerGroup = self.taskBufferIF.getConfigValue('prio_mgr', 'CAP_RUNNING_GROUP_CORES')
         if maxNumRunPerUser is None:
             maxNumRunPerUser = 10000
         if maxNumRunPerGroup is None:
             maxNumRunPerGroup = 10000
         if maxNumCorePerUser is None:
             maxNumCorePerUser = 10000
         if maxNumCorePerGroup is None:
             maxNumCorePerGroup = 10000
         try:
             throttledUsers = self.taskBufferIF.getThrottledUsers()
             for prodUserName in usersTotalJobs:
                 for workingGroup in usersTotalJobs[prodUserName]:
                     tmpNumTotalJobs = usersTotalJobs[prodUserName][workingGroup]
                     tmpNumTotalCores = usersTotalCores[prodUserName][workingGroup]
                     if workingGroup is None:
                         maxNumRun = maxNumRunPerUser
                         maxNumCore = maxNumCorePerUser
                     else:
                         maxNumRun = maxNumRunPerGroup
                         maxNumCore = maxNumCorePerGroup
                     if tmpNumTotalJobs >= maxNumRun or tmpNumTotalCores >= maxNumCore:
                         # throttle user
                         tmpNumJobs = self.taskBufferIF.throttleUserJobs(prodUserName, workingGroup, get_dict=True)
                         if tmpNumJobs is not None:
                             for tmpJediTaskID, tmpNumJob in iteritems(tmpNumJobs):
                                 msg = ('throttled {} jobs in jediTaskID={} for user="******" group={} '
                                        'since too many running jobs ({} > {}) or cores ({} > {}) ').format(
                                        tmpNumJob, tmpJediTaskID, prodUserName, workingGroup, tmpNumTotalJobs,
                                        maxNumRun, tmpNumTotalCores, maxNumCore)
                                 tmpLog.debug(msg)
                                 tmpLog.sendMsg(msg, 'userCap', msgLevel='warning')
                     elif tmpNumTotalJobs < maxNumRun*0.9 and tmpNumTotalCores < maxNumCore*0.9 and \
                             (prodUserName, workingGroup) in throttledUsers:
                         # unthrottle user
                         tmpNumJobs = self.taskBufferIF.unThrottleUserJobs(prodUserName, workingGroup, get_dict=True)
                         if tmpNumJobs is not None:
                             for tmpJediTaskID, tmpNumJob in iteritems(tmpNumJobs):
                                 msg = ('released {} jobs in jediTaskID={} for user="******" group={} '
                                        'since number of running jobs and cores are less than {} and {}').format(
                                        tmpNumJob, tmpJediTaskID, prodUserName, workingGroup, maxNumRun, maxNumCore)
                                 tmpLog.debug(msg)
                                 tmpLog.sendMsg(msg, 'userCap')
         except Exception as e:
             errStr = "cap failed for %s : %s" % (prodUserName, str(e))
             errStr.strip()
             errStr += traceback.format_exc()
             tmpLog.error(errStr)
         # to boost
         tmpLog.debug('boost jobs')
         # global average
         globalAverageRunDone = float(totalRunDone)/float(totalUsers)
         tmpLog.debug('global average: {0}'.format(globalAverageRunDone))
         # count the number of users and run/done jobs for each site
         siteRunDone = {}
         siteUsers = {}
         for computingSite in usageBreakDownPerSite:
             userValMap = usageBreakDownPerSite[computingSite]
             for prodUserName in userValMap:
                 wgValMap = userValMap[prodUserName]
                 for workingGroup in wgValMap:
                     statValMap = wgValMap[workingGroup]
                     # count the number of users and running/done jobs
                     siteUsers.setdefault(computingSite, 0)
                     siteUsers[computingSite] += 1
                     siteRunDone.setdefault(computingSite, 0)
                     siteRunDone[computingSite] += statValMap['rundone']
         # get site average
         tmpLog.debug('site average')
         siteAverageRunDone = {}
         for computingSite in siteRunDone:
             nRunDone = siteRunDone[computingSite]
             siteAverageRunDone[computingSite] = float(nRunDone)/float(siteUsers[computingSite])
             tmpLog.debug(" %-25s : %s" % (computingSite,siteAverageRunDone[computingSite]))
         # check if the number of user's jobs is lower than the average
         for prodUserName in usageBreakDownPerUser:
             wgValMap = usageBreakDownPerUser[prodUserName]
             for workingGroup in wgValMap:
                 tmpLog.debug("---> %s group=%s" % (prodUserName, workingGroup))
                 # count the number of running/done jobs
                 userTotalRunDone = 0
                 for computingSite in wgValMap[workingGroup]:
                     statValMap = wgValMap[workingGroup][computingSite]
                     userTotalRunDone += statValMap['rundone']
                 # no priority boost when the number of jobs is higher than the average
                 if userTotalRunDone >= globalAverageRunDone:
                     tmpLog.debug("enough running %s > %s (global average)" % (userTotalRunDone,globalAverageRunDone))
                     continue
                 tmpLog.debug("user total:%s global average:%s" % (userTotalRunDone,globalAverageRunDone))
                 # check with site average
                 toBeBoostedSites = []
                 for computingSite in wgValMap[workingGroup]:
                     statValMap = wgValMap[workingGroup][computingSite]
                     # the number of running/done jobs is lower than the average and activated jobs are waiting
                     if statValMap['rundone'] >= siteAverageRunDone[computingSite]:
                         tmpLog.debug("enough running %s > %s (site average) at %s" % \
                                       (statValMap['rundone'],siteAverageRunDone[computingSite],computingSite))
                     elif statValMap['activated'] == 0:
                         tmpLog.debug("no activated jobs at %s" % computingSite)
                     else:
                         toBeBoostedSites.append(computingSite)
                 # no boost is required
                 if toBeBoostedSites == []:
                     tmpLog.debug("no sites to be boosted")
                     continue
                 # check special prioritized site
                 siteAccessForUser = {}
                 varMap = {}
                 varMap[':dn'] = prodUserName
                 sql = "SELECT pandaSite,pOffset,status,workingGroups FROM ATLAS_PANDAMETA.siteAccess WHERE dn=:dn"
                 res = self.taskBufferIF.querySQL(sql, varMap, arraySize=10000)
                 if res is not None:
                     for pandaSite, pOffset, pStatus, workingGroups in res:
                         # ignore special working group for now
                         if workingGroups not in ['', None]:
                             continue
                         # only approved sites
                         if pStatus != 'approved':
                             continue
                         # no priority boost
                         if pOffset == 0:
                             continue
                         # append
                         siteAccessForUser[pandaSite] = pOffset
                 # set weight
                 totalW = 0
                 defaultW = 100
                 for computingSite in toBeBoostedSites:
                     totalW += defaultW
                     if computingSite in siteAccessForUser:
                         totalW += siteAccessForUser[computingSite]
                 totalW = float(totalW)
                 # the total number of jobs to be boosted
                 numBoostedJobs = globalAverageRunDone - float(userTotalRunDone)
                 # get quota
                 quotaFactor = 1.0 + self.taskBufferIF.checkQuota(prodUserName)
                 tmpLog.debug("quota factor:%s" % quotaFactor)
                 # make priority boost
                 nJobsPerPrioUnit = 5
                 highestPrio = 1000
                 for computingSite in toBeBoostedSites:
                     weight = float(defaultW)
                     if computingSite in siteAccessForUser:
                         weight += float(siteAccessForUser[computingSite])
                     weight /= totalW
                     # the number of boosted jobs at the site
                     numBoostedJobsSite = int(numBoostedJobs * weight / quotaFactor)
                     tmpLog.debug("nSite:%s nAll:%s W:%s Q:%s at %s" % (numBoostedJobsSite, numBoostedJobs, weight, quotaFactor, computingSite))
                     if numBoostedJobsSite/nJobsPerPrioUnit == 0:
                         tmpLog.debug("too small number of jobs %s to be boosted at %s" % (numBoostedJobsSite, computingSite))
                         continue
                     # get the highest prio of activated jobs at the site
                     varMap = {}
                     varMap[':jobStatus'] = 'activated'
                     varMap[':prodSourceLabel'] = self.prodSourceLabel
                     varMap[':pmerge'] = 'pmerge'
                     varMap[':prodUserName'] = prodUserName
                     varMap[':computingSite'] = computingSite
                     sql  = "SELECT MAX(currentPriority) FROM ATLAS_PANDA.jobsActive4 "
                     sql += "WHERE prodSourceLabel=:prodSourceLabel AND jobStatus=:jobStatus AND computingSite=:computingSite "
                     sql += "AND processingType<>:pmerge AND prodUserName=:prodUserName "
                     if workingGroup is not None:
                         varMap[':workingGroup'] = workingGroup
                         sql += "AND workingGroup=:workingGroup "
                     else:
                         sql += "AND workingGroup IS NULL "
                     res = self.taskBufferIF.querySQL(sql, varMap, arraySize=10)
                     maxPrio = None
                     if res is not None:
                         try:
                             maxPrio = res[0][0]
                         except Exception:
                             pass
                     if maxPrio is None:
                         tmpLog.debug("cannot get the highest prio at %s" % computingSite)
                         continue
                     # delta for priority boost
                     prioDelta = highestPrio - maxPrio
                     # already boosted
                     if prioDelta <= 0:
                         tmpLog.debug("already boosted (prio=%s) at %s" % (maxPrio,computingSite))
                         continue
                     # lower limit
                     minPrio = maxPrio - numBoostedJobsSite/nJobsPerPrioUnit
                     # SQL for priority boost
                     varMap = {}
                     varMap[':jobStatus'] = 'activated'
                     varMap[':prodSourceLabel'] = self.prodSourceLabel
                     varMap[':prodUserName'] = prodUserName
                     varMap[':computingSite'] = computingSite
                     varMap[':prioDelta'] = prioDelta
                     varMap[':maxPrio'] = maxPrio
                     varMap[':minPrio'] = minPrio
                     varMap[':rlimit'] = numBoostedJobsSite
                     sql  = "UPDATE ATLAS_PANDA.jobsActive4 SET currentPriority=currentPriority+:prioDelta "
                     sql += "WHERE prodSourceLabel=:prodSourceLabel AND prodUserName=:prodUserName "
                     if workingGroup is not None:
                         varMap[':workingGroup'] = workingGroup
                         sql += "AND workingGroup=:workingGroup "
                     else:
                         sql += "AND workingGroup IS NULL "
                     sql += "AND jobStatus=:jobStatus AND computingSite=:computingSite AND currentPriority>:minPrio "
                     sql += "AND currentPriority<=:maxPrio AND rownum<=:rlimit"
                     tmpLog.debug("boost %s" % str(varMap))
                     res = self.taskBufferIF.querySQL(sql, varMap, arraySize=10)
                     tmpLog.debug("   database return : %s" % res)
         # done
         tmpLog.debug('done')
     except Exception:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('failed with {0} {1} {2}'.format(errtype, errvalue, traceback.format_exc()))

Пример #7

Показать файл

 def runImpl(self):
     # cutoff for disk in TB
     diskThreshold = self.taskBufferIF.getConfigValue(
         self.msgType,
         'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name), 'jedi',
         'atlas')
     if diskThreshold is None:
         diskThreshold = 100 * 1024
     # dataset type to ignore file availability check
     datasetTypeToSkipCheck = ['log']
     # thresholds for data availability check
     thrInputSize = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas')
     if thrInputSize is None:
         thrInputSize = 1
     thrInputSize *= 1024 * 1024 * 1024
     thrInputNum = self.taskBufferIF.getConfigValue(self.msgType,
                                                    'INPUT_NUM_THRESHOLD',
                                                    'jedi', 'atlas')
     if thrInputNum is None:
         thrInputNum = 100
     thrInputSizeFrac = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas')
     if thrInputSizeFrac is None:
         thrInputSizeFrac = 10
     thrInputSizeFrac = float(thrInputSizeFrac) / 100
     thrInputNumFrac = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas')
     if thrInputNumFrac is None:
         thrInputNumFrac = 10
     thrInputNumFrac = float(thrInputNumFrac) / 100
     cutOffRW = 50
     negWeightTape = 0.001
     minIoIntensityWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MIN_IO_INTENSITY_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if minIoIntensityWithLD is None:
         minIoIntensityWithLD = 200
     minInputSizeWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MIN_INPUT_SIZE_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if minInputSizeWithLD is None:
         minInputSizeWithLD = 10000
     maxTaskPrioWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MAX_TASK_PRIO_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if maxTaskPrioWithLD is None:
         maxTaskPrioWithLD = 800
     # main
     lastJediTaskID = None
     siteMapper = self.taskBufferIF.getSiteMapper()
     while True:
         try:
             taskInputList = self.inputList.get(1)
             # no more datasets
             if len(taskInputList) == 0:
                 self.logger.debug(
                     '{0} terminating after processing {1} tasks since no more inputs '
                     .format(self.__class__.__name__, self.numTasks))
                 return
             # loop over all tasks
             for taskSpec, inputChunk in taskInputList:
                 lastJediTaskID = taskSpec.jediTaskID
                 # make logger
                 tmpLog = MsgWrapper(
                     self.logger,
                     '<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                     monToken='jediTaskID={0}'.format(taskSpec.jediTaskID))
                 tmpLog.debug('start')
                 tmpLog.info(
                     'thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}'
                     .format(thrInputSize, thrInputNum, thrInputSizeFrac,
                             thrInputNumFrac))
                 # read task parameters
                 try:
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                         taskSpec.jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except Exception:
                     tmpLog.error('failed to read task params')
                     taskSpec.setErrDiag(
                         tmpLog.uploadLog(taskSpec.jediTaskID))
                     self.sendLogMessage(tmpLog)
                     continue
                 # RW
                 taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(
                     taskSpec.jediTaskID)
                 # get nuclei
                 nucleusList = siteMapper.nuclei
                 if taskSpec.nucleus in siteMapper.nuclei:
                     candidateNucleus = taskSpec.nucleus
                 elif taskSpec.nucleus in siteMapper.satellites:
                     nucleusList = siteMapper.satellites
                     candidateNucleus = taskSpec.nucleus
                 else:
                     tmpLog.info('got {0} candidates'.format(
                         len(nucleusList)))
                     ######################################
                     # check status
                     newNucleusList = {}
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleusSpec.state not in ['ACTIVE']:
                             tmpLog.info(
                                 '  skip nucleus={0} due to status={1} criteria=-status'
                                 .format(tmpNucleus, tmpNucleusSpec.state))
                         else:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info(
                         '{0} candidates passed status check'.format(
                             len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # check status of transfer backlog
                     t1Weight = taskSpec.getT1Weight()
                     if t1Weight < 0:
                         tmpLog.info(
                             'skip transfer backlog check due to negative T1Weight'
                         )
                     else:
                         newNucleusList = {}
                         backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei(
                         )
                         for tmpNucleus, tmpNucleusSpec in iteritems(
                                 nucleusList):
                             if tmpNucleus in backlogged_nuclei:
                                 tmpLog.info(
                                     '  skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog'
                                     .format(tmpNucleus))
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         nucleusList = newNucleusList
                         tmpLog.info(
                             '{0} candidates passed transfer backlog check'.
                             format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # check endpoint
                     fractionFreeSpace = {}
                     newNucleusList = {}
                     tmpStat, tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                         taskSpec.jediTaskID, ['output', 'log'])
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         toSkip = False
                         for tmpDatasetSpec in tmpDatasetSpecList:
                             # ignore distributed datasets
                             if DataServiceUtils.getDistributedDestination(
                                     tmpDatasetSpec.storageToken
                             ) is not None:
                                 continue
                             # get endpoint with the pattern
                             tmpEP = tmpNucleusSpec.getAssociatedEndpoint(
                                 tmpDatasetSpec.storageToken)
                             if tmpEP is None:
                                 tmpLog.info(
                                     '  skip nucleus={0} since no endpoint with {1} criteria=-match'
                                     .format(tmpNucleus,
                                             tmpDatasetSpec.storageToken))
                                 toSkip = True
                                 break
                             # check state
                             """
                             if tmpEP['state'] not in ['ACTIVE']:
                                 tmpLog.info('  skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus,
                                                                                                                          tmpEP['ddm_endpoint_name'],
                                                                                                                          tmpEP['state']))
                                 toSkip = True
                                 break
                             """
                             # check space
                             tmpSpaceSize = tmpEP['space_free'] + tmpEP[
                                 'space_expired']
                             tmpSpaceToUse = 0
                             if tmpNucleus in self.fullRW:
                                 # 0.25GB per cpuTime/corePower/day
                                 tmpSpaceToUse = long(
                                     self.fullRW[tmpNucleus] / 10 / 24 /
                                     3600 * 0.25)
                             if tmpSpaceSize - tmpSpaceToUse < diskThreshold:
                                 tmpLog.info(
                                     '  skip nucleus={0} since disk shortage (free {1} GB - reserved {2} GB < thr {3} GB) at endpoint {4} criteria=-space'
                                     .format(tmpNucleus, tmpSpaceSize,
                                             tmpSpaceToUse, diskThreshold,
                                             tmpEP['ddm_endpoint_name']))
                                 toSkip = True
                                 break
                             # keep fraction of free space
                             if tmpNucleus not in fractionFreeSpace:
                                 fractionFreeSpace[tmpNucleus] = {
                                     'total': 0,
                                     'free': 0
                                 }
                             try:
                                 tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                     float(fractionFreeSpace[tmpNucleus]['total'])
                             except Exception:
                                 tmpOld = None
                             try:
                                 tmpNew = float(tmpSpaceSize -
                                                tmpSpaceToUse) / float(
                                                    tmpEP['space_total'])
                             except Exception:
                                 tmpNew = None
                             if tmpNew is not None and (tmpOld is None
                                                        or tmpNew < tmpOld):
                                 fractionFreeSpace[tmpNucleus] = {
                                     'total': tmpEP['space_total'],
                                     'free': tmpSpaceSize - tmpSpaceToUse
                                 }
                         if not toSkip:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info(
                         '{0} candidates passed endpoint check {1} TB'.
                         format(len(nucleusList), diskThreshold / 1024))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # ability to execute jobs
                     newNucleusList = {}
                     # get all panda sites
                     tmpSiteList = []
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         tmpSiteList += tmpNucleusSpec.allPandaSites
                     tmpSiteList = list(set(tmpSiteList))
                     tmpLog.debug('===== start for job check')
                     jobBroker = AtlasProdJobBroker(self.ddmIF,
                                                    self.taskBufferIF)
                     tmpSt, tmpRet = jobBroker.doBrokerage(
                         taskSpec, taskSpec.cloud, inputChunk, None, True,
                         tmpSiteList, tmpLog)
                     tmpLog.debug('===== done for job check')
                     if tmpSt != Interaction.SC_SUCCEEDED:
                         tmpLog.error('no sites can run jobs')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     okNuclei = set()
                     for tmpSite in tmpRet:
                         siteSpec = siteMapper.getSite(tmpSite)
                         okNuclei.add(siteSpec.pandasite)
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleus in okNuclei:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                         else:
                             tmpLog.info(
                                 '  skip nucleus={0} due to missing ability to run jobs criteria=-job'
                                 .format(tmpNucleus))
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed job check'.format(
                         len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # data locality
                     toSkip = False
                     availableData = {}
                     for datasetSpec in inputChunk.getDatasets():
                         # only for real datasets
                         if datasetSpec.isPseudo():
                             continue
                         # ignore DBR
                         if DataServiceUtils.isDBR(datasetSpec.datasetName):
                             continue
                         # skip locality check
                         if DataServiceUtils.getDatasetType(
                                 datasetSpec.datasetName
                         ) in datasetTypeToSkipCheck:
                             continue
                         # primary only
                         if taskParamMap.get(
                                 'taskBrokerOnMaster'
                         ) is True and not datasetSpec.isMaster():
                             continue
                         # use deep scan for primary dataset unless data carousel
                         if datasetSpec.isMaster(
                         ) and not taskSpec.inputPreStaging():
                             deepScan = True
                         else:
                             deepScan = False
                         # get nuclei where data is available
                         tmpSt, tmpRet = AtlasBrokerUtils.getNucleiWithData(
                             siteMapper, self.ddmIF,
                             datasetSpec.datasetName,
                             list(nucleusList.keys()), deepScan)
                         if tmpSt != Interaction.SC_SUCCEEDED:
                             tmpLog.error(
                                 'failed to get nuclei where data is available, since {0}'
                                 .format(tmpRet))
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             toSkip = True
                             break
                         # sum
                         for tmpNucleus, tmpVals in iteritems(tmpRet):
                             if tmpNucleus not in availableData:
                                 availableData[tmpNucleus] = tmpVals
                             else:
                                 availableData[tmpNucleus] = dict(
                                     (k, v + tmpVals[k])
                                     for (k, v) in iteritems(
                                         availableData[tmpNucleus]))
                     if toSkip:
                         continue
                     if availableData != {}:
                         newNucleusList = {}
                         # skip if no data
                         skipMsgList = []
                         for tmpNucleus, tmpNucleusSpec in iteritems(
                                 nucleusList):
                             if taskSpec.inputPreStaging(
                             ) and availableData[tmpNucleus][
                                     'ava_num_any'] > 0:
                                 # use incomplete replicas for data carousel since the completeness is guaranteed
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                             elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \
                                     availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac:
                                 tmpMsg = '  skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(
                                     tmpNucleus, availableData[tmpNucleus]
                                     ['ava_size_any'],
                                     availableData[tmpNucleus]['tot_size'],
                                     thrInputSizeFrac)
                                 skipMsgList.append(tmpMsg)
                             elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \
                                     availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac:
                                 tmpMsg = '  skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(
                                     tmpNucleus, availableData[tmpNucleus]
                                     ['ava_num_any'],
                                     availableData[tmpNucleus]['tot_num'],
                                     thrInputNumFrac)
                                 skipMsgList.append(tmpMsg)
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         totInputSize = list(availableData.values(
                         ))[0]['tot_size'] / 1024 / 1024 / 1024
                         data_locality_check_str = (
                             '(ioIntensity ({0}) is None or less than {1} kBPerS '
                             'and input size ({2} GB) is less than {3}) '
                             'or task.currentPriority ({4}) is higher than or equal to {5}'
                         ).format(taskSpec.ioIntensity,
                                  minIoIntensityWithLD, int(totInputSize),
                                  minInputSizeWithLD,
                                  taskSpec.currentPriority,
                                  maxTaskPrioWithLD)
                         if len(newNucleusList) > 0:
                             nucleusList = newNucleusList
                             for tmpMsg in skipMsgList:
                                 tmpLog.info(tmpMsg)
                         elif ((taskSpec.ioIntensity is None
                               or taskSpec.ioIntensity <= minIoIntensityWithLD)
                               and totInputSize <= minInputSizeWithLD) \
                               or taskSpec.currentPriority >= maxTaskPrioWithLD:
                             availableData = {}
                             tmpLog.info(
                                 '  disable data locality check since no nucleus has input data, {}'
                                 .format(data_locality_check_str))
                         else:
                             # no candidate + unavoidable data locality check
                             nucleusList = newNucleusList
                             for tmpMsg in skipMsgList:
                                 tmpLog.info(tmpMsg)
                             tmpLog.info(
                                 '  the following conditions required to disable data locality check: {}'
                                 .format(data_locality_check_str))
                         tmpLog.info(
                             '{0} candidates passed data check'.format(
                                 len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # weight
                     self.prioRW.acquire()
                     nucleusRW = self.prioRW[taskSpec.currentPriority]
                     self.prioRW.release()
                     totalWeight = 0
                     nucleusweights = []
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleus not in nucleusRW:
                             nucleusRW[tmpNucleus] = 0
                         wStr = '1'
                         # with RW
                         if tmpNucleus in nucleusRW and nucleusRW[
                                 tmpNucleus] >= cutOffRW:
                             weight = 1 / float(nucleusRW[tmpNucleus])
                             wStr += '/( RW={0} )'.format(
                                 nucleusRW[tmpNucleus])
                         else:
                             weight = 1
                             wStr += '/(1 : RW={0}<{1})'.format(
                                 nucleusRW[tmpNucleus], cutOffRW)
                         # with data
                         if availableData != {}:
                             if availableData[tmpNucleus]['tot_size'] > 0:
                                 weight *= float(availableData[tmpNucleus]
                                                 ['ava_size_any'])
                                 weight /= float(
                                     availableData[tmpNucleus]['tot_size'])
                                 wStr += '* ( available_input_size_DISKTAPE={0} )'.format(
                                     availableData[tmpNucleus]
                                     ['ava_size_any'])
                                 wStr += '/ ( total_input_size={0} )'.format(
                                     availableData[tmpNucleus]['tot_size'])
                                 # negative weight for tape
                                 if availableData[tmpNucleus][
                                         'ava_size_any'] > availableData[
                                             tmpNucleus]['ava_size_disk']:
                                     weight *= negWeightTape
                                     wStr += '*( weight_TAPE={0} )'.format(
                                         negWeightTape)
                         # fraction of free space
                         if tmpNucleus in fractionFreeSpace:
                             try:
                                 tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                     float(fractionFreeSpace[tmpNucleus]['total'])
                                 weight *= tmpFrac
                                 wStr += '*( free_space={0} )/( total_space={1} )'.format(
                                     fractionFreeSpace[tmpNucleus]['free'],
                                     fractionFreeSpace[tmpNucleus]['total'])
                             except Exception:
                                 pass
                         tmpLog.info(
                             '  use nucleus={0} weight={1} {2} criteria=+use'
                             .format(tmpNucleus, weight, wStr))
                         totalWeight += weight
                         nucleusweights.append((tmpNucleus, weight))
                     tmpLog.info('final {0} candidates'.format(
                         len(nucleusList)))
                     ######################################
                     # final selection
                     tgtWeight = random.uniform(0, totalWeight)
                     candidateNucleus = None
                     for tmpNucleus, weight in nucleusweights:
                         tgtWeight -= weight
                         if tgtWeight <= 0:
                             candidateNucleus = tmpNucleus
                             break
                     if candidateNucleus is None:
                         candidateNucleus = nucleusweights[-1][0]
                 ######################################
                 # update
                 nucleusSpec = nucleusList[candidateNucleus]
                 # get output/log datasets
                 tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                     taskSpec.jediTaskID, ['output', 'log'])
                 # get destinations
                 retMap = {
                     taskSpec.jediTaskID:
                     AtlasBrokerUtils.getDictToSetNucleus(
                         nucleusSpec, tmpDatasetSpecs)
                 }
                 tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
                 tmpLog.info(
                     '  set nucleus={0} with {1} criteria=+set'.format(
                         candidateNucleus, tmpRet))
                 self.sendLogMessage(tmpLog)
                 if tmpRet:
                     tmpMsg = 'set task_status=ready'
                     tmpLog.sendMsg(tmpMsg, self.msgType)
                 # update RW table
                 self.prioRW.acquire()
                 for prio, rwMap in iteritems(self.prioRW):
                     if prio > taskSpec.currentPriority:
                         continue
                     if candidateNucleus in rwMap:
                         rwMap[candidateNucleus] += taskRW
                     else:
                         rwMap[candidateNucleus] = taskRW
                 self.prioRW.release()
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             errMsg = '{0}.runImpl() failed with {1} {2} '.format(
                 self.__class__.__name__, errtype.__name__, errvalue)
             errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID)
             errMsg += traceback.format_exc()
             logger.error(errMsg)

Пример #8

Показать файл

Файл: TaskCommando.py Проект: PanDAWMS/panda-jedi

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug(
                     '{0} terminating since no more items'.format(
                         self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID, commandMap in taskList:
                 # make logger
                 tmpLog = MsgWrapper(
                     self.logger, ' < jediTaskID={0} >'.format(jediTaskID))
                 commandStr = commandMap['command']
                 commentStr = commandMap['comment']
                 oldStatus = commandMap['oldStatus']
                 tmpLog.info('start for {0}'.format(commandStr))
                 tmpStat = Interaction.SC_SUCCEEDED
                 if commandStr in ['kill', 'finish', 'reassign']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg, self.msgType)
                     # loop twice to see immediate result
                     for iLoop in range(2):
                         # get active PandaIDs to be killed
                         if commandStr == 'reassign' and commentStr is not None and 'soft reassign' in commentStr:
                             pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(
                                 jediTaskID)
                         elif commandStr == 'reassign' and commentStr is not None and 'nokill reassign' in commentStr:
                             pandaIDs = []
                         else:
                             pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(
                                 jediTaskID, True)
                         if pandaIDs is None:
                             tmpLog.error(
                                 'failed to get PandaIDs for jediTaskID={0}'
                                 .format(jediTaskID))
                             tmpStat = Interaction.SC_FAILED
                         # kill jobs or update task
                         if tmpStat == Interaction.SC_SUCCEEDED:
                             if pandaIDs == []:
                                 # done since no active jobs
                                 tmpMsg = 'completed cleaning jobs'
                                 tmpLog.sendMsg(tmpMsg, self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpTaskSpec = JediTaskSpec()
                                 tmpTaskSpec.jediTaskID = jediTaskID
                                 updateTaskStatus = True
                                 if commandStr != 'reassign':
                                     # reset oldStatus
                                     # keep oldStatus for task reassignment since it is reset when actually reassigned
                                     tmpTaskSpec.forceUpdate('oldStatus')
                                 else:
                                     # extract cloud or site
                                     if commentStr is not None:
                                         tmpItems = commentStr.split(':')
                                         if tmpItems[0] == 'cloud':
                                             tmpTaskSpec.cloud = tmpItems[1]
                                         elif tmpItems[0] == 'nucleus':
                                             tmpTaskSpec.nucleus = tmpItems[
                                                 1]
                                         else:
                                             tmpTaskSpec.site = tmpItems[1]
                                         tmpMsg = 'set {0}={1}'.format(
                                             tmpItems[0], tmpItems[1])
                                         tmpLog.sendMsg(
                                             tmpMsg, self.msgType)
                                         tmpLog.info(tmpMsg)
                                         # back to oldStatus if necessary
                                         if tmpItems[2] == 'y':
                                             tmpTaskSpec.status = oldStatus
                                             tmpTaskSpec.forceUpdate(
                                                 'oldStatus')
                                             updateTaskStatus = False
                                 if commandStr == 'reassign':
                                     tmpTaskSpec.forceUpdate('errorDialog')
                                 if commandStr == 'finish':
                                     # update datasets
                                     tmpLog.info(
                                         'updating datasets to finish')
                                     tmpStat = self.taskBufferIF.updateDatasetsToFinishTask_JEDI(
                                         jediTaskID, self.pid)
                                     if not tmpStat:
                                         tmpLog.info(
                                             'wait until datasets are updated to finish'
                                         )
                                     # ignore failGoalUnreached when manually finished
                                     tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(
                                         jediTaskID)
                                     tmpTaskSpec.splitRule = taskSpec.splitRule
                                     tmpTaskSpec.unsetFailGoalUnreached()
                                 if updateTaskStatus:
                                     tmpTaskSpec.status = JediTaskSpec.commandStatusMap(
                                     )[commandStr]['done']
                                 tmpMsg = 'set task_status={0}'.format(
                                     tmpTaskSpec.status)
                                 tmpLog.sendMsg(tmpMsg, self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpRet = self.taskBufferIF.updateTask_JEDI(
                                     tmpTaskSpec,
                                     {'jediTaskID': jediTaskID},
                                     setOldModTime=True)
                                 tmpLog.info('done with {0}'.format(
                                     str(tmpRet)))
                                 break
                             else:
                                 # kill only in the first loop
                                 if iLoop > 0:
                                     break
                                 # wait or kill jobs
                                 if commentStr and 'soft finish' in commentStr:
                                     queuedPandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(
                                         jediTaskID)
                                     tmpMsg = "trying to kill {0} queued jobs for soft finish".format(
                                         len(queuedPandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = self.taskBufferIF.killJobs(
                                         queuedPandaIDs, commentStr, '52',
                                         True)
                                     tmpMsg = "wating {0} jobs for soft finish".format(
                                         len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = True
                                     tmpLog.info('done with {0}'.format(
                                         str(tmpRet)))
                                     break
                                 else:
                                     tmpMsg = "trying to kill {0} jobs".format(
                                         len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpLog.sendMsg(tmpMsg, self.msgType)
                                     if commandStr in ['finish']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(
                                             pandaIDs, commentStr, '52',
                                             True)
                                     elif commandStr in ['reassign']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(
                                             pandaIDs, commentStr, '51',
                                             True)
                                     else:
                                         # normal kill
                                         tmpRet = self.taskBufferIF.killJobs(
                                             pandaIDs, commentStr, '50',
                                             True)
                                     tmpLog.info('done with {0}'.format(
                                         str(tmpRet)))
                 elif commandStr in ['retry', 'incexec']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg, self.msgType)
                     # change task params for incexec
                     if commandStr == 'incexec':
                         try:
                             # read task params
                             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                                 jediTaskID)
                             taskParamMap = RefinerUtils.decodeJSON(
                                 taskParam)
                             # remove some params
                             for newKey in ['nFiles', 'fixedSandbox']:
                                 try:
                                     del taskParamMap[newKey]
                                 except Exception:
                                     pass
                             # convert new params
                             newParamMap = RefinerUtils.decodeJSON(
                                 commentStr)
                             # change params
                             for newKey, newVal in iteritems(newParamMap):
                                 if newVal is None:
                                     # delete
                                     if newKey in taskParamMap:
                                         del taskParamMap[newKey]
                                 else:
                                     # change
                                     taskParamMap[newKey] = newVal
                             # overwrite sandbox
                             if 'fixedSandbox' in taskParamMap:
                                 # noBuild
                                 for tmpParam in taskParamMap[
                                         'jobParameters']:
                                     if tmpParam[
                                             'type'] == 'constant' and re.search(
                                                 '^-a [^ ]+$',
                                                 tmpParam['value']
                                             ) is not None:
                                         tmpParam['value'] = '-a {0}'.format(
                                             taskParamMap['fixedSandbox'])
                                 # build
                                 if 'buildSpec' in taskParamMap:
                                     taskParamMap['buildSpec'][
                                         'archiveName'] = taskParamMap[
                                             'fixedSandbox']
                                 # merge
                                 if 'mergeSpec' in taskParamMap:
                                     taskParamMap['mergeSpec']['jobParameters'] = \
                                         re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters'])
                             # encode new param
                             strTaskParams = RefinerUtils.encodeJSON(
                                 taskParamMap)
                             tmpRet = self.taskBufferIF.updateTaskParams_JEDI(
                                 jediTaskID, strTaskParams)
                             if tmpRet is not True:
                                 tmpLog.error(
                                     'failed to update task params')
                                 continue
                         except Exception as e:
                             tmpLog.error(
                                 'failed to change task params with {} {}'.
                                 format(str(e), traceback.format_exc()))
                             continue
                     # retry child tasks
                     if 'sole ' in commentStr:
                         retryChildTasks = False
                     else:
                         retryChildTasks = True
                     # discard events
                     if 'discard ' in commentStr:
                         discardEvents = True
                     else:
                         discardEvents = False
                     # release un-staged files
                     if 'staged ' in commentStr:
                         releaseUnstaged = True
                     else:
                         releaseUnstaged = False
                     tmpRet, newTaskStatus = self.taskBufferIF.retryTask_JEDI(
                         jediTaskID,
                         commandStr,
                         retryChildTasks=retryChildTasks,
                         discardEvents=discardEvents,
                         release_unstaged=releaseUnstaged)
                     if tmpRet is True:
                         tmpMsg = 'set task_status={0}'.format(
                             newTaskStatus)
                         tmpLog.sendMsg(tmpMsg, self.msgType)
                         tmpLog.info(tmpMsg)
                     tmpLog.info('done with {0}'.format(tmpRet))
                 else:
                     tmpLog.error('unknown command')
         except Exception as e:
             errStr = '{} failed in runImpl() with {} {} '.format(
                 self.__class__.__name__, str(e), traceback.format_exc())
             logger.error(errStr)

Пример #9

Показать файл

Файл: TaskCommando.py Проект: PanDAWMS/panda-jedi

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,commandMap in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,' < jediTaskID={0} >'.format(jediTaskID))
                 commandStr = commandMap['command']
                 commentStr = commandMap['comment']
                 oldStatus  = commandMap['oldStatus']
                 tmpLog.info('start for {0}'.format(commandStr))
                 tmpStat = Interaction.SC_SUCCEEDED
                 if commandStr in ['kill','finish','reassign']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # loop twice to see immediate result
                     for iLoop in range(2):
                         # get active PandaIDs to be killed
                         if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr:
                             pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID)
                         elif commandStr == 'reassign' and commentStr != None and 'nokill reassign' in commentStr:
                             pandaIDs = []
                         else:
                             pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True)
                         if pandaIDs == None:
                             tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID))
                             tmpStat = Interaction.SC_FAILED
                         # kill jobs or update task
                         if tmpStat == Interaction.SC_SUCCEEDED:
                             if pandaIDs == []:
                                 # done since no active jobs
                                 tmpMsg = 'completed cleaning jobs'
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpTaskSpec = JediTaskSpec()
                                 tmpTaskSpec.jediTaskID = jediTaskID
                                 updateTaskStatus = True
                                 if commandStr != 'reassign':
                                     # reset oldStatus
                                     # keep oldStatus for task reassignment since it is reset when actually reassigned
                                     tmpTaskSpec.forceUpdate('oldStatus')
                                 else:
                                     # extract cloud or site
                                     if commentStr != None:
                                         tmpItems = commentStr.split(':')
                                         if tmpItems[0] == 'cloud':
                                             tmpTaskSpec.cloud = tmpItems[1]
                                         elif tmpItems[0] == 'nucleus':
                                             tmpTaskSpec.nucleus = tmpItems[1]
                                         else:
                                             tmpTaskSpec.site = tmpItems[1]
                                         tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1])
                                         tmpLog.sendMsg(tmpMsg,self.msgType)
                                         tmpLog.info(tmpMsg)
                                         # back to oldStatus if necessary 
                                         if tmpItems[2] == 'y':
                                             tmpTaskSpec.status = oldStatus
                                             tmpTaskSpec.forceUpdate('oldStatus')
                                             updateTaskStatus = False
                                 if commandStr == 'reassign':
                                     tmpTaskSpec.forceUpdate('errorDialog')
                                 if commandStr == 'finish':
                                     # update datasets
                                     tmpLog.info('updating datasets to finish')
                                     tmpStat = self.taskBufferIF.updateDatasetsToFinishTask_JEDI(jediTaskID, self.pid)
                                     if not tmpStat:
                                         tmpLog.info('wait until datasets are updated to finish')
                                     # ignore failGoalUnreached when manually finished
                                     tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID)
                                     tmpTaskSpec.splitRule = taskSpec.splitRule
                                     tmpTaskSpec.unsetFailGoalUnreached()
                                 if updateTaskStatus:
                                     tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done']
                                 tmpMsg = 'set task_status={0}'.format(tmpTaskSpec.status)
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID},
                                                                            setOldModTime=True)
                                 tmpLog.info('done with {0}'.format(str(tmpRet)))
                                 break
                             else:
                                 # kill only in the first loop
                                 if iLoop > 0:
                                     break
                                 # wait or kill jobs 
                                 if 'soft finish' in commentStr:
                                     queuedPandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID)
                                     tmpMsg = "trying to kill {0} queued jobs for soft finish".format(len(queuedPandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = self.taskBufferIF.killJobs(queuedPandaIDs,commentStr,'52',True)
                                     tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = True
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                                     break
                                 else:
                                     tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpLog.sendMsg(tmpMsg,self.msgType)
                                     if commandStr in ['finish']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True)
                                     elif commandStr in ['reassign']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'51',True)
                                     else:
                                         # normal kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True)
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                 elif commandStr in ['retry','incexec']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # change task params for incexec
                     if commandStr == 'incexec':
                         try:
                             # read task params
                             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                             taskParamMap = RefinerUtils.decodeJSON(taskParam)
                             # remove some params
                             for newKey in ['nFiles','fixedSandbox']:
                                 try:
                                     del taskParamMap[newKey]
                                 except:
                                     pass
                             # convert new params
                             newParamMap = RefinerUtils.decodeJSON(commentStr)
                             # change params
                             for newKey,newVal in newParamMap.iteritems():
                                 if newVal == None:
                                     # delete
                                     if newKey in taskParamMap:
                                         del taskParamMap[newKey]
                                 else:
                                     # change
                                     taskParamMap[newKey] = newVal
                             # overwrite sandbox
                             if 'fixedSandbox' in taskParamMap:
                                 # noBuild
                                 for tmpParam in taskParamMap['jobParameters']:
                                     if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None:
                                         tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox']
                                 # build
                                 if taskParamMap.has_key('buildSpec'):
                                     taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox']
                                 # merge
                                 if taskParamMap.has_key('mergeSpec'):
                                     taskParamMap['mergeSpec']['jobParameters'] = \
                                         re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters'])
                             # encode new param
                             strTaskParams = RefinerUtils.encodeJSON(taskParamMap)
                             tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams)
                             if tmpRet != True:
                                 tmpLog.error('failed to update task params')
                                 continue
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue))
                             continue
                     # retry child tasks
                     if 'sole ' in commentStr:
                         retryChildTasks = False
                     else:
                         retryChildTasks = True
                     # discard events
                     if 'discard ' in commentStr:
                         discardEvents = True
                     else:
                         discardEvents = False
                     tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr,
                                                                             retryChildTasks=retryChildTasks,
                                                                             discardEvents=discardEvents)
                     if tmpRet == True:
                         tmpMsg = 'set task_status={0}'.format(newTaskStatus)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                         tmpLog.info(tmpMsg)
                     tmpLog.info('done with {0}'.format(tmpRet))
                 else:
                     tmpLog.error('unknown command')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errStr  = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errStr += traceback.format_exc()
             logger.error(errStr)

Пример #10

Показать файл

Файл: TaskRefiner.py Проект: pavlo-svirin/panda-jedi

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.info('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,splitRule,taskStatus,parent_tid in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'< jediTaskID={0} >'.format(jediTaskID))
                 tmpLog.debug('start')
                 tmpStat = Interaction.SC_SUCCEEDED
                 errStr = ''
                 # read task parameters
                 try:
                     taskParam = None
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue)
                     tmpLog.debug(taskParam)
                     tmpLog.error(errStr)
                     continue
                     tmpStat = Interaction.SC_FAILED
                 # get impl
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('getting Impl')
                     try:
                         # get VO and sourceLabel
                         vo = taskParamMap['vo']
                         prodSourceLabel = taskParamMap['prodSourceLabel']
                         taskType = taskParamMap['taskType']
                         tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType))
                         # get impl
                         impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType,
                                                                 self.taskBufferIF,self.ddmIF)
                         if impl == None:
                             # task refiner is undefined
                             errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # extract common parameters
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('extracting common')
                     try:
                         # initalize impl
                         impl.initializeRefiner(tmpLog)
                         impl.oldTaskStatus = taskStatus
                         # extract common parameters
                         impl.extractCommon(jediTaskID,taskParamMap,self.workQueueMapper,splitRule)
                         # set parent tid
                         if not parent_tid in [None,jediTaskID]:
                             impl.taskSpec.parent_tid = parent_tid
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to extract common parameters with {0}:{1} {2}'.format(errtype.__name__,errvalue,
                                                                                                traceback.format_exc())
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # check attribute length
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('checking attribute length')
                     if not impl.taskSpec.checkAttrLength():
                         tmpLog.error(impl.taskSpec.errorDialog)
                         tmpStat = Interaction.SC_FAILED
                 # check parent
                 noWaitParent = False
                 parentState = None
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     if not parent_tid in [None,jediTaskID]:
                         tmpLog.info('check parent task')
                         try:
                             tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid)
                             parentState = tmpStat
                             if tmpStat == 'completed':
                                 # parent is done
                                 tmpStat = Interaction.SC_SUCCEEDED
                             elif tmpStat == 'running':
                                 if not impl.taskSpec.noWaitParent():
                                     # parent is running
                                     errStr = 'pending until parent task {0} is done'.format(parent_tid)
                                     impl.taskSpec.status = taskStatus
                                     impl.taskSpec.setOnHold()
                                     impl.taskSpec.setErrDiag(errStr)
                                     tmpLog.info(errStr)
                                     self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                                       oldStatus=[taskStatus],setFrozenTime=False)
                                     continue
                                 else:
                                     # not wait for parent
                                     tmpStat = Interaction.SC_SUCCEEDED
                                     noWaitParent = True
                             else:
                                 # parent is corrupted
                                 tmpStat = Interaction.SC_FAILED
                                 tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid)
                                 impl.taskSpec.setErrDiag(tmpErrStr)
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # refine
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('refining with {0}'.format(impl.__class__.__name__))
                     try:
                         tmpStat = impl.doRefine(jediTaskID,taskParamMap)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         # wait unknown input if noWaitParent or waitInput
                         if ((impl.taskSpec.noWaitParent() or impl.taskSpec.waitInput()) \
                                 and errtype == JediException.UnknownDatasetError) or parentState == 'running' \
                                 or errtype == Interaction.JEDITemporaryError:
                             if impl.taskSpec.noWaitParent() or parentState == 'running':
                                 tmpErrStr = 'pending until parent produces input'
                                 setFrozenTime=False
                             elif errtype == Interaction.JEDITemporaryError:
                                 tmpErrStr = 'pending due to DDM problem. {0}'.format(errvalue)
                                 setFrozenTime=True
                             else:
                                 tmpErrStr = 'pending until input is staged'
                                 setFrozenTime=True
                             impl.taskSpec.status = taskStatus
                             impl.taskSpec.setOnHold()
                             impl.taskSpec.setErrDiag(tmpErrStr)
                             tmpLog.info(tmpErrStr)
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                               oldStatus=[taskStatus],
                                                               insertUnknown=impl.unknownDatasetList,
                                                               setFrozenTime=setFrozenTime)
                             continue
                         else:
                             errStr  = 'failed to refine task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # register
                 if tmpStat != Interaction.SC_SUCCEEDED:
                     tmpLog.error('failed to refine the task')
                     if impl == None or impl.taskSpec == None:
                         tmpTaskSpec = JediTaskSpec()
                         tmpTaskSpec.jediTaskID = jediTaskID
                     else:
                         tmpTaskSpec = impl.taskSpec
                     tmpTaskSpec.status = 'tobroken'
                     if errStr != '':
                         tmpTaskSpec.setErrDiag(errStr,True)
                     self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID},oldStatus=[taskStatus])
                 else:
                     tmpLog.info('registering')                    
                     # fill JEDI tables
                     try:
                         # enable protection against task duplication
                         if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \
                                 not impl.taskSpec.checkPreProcessed():
                             uniqueTaskName = True
                         else:
                             uniqueTaskName = False
                         strTaskParams = None
                         if impl.updatedTaskParams != None:
                             strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams)
                         if taskStatus == 'registered':
                             # unset pre-process flag
                             if impl.taskSpec.checkPreProcessed():
                                 impl.taskSpec.setPostPreProcess()
                             # full registration
                             tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec,
                                                                                                  impl.inMasterDatasetSpec,
                                                                                                  impl.inSecDatasetSpecList,
                                                                                                  impl.outDatasetSpecList,
                                                                                                  impl.outputTemplateMap,
                                                                                                  impl.jobParamsTemplate,
                                                                                                  strTaskParams,
                                                                                                  impl.unmergeMasterDatasetSpec,
                                                                                                  impl.unmergeDatasetSpecMap,
                                                                                                  uniqueTaskName,
                                                                                                  taskStatus) 
                             if not tmpStat:
                                 tmpErrStr = 'failed to register the task to JEDI in a single shot'
                                 tmpLog.error(tmpErrStr)
                                 impl.taskSpec.status = newTaskStatus
                                 impl.taskSpec.setErrDiag(tmpErrStr,True)
                                 self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                                   oldStatus=[taskStatus])
                             tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                             tmpLog.info(tmpMsg)
                             tmpLog.sendMsg(tmpMsg,self.msgType)
                         else:
                             # disable scouts if previous attempt didn't use it
                             if not impl.taskSpec.useScout(splitRule):
                                 impl.taskSpec.setUseScout(False)
                             # update task with new params
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                               oldStatus=[taskStatus])
                             # appending for incremetnal execution
                             tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec,
                                                                             impl.inSecDatasetSpecList)
                             if not tmpStat:
                                 tmpLog.error('failed to append datasets for incexec')
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(tmpErrStr)
                     else:
                         tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))

Пример #11

Показать файл

Файл: ContentsFeeder.py Проект: lukewayne123/panda-jedi

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskDsList = self.taskDsList.get(nTasks)
             # no more datasets
             if len(taskDsList) == 0:
                 self.logger.debug('%s terminating since no more items' % self.__class__.__name__)
                 return
             # loop over all tasks
             for jediTaskID,dsList in taskDsList:
                 allUpdated = True
                 taskBroken = False
                 taskOnHold = False
                 runningTask = False
                 missingMap = {}
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(jediTaskID))
                 # get task
                 tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False,True,self.pid,10)
                 if not tmpStat or taskSpec == None:
                     tmpLog.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID))
                     continue
                 try:
                     # get task parameters
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue))
                     taskBroken = True
                 # renaming of parameters
                 if taskParamMap.has_key('nEventsPerInputFile'):
                     taskParamMap['nEventsPerFile'] = taskParamMap['nEventsPerInputFile']
                 # the number of files per job
                 nFilesPerJob = None
                 if taskParamMap.has_key('nFilesPerJob'):
                     nFilesPerJob = taskParamMap['nFilesPerJob']
                 # the number of chunks used by scout 
                 nChunksForScout = 10
                 # load XML
                 if taskSpec.useLoadXML():
                     xmlConfig = taskParamMap['loadXML']
                 else:
                     xmlConfig = None
                 # check no wait
                 noWaitParent = False
                 if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None,taskSpec.jediTaskID]:
                     tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid)
                     if tmpStat == 'running':
                         noWaitParent = True
                 # loop over all datasets
                 nFilesMaster = 0
                 checkedMaster = False
                 setFrozenTime = True
                 if not taskBroken:
                     ddmIF = self.ddmIF.getInterface(taskSpec.vo) 
                     origNumFiles = None
                     if taskParamMap.has_key('nFiles'):
                         origNumFiles = taskParamMap['nFiles']
                     for datasetSpec in dsList:
                         tmpLog.info('start loop for {0}(id={1})'.format(datasetSpec.datasetName,datasetSpec.datasetID))
                         # get dataset metadata
                         tmpLog.info('get metadata')
                         gotMetadata = False
                         stateUpdateTime = datetime.datetime.utcnow()                    
                         try:
                             if not datasetSpec.isPseudo():
                                 tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName)
                             else:
                                 # dummy metadata for pseudo dataset
                                 tmpMetadata = {'state':'closed'}
                             # set mutable when parent is running and the dataset is open
                             if noWaitParent and tmpMetadata['state'] == 'open':
                                 # dummy metadata when parent is running
                                 tmpMetadata = {'state':'mutable'}
                             gotMetadata = True
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             tmpLog.error('{0} failed to get metadata to {1}:{2}'.format(self.__class__.__name__,
                                                                                         errtype.__name__,errvalue))
                             if errtype == Interaction.JEDIFatalError:
                                 # fatal error
                                 datasetStatus = 'broken'
                                 taskBroken = True
                                 # update dataset status    
                                 self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                             else:
                                 # temporary error
                                 taskOnHold = True
                             taskSpec.setErrDiag('failed to get metadata for {0}'.format(datasetSpec.datasetName))
                             allUpdated = False
                         else:
                             # get file list specified in task parameters
                             fileList,includePatt,excludePatt = RefinerUtils.extractFileList(taskParamMap,datasetSpec.datasetName)   
                             # get the number of events in metadata
                             if taskParamMap.has_key('getNumEventsInMetadata'):
                                 getNumEvents = True
                             else:
                                 getNumEvents = False
                             # get file list from DDM
                             tmpLog.info('get files')
                             try:
                                 useInFilesWithNewAttemptNr = False
                                 skipDuplicate = not datasetSpec.useDuplicatedFiles()
                                 if not datasetSpec.isPseudo():
                                     if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \
                                             not datasetSpec.containerName in ['',None]:
                                         # read files from container if file list is specified in task parameters
                                         tmpDatasetName = datasetSpec.containerName
                                     else:
                                         tmpDatasetName = datasetSpec.datasetName
                                     tmpRet = ddmIF.getFilesInDataset(tmpDatasetName,
                                                                      getNumEvents=getNumEvents,
                                                                      skipDuplicate=skipDuplicate
                                                                      )
                                     tmpLog.info('got {0} files in {1}'.format(len(tmpRet),tmpDatasetName))
                                     # remove lost files
                                     tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName,tmpRet)
                                     if tmpLostFiles != {}:
                                         tmpLog.info('found {0} lost files in {1}'.format(len(tmpLostFiles),tmpDatasetName))
                                         for tmpListGUID,tmpLostLFN in tmpLostFiles.iteritems():
                                             tmpLog.info('removed {0}'.format(tmpLostLFN))
                                             del tmpRet[tmpListGUID]
                                 else:
                                     if not taskSpec.useListPFN():
                                         # dummy file list for pseudo dataset
                                         tmpRet = {str(uuid.uuid4()):{'lfn':'pseudo_lfn',
                                                                      'scope':None,
                                                                      'filesize':0,
                                                                      'checksum':None,
                                                                      }
                                                   }
                                     else:
                                         # make dummy file list for PFN list
                                         if taskParamMap.has_key('nFiles'):
                                             nPFN = taskParamMap['nFiles']
                                         else:
                                             nPFN = 1
                                         tmpRet = {}
                                         for iPFN in range(nPFN):
                                             tmpRet[str(uuid.uuid4())] = {'lfn':'{0:06d}:{1}'.format(iPFN,taskParamMap['pfnList'][iPFN].split('/')[-1]),
                                                                          'scope':None,
                                                                          'filesize':0,
                                                                          'checksum':None,
                                                                          }
                             except:
                                 errtype,errvalue = sys.exc_info()[:2]
                                 tmpLog.error('failed to get files due to {0}:{1}'.format(self.__class__.__name__,
                                                                                              errtype.__name__,errvalue))
                                 if errtype == Interaction.JEDIFatalError:
                                     # fatal error
                                     datasetStatus = 'broken'
                                     taskBroken = True
                                     # update dataset status    
                                     self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                 else:
                                     # temporary error
                                     taskOnHold = True
                                 taskSpec.setErrDiag('failed to get files for {0}'.format(datasetSpec.datasetName))
                                 allUpdated = False
                             else:
                                 # the number of events per file
                                 nEventsPerFile  = None
                                 nEventsPerJob   = None
                                 nEventsPerRange = None
                                 if (datasetSpec.isMaster() and taskParamMap.has_key('nEventsPerFile')) or \
                                         (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents')):
                                     if taskParamMap.has_key('nEventsPerFile'):
                                         nEventsPerFile = taskParamMap['nEventsPerFile']
                                     elif datasetSpec.isPseudo() and taskParamMap.has_key('nEvents'):
                                         # use nEvents as nEventsPerFile for pseudo input
                                         nEventsPerFile = taskParamMap['nEvents']
                                     if taskParamMap.has_key('nEventsPerJob'):
                                         nEventsPerJob = taskParamMap['nEventsPerJob']
                                     elif taskParamMap.has_key('nEventsPerRange'):
                                         nEventsPerRange = taskParamMap['nEventsPerRange']
                                 # max attempts
                                 maxAttempt = None
                                 if datasetSpec.isMaster() or datasetSpec.toKeepTrack():
                                     # max attempts 
                                     if taskSpec.disableAutoRetry():
                                         # disable auto retry 
                                         maxAttempt = 1
                                     elif taskParamMap.has_key('maxAttempt'):
                                         maxAttempt = taskParamMap['maxAttempt']
                                     else:
                                         # use default value
                                         maxAttempt = 3
                                 # first event number
                                 firstEventNumber = None
                                 if datasetSpec.isMaster():
                                     # first event number
                                     firstEventNumber = 1 + taskSpec.getFirstEventOffset()
                                 # nMaxEvents
                                 nMaxEvents = None 
                                 if datasetSpec.isMaster() and taskParamMap.has_key('nEvents'):
                                     nMaxEvents = taskParamMap['nEvents']
                                 # nMaxFiles
                                 nMaxFiles = None
                                 if taskParamMap.has_key('nFiles'):
                                     if datasetSpec.isMaster():
                                         nMaxFiles = taskParamMap['nFiles']
                                     else:
                                         # calculate for secondary
                                         nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles)
                                         # multipled by the number of jobs per file for event-level splitting
                                         if nMaxFiles != None and taskParamMap.has_key('nEventsPerFile'):
                                             if taskParamMap.has_key('nEventsPerJob'):
                                                 if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                     nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerJob'])
                                                     nMaxFiles = int(math.ceil(nMaxFiles))
                                             elif taskParamMap.has_key('nEventsPerRange'):
                                                 if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerRange']:
                                                     nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerRange'])
                                                     nMaxFiles = int(math.ceil(nMaxFiles))
                                 # use scout
                                 useScout = False    
                                 if datasetSpec.isMaster() and taskSpec.useScout() and datasetSpec.status != 'toupdate':
                                     useScout = True
                                 # use files with new attempt numbers    
                                 useFilesWithNewAttemptNr = False
                                 if not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key('useInFilesWithNewAttemptNr'):
                                     useFilesWithNewAttemptNr = True
                                 # feed files to the contents table
                                 tmpLog.info('update contents')
                                 retDB,missingFileList,nFilesUnique,diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(datasetSpec,tmpRet,
                                                                                                                           tmpMetadata['state'],
                                                                                                                           stateUpdateTime,
                                                                                                                           nEventsPerFile,
                                                                                                                           nEventsPerJob,
                                                                                                                           maxAttempt,
                                                                                                                           firstEventNumber,
                                                                                                                           nMaxFiles,
                                                                                                                           nMaxEvents,
                                                                                                                           useScout,
                                                                                                                           fileList,
                                                                                                                           useFilesWithNewAttemptNr,
                                                                                                                           nFilesPerJob,
                                                                                                                           nEventsPerRange,
                                                                                                                           nChunksForScout,
                                                                                                                           includePatt,
                                                                                                                           excludePatt,
                                                                                                                           xmlConfig,
                                                                                                                           noWaitParent,
                                                                                                                           taskSpec.parent_tid,
                                                                                                                           self.pid)
                                 if retDB == False:
                                     taskSpec.setErrDiag('failed to insert files for {0}. {1}'.format(datasetSpec.datasetName,
                                                                                                      diagMap['errMsg']))
                                     allUpdated = False
                                     taskBroken = True
                                     break
                                 elif retDB == None:
                                     # the dataset is locked by another or status is not applicable
                                     allUpdated = False
                                     tmpLog.info('escape since task or dataset is locked')
                                     break
                                 elif missingFileList != []:
                                     # files are missing
                                     tmpErrStr = '{0} files missing in {1}'.format(len(missingFileList),datasetSpec.datasetName)
                                     tmpLog.info(tmpErrStr)
                                     taskSpec.setErrDiag(tmpErrStr)
                                     allUpdated = False
                                     taskOnHold = True
                                     missingMap[datasetSpec.datasetName] = {'datasetSpec':datasetSpec,
                                                                            'missingFiles':missingFileList} 
                                 else:
                                     # reduce the number of files to be read
                                     if taskParamMap.has_key('nFiles'):
                                         if datasetSpec.isMaster():
                                             taskParamMap['nFiles'] -= nFilesUnique
                                     # reduce the number of files for scout
                                     if useScout:
                                         nChunksForScout = diagMap['nChunksForScout']
                                     # number of master input files
                                     if datasetSpec.isMaster():
                                         checkedMaster = True
                                         nFilesMaster += nFilesUnique
                                 # running task
                                 if diagMap['isRunningTask']:
                                     runningTask = True
                                 # no activated pending input for noWait
                                 if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout == 0):
                                     tmpErrStr = 'insufficient inputs are ready'
                                     tmpLog.info(tmpErrStr)
                                     taskSpec.setErrDiag(tmpErrStr)
                                     taskOnHold = True
                                     setFrozenTime = False
                                     break
                         tmpLog.info('end loop')
                 # no mater input
                 if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster:
                     tmpErrStr = 'no master input files. input dataset is empty'
                     tmpLog.error(tmpErrStr)
                     taskSpec.setErrDiag(tmpErrStr,None)
                     if taskSpec.allowEmptyInput() or noWaitParent:
                         taskOnHold = True
                     else:
                         taskBroken = True
                 # update task status
                 if taskBroken:
                     # task is broken
                     taskSpec.status = 'tobroken'
                     tmpMsg = 'set task.status={0}'.format(taskSpec.status)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid)
                 # change task status unless the task is running
                 if not runningTask:
                     if taskOnHold:
                         if not noWaitParent:
                             # initialize task generator
                             taskGenerator = TaskGenerator(taskSpec.vo,taskSpec.prodSourceLabel)
                             tmpStat = taskGenerator.initializeMods(self.taskBufferIF,
                                                                    self.ddmIF.getInterface(taskSpec.vo))
                             if not tmpStat:
                                 tmpErrStr = 'failed to initialize TaskGenerator'
                                 tmpLog.error(tmpErrStr)
                                 taskSpec.status = 'tobroken'
                                 taskSpec.setErrDiag(tmpErrStr)
                             else:
                                 # make parent tasks if necessary
                                 tmpLog.info('make parent tasks with {0} (if necessary)'.format(taskGenerator.getClassName(taskSpec.vo,
                                                                                                                           taskSpec.prodSourceLabel)))
                                 tmpStat = taskGenerator.doGenerate(taskSpec,taskParamMap,missingFilesMap=missingMap)
                                 if tmpStat == Interaction.SC_FATAL:
                                     # failed to make parent tasks
                                     taskSpec.status = 'tobroken'
                                     tmpLog.error('failed to make parent tasks')
                         # go to pending state
                         if not taskSpec.status in ['broken','tobroken']:
                             taskSpec.setOnHold()
                         tmpMsg = 'set task.status={0}'.format(taskSpec.status)
                         tmpLog.info(tmpMsg)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                         allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid,setFrozenTime=setFrozenTime)
                     elif allUpdated:
                         # all OK
                         allRet,newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,getTaskStatus=True,
                                                                                                    pid=self.pid)
                         tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                         tmpLog.info(tmpMsg)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                 tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))

Пример #12

Показать файл

Файл: AtlasProdTaskBroker.py Проект: RRCKI/panda-jedi

 def runImpl(self):
     # cutoff for disk in TB
     diskThreshold = self.taskBufferIF.getConfigValue(self.msgType, 'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name),
                                                      'jedi', 'atlas')
     if diskThreshold is None:
         diskThreshold = 100 * 1024
     # dataset type to ignore file availability check
     datasetTypeToSkipCheck = ['log']
     # thresholds for data availability check
     thrInputSize = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas')
     if thrInputSize is None:
         thrInputSize = 1
     thrInputSize *= 1024*1024*1024
     thrInputNum = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_THRESHOLD', 'jedi', 'atlas')
     if thrInputNum is None:
         thrInputNum = 100
     thrInputSizeFrac = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas')
     if thrInputSizeFrac is None:
         thrInputSizeFrac = 10
     thrInputSizeFrac = float(thrInputSizeFrac) / 100
     thrInputNumFrac = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas')
     if thrInputNumFrac is None:
         thrInputNumFrac = 10
     thrInputNumFrac = float(thrInputNumFrac) / 100
     cutOffRW = 50
     negWeightTape = 0.001
     # main
     lastJediTaskID = None
     siteMapper = self.taskBufferIF.getSiteMapper()
     while True:
         try:
             taskInputList = self.inputList.get(1)
             # no more datasets
             if len(taskInputList) == 0:
                 self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__,
                                                                                                             self.numTasks))
                 return
             # loop over all tasks
             for taskSpec,inputChunk in taskInputList:
                 lastJediTaskID = taskSpec.jediTaskID
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='jediTaskID={0}'.format(taskSpec.jediTaskID))
                 tmpLog.debug('start')
                 tmpLog.info('thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}'.format(thrInputSize,
                                                                                                                 thrInputNum,
                                                                                                                 thrInputSizeFrac,
                                                                                                                 thrInputNumFrac))
                 # RW
                 taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID)
                 # get nuclei
                 nucleusList = siteMapper.nuclei
                 if taskSpec.nucleus in nucleusList:
                     candidateNucleus = taskSpec.nucleus
                 else:
                     tmpLog.info('got {0} candidates'.format(len(nucleusList)))
                     ######################################
                     # check status
                     newNucleusList = {}
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleusSpec.state in ['ACTIVE']:
                             tmpLog.info('  skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus,
                                                                                                         tmpNucleusSpec.state))
                         else:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed status check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # check status of transfer backlog
                     t1Weight = taskSpec.getT1Weight()
                     if t1Weight < 0:
                         tmpLog.info('skip transfer backlog check due to negative T1Weight')
                     else:
                         newNucleusList = {}
                         backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei()
                         for tmpNucleus, tmpNucleusSpec in nucleusList.iteritems():
                             if tmpNucleus in backlogged_nuclei:
                                 tmpLog.info('  skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog'.
                                              format(tmpNucleus))
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         nucleusList = newNucleusList
                         tmpLog.info('{0} candidates passed transfer backlog check'.format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # check endpoint
                     fractionFreeSpace = {}
                     newNucleusList = {}
                     tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                                   ['output','log'])
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         toSkip = False
                         for tmpDatasetSpec in tmpDatasetSpecList:
                             # ignore distributed datasets
                             if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None:
                                 continue
                             # get endpoint with the pattern
                             tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken)
                             if tmpEP == None:
                                 tmpLog.info('  skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus,
                                                                                                                     tmpDatasetSpec.storageToken))
                                 toSkip = True
                                 break
                             # check state
                             """
                             if not tmpEP['state'] in ['ACTIVE']:
                                 tmpLog.info('  skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus,
                                                                                                                          tmpEP['ddm_endpoint_name'],
                                                                                                                          tmpEP['state']))
                                 toSkip = True
                                 break
                             """    
                             # check space
                             tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired']
                             tmpSpaceToUse = 0
                             if tmpNucleus in self.fullRW:
                                 # 0.25GB per cpuTime/corePower/day
                                 tmpSpaceToUse = long(self.fullRW[tmpNucleus]/10/24/3600*0.25)
                             if tmpSpaceSize-tmpSpaceToUse < diskThreshold:
                                 tmpLog.info('  skip nucleus={0} since disk shortage (free {1} - reserved {2} < thr {3}) at endpoint {4} criteria=-space'.format(tmpNucleus,
                                                                                                                                                                  tmpSpaceSize,
                                                                                                                                                                  tmpSpaceToUse,
                                                                                                                                                                  diskThreshold,
                                                                                                                                                                  tmpEP['ddm_endpoint_name']))
                                 toSkip = True
                                 break
                             # keep fraction of free space
                             if not tmpNucleus in fractionFreeSpace:
                                 fractionFreeSpace[tmpNucleus] = {'total':0,'free':0}
                             try:
                                 tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                     float(fractionFreeSpace[tmpNucleus]['total'])
                             except:
                                 tmpOld = None
                             try:
                                 tmpNew = float(tmpSpaceSize-tmpSpaceToUse)/float(tmpEP['space_total'])
                             except:
                                 tmpNew = None
                             if tmpNew != None and (tmpOld == None or tmpNew < tmpOld):
                                 fractionFreeSpace[tmpNucleus] = {'total':tmpEP['space_total'],
                                                                  'free':tmpSpaceSize-tmpSpaceToUse}
                         if not toSkip:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed endpoint check {1} TB'.format(len(nucleusList),diskThreshold/1024))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # ability to execute jobs
                     newNucleusList = {}
                     # get all panda sites
                     tmpSiteList = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         tmpSiteList += tmpNucleusSpec.allPandaSites
                     tmpSiteList = list(set(tmpSiteList))
                     tmpLog.debug('===== start for job check')
                     jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF)
                     tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True,
                                                          tmpSiteList,tmpLog)
                     tmpLog.debug('===== done for job check')
                     if tmpSt != Interaction.SC_SUCCEEDED:
                         tmpLog.error('no sites can run jobs')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     okNuclei = set()
                     for tmpSite in tmpRet:
                         siteSpec = siteMapper.getSite(tmpSite)
                         okNuclei.add(siteSpec.pandasite)
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if tmpNucleus in okNuclei:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                         else:
                             tmpLog.info('  skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus))
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed job check'.format(len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ###################################### 
                     # data locality
                     toSkip = False
                     availableData = {}
                     for datasetSpec in inputChunk.getDatasets():
                         # only for real datasets
                         if datasetSpec.isPseudo():
                             continue
                         # ignore DBR
                         if DataServiceUtils.isDBR(datasetSpec.datasetName):
                             continue
                         # skip locality check
                         if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck:
                             continue
                         # use deep scan for primary dataset
                         if datasetSpec.isMaster():
                             deepScan = True
                         else:
                             deepScan = False
                         # get nuclei where data is available
                         tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF,
                                                                           datasetSpec.datasetName,
                                                                           nucleusList.keys(),
                                                                           deepScan)
                         if tmpSt != Interaction.SC_SUCCEEDED:
                             tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet))
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             toSkip = True
                             break
                         # sum
                         for tmpNucleus,tmpVals in tmpRet.iteritems():
                             if not tmpNucleus in availableData:
                                 availableData[tmpNucleus] = tmpVals
                             else:
                                 availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems())
                     if toSkip:
                         continue
                     if availableData != {}:
                         newNucleusList = {}
                         # skip if no data
                         skipMsgList = []
                         for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                             if len(nucleusList) == 1:
                                 tmpLog.info('  disable data locality check for nucleus={0} since no other candidate'.format(tmpNucleus))
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                             elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \
                                     availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac:
                                 tmpMsg = '  skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus,
                                                                                                                                     availableData[tmpNucleus]['ava_size_any'],
                                                                                                                                     availableData[tmpNucleus]['tot_size'],
                                                                                                                                     thrInputSizeFrac)
                                 skipMsgList.append(tmpMsg)
                             elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \
                                     availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac:
                                 tmpMsg = '  skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus,
                                                                                                                                       availableData[tmpNucleus]['ava_num_any'],
                                                                                                                                       availableData[tmpNucleus]['tot_num'],
                                                                                                                                       thrInputNumFrac)
                                 skipMsgList.append(tmpMsg)
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         if len(newNucleusList) > 0:
                             nucleusList = newNucleusList
                             for tmpMsg in skipMsgList:
                                 tmpLog.info(tmpMsg)
                         else:
                             tmpLog.info('  disable data locality check since no nucleus has input data')
                         tmpLog.info('{0} candidates passed data check'.format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ###################################### 
                     # weight
                     self.prioRW.acquire()
                     nucleusRW = self.prioRW[taskSpec.currentPriority]
                     self.prioRW.release()
                     totalWeight = 0
                     nucleusweights = []
                     for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems():
                         if not tmpNucleus in nucleusRW:
                             nucleusRW[tmpNucleus] = 0
                         wStr = '1'
                         # with RW
                         if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW:
                             weight = 1 / float(nucleusRW[tmpNucleus])
                             wStr += '/( RW={0} )'.format(nucleusRW[tmpNucleus])
                         else:
                             weight = 1
                             wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW)
                         # with data
                         if availableData != {}:
                             if availableData[tmpNucleus]['tot_size'] > 0:
                                 weight *= float(availableData[tmpNucleus]['ava_size_any'])
                                 weight /= float(availableData[tmpNucleus]['tot_size'])
                                 wStr += '* ( available_input_size_DISKTAPE={0} )'.format(availableData[tmpNucleus]['ava_size_any'])
                                 wStr += '/ ( total_input_size={0} )'.format(availableData[tmpNucleus]['tot_size'])
                                 # negative weight for tape
                                 if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']:
                                     weight *= negWeightTape
                                     wStr += '*( weight_TAPE={0} )'.format(negWeightTape)
                             # fraction of free space
                             if tmpNucleus in fractionFreeSpace:
                                 try:
                                     tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                         float(fractionFreeSpace[tmpNucleus]['total'])
                                     weight *= tmpFrac
                                     wStr += '*( free_space={0} )/( total_space={1} )'.format(fractionFreeSpace[tmpNucleus]['free'],
                                                                                          fractionFreeSpace[tmpNucleus]['total'])
                                 except:
                                     pass
                         tmpLog.info('  use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr))
                         totalWeight += weight
                         nucleusweights.append((tmpNucleus,weight))
                     tmpLog.info('final {0} candidates'.format(len(nucleusList)))
                     ###################################### 
                     # final selection
                     tgtWeight = random.uniform(0,totalWeight)
                     candidateNucleus = None
                     for tmpNucleus,weight in nucleusweights:
                         tgtWeight -= weight
                         if tgtWeight <= 0:
                             candidateNucleus = tmpNucleus
                             break
                     if candidateNucleus == None:
                         candidateNucleus = nucleusweights[-1][0]
                 ###################################### 
                 # update
                 nucleusSpec = nucleusList[candidateNucleus]
                 # get output/log datasets
                 tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,
                                                                                            ['output','log'])
                 # get destinations
                 retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)}
                 tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
                 tmpLog.info('  set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet))
                 self.sendLogMessage(tmpLog)
                 if tmpRet:
                     tmpMsg = 'set task.status=ready'
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                 # update RW table
                 self.prioRW.acquire()
                 for prio,rwMap in self.prioRW.iteritems():
                     if prio > taskSpec.currentPriority:
                         continue
                     if candidateNucleus in rwMap:
                         rwMap[candidateNucleus] += taskRW
                     else:
                         rwMap[candidateNucleus] = taskRW
                 self.prioRW.release()
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errMsg  = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID)
             errMsg += traceback.format_exc()
             logger.error(errMsg)

Пример #13

Показать файл

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskDsList = self.taskDsList.get(nTasks)
             # no more datasets
             if len(taskDsList) == 0:
                 self.logger.debug("%s terminating since no more items" % self.__class__.__name__)
                 return
             # loop over all tasks
             for jediTaskID, dsList in taskDsList:
                 allUpdated = True
                 taskBroken = False
                 taskOnHold = False
                 runningTask = False
                 missingMap = {}
                 # make logger
                 tmpLog = MsgWrapper(self.logger, "<jediTaskID={0}>".format(jediTaskID))
                 # get task
                 tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID, False, True, None, 10)
                 if not tmpStat or taskSpec == None:
                     tmpLog.error("failed to get taskSpec for jediTaskID={0}".format(jediTaskID))
                     continue
                 try:
                     # get task parameters
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype, errvalue = sys.exc_info()[:2]
                     tmpLog.error(
                         "task param conversion from json failed with {0}:{1}".format(errtype.__name__, errvalue)
                     )
                     taskBroken = True
                 # renaming of parameters
                 if taskParamMap.has_key("nEventsPerInputFile"):
                     taskParamMap["nEventsPerFile"] = taskParamMap["nEventsPerInputFile"]
                 # the number of files per job
                 nFilesPerJob = None
                 if taskParamMap.has_key("nFilesPerJob"):
                     nFilesPerJob = taskParamMap["nFilesPerJob"]
                 # the number of files used by scout
                 nFilesForScout = 0
                 if nFilesPerJob != None:
                     nFilesForScout = 10 * nFilesPerJob
                 else:
                     nFilesForScout = 10
                 # load XML
                 if taskSpec.useLoadXML():
                     try:
                         loadXML = taskParamMap["loadXML"]
                         xmlConfig = ParseJobXML.dom_parser(xmlStr=loadXML)
                     except:
                         errtype, errvalue = sys.exc_info()[:2]
                         tmpLog.error("failed to load XML config with {0}:{1}".format(errtype.__name__, errvalue))
                         taskBroken = True
                 else:
                     xmlConfig = None
                 # check no wait
                 noWaitParent = False
                 if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None, taskSpec.jediTaskID]:
                     tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid)
                     if tmpStat == "running":
                         noWaitParent = True
                 # loop over all datasets
                 nFilesMaster = 0
                 if not taskBroken:
                     ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                     origNumFiles = None
                     if taskParamMap.has_key("nFiles"):
                         origNumFiles = taskParamMap["nFiles"]
                     for datasetSpec in dsList:
                         tmpLog.info(
                             "start loop for {0}(id={1})".format(datasetSpec.datasetName, datasetSpec.datasetID)
                         )
                         # get dataset metadata
                         tmpLog.info("get metadata")
                         gotMetadata = False
                         stateUpdateTime = datetime.datetime.utcnow()
                         try:
                             if not datasetSpec.isPseudo():
                                 tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName)
                             else:
                                 # dummy metadata for pseudo dataset
                                 tmpMetadata = {"state": "closed"}
                             # set mutable when parent is running and the dataset is open
                             if noWaitParent and tmpMetadata["state"] == "open":
                                 # dummy metadata when parent is running
                                 tmpMetadata = {"state": "mutable"}
                             gotMetadata = True
                         except:
                             errtype, errvalue = sys.exc_info()[:2]
                             tmpLog.error(
                                 "{0} failed to get metadata to {1}:{2}".format(
                                     self.__class__.__name__, errtype.__name__, errvalue
                                 )
                             )
                             if errtype == Interaction.JEDIFatalError:
                                 # fatal error
                                 datasetStatus = "broken"
                                 taskBroken = True
                                 # update dataset status
                                 self.updateDatasetStatus(datasetSpec, datasetStatus, tmpLog)
                             else:
                                 # temporary error
                                 taskOnHold = True
                             taskSpec.setErrDiag("failed to get metadata for {0}".format(datasetSpec.datasetName))
                             allUpdated = False
                         else:
                             # get file list specified in task parameters
                             fileList, includePatt, excludePatt = RefinerUtils.extractFileList(
                                 taskParamMap, datasetSpec.datasetName
                             )
                             # get the number of events in metadata
                             if taskParamMap.has_key("getNumEventsInMetadata"):
                                 getNumEvents = True
                             else:
                                 getNumEvents = False
                             # get file list from DDM
                             tmpLog.info("get files")
                             try:
                                 useInFilesWithNewAttemptNr = False
                                 skipDuplicate = not datasetSpec.useDuplicatedFiles()
                                 if not datasetSpec.isPseudo():
                                     if (
                                         fileList != []
                                         and taskParamMap.has_key("useInFilesInContainer")
                                         and not datasetSpec.containerName in ["", None]
                                     ):
                                         # read files from container if file list is specified in task parameters
                                         tmpDatasetName = datasetSpec.containerName
                                     else:
                                         tmpDatasetName = datasetSpec.datasetName
                                     tmpRet = ddmIF.getFilesInDataset(
                                         tmpDatasetName, getNumEvents=getNumEvents, skipDuplicate=skipDuplicate
                                     )
                                     # remove lost files
                                     tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName, tmpRet)
                                     if tmpLostFiles != {}:
                                         tmpLog.info(
                                             "found {0} lost files in {1}".format(len(tmpLostFiles), tmpDatasetName)
                                         )
                                         for tmpListGUID, tmpLostLFN in tmpLostFiles.iteritems():
                                             tmpLog.info("removed {0}".format(tmpLostLFN))
                                             del tmpRet[tmpListGUID]
                                 else:
                                     if not taskSpec.useListPFN():
                                         # dummy file list for pseudo dataset
                                         tmpRet = {
                                             str(uuid.uuid4()): {
                                                 "lfn": "pseudo_lfn",
                                                 "scope": None,
                                                 "filesize": 0,
                                                 "checksum": None,
                                             }
                                         }
                                     else:
                                         # make dummy file list for PFN list
                                         if taskParamMap.has_key("nFiles"):
                                             nPFN = taskParamMap["nFiles"]
                                         else:
                                             nPFN = 1
                                         tmpRet = {}
                                         for iPFN in range(nPFN):
                                             tmpRet[str(uuid.uuid4())] = {
                                                 "lfn": "{0:06d}:{1}".format(
                                                     iPFN, taskParamMap["pfnList"][iPFN].split("/")[-1]
                                                 ),
                                                 "scope": None,
                                                 "filesize": 0,
                                                 "checksum": None,
                                             }
                             except:
                                 errtype, errvalue = sys.exc_info()[:2]
                                 tmpLog.error(
                                     "failed to get files due to {0}:{1}".format(
                                         self.__class__.__name__, errtype.__name__, errvalue
                                     )
                                 )
                                 if errtype == Interaction.JEDIFatalError:
                                     # fatal error
                                     datasetStatus = "broken"
                                     taskBroken = True
                                     # update dataset status
                                     self.updateDatasetStatus(datasetSpec, datasetStatus, tmpLog)
                                 else:
                                     # temporary error
                                     taskOnHold = True
                                 taskSpec.setErrDiag("failed to get files for {0}".format(datasetSpec.datasetName))
                                 allUpdated = False
                             else:
                                 # the number of events per file
                                 nEventsPerFile = None
                                 nEventsPerJob = None
                                 nEventsPerRange = None
                                 if (datasetSpec.isMaster() and taskParamMap.has_key("nEventsPerFile")) or (
                                     datasetSpec.isPseudo() and taskParamMap.has_key("nEvents")
                                 ):
                                     if taskParamMap.has_key("nEventsPerFile"):
                                         nEventsPerFile = taskParamMap["nEventsPerFile"]
                                     elif datasetSpec.isPseudo() and taskParamMap.has_key("nEvents"):
                                         # use nEvents as nEventsPerFile for pseudo input
                                         nEventsPerFile = taskParamMap["nEvents"]
                                     if taskParamMap.has_key("nEventsPerJob"):
                                         nEventsPerJob = taskParamMap["nEventsPerJob"]
                                     elif taskParamMap.has_key("nEventsPerRange"):
                                         nEventsPerRange = taskParamMap["nEventsPerRange"]
                                 # max attempts and first event number
                                 maxAttempt = None
                                 firstEventNumber = None
                                 if datasetSpec.isMaster():
                                     # max attempts
                                     if taskSpec.disableAutoRetry():
                                         # disable auto retry
                                         maxAttempt = 1
                                     elif taskParamMap.has_key("maxAttempt"):
                                         maxAttempt = taskParamMap["maxAttempt"]
                                     else:
                                         # use default value
                                         maxAttempt = 3
                                     # first event number
                                     firstEventNumber = 1 + taskSpec.getFirstEventOffset()
                                 # nMaxEvents
                                 nMaxEvents = None
                                 if datasetSpec.isMaster() and taskParamMap.has_key("nEvents"):
                                     nMaxEvents = taskParamMap["nEvents"]
                                 # nMaxFiles
                                 nMaxFiles = None
                                 if taskParamMap.has_key("nFiles"):
                                     if datasetSpec.isMaster():
                                         nMaxFiles = taskParamMap["nFiles"]
                                     else:
                                         # calculate for secondary
                                         nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles)
                                         # multipled by the number of jobs per file for event-level splitting
                                         if nMaxFiles != None and taskParamMap.has_key("nEventsPerFile"):
                                             if taskParamMap.has_key("nEventsPerJob"):
                                                 if taskParamMap["nEventsPerFile"] > taskParamMap["nEventsPerJob"]:
                                                     nMaxFiles *= float(taskParamMap["nEventsPerFile"]) / float(
                                                         taskParamMap["nEventsPerJob"]
                                                     )
                                                     nMaxFiles = int(math.ceil(nMaxFiles))
                                             elif taskParamMap.has_key("nEventsPerRange"):
                                                 if taskParamMap["nEventsPerFile"] > taskParamMap["nEventsPerRange"]:
                                                     nMaxFiles *= float(taskParamMap["nEventsPerFile"]) / float(
                                                         taskParamMap["nEventsPerRange"]
                                                     )
                                                     nMaxFiles = int(math.ceil(nMaxFiles))
                                 # use scout
                                 useScout = False
                                 if datasetSpec.isMaster() and taskSpec.useScout():
                                     useScout = True
                                 # use files with new attempt numbers
                                 useFilesWithNewAttemptNr = False
                                 if (
                                     not datasetSpec.isPseudo()
                                     and fileList != []
                                     and taskParamMap.has_key("useInFilesWithNewAttemptNr")
                                 ):
                                     useFilesWithNewAttemptNr = True
                                 # feed files to the contents table
                                 tmpLog.info("update contents")
                                 retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(
                                     datasetSpec,
                                     tmpRet,
                                     tmpMetadata["state"],
                                     stateUpdateTime,
                                     nEventsPerFile,
                                     nEventsPerJob,
                                     maxAttempt,
                                     firstEventNumber,
                                     nMaxFiles,
                                     nMaxEvents,
                                     useScout,
                                     fileList,
                                     useFilesWithNewAttemptNr,
                                     nFilesPerJob,
                                     nEventsPerRange,
                                     nFilesForScout,
                                     includePatt,
                                     excludePatt,
                                     xmlConfig,
                                     noWaitParent,
                                     taskSpec.parent_tid,
                                 )
                                 if retDB == False:
                                     taskSpec.setErrDiag(
                                         "failed to insert files for {0}. {1}".format(
                                             datasetSpec.datasetName, diagMap["errMsg"]
                                         )
                                     )
                                     allUpdated = False
                                     taskBroken = True
                                     break
                                 elif retDB == None:
                                     # the dataset is locked by another or status is not applicable
                                     allUpdated = False
                                 elif missingFileList != []:
                                     # files are missing
                                     tmpErrStr = "{0} files missing in {1}".format(
                                         len(missingFileList), datasetSpec.datasetName
                                     )
                                     tmpLog.info(tmpErrStr)
                                     taskSpec.setErrDiag(tmpErrStr)
                                     allUpdated = False
                                     taskOnHold = True
                                     missingMap[datasetSpec.datasetName] = {
                                         "datasetSpec": datasetSpec,
                                         "missingFiles": missingFileList,
                                     }
                                 else:
                                     # reduce the number of files to be read
                                     if taskParamMap.has_key("nFiles"):
                                         if datasetSpec.isMaster():
                                             taskParamMap["nFiles"] -= nFilesUnique
                                     # reduce the number of files for scout
                                     if useScout:
                                         nFilesForScout = diagMap["nFilesForScout"]
                                     # number of master input files
                                     if datasetSpec.isMaster():
                                         nFilesMaster += nFilesUnique
                                 # running task
                                 if diagMap["isRunningTask"]:
                                     runningTask = True
                                 # no activated pending input for noWait
                                 if noWaitParent and diagMap["nActivatedPending"] == 0:
                                     tmpErrStr = "insufficient inputs are ready"
                                     tmpLog.info(tmpErrStr)
                                     taskSpec.setErrDiag(tmpErrStr)
                                     taskOnHold = True
                         tmpLog.info("end loop")
                 # no mater input
                 if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0:
                     tmpErrStr = "no master input files. input dataset is empty"
                     tmpLog.error(tmpErrStr)
                     taskSpec.setErrDiag(tmpErrStr, None)
                     if taskSpec.allowEmptyInput() or noWaitParent:
                         taskOnHold = True
                     else:
                         taskBroken = True
                 # update task status
                 if taskBroken:
                     # task is broken
                     taskSpec.status = "tobroken"
                     tmpMsg = "set task.status={0}".format(taskSpec.status)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg, self.msgType)
                     allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID, taskSpec)
                 # change task status unless the task is running
                 if not runningTask:
                     if taskOnHold:
                         if not noWaitParent:
                             # initialize task generator
                             taskGenerator = TaskGenerator(taskSpec.vo, taskSpec.prodSourceLabel)
                             tmpStat = taskGenerator.initializeMods(
                                 self.taskBufferIF, self.ddmIF.getInterface(taskSpec.vo)
                             )
                             if not tmpStat:
                                 tmpErrStr = "failed to initialize TaskGenerator"
                                 tmpLog.error(tmpErrStr)
                                 taskSpec.status = "tobroken"
                                 taskSpec.setErrDiag(tmpErrStr)
                             else:
                                 # make parent tasks if necessary
                                 tmpLog.info(
                                     "make parent tasks with {0} (if necessary)".format(
                                         taskGenerator.getClassName(taskSpec.vo, taskSpec.prodSourceLabel)
                                     )
                                 )
                                 tmpStat = taskGenerator.doGenerate(
                                     taskSpec, taskParamMap, missingFilesMap=missingMap
                                 )
                                 if tmpStat == Interaction.SC_FATAL:
                                     # failed to make parent tasks
                                     taskSpec.status = "tobroken"
                                     tmpLog.error("failed to make parent tasks")
                         # go to pending state
                         if not taskSpec.status in ["broken", "tobroken"]:
                             taskSpec.setOnHold()
                         tmpMsg = "set task.status={0}".format(taskSpec.status)
                         tmpLog.info(tmpMsg)
                         tmpLog.sendMsg(tmpMsg, self.msgType)
                         allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID, taskSpec)
                     elif allUpdated:
                         # all OK
                         allRet, newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                             jediTaskID, getTaskStatus=True
                         )
                         tmpMsg = "set task.status={0}".format(newTaskStatus)
                         tmpLog.info(tmpMsg)
                         tmpLog.sendMsg(tmpMsg, self.msgType)
                 tmpLog.info("done")
         except:
             errtype, errvalue = sys.exc_info()[:2]
             logger.error(
                 "{0} failed in runImpl() with {1}:{2}".format(self.__class__.__name__, errtype.__name__, errvalue)
             )

Пример #14

Показать файл

Файл: TaskRefiner.py Проект: PanDAWMS/panda-jedi

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.info('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,splitRule,taskStatus,parent_tid in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'< jediTaskID={0} >'.format(jediTaskID))
                 tmpLog.debug('start')
                 tmpStat = Interaction.SC_SUCCEEDED
                 errStr = ''
                 # read task parameters
                 try:
                     taskParam = None
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue)
                     tmpLog.debug(taskParam)
                     tmpLog.error(errStr)
                     continue
                     tmpStat = Interaction.SC_FAILED
                 # get impl
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('getting Impl')
                     try:
                         # get VO and sourceLabel
                         vo = taskParamMap['vo']
                         prodSourceLabel = taskParamMap['prodSourceLabel']
                         taskType = taskParamMap['taskType']
                         tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType))
                         # get impl
                         impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType,
                                                                 self.taskBufferIF,self.ddmIF)
                         if impl == None:
                             # task refiner is undefined
                             errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # extract common parameters
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('extracting common')
                     try:
                         # initalize impl
                         impl.initializeRefiner(tmpLog)
                         impl.oldTaskStatus = taskStatus
                         # extract common parameters
                         impl.extractCommon(jediTaskID, taskParamMap, self.workQueueMapper, splitRule)
                         # set parent tid
                         if not parent_tid in [None,jediTaskID]:
                             impl.taskSpec.parent_tid = parent_tid
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to extract common parameters with {0}:{1} {2}'.format(errtype.__name__,errvalue,
                                                                                                traceback.format_exc())
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # check attribute length
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('checking attribute length')
                     if not impl.taskSpec.checkAttrLength():
                         tmpLog.error(impl.taskSpec.errorDialog)
                         tmpStat = Interaction.SC_FAILED
                 # staging
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     if 'toStaging' in taskParamMap and taskStatus <> 'staged':
                         errStr = 'wait until staging is done'
                         impl.taskSpec.status = 'staging'
                         impl.taskSpec.oldStatus = taskStatus
                         impl.taskSpec.setErrDiag(errStr)
                         # not to update some task attributes
                         impl.taskSpec.resetRefinedAttrs()
                         tmpLog.info(errStr)
                         self.taskBufferIF.updateTask_JEDI(impl.taskSpec, {'jediTaskID':impl.taskSpec.jediTaskID},
                                                           oldStatus=[taskStatus], updateDEFT=False, setFrozenTime=False)
                         continue
                 # check parent
                 noWaitParent = False
                 parentState = None
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     if parent_tid not in [None,jediTaskID]:
                         tmpLog.info('check parent task')
                         try:
                             tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid)
                             parentState = tmpStat
                             if tmpStat == 'completed':
                                 # parent is done
                                 tmpStat = Interaction.SC_SUCCEEDED
                             elif tmpStat == 'running':
                                 if not impl.taskSpec.noWaitParent():
                                     # parent is running
                                     errStr = 'pending until parent task {0} is done'.format(parent_tid)
                                     impl.taskSpec.status = taskStatus
                                     impl.taskSpec.setOnHold()
                                     impl.taskSpec.setErrDiag(errStr)
                                     # not to update some task attributes
                                     impl.taskSpec.resetRefinedAttrs()
                                     tmpLog.info(errStr)
                                     self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                                       oldStatus=[taskStatus],setFrozenTime=False)
                                     continue
                                 else:
                                     # not wait for parent
                                     tmpStat = Interaction.SC_SUCCEEDED
                                     noWaitParent = True
                             else:
                                 # parent is corrupted
                                 tmpStat = Interaction.SC_FAILED
                                 tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid)
                                 impl.taskSpec.setErrDiag(tmpErrStr)
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # refine
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('refining with {0}'.format(impl.__class__.__name__))
                     try:
                         tmpStat = impl.doRefine(jediTaskID,taskParamMap)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         # wait unknown input if noWaitParent or waitInput
                         if ((impl.taskSpec.noWaitParent() or impl.taskSpec.waitInput()) \
                                 and errtype == JediException.UnknownDatasetError) or parentState == 'running' \
                                 or errtype == Interaction.JEDITemporaryError:
                             if impl.taskSpec.noWaitParent() or parentState == 'running':
                                 tmpErrStr = 'pending until parent produces input'
                                 setFrozenTime=False
                             elif errtype == Interaction.JEDITemporaryError:
                                 tmpErrStr = 'pending due to DDM problem. {0}'.format(errvalue)
                                 setFrozenTime=True
                             else:
                                 tmpErrStr = 'pending until input is staged'
                                 setFrozenTime=True
                             impl.taskSpec.status = taskStatus
                             impl.taskSpec.setOnHold()
                             impl.taskSpec.setErrDiag(tmpErrStr)
                             # not to update some task attributes
                             impl.taskSpec.resetRefinedAttrs()
                             tmpLog.info(tmpErrStr)
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                               oldStatus=[taskStatus],
                                                               insertUnknown=impl.unknownDatasetList,
                                                               setFrozenTime=setFrozenTime)
                             continue
                         else:
                             errStr  = 'failed to refine task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # register
                 if tmpStat != Interaction.SC_SUCCEEDED:
                     tmpLog.error('failed to refine the task')
                     if impl == None or impl.taskSpec == None:
                         tmpTaskSpec = JediTaskSpec()
                         tmpTaskSpec.jediTaskID = jediTaskID
                     else:
                         tmpTaskSpec = impl.taskSpec
                     tmpTaskSpec.status = 'tobroken'
                     if errStr != '':
                         tmpTaskSpec.setErrDiag(errStr,True)
                     self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID},oldStatus=[taskStatus])
                 else:
                     tmpLog.info('registering')                    
                     # fill JEDI tables
                     try:
                         # enable protection against task duplication
                         if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \
                                 not impl.taskSpec.checkPreProcessed():
                             uniqueTaskName = True
                         else:
                             uniqueTaskName = False
                         strTaskParams = None
                         if impl.updatedTaskParams != None:
                             strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams)
                         if taskStatus in ['registered', 'staged']:
                             # unset pre-process flag
                             if impl.taskSpec.checkPreProcessed():
                                 impl.taskSpec.setPostPreProcess()
                             # full registration
                             tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec,
                                                                                                  impl.inMasterDatasetSpec,
                                                                                                  impl.inSecDatasetSpecList,
                                                                                                  impl.outDatasetSpecList,
                                                                                                  impl.outputTemplateMap,
                                                                                                  impl.jobParamsTemplate,
                                                                                                  strTaskParams,
                                                                                                  impl.unmergeMasterDatasetSpec,
                                                                                                  impl.unmergeDatasetSpecMap,
                                                                                                  uniqueTaskName,
                                                                                                  taskStatus) 
                             if not tmpStat:
                                 tmpErrStr = 'failed to register the task to JEDI in a single shot'
                                 tmpLog.error(tmpErrStr)
                                 impl.taskSpec.status = newTaskStatus
                                 impl.taskSpec.setErrDiag(tmpErrStr,True)
                                 self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                                   oldStatus=[taskStatus])
                             tmpMsg = 'set task_status={0}'.format(newTaskStatus)
                             tmpLog.info(tmpMsg)
                             tmpLog.sendMsg(tmpMsg,self.msgType)
                         else:
                             # disable scouts if previous attempt didn't use it
                             if not impl.taskSpec.useScout(splitRule):
                                 impl.taskSpec.setUseScout(False)
                             # disallow to reset some attributes
                             for attName in ['ramCount', 'walltime', 'cpuTime', 'startTime']:
                                 impl.taskSpec.resetChangedAttr(attName)
                             # update task with new params
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                               oldStatus=[taskStatus])
                             # appending for incremetnal execution
                             tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec,
                                                                             impl.inSecDatasetSpecList)
                             if not tmpStat:
                                 tmpLog.error('failed to append datasets for incexec')
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(tmpErrStr)
                     else:
                         tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))

Пример #15

Показать файл

Файл: AtlasProdJobThrottler.py Проект: lukewayne123/panda-jedi

 def toBeThrottled(self,vo,prodSourceLabel,cloudName,workQueue,jobStat):
     # params
     nBunch = 4
     threshold = 2.0
     thresholdForSite = threshold - 1.0
     nJobsInBunchMax = 500
     nJobsInBunchMin = 300
     nJobsInBunchMaxES = 1000
     nWaitingLimit = 4
     nWaitingBunchLimit = 2
     nParallel = 8
     # make logger
     tmpLog = MsgWrapper(logger)
     workQueueIDs = workQueue.getIDs()
     msgHeader = '{0}:{1} cloud={2} queue={3}:'.format(vo,prodSourceLabel,cloudName,workQueue.queue_name)
     tmpLog.debug(msgHeader+' start workQueueID={0}'.format(str(workQueueIDs)))
     # check cloud status
     if not self.siteMapper.checkCloud(cloudName):
         msgBody = "SKIP cloud={0} undefined".format(cloudName)
         tmpLog.debug(msgHeader+" "+msgBody)
         tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
         return self.retThrottled
     cloudSpec = self.siteMapper.getCloud(cloudName)
     if cloudSpec['status'] in ['offline']:
         msgBody = "SKIP cloud.status={0}".format(cloudSpec['status'])
         tmpLog.debug(msgHeader+" "+msgBody)
         tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
         return self.retThrottled
     if cloudSpec['status'] in ['test']:
         if workQueue.queue_name != 'test':
             msgBody = "SKIP cloud.status={0} for non test queue ({1})".format(cloudSpec['status'],
                                                                               workQueue.queue_name)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
             tmpLog.debug(msgHeader+" "+msgBody)
             return self.retThrottled
     # check if unthrottled
     if workQueue.queue_share == None:
         msgBody = "PASS unthrottled since share=None"
         tmpLog.debug(msgHeader+" "+msgBody)
         return self.retUnThrottled
     # count number of jobs in each status
     nRunning = 0
     nNotRun  = 0
     nDefine  = 0
     nWaiting = 0
     for workQueueID in workQueueIDs:
         if jobStat.has_key(cloudName) and \
                jobStat[cloudName].has_key(workQueueID):
             tmpLog.debug(msgHeader+" "+str(jobStat[cloudName][workQueueID]))
             for pState,pNumber in jobStat[cloudName][workQueueID].iteritems():
                 if pState in ['running']:
                     nRunning += pNumber
                 elif pState in ['assigned','activated','starting']:
                     nNotRun  += pNumber
                 elif pState in ['defined']:
                     nDefine  += pNumber
                 elif pState in ['waiting']:
                     nWaiting += pNumber
     # check if higher prio tasks are waiting
     tmpStat,highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed',cloudName,workQueue)
     highestPrioInPandaDB = highestPrioJobStat['highestPrio']
     nNotRunHighestPrio   = highestPrioJobStat['nNotRun']
     # the highest priority of waiting tasks 
     highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo,workQueue,
                                                                      'managed',cloudName)
     if highestPrioWaiting == None:
         msgBody = 'failed to get the highest priority of waiting tasks'
         tmpLog.error(msgHeader+" "+msgBody)
         return self.retTmpError
     # high priority tasks are waiting
     highPrioQueued = False
     if highestPrioWaiting > highestPrioInPandaDB or (highestPrioWaiting == highestPrioInPandaDB and \
                                                      nNotRunHighestPrio < nJobsInBunchMin):
         highPrioQueued = True
     tmpLog.debug(msgHeader+" highestPrio waiting:{0} inPanda:{1} numNotRun:{2} -> highPrioQueued={3}".format(highestPrioWaiting,
                                                                                                              highestPrioInPandaDB,
                                                                                                              nNotRunHighestPrio,
                                                                                                              highPrioQueued))
     # set maximum number of jobs to be submitted
     tmpRemainingSlot = int(nRunning*threshold-nNotRun)
     if tmpRemainingSlot < nJobsInBunchMin:
         # use the lower limit to avoid creating too many _sub/_dis datasets
         nJobsInBunch = nJobsInBunchMin
     else:
         if workQueue.queue_name in ['evgensimul']:
             # use higher limit for evgensimul
             if tmpRemainingSlot < nJobsInBunchMaxES:
                 nJobsInBunch = tmpRemainingSlot
             else:
                 nJobsInBunch = nJobsInBunchMaxES
         else:
             if tmpRemainingSlot < nJobsInBunchMax:
                 nJobsInBunch = tmpRemainingSlot
             else:
                 nJobsInBunch = nJobsInBunchMax
     nQueueLimit = nJobsInBunch*nBunch
     # use special limit for CERN
     if cloudName == 'CERN':
         nQueueLimit = 2000
     # use nPrestage for reprocessing   
     if workQueue.queue_name in ['reprocessing']:
         if cloudSpec.has_key('nprestage') and cloudSpec['nprestage'] > 0:
             nQueueLimit = cloudSpec['nprestage']
             # reset nJobsInBunch
             if nQueueLimit > (nNotRun+nDefine):
                 tmpRemainingSlot = nQueueLimit - (nNotRun+nDefine)
                 if tmpRemainingSlot < nJobsInBunch:
                     pass
                 elif tmpRemainingSlot < nJobsInBunchMax:
                     nJobsInBunch = tmpRemainingSlot
                 else:
                     nJobsInBunch = nJobsInBunchMax
     # set number of jobs to be submitted
     self.setMaxNumJobs(nJobsInBunch/nParallel)
     # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling
     limitPriority = False
     tmpLog.debug(msgHeader+" nQueueLimit:{0} nQueued:{1} nDefine:{2} nRunning:{3}".format(nQueueLimit,
                                                                                           nNotRun+nDefine,
                                                                                           nDefine,
                                                                                           nRunning))
     # check when high prio tasks are not waiting
     if not highPrioQueued:
         if nRunning == 0 and (nNotRun+nDefine) > nQueueLimit:
             limitPriority = True
             # pilot is not running or DDM has a problem
             msgBody = "SKIP no running and enough nQueued({0})>{1}".format(nNotRun+nDefine,nQueueLimit)
             tmpLog.debug(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
             return self.retMergeUnThr
         elif nRunning != 0 and float(nNotRun)/float(nRunning) > threshold and (nNotRun+nDefine) > nQueueLimit:
             limitPriority = True
             # enough jobs in Panda
             msgBody = "SKIP nQueued({0})/nRunning({1})>{2} & nQueued+Defined({3})>{4}".format(nNotRun,nRunning,
                                                                                               threshold,nNotRun+nDefine,
                                                                                               nQueueLimit)
             tmpLog.debug(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
             return self.retMergeUnThr
         elif nDefine > nQueueLimit:
             limitPriority = True
             # brokerage is stuck
             msgBody = "SKIP too many nDefined({0})>{1}".format(nDefine,nQueueLimit)
             tmpLog.debug(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
             return self.retMergeUnThr
         elif nWaiting > nRunning*nWaitingLimit and nWaiting > nJobsInBunch*nWaitingBunchLimit:
             limitPriority = True
             # too many waiting
             msgBody = "SKIP too many nWaiting({0})>max(nRunning({1})x{2},{3}x{4})".format(nWaiting,nRunning,nWaitingLimit,
                                                                                           nJobsInBunch,nWaitingBunchLimit)
             tmpLog.debug(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
             return self.retMergeUnThr
     # get jobs from prodDB
     limitPriorityValue = None
     if limitPriority:
         limitPriorityValue = highestPrioInPandaDB
         self.setMinPriority(limitPriorityValue)
     msgBody = "PASS - priority limit={0}".format(limitPriorityValue)
     tmpLog.debug(msgHeader+" "+msgBody)
     return self.retUnThrottled

Пример #16

Показать файл

 def toBeThrottled(self,vo,prodSourceLabel,cloudName,workQueue,jobStat):
     # component name
     compName = 'prod_job_throttler'
     # params
     nBunch = 4
     threshold = 2.0
     thresholdForSite = threshold - 1.0
     nJobsInBunchMax = 600
     nJobsInBunchMin = 500
     nJobsInBunchMaxES = 1000
     if workQueue.criteria != None and 'site' in workQueue.criteria:
         minTotalWalltime = 10*1000*1000
     else:
         minTotalWalltime = 50*1000*1000
     nWaitingLimit = 4
     nWaitingBunchLimit = 2
     nParallel = 2
     # make logger
     tmpLog = MsgWrapper(logger)
     workQueueIDs = workQueue.getIDs()
     msgHeader = '{0}:{1} cloud={2} queue={3}:'.format(vo,prodSourceLabel,cloudName,workQueue.queue_name)
     tmpLog.debug(msgHeader+' start workQueueID={0}'.format(str(workQueueIDs)))
     # change threashold
     if workQueue.queue_name in ['mcore']:
         threshold = 5.0
     # check cloud status
     if not self.siteMapper.checkCloud(cloudName):
         msgBody = "SKIP cloud={0} undefined".format(cloudName)
         tmpLog.warning(msgHeader+" "+msgBody)
         tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
         return self.retThrottled
     cloudSpec = self.siteMapper.getCloud(cloudName)
     if cloudSpec['status'] in ['offline']:
         msgBody = "SKIP cloud.status={0}".format(cloudSpec['status'])
         tmpLog.warning(msgHeader+" "+msgBody)
         tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
         return self.retThrottled
     if cloudSpec['status'] in ['test']:
         if workQueue.queue_name != 'test':
             msgBody = "SKIP cloud.status={0} for non test queue ({1})".format(cloudSpec['status'],
                                                                               workQueue.queue_name)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning')
             tmpLog.warning(msgHeader+" "+msgBody)
             return self.retThrottled
     # check if unthrottled
     if workQueue.queue_share == None:
         msgBody = "PASS unthrottled since share=None"
         tmpLog.debug(msgHeader+" "+msgBody)
         return self.retUnThrottled
     # count number of jobs in each status
     nRunning = 0
     nNotRun  = 0
     nDefine  = 0
     nWaiting = 0
     for workQueueID in workQueueIDs:
         if jobStat.has_key(cloudName) and \
                jobStat[cloudName].has_key(workQueueID):
             tmpLog.debug(msgHeader+" "+str(jobStat[cloudName][workQueueID]))
             for pState,pNumber in jobStat[cloudName][workQueueID].iteritems():
                 if pState in ['running']:
                     nRunning += pNumber
                 elif pState in ['assigned','activated','starting']:
                     nNotRun  += pNumber
                 elif pState in ['defined']:
                     nDefine  += pNumber
                 elif pState in ['waiting']:
                     nWaiting += pNumber
     # check if higher prio tasks are waiting
     tmpStat,highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed',cloudName,workQueue)
     highestPrioInPandaDB = highestPrioJobStat['highestPrio']
     nNotRunHighestPrio   = highestPrioJobStat['nNotRun']
     # the highest priority of waiting tasks 
     highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo,workQueue,
                                                                      'managed',cloudName)
     if highestPrioWaiting == None:
         msgBody = 'failed to get the highest priority of waiting tasks'
         tmpLog.error(msgHeader+" "+msgBody)
         return self.retTmpError
     # high priority tasks are waiting
     highPrioQueued = False
     if highestPrioWaiting > highestPrioInPandaDB or (highestPrioWaiting == highestPrioInPandaDB and \
                                                      nNotRunHighestPrio < nJobsInBunchMin):
         highPrioQueued = True
     tmpLog.debug(msgHeader+" highestPrio waiting:{0} inPanda:{1} numNotRun:{2} -> highPrioQueued={3}".format(highestPrioWaiting,
                                                                                                              highestPrioInPandaDB,
                                                                                                              nNotRunHighestPrio,
                                                                                                              highPrioQueued))
     # set maximum number of jobs to be submitted
     tmpRemainingSlot = int(nRunning*threshold-nNotRun)
     if tmpRemainingSlot < nJobsInBunchMin:
         # use the lower limit to avoid creating too many _sub/_dis datasets
         nJobsInBunch = nJobsInBunchMin
     else:
         if workQueue.queue_name in ['evgensimul']:
             # use higher limit for evgensimul
             if tmpRemainingSlot < nJobsInBunchMaxES:
                 nJobsInBunch = tmpRemainingSlot
             else:
                 nJobsInBunch = nJobsInBunchMaxES
         else:
             if tmpRemainingSlot < nJobsInBunchMax:
                 nJobsInBunch = tmpRemainingSlot
             else:
                 nJobsInBunch = nJobsInBunchMax
     nQueueLimit = nJobsInBunch*nBunch
     # use special nQueueLimit
     tmpVal = self.taskBufferIF.getConfigValue(compName, 'NQUEUELIMIT_{0}'.format(workQueue.queue_name), 'jedi', 'atlas')
     if tmpVal is not None:
         nQueueLimit = tmpVal
     # use nPrestage for reprocessing   
     if workQueue.queue_name in ['reprocessing','mcore_repro']:
         # reset nJobsInBunch
         if nQueueLimit > (nNotRun+nDefine):
             tmpRemainingSlot = nQueueLimit - (nNotRun+nDefine)
             if tmpRemainingSlot < nJobsInBunch:
                 pass
             elif tmpRemainingSlot < nJobsInBunchMax:
                 nJobsInBunch = tmpRemainingSlot
             else:
                 nJobsInBunch = nJobsInBunchMax
     # get cap
     nRunningCap = self.taskBufferIF.getConfigValue(compName, 'NRUNNINGCAP_{0}'.format(workQueue.queue_name), 'jedi', 'atlas')
     nQueueCap = self.taskBufferIF.getConfigValue(compName, 'NQUEUECAP_{0}'.format(workQueue.queue_name), 'jedi', 'atlas')
     # set number of jobs to be submitted
     self.setMaxNumJobs(nJobsInBunch/nParallel)
     # get total walltime
     totWalltime = self.taskBufferIF.getTotalWallTime_JEDI(vo,prodSourceLabel,workQueue,cloudName)
     # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling
     limitPriority = False
     tmpStr = msgHeader+" nQueueLimit:{0} nQueued:{1} nDefine:{2} nRunning:{3} totWalltime:{4} nRunCap:{5} nQueueCap:{6}"
     tmpLog.debug(tmpStr.format(nQueueLimit,
                                nNotRun+nDefine,
                                nDefine,
                                nRunning,
                                totWalltime,
                                nRunningCap,
                                nQueueCap))
     # check
     if nRunning == 0 and (nNotRun+nDefine) > nQueueLimit and (totWalltime == None or totWalltime > minTotalWalltime):
         limitPriority = True
         if not highPrioQueued:
             # pilot is not running or DDM has a problem
             msgBody = "SKIP no running and enough nQueued({0})>{1} totWalltime({2})>{3} ".format(nNotRun+nDefine,nQueueLimit,
                                                                                                  totWalltime,minTotalWalltime)
             tmpLog.warning(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True)
             return self.retMergeUnThr
     elif nRunning != 0 and float(nNotRun+nDefine)/float(nRunning) > threshold and \
             (nNotRun+nDefine) > nQueueLimit and (totWalltime == None or totWalltime > minTotalWalltime):
         limitPriority = True
         if not highPrioQueued:
             # enough jobs in Panda
             msgBody = "SKIP nQueued({0})/nRunning({1})>{2} & nQueued({3})>{4} totWalltime({5})>{6}".format(nNotRun+nDefine,nRunning,
                                                                                                            threshold,nNotRun+nDefine,
                                                                                                            nQueueLimit,
                                                                                                            totWalltime,minTotalWalltime)
             tmpLog.warning(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True)
             return self.retMergeUnThr
     elif nDefine > nQueueLimit:
         limitPriority = True
         if not highPrioQueued:
             # brokerage is stuck
             msgBody = "SKIP too many nDefined({0})>{1}".format(nDefine,nQueueLimit)
             tmpLog.warning(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True)
             return self.retMergeUnThr
     elif nWaiting > nRunning*nWaitingLimit and nWaiting > nJobsInBunch*nWaitingBunchLimit:
         limitPriority = True
         if not highPrioQueued:
             # too many waiting
             msgBody = "SKIP too many nWaiting({0})>max(nRunning({1})x{2},{3}x{4})".format(nWaiting,nRunning,nWaitingLimit,
                                                                                           nJobsInBunch,nWaitingBunchLimit)
             tmpLog.warning(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True)
             return self.retMergeUnThr
     elif nRunningCap is not None and nRunning > nRunningCap:
         limitPriority = True
         if not highPrioQueued:
             # cap on running
             msgBody = "SKIP nRunning({0})>nRunningCap({1})".format(nRunning,nRunningCap)
             tmpLog.warning(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True)
             return self.retMergeUnThr
     elif nQueueCap is not None and nNotRun+nDefine > nQueueCap:
         limitPriority = True
         if not highPrioQueued:
             # cap on queued
             msgBody = "SKIP nQueue({0})>nQueueCap({1})".format(nNotRun+nDefine,nQueueCap)
             tmpLog.warning(msgHeader+" "+msgBody)
             tmpLog.sendMsg(msgHeader+' '+msgBody,self.msgType,msgLevel='warning',escapeChar=True)
             return self.retMergeUnThr
     # get jobs from prodDB
     limitPriorityValue = None
     if limitPriority:
         limitPriorityValue = highestPrioWaiting
         self.setMinPriority(limitPriorityValue)
     else:
         # not enough jobs are queued
         if nNotRun+nDefine < max(nQueueLimit,nRunning) or (totWalltime != None and totWalltime < minTotalWalltime):
             tmpLog.debug(msgHeader+" not enough jobs queued")
             self.notEnoughJobsQueued()
             self.setMaxNumJobs(max(self.maxNumJobs,nQueueLimit/20))
     msgBody = "PASS - priority limit={0}".format(limitPriorityValue)
     tmpLog.debug(msgHeader+" "+msgBody)
     return self.retUnThrottled

Пример #17

Показать файл

Файл: TaskCommando.py Проект: lukewayne123/panda-jedi

 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,commandMap in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID))
                 commandStr = commandMap['command']
                 commentStr = commandMap['comment']
                 oldStatus  = commandMap['oldStatus']
                 tmpLog.info('start for {0}'.format(commandStr))
                 tmpStat = Interaction.SC_SUCCEEDED
                 if commandStr in ['kill','finish','reassign']:
                     # get active PandaIDs to be killed
                     pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True)
                     if pandaIDs == None:
                         tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID))
                         tmpStat = Interaction.SC_FAILED
                     # kill jobs or update task
                     if tmpStat == Interaction.SC_SUCCEEDED:
                         if pandaIDs == []:
                             # done since no active jobs
                             tmpLog.info('completed the command')
                             tmpTaskSpec = JediTaskSpec()
                             tmpTaskSpec.jediTaskID = jediTaskID
                             updateTaskStatus = True
                             if commandStr != 'reassign':
                                 # keep oldStatus for task reassignment since it is reset when actually reassigned
                                 tmpTaskSpec.forceUpdate('oldStatus')
                             else:
                                 # extract cloud or site
                                 tmpItems = commentStr.split(':')
                                 if tmpItems[0] == 'cloud':
                                     tmpTaskSpec.cloud = tmpItems[1]
                                 else:
                                     tmpTaskSpec.site = tmpItems[1]
                                 # back to oldStatus if necessary 
                                 if tmpItems[2] == 'y':
                                     tmpTaskSpec.status = oldStatus
                                     tmpTaskSpec.forceUpdate('oldStatus')
                                     updateTaskStatus = False
                             if updateTaskStatus:
                                 tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done']
                             tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID})
                         else:
                             tmpLog.info('sending kill command')
                             tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True)
                         tmpLog.info('done with {0}'.format(str(tmpRet)))
                 elif commandStr in ['retry','incexec']:
                     # change task params for incexec
                     if commandStr == 'incexec':
                         try:
                             # read task params
                             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                             taskParamMap = RefinerUtils.decodeJSON(taskParam)
                             # remove some params
                             for newKey in ['nFiles','fixedSandbox']:
                                 try:
                                     del taskParamMap[newKey]
                                 except:
                                     pass
                             # convert new params
                             newParamMap = RefinerUtils.decodeJSON(commentStr)
                             # change params
                             for newKey,newVal in newParamMap.iteritems():
                                 if newVal == None:
                                     # delete
                                     if newKey in taskParamMap:
                                         del taskParamMap[newKey]
                                 else:
                                     # change
                                     taskParamMap[newKey] = newVal
                             # overwrite sandbox
                             if 'fixedSandbox' in taskParamMap:
                                 # noBuild
                                 for tmpParam in taskParamMap['jobParameters']:
                                     if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None:
                                         tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox']
                                 # build
                                 if taskParamMap.has_key('buildSpec'):
                                     taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox']
                                 # merge
                                 if taskParamMap.has_key('mergeSpec'):
                                     taskParamMap['mergeSpec']['jobParameters'] = \
                                         re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters'])
                             # encode new param
                             strTaskParams = RefinerUtils.encodeJSON(taskParamMap)
                             tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams)
                             if tmpRet != True:
                                 tmpLog.error('failed to update task params')
                                 continue
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue))
                             continue
                     # retry failed files
                     tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr)
                     if tmpRet == True:
                         tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                         tmpLog.info(tmpMsg)
                     tmpLog.info('done with {0}'.format(tmpRet))
                 else:
                     tmpLog.error('unknown command')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))

Пример #18

Показать файл

Файл: AtlasProdJobThrottler.py Проект: PanDAWMS/panda-jedi

    def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue, resource_name):
        # params
        nBunch = 4
        threshold = 2.0
        nJobsInBunchMax = 600
        nJobsInBunchMin = 500
        minTotalWalltime = 50*1000*1000
        nWaitingLimit = 4
        nWaitingBunchLimit = 2
        nParallel = 2
        nParallelCap = 5
        # make logger
        tmpLog = MsgWrapper(logger)

        workQueueID = workQueue.getID()
        workQueueName = workQueue.queue_name

        workQueueName = '_'.join(workQueue.queue_name.split(' '))
        msgHeader = '{0}:{1} cloud={2} queue={3} resource_type={4}:'.format(vo, prodSourceLabel, cloudName,
                                                                            workQueueName, resource_name)
        tmpLog.debug('{0} start workQueueID={1}'.format(msgHeader, workQueueID))

        # get central configuration values
        config_map = self.__getConfiguration(vo, workQueue.queue_name, resource_name)
        configQueueLimit = config_map[NQUEUELIMIT]['value']
        configQueueCap = config_map[NQUEUECAP]['value']
        configRunningCap = config_map[NRUNNINGCAP]['value']

        tmpLog.debug(msgHeader + ' got configuration configQueueLimit={0}, configQueueCap={1}, configRunningCap={2}'
                     .format(configQueueLimit, configQueueCap, configRunningCap))

        # check if unthrottled
        if not workQueue.throttled:
            msgBody = "PASS unthrottled since GS_throttled is False"
            tmpLog.info(msgHeader+" "+msgBody)
            return self.retUnThrottled

        # get the jobs statistics for our wq/gs and expand the stats map
        jobstats_map = self.__prepareJobStats(workQueue, resource_name, config_map)
        nRunning_rt = jobstats_map['nRunning_rt']
        nRunning_gs = jobstats_map['nRunning_gs']
        nRunning_runningcap = jobstats_map['nRunning_runningcap']
        nNotRun_rt = jobstats_map['nNotRun_rt']
        nNotRun_gs = jobstats_map['nNotRun_gs']
        nNotRun_queuelimit = jobstats_map['nNotRun_queuelimit']
        nNotRun_queuecap = jobstats_map['nNotRun_queuecap']
        nDefine_rt = jobstats_map['nDefine_rt']
        nDefine_gs = jobstats_map['nDefine_gs']
        nDefine_queuelimit = jobstats_map['nDefine_queuelimit']
        nDefine_queuecap = jobstats_map['nDefine_queuecap']
        nWaiting_rt = jobstats_map['nWaiting_rt']
        nWaiting_gs = jobstats_map['nWaiting_gs']

        # check if higher prio tasks are waiting
        if workQueue.queue_name in non_rt_wqs:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed', cloudName, workQueue)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo, workQueue, 'managed', cloudName)
        else:
            # find highest priority of currently defined jobs
            tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed', cloudName, workQueue, resource_name)
            # the highest priority of waiting tasks
            highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo, workQueue, 'managed', cloudName, resource_name)

        highestPrioInPandaDB = highestPrioJobStat['highestPrio']
        nNotRunHighestPrio   = highestPrioJobStat['nNotRun']
        if highestPrioWaiting is None:
            msgBody = 'failed to get the highest priority of waiting tasks'
            tmpLog.error("{0} {1}".format(msgHeader, msgBody))
            return self.retTmpError

        # high priority tasks are waiting
        highPrioQueued = False
        if highestPrioWaiting > highestPrioInPandaDB \
                or (highestPrioWaiting == highestPrioInPandaDB and nNotRunHighestPrio < nJobsInBunchMin):
            highPrioQueued = True
        tmpLog.debug("{0} highestPrio waiting:{1} inPanda:{2} numNotRun:{3} -> highPrioQueued={4}".format(msgHeader,
                                                                                                          highestPrioWaiting,
                                                                                                          highestPrioInPandaDB,
                                                                                                          nNotRunHighestPrio,
                                                                                                          highPrioQueued))
        # set maximum number of jobs to be submitted
        if workQueue.queue_name in non_rt_wqs:
            tmpRemainingSlot = int(nRunning_gs * threshold - nNotRun_gs)
        else:
            tmpRemainingSlot = int(nRunning_rt * threshold - nNotRun_rt)
        # use the lower limit to avoid creating too many _sub/_dis datasets
        nJobsInBunch = min(max(nJobsInBunchMin, tmpRemainingSlot), nJobsInBunchMax)

        if configQueueLimit is not None:
            nQueueLimit = configQueueLimit
        else:
            nQueueLimit = nJobsInBunch * nBunch

        # use nPrestage for reprocessing
        if workQueue.queue_name in ['Heavy Ion', 'Reprocessing default']:
            # reset nJobsInBunch
            if nQueueLimit > (nNotRun_queuelimit + nDefine_queuelimit):
                tmpRemainingSlot = nQueueLimit - (nNotRun_queuelimit + nDefine_queuelimit)
                if tmpRemainingSlot > nJobsInBunch:
                    nJobsInBunch = min(tmpRemainingSlot, nJobsInBunchMax)

        # get cap
        # set number of jobs to be submitted
        if configQueueCap is None:
            self.setMaxNumJobs(nJobsInBunch / nParallel)
        else:
            self.setMaxNumJobs(configQueueCap / nParallelCap)

        # get total walltime
        totWalltime = self.taskBufferIF.getTotalWallTime_JEDI(vo, prodSourceLabel, workQueue, resource_name, cloudName)

        # log the current situation and limits
        tmpLog.info("{0} nQueueLimit={1} nRunCap={2} nQueueCap={3}".format(msgHeader, nQueueLimit,
                                                                           configRunningCap, configQueueCap))
        tmpLog.info("{0} at global share level: nQueued={1} nDefine={2} nRunning={3}".format(msgHeader,
                                                                                             nNotRun_gs + nDefine_gs,
                                                                                             nDefine_gs, nRunning_gs))
        tmpLog.info("{0} at resource type level: nQueued_rt={1} nDefine_rt={2} nRunning_rt={3} totWalltime={4}".format(msgHeader,
                                                                                                                nNotRun_rt + nDefine_rt,
                                                                                                                nDefine_rt, nRunning_rt,
                                                                                                                totWalltime))

        # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling
        limitPriority = False
        if workQueue.queue_name not in non_rt_wqs \
                and nRunning_rt == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit \
                and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(nNotRun_queuelimit + nDefine_queuelimit,
                                                                                                     nQueueLimit, totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs \
                and nRunning_gs == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # pilot is not running or DDM has a problem
                msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(nNotRun_queuelimit + nDefine_queuelimit,
                                                                                                     nQueueLimit, totWalltime, minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name not in non_rt_wqs and  nRunning_rt != 0 \
                and float(nNotRun_rt + nDefine_rt) / float(nRunning_rt) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit and (totWalltime is None or totWalltime > minTotalWalltime):
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_rt({0})/nRunning_rt({1})>{2} & nQueued_queuelimit({3})>{4} totWalltime({5})>{6}".format(nNotRun_rt + nDefine_rt, nRunning_rt,
                                                                                                               threshold, nNotRun_queuelimit + nDefine_queuelimit,
                                                                                                               nQueueLimit, totWalltime,
                                                                                                               minTotalWalltime)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif workQueue.queue_name in non_rt_wqs and nRunning_gs != 0 \
                and float(nNotRun_gs + nDefine_gs) / float(nRunning_gs) > threshold and \
                (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # enough jobs in Panda
                msgBody = "SKIP nQueued_gs({0})/nRunning_gs({1})>{2} & nQueued_queuelimit({3})>{4}".format(nNotRun_gs + nDefine_gs, nRunning_gs,
                                                                                                               threshold, nNotRun_queuelimit + nDefine_queuelimit,
                                                                                                               nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif nDefine_queuelimit > nQueueLimit:
            limitPriority = True
            if not highPrioQueued:
                # brokerage is stuck
                msgBody = "SKIP too many nDefined_queuelimit({0})>{1}".format(nDefine_queuelimit, nQueueLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif nWaiting_rt > max(nRunning_rt * nWaitingLimit, nJobsInBunch * nWaitingBunchLimit):
            limitPriority = True
            if not highPrioQueued:
                # too many waiting
                msgBody = "SKIP too many nWaiting_rt({0})>max(nRunning_rt({1})x{2},{3}x{4})".format(nWaiting_rt, nRunning_rt, nWaitingLimit,
                                                                                                    nJobsInBunch, nWaitingBunchLimit)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        elif configRunningCap and nRunning_runningcap > configRunningCap:
            # cap on running
            msgBody = "SKIP nRunning_runningcap({0})>nRunningCap({1})".format(nRunning_runningcap, configRunningCap)
            tmpLog.warning('{0} {1}'.format(msgHeader, msgBody))
            tmpLog.sendMsg('{0} {1}'.format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
            return self.retMergeUnThr

        elif configQueueCap and nNotRun_queuecap + nDefine_queuecap > configQueueCap:
            limitPriority = True
            if not highPrioQueued:
                # cap on queued
                msgBody = "SKIP nQueued_queuecap({0})>nQueueCap({1})".format(nNotRun_queuecap + nDefine_queuecap, configQueueCap)
                tmpLog.warning("{0} {1}".format(msgHeader, msgBody))
                tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True)
                return self.retMergeUnThr

        # get jobs from prodDB
        limitPriorityValue = None
        if limitPriority:
            limitPriorityValue = highestPrioWaiting
            self.setMinPriority(limitPriorityValue)
        else:
            # not enough jobs are queued
            if (nNotRun_queuelimit + nDefine_queuelimit < nQueueLimit * 0.9) \
                    or (workQueue.queue_name in non_rt_wqs and nNotRun_gs + nDefine_gs < nRunning_gs) \
                    or (workQueue.queue_name not in non_rt_wqs and nNotRun_rt + nDefine_rt < nRunning_rt):
                tmpLog.debug(msgHeader+" not enough jobs queued")
                self.notEnoughJobsQueued()
                self.setMaxNumJobs(max(self.maxNumJobs, nQueueLimit/20))

        msgBody = "PASS - priority limit={0} maxNumJobs={1}".format(limitPriorityValue, self.maxNumJobs)
        tmpLog.info(msgHeader+" "+msgBody)
        return self.retUnThrottled