Exemplo n.º 1
0
 def doFinalProcedure(self,taskSpec,tmpLog):
     # check email address
     toAdd = self.getEmail(taskSpec.userName,taskSpec.vo,tmpLog)
     # read task parameters
     try:
         taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(taskSpec.jediTaskID)
         self.taskParamMap = RefinerUtils.decodeJSON(taskParam)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue))
     if toAdd == None or \
             (self.taskParamMap != None and self.taskParamMap.has_key('noEmail') and self.taskParamMap['noEmail'] == True):
         tmpLog.info('email notification is suppressed')
     else:
         # send email notification
         fromAdd = self.senderAddress()
         msgBody = self.composeMessage(taskSpec,fromAdd,toAdd)
         self.sendMail(taskSpec.jediTaskID,fromAdd,toAdd,msgBody,3,False,tmpLog)
     return self.SC_SUCCEEDED
 def doFinalProcedure(self,taskSpec,tmpLog):
     # check email address
     toAdd = self.getEmail(taskSpec.userName,taskSpec.vo,tmpLog)
     # read task parameters
     try:
         taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(taskSpec.jediTaskID)
         self.taskParamMap = RefinerUtils.decodeJSON(taskParam)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue))
     if toAdd == None or \
             (self.taskParamMap != None and self.taskParamMap.has_key('noEmail') and self.taskParamMap['noEmail'] == True):
         tmpLog.debug('email notification is suppressed')
     else:
         # send email notification
         fromAdd = self.senderAddress()
         msgBody = self.composeMessage(taskSpec,fromAdd,toAdd)
         self.sendMail(taskSpec.jediTaskID,fromAdd,toAdd,msgBody,3,False,tmpLog)
     return self.SC_SUCCEEDED
Exemplo n.º 3
0
    def runImpl(self):
        while True:
            try:
                # get a part of list
                nTasks = 10
                taskDsList = self.taskDsList.get(nTasks)
                # no more datasets
                if len(taskDsList) == 0:
                    self.logger.debug('%s terminating since no more items' % self.__class__.__name__)
                    return
                # loop over all tasks
                for jediTaskID,dsList in taskDsList:
                    allUpdated = True
                    taskBroken = False
                    taskOnHold = False
                    runningTask = False
                    missingMap = {}
                    datasetsIdxConsistency = []

                    # get task
                    tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False,True,self.pid,10)
                    if not tmpStat or taskSpec == None:
                        self.logger.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID))
                        continue

                    # make logger
                    try:
                        gshare = '_'.join(taskSpec.gshare.split(' '))
                    except:
                        gshare = 'Undefined'
                    tmpLog = MsgWrapper(self.logger,'<jediTaskID={0} gshare={1}>'.format(jediTaskID, gshare))

                    try:
                        # get task parameters
                        taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                        taskParamMap = RefinerUtils.decodeJSON(taskParam)
                    except:
                        errtype,errvalue = sys.exc_info()[:2]
                        tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue))
                        taskBroken = True
                    # renaming of parameters
                    if taskParamMap.has_key('nEventsPerInputFile'):
                        taskParamMap['nEventsPerFile'] = taskParamMap['nEventsPerInputFile']
                    # the number of files per job
                    nFilesPerJob = taskSpec.getNumFilesPerJob()
                    # the number of chunks used by scout 
                    nChunksForScout = 10
                    # load XML
                    if taskSpec.useLoadXML():
                        xmlConfig = taskParamMap['loadXML']
                    else:
                        xmlConfig = None
                    # skip files used by another task
                    if 'skipFilesUsedBy' in taskParamMap:
                        skipFilesUsedBy = taskParamMap['skipFilesUsedBy']
                    else:
                        skipFilesUsedBy = None
                    # check no wait
                    noWaitParent = False
                    parentOutDatasets = set()
                    if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None,taskSpec.jediTaskID]:
                        tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid)
                        if tmpStat == 'running':
                            noWaitParent = True
                            # get output datasets from parent task
                            tmpParentStat,tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.parent_tid,
                                                                                                                  ['output','log'])
                            # collect dataset names
                            for tmpParentOutDataset in tmpParentOutDatasets:
                                parentOutDatasets.add(tmpParentOutDataset.datasetName)
                    # loop over all datasets
                    nFilesMaster = 0
                    checkedMaster = False
                    setFrozenTime = True
                    if not taskBroken:
                        ddmIF = self.ddmIF.getInterface(taskSpec.vo) 
                        origNumFiles = None
                        if taskParamMap.has_key('nFiles'):
                            origNumFiles = taskParamMap['nFiles']
                        for datasetSpec in dsList:
                            tmpLog.debug('start loop for {0}(id={1})'.format(datasetSpec.datasetName,datasetSpec.datasetID))
                            # index consistency
                            if datasetSpec.indexConsistent():
                                datasetsIdxConsistency.append(datasetSpec.datasetID)
                            # get dataset metadata
                            tmpLog.debug('get metadata')
                            gotMetadata = False
                            stateUpdateTime = datetime.datetime.utcnow()                    
                            try:
                                if not datasetSpec.isPseudo():
                                    tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName)
                                else:
                                    # dummy metadata for pseudo dataset
                                    tmpMetadata = {'state':'closed'}
                                # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed 
                                if (noWaitParent or taskSpec.runUntilClosed()) and \
                                        (tmpMetadata['state'] == 'open' \
                                             or datasetSpec.datasetName in parentOutDatasets \
                                             or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets):
                                    # dummy metadata when parent is running
                                    tmpMetadata = {'state':'mutable'}
                                gotMetadata = True
                            except:
                                errtype,errvalue = sys.exc_info()[:2]
                                tmpLog.error('{0} failed to get metadata to {1}:{2}'.format(self.__class__.__name__,
                                                                                            errtype.__name__,errvalue))
                                if errtype == Interaction.JEDIFatalError:
                                    # fatal error
                                    datasetStatus = 'broken'
                                    taskBroken = True
                                    # update dataset status    
                                    self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                else:
                                    if not taskSpec.ignoreMissingInDS():
                                        # temporary error
                                        taskOnHold = True
                                    else:
                                        # ignore missing 
                                        datasetStatus = 'failed'
                                        # update dataset status
                                        self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                taskSpec.setErrDiag('failed to get metadata for {0}'.format(datasetSpec.datasetName))
                                if not taskSpec.ignoreMissingInDS():
                                    allUpdated = False
                            else:
                                # get file list specified in task parameters
                                fileList,includePatt,excludePatt = RefinerUtils.extractFileList(taskParamMap,datasetSpec.datasetName)   
                                # get the number of events in metadata
                                if taskParamMap.has_key('getNumEventsInMetadata'):
                                    getNumEvents = True
                                else:
                                    getNumEvents = False
                                # get file list from DDM
                                tmpLog.debug('get files')
                                try:
                                    useInFilesWithNewAttemptNr = False
                                    skipDuplicate = not datasetSpec.useDuplicatedFiles()
                                    if not datasetSpec.isPseudo():
                                        if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \
                                                not datasetSpec.containerName in ['',None]:
                                            # read files from container if file list is specified in task parameters
                                            tmpDatasetName = datasetSpec.containerName
                                        else:
                                            tmpDatasetName = datasetSpec.datasetName
                                        # use long format for LB
                                        longFormat = False
                                        if taskSpec.respectLumiblock() or taskSpec.orderByLB():
                                            longFormat = True
                                        tmpRet = ddmIF.getFilesInDataset(tmpDatasetName,
                                                                         getNumEvents=getNumEvents,
                                                                         skipDuplicate=skipDuplicate,
                                                                         longFormat=longFormat
                                                                         )
                                        tmpLog.debug('got {0} files in {1}'.format(len(tmpRet),tmpDatasetName))
                                        # remove lost files
                                        tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName,tmpRet)
                                        if tmpLostFiles != {}:
                                            tmpLog.debug('found {0} lost files in {1}'.format(len(tmpLostFiles),tmpDatasetName))
                                            for tmpListGUID,tmpLostLFN in tmpLostFiles.iteritems():
                                                tmpLog.debug('removed {0}'.format(tmpLostLFN))
                                                del tmpRet[tmpListGUID]
                                    else:
                                        if datasetSpec.isSeqNumber():
                                            # make dummy files for seq_number
                                            if datasetSpec.getNumRecords() != None:
                                                nPFN = datasetSpec.getNumRecords()
                                            elif origNumFiles != None:
                                                nPFN = origNumFiles
                                                if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \
                                                        and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                    nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerJob']
                                                elif taskParamMap.has_key('nEventsPerFile') and taskParamMap.has_key('nEventsPerRange'):
                                                    nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerRange']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap:
                                                nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerJob']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \
                                                    and taskSpec.getNumFilesPerJob() is not None:
                                                nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerFile'] / taskSpec.getNumFilesPerJob()
                                            else:
                                                # the default number of records for seq_number
                                                seqDefNumRecords = 10000
                                                # get nFiles of the master
                                                tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI(datasetSpec.jediTaskID,
                                                                                                           datasetSpec.masterID,
                                                                                                           ['nFiles'])
                                                # use nFiles of the master as the number of records if it is larger than the default
                                                if 'nFiles' in tmpMasterAtt and tmpMasterAtt['nFiles'] > seqDefNumRecords:
                                                    nPFN = tmpMasterAtt['nFiles']
                                                else:
                                                    nPFN = seqDefNumRecords
                                                # check usedBy 
                                                if skipFilesUsedBy != None:
                                                    for tmpJediTaskID in str(skipFilesUsedBy).split(','):
                                                        tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI(tmpJediTaskID,
                                                                                                                          {'datasetName':datasetSpec.datasetName},
                                                                                                                          ['nFiles'])
                                                        if 'nFiles' in tmpParentAtt and tmpParentAtt['nFiles']:
                                                            nPFN += tmpParentAtt['nFiles']
                                            tmpRet = {}
                                            # get offset
                                            tmpOffset = datasetSpec.getOffset()
                                            tmpOffset += 1
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {'lfn':iPFN+tmpOffset,
                                                                             'scope':None,
                                                                             'filesize':0,
                                                                             'checksum':None,
                                                                             }
                                        elif not taskSpec.useListPFN():
                                            # dummy file list for pseudo dataset
                                            tmpRet = {str(uuid.uuid4()):{'lfn':'pseudo_lfn',
                                                                         'scope':None,
                                                                         'filesize':0,
                                                                         'checksum':None,
                                                                         }
                                                      }
                                        else:
                                            # make dummy file list for PFN list
                                            if taskParamMap.has_key('nFiles'):
                                                nPFN = taskParamMap['nFiles']
                                            else:
                                                nPFN = 1
                                            tmpRet = {}
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {'lfn':'{0:06d}:{1}'.format(iPFN,taskParamMap['pfnList'][iPFN].split('/')[-1]),
                                                                             'scope':None,
                                                                             'filesize':0,
                                                                             'checksum':None,
                                                                             }
                                except:
                                    errtype,errvalue = sys.exc_info()[:2]
                                    tmpLog.error('failed to get files due to {0}:{1} {2}'.format(self.__class__.__name__,
                                                                                                 errtype.__name__,errvalue))
                                    if errtype == Interaction.JEDIFatalError:
                                        # fatal error
                                        datasetStatus = 'broken'
                                        taskBroken = True
                                        # update dataset status    
                                        self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                    else:
                                        # temporary error
                                        taskOnHold = True
                                    taskSpec.setErrDiag('failed to get files for {0}'.format(datasetSpec.datasetName))
                                    allUpdated = False
                                else:
                                    # parameters for master input
                                    respectLB = False
                                    useRealNumEvents = False
                                    if datasetSpec.isMaster():
                                        # respect LB boundaries
                                        respectLB = taskSpec.respectLumiblock()
                                        # use real number of events
                                        useRealNumEvents = taskSpec.useRealNumEvents()
                                    # the number of events per file
                                    nEventsPerFile  = None
                                    nEventsPerJob   = None
                                    nEventsPerRange = None
                                    tgtNumEventsPerJob = None
                                    if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \
                                            (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()):
                                        if taskParamMap.has_key('nEventsPerFile'):
                                            nEventsPerFile = taskParamMap['nEventsPerFile']
                                        elif datasetSpec.isMaster() and datasetSpec.isPseudo() and taskParamMap.has_key('nEvents'):
                                            # use nEvents as nEventsPerFile for pseudo input
                                            nEventsPerFile = taskParamMap['nEvents']
                                        if taskParamMap.has_key('nEventsPerJob'):
                                            nEventsPerJob = taskParamMap['nEventsPerJob']
                                        elif taskParamMap.has_key('nEventsPerRange'):
                                            nEventsPerRange = taskParamMap['nEventsPerRange']
                                        if 'tgtNumEventsPerJob' in taskParamMap:
                                            tgtNumEventsPerJob = taskParamMap['tgtNumEventsPerJob']
                                            # reset nEventsPerJob
                                            nEventsPerJob = None
                                    # max attempts
                                    maxAttempt = None
                                    maxFailure = None
                                    if datasetSpec.isMaster() or datasetSpec.toKeepTrack():
                                        # max attempts 
                                        if taskSpec.disableAutoRetry():
                                            # disable auto retry 
                                            maxAttempt = 1
                                        elif taskParamMap.has_key('maxAttempt'):
                                            maxAttempt = taskParamMap['maxAttempt']
                                        else:
                                            # use default value
                                            maxAttempt = 3
                                        # max failure
                                        if 'maxFailure' in taskParamMap:
                                            maxFailure = taskParamMap['maxFailure']
                                    # first event number
                                    firstEventNumber = None
                                    if datasetSpec.isMaster():
                                        # first event number
                                        firstEventNumber = 1 + taskSpec.getFirstEventOffset()
                                    # nMaxEvents
                                    nMaxEvents = None 
                                    if datasetSpec.isMaster() and taskParamMap.has_key('nEvents'):
                                        nMaxEvents = taskParamMap['nEvents']
                                    # nMaxFiles
                                    nMaxFiles = None
                                    if taskParamMap.has_key('nFiles'):
                                        if datasetSpec.isMaster():
                                            nMaxFiles = taskParamMap['nFiles']
                                        else:
                                            # calculate for secondary
                                            nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles)
                                            # multipled by the number of jobs per file for event-level splitting
                                            if nMaxFiles != None and taskParamMap.has_key('nEventsPerFile'):
                                                if taskParamMap.has_key('nEventsPerJob'):
                                                    if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                        nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerJob'])
                                                        nMaxFiles = int(math.ceil(nMaxFiles))
                                                elif taskParamMap.has_key('nEventsPerRange'):
                                                    if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerRange']:
                                                        nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerRange'])
                                                        nMaxFiles = int(math.ceil(nMaxFiles))
                                    # use scout
                                    useScout = False    
                                    if datasetSpec.isMaster() and taskSpec.useScout() and (datasetSpec.status != 'toupdate' or not taskSpec.isPostScout()):
                                        useScout = True
                                    # use files with new attempt numbers    
                                    useFilesWithNewAttemptNr = False
                                    if not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key('useInFilesWithNewAttemptNr'):
                                        useFilesWithNewAttemptNr = True
                                    # ramCount
                                    ramCount = 0
                                    # skip short input
                                    if datasetSpec.isMaster() and not datasetSpec.isPseudo() \
                                            and nEventsPerFile is not None and nEventsPerJob is not None \
                                            and nEventsPerFile >= nEventsPerJob \
                                            and 'skipShortInput' in taskParamMap and taskParamMap['skipShortInput'] == True:
                                        skipShortInput = True
                                    else:
                                        skipShortInput = False
                                    # feed files to the contents table
                                    tmpLog.debug('update contents')
                                    retDB,missingFileList,nFilesUnique,diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(datasetSpec,tmpRet,
                                                                                                                              tmpMetadata['state'],
                                                                                                                              stateUpdateTime,
                                                                                                                              nEventsPerFile,
                                                                                                                              nEventsPerJob,
                                                                                                                              maxAttempt,
                                                                                                                              firstEventNumber,
                                                                                                                              nMaxFiles,
                                                                                                                              nMaxEvents,
                                                                                                                              useScout,
                                                                                                                              fileList,
                                                                                                                              useFilesWithNewAttemptNr,
                                                                                                                              nFilesPerJob,
                                                                                                                              nEventsPerRange,
                                                                                                                              nChunksForScout,
                                                                                                                              includePatt,
                                                                                                                              excludePatt,
                                                                                                                              xmlConfig,
                                                                                                                              noWaitParent,
                                                                                                                              taskSpec.parent_tid,
                                                                                                                              self.pid,
                                                                                                                              maxFailure,
                                                                                                                              useRealNumEvents,
                                                                                                                              respectLB,
                                                                                                                              tgtNumEventsPerJob,
                                                                                                                              skipFilesUsedBy,
                                                                                                                              ramCount,
                                                                                                                              taskSpec,
                                                                                                                              skipShortInput)
                                    if retDB == False:
                                        taskSpec.setErrDiag('failed to insert files for {0}. {1}'.format(datasetSpec.datasetName,
                                                                                                         diagMap['errMsg']))
                                        allUpdated = False
                                        taskBroken = True
                                        break
                                    elif retDB == None:
                                        # the dataset is locked by another or status is not applicable
                                        allUpdated = False
                                        tmpLog.debug('escape since task or dataset is locked')
                                        break
                                    elif missingFileList != []:
                                        # files are missing
                                        tmpErrStr = '{0} files missing in {1}'.format(len(missingFileList),datasetSpec.datasetName)
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        allUpdated = False
                                        taskOnHold = True
                                        missingMap[datasetSpec.datasetName] = {'datasetSpec':datasetSpec,
                                                                               'missingFiles':missingFileList} 
                                    else:
                                        # reduce the number of files to be read
                                        if taskParamMap.has_key('nFiles'):
                                            if datasetSpec.isMaster():
                                                taskParamMap['nFiles'] -= nFilesUnique
                                        # reduce the number of files for scout
                                        if useScout:
                                            nChunksForScout = diagMap['nChunksForScout']
                                        # number of master input files
                                        if datasetSpec.isMaster():
                                            checkedMaster = True
                                            nFilesMaster += nFilesUnique
                                    # running task
                                    if diagMap['isRunningTask']:
                                        runningTask = True
                                    # no activated pending input for noWait
                                    if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout <= 0) \
                                            and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster():
                                        tmpErrStr = 'insufficient inputs are ready. '
                                        tmpErrStr += diagMap['errMsg']
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        taskOnHold = True
                                        setFrozenTime = False
                                        break
                            tmpLog.debug('end loop')
                    # no mater input
                    if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster:
                        tmpErrStr = 'no master input files. input dataset is empty'
                        tmpLog.error(tmpErrStr)
                        taskSpec.setErrDiag(tmpErrStr,None)
                        if taskSpec.allowEmptyInput() or noWaitParent:
                            taskOnHold = True
                        else:
                            taskBroken = True
                    # index consistency
                    if not taskOnHold and not taskBroken and len(datasetsIdxConsistency) > 0:
                        self.taskBufferIF.removeFilesIndexInconsistent_JEDI(jediTaskID,datasetsIdxConsistency)
                    # update task status
                    if taskBroken:
                        # task is broken
                        taskSpec.status = 'tobroken'
                        tmpMsg = 'set task_status={0}'.format(taskSpec.status)
                        tmpLog.info(tmpMsg)
                        tmpLog.sendMsg(tmpMsg,self.msgType)
                        allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid)
                    # change task status unless the task is running
                    if not runningTask:
                        if taskOnHold:
                            # go to pending state
                            if not taskSpec.status in ['broken','tobroken']:
                                taskSpec.setOnHold()
                            tmpMsg = 'set task_status={0}'.format(taskSpec.status)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg,self.msgType)
                            allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid,setFrozenTime=setFrozenTime)
                        elif allUpdated:
                            # all OK
                            allRet,newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,getTaskStatus=True,pid=self.pid,
                                                                                                       useWorldCloud=taskSpec.useWorldCloud())
                            tmpMsg = 'set task_status={0}'.format(newTaskStatus)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg,self.msgType)
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid)
                        tmpLog.debug('unlock not-running task with {0}'.format(retUnlock))
                    else:
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid)
                        tmpLog.debug('unlock task with {0}'.format(retUnlock))
                    tmpLog.debug('done')
            except:
                errtype,errvalue = sys.exc_info()[:2]
                logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
Exemplo n.º 4
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,commandMap in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID))
                 commandStr = commandMap['command']
                 commentStr = commandMap['comment']
                 oldStatus  = commandMap['oldStatus']
                 tmpLog.info('start for {0}'.format(commandStr))
                 tmpStat = Interaction.SC_SUCCEEDED
                 if commandStr in ['kill','finish','reassign']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # loop twice to see immediate result
                     for iLoop in range(2):
                         # get active PandaIDs to be killed
                         if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr:
                             pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID)
                         else:
                             pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True)
                         if pandaIDs == None:
                             tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID))
                             tmpStat = Interaction.SC_FAILED
                         # kill jobs or update task
                         if tmpStat == Interaction.SC_SUCCEEDED:
                             if pandaIDs == []:
                                 # done since no active jobs
                                 tmpMsg = 'completed cleaning jobs'
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpTaskSpec = JediTaskSpec()
                                 tmpTaskSpec.jediTaskID = jediTaskID
                                 updateTaskStatus = True
                                 if commandStr != 'reassign':
                                     # reset oldStatus
                                     # keep oldStatus for task reassignment since it is reset when actually reassigned
                                     tmpTaskSpec.forceUpdate('oldStatus')
                                 else:
                                     # extract cloud or site
                                     if commentStr != None:
                                         tmpItems = commentStr.split(':')
                                         if tmpItems[0] == 'cloud':
                                             tmpTaskSpec.cloud = tmpItems[1]
                                         else:
                                             tmpTaskSpec.site = tmpItems[1]
                                         tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1])
                                         tmpLog.sendMsg(tmpMsg,self.msgType)
                                         tmpLog.info(tmpMsg)
                                         # back to oldStatus if necessary 
                                         if tmpItems[2] == 'y':
                                             tmpTaskSpec.status = oldStatus
                                             tmpTaskSpec.forceUpdate('oldStatus')
                                             updateTaskStatus = False
                                 if commandStr == 'reassign':
                                     tmpTaskSpec.forceUpdate('errorDialog')
                                 if updateTaskStatus:
                                     tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done']
                                 tmpMsg = 'set task.status={0}'.format(tmpTaskSpec.status)
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID})
                                 tmpLog.info('done with {0}'.format(str(tmpRet)))
                                 break
                             else:
                                 # kill only in the first loop
                                 if iLoop > 0:
                                     break
                                 # wait or kill jobs 
                                 if 'soft finish' in commentStr:
                                     tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = True
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                                     break
                                 else:
                                     tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpLog.sendMsg(tmpMsg,self.msgType)
                                     if commandStr in ['reassign','finish']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True)
                                     else:
                                         # normal kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True)
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                 elif commandStr in ['retry','incexec']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # change task params for incexec
                     if commandStr == 'incexec':
                         try:
                             # read task params
                             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                             taskParamMap = RefinerUtils.decodeJSON(taskParam)
                             # remove some params
                             for newKey in ['nFiles','fixedSandbox']:
                                 try:
                                     del taskParamMap[newKey]
                                 except:
                                     pass
                             # convert new params
                             newParamMap = RefinerUtils.decodeJSON(commentStr)
                             # change params
                             for newKey,newVal in newParamMap.iteritems():
                                 if newVal == None:
                                     # delete
                                     if newKey in taskParamMap:
                                         del taskParamMap[newKey]
                                 else:
                                     # change
                                     taskParamMap[newKey] = newVal
                             # overwrite sandbox
                             if 'fixedSandbox' in taskParamMap:
                                 # noBuild
                                 for tmpParam in taskParamMap['jobParameters']:
                                     if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None:
                                         tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox']
                                 # build
                                 if taskParamMap.has_key('buildSpec'):
                                     taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox']
                                 # merge
                                 if taskParamMap.has_key('mergeSpec'):
                                     taskParamMap['mergeSpec']['jobParameters'] = \
                                         re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters'])
                             # encode new param
                             strTaskParams = RefinerUtils.encodeJSON(taskParamMap)
                             tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams)
                             if tmpRet != True:
                                 tmpLog.error('failed to update task params')
                                 continue
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue))
                             continue
                     # retry failed files
                     tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr)
                     if tmpRet == True:
                         tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                         tmpLog.info(tmpMsg)
                     tmpLog.info('done with {0}'.format(tmpRet))
                 else:
                     tmpLog.error('unknown command')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errStr  = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errStr += traceback.format_exc()
             logger.error(errStr)
Exemplo n.º 5
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,splitRule,taskStatus,parent_tid in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(jediTaskID))
                 tmpLog.info('start')
                 tmpStat = Interaction.SC_SUCCEEDED
                 errStr = ''
                 # read task parameters
                 try:
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue)
                     tmpLog.error(errStr)
                     tmpStat = Interaction.SC_FAILED
                 # get impl
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('getting Impl')
                     try:
                         # get VO and sourceLabel
                         vo = taskParamMap['vo']
                         prodSourceLabel = taskParamMap['prodSourceLabel']
                         taskType = taskParamMap['taskType']
                         tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType))
                         # get impl
                         impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType,
                                                                 self.taskBufferIF,self.ddmIF)
                         if impl == None:
                             # task refiner is undefined
                             errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # extract common parameters
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('extracting common')                    
                     try:
                         # initalize impl
                         impl.initializeRefiner(tmpLog)
                         # extarct common parameters
                         impl.extractCommon(jediTaskID,taskParamMap,self.workQueueMapper,splitRule)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to extract common parameters with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # check parent
                 noWaitParent = False
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     if not parent_tid in [None,jediTaskID]:
                         tmpLog.info('check parent task')
                         try:
                             tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid)
                             if tmpStat == 'completed':
                                 # parent is done
                                 tmpStat = Interaction.SC_SUCCEEDED
                             elif tmpStat == 'running':
                                 if not impl.taskSpec.noWaitParent():
                                     # parent is running
                                     errStr = 'pending until parent task {0} is done'.format(parent_tid)
                                     impl.taskSpec.status = taskStatus
                                     impl.taskSpec.setOnHold()
                                     impl.taskSpec.setErrDiag(errStr)
                                     tmpLog.info(errStr)
                                     self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID})
                                     continue
                                 else:
                                     # not wait for parent
                                     tmpStat = Interaction.SC_SUCCEEDED
                                     noWaitParent = True
                             else:
                                 # parent is corrupted
                                 tmpStat = Interaction.SC_FAILED
                                 tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid)
                                 impl.taskSpec.setErrDiag(tmpErrStr)
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # refine
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('refining with {0}'.format(impl.__class__.__name__))
                     try:
                         tmpStat = impl.doRefine(jediTaskID,taskParamMap)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         # no wait for parent
                         if impl.taskSpec.noWaitParent() and errtype == JediException.UnknownDatasetError:
                             impl.taskSpec.status = taskStatus
                             impl.taskSpec.setOnHold()
                             errStr = 'pending until parent produces input'
                             tmpLog.info(errStr)
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID})
                             continue
                         else:
                             errStr = 'failed to refine task'
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # register
                 if tmpStat != Interaction.SC_SUCCEEDED:
                     tmpLog.error('failed to refine the task')
                     if impl == None or impl.taskSpec == None:
                         tmpTaskSpec = JediTaskSpec()
                         tmpTaskSpec.jediTaskID = jediTaskID
                     else:
                         tmpTaskSpec = impl.taskSpec
                     tmpTaskSpec.status = 'tobroken'
                     if errStr != '':
                         tmpTaskSpec.setErrDiag(errStr,True)
                     self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID})
                 else:
                     tmpLog.info('registering')                    
                     # fill JEDI tables
                     try:
                         # enable protection against task duplication
                         if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \
                                 not impl.taskSpec.checkPreProcessed():
                             uniqueTaskName = True
                         else:
                             uniqueTaskName = False
                         strTaskParams = None
                         if impl.updatedTaskParams != None:
                             strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams)
                         if taskStatus == 'registered':
                             # unset pre-process flag
                             if impl.taskSpec.checkPreProcessed():
                                 impl.taskSpec.setPostPreProcess()
                             # full registration
                             tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec,
                                                                                                  impl.inMasterDatasetSpec,
                                                                                                  impl.inSecDatasetSpecList,
                                                                                                  impl.outDatasetSpecList,
                                                                                                  impl.outputTemplateMap,
                                                                                                  impl.jobParamsTemplate,
                                                                                                  strTaskParams,
                                                                                                  impl.unmergeMasterDatasetSpec,
                                                                                                  impl.unmergeDatasetSpecMap,
                                                                                                  uniqueTaskName) 
                             if not tmpStat:
                                 tmpErrStr = 'failed to register the task to JEDI in a single shot'
                                 tmpLog.error(tmpErrStr)
                                 impl.taskSpec.status = 'tobroken'
                                 impl.taskSpec.setErrDiag(tmpErrStr,True)
                                 self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID})
                             tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                             tmpLog.info(tmpMsg)
                             tmpLog.sendMsg(tmpMsg,self.msgType)
                         else:        
                             # appending for incremetnal execution
                             tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec,
                                                                             impl.inSecDatasetSpecList)
                             if not tmpStat:
                                 tmpLog.error('failed to append datasets for incexec')
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(tmpErrStr)
                     else:
                         tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
Exemplo n.º 6
0
    def runImpl(self):
        while True:
            try:
                # get a part of list
                nTasks = 10
                taskDsList = self.taskDsList.get(nTasks)
                # no more datasets
                if len(taskDsList) == 0:
                    self.logger.debug('%s terminating since no more items' %
                                      self.__class__.__name__)
                    return
                # loop over all tasks
                for jediTaskID, dsList in taskDsList:
                    allUpdated = True
                    taskBroken = False
                    taskOnHold = False
                    runningTask = False
                    missingMap = {}
                    # make logger
                    tmpLog = MsgWrapper(
                        self.logger, '< jediTaskID={0} >'.format(jediTaskID))
                    # get task
                    tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(
                        jediTaskID, False, True, self.pid, 10)
                    if not tmpStat or taskSpec == None:
                        tmpLog.error(
                            'failed to get taskSpec for jediTaskID={0}'.format(
                                jediTaskID))
                        continue
                    try:
                        # get task parameters
                        taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                            jediTaskID)
                        taskParamMap = RefinerUtils.decodeJSON(taskParam)
                    except:
                        errtype, errvalue = sys.exc_info()[:2]
                        tmpLog.error(
                            'task param conversion from json failed with {0}:{1}'
                            .format(errtype.__name__, errvalue))
                        taskBroken = True
                    # renaming of parameters
                    if taskParamMap.has_key('nEventsPerInputFile'):
                        taskParamMap['nEventsPerFile'] = taskParamMap[
                            'nEventsPerInputFile']
                    # the number of files per job
                    nFilesPerJob = None
                    if taskParamMap.has_key('nFilesPerJob'):
                        nFilesPerJob = taskParamMap['nFilesPerJob']
                    # the number of chunks used by scout
                    nChunksForScout = 10
                    # load XML
                    if taskSpec.useLoadXML():
                        xmlConfig = taskParamMap['loadXML']
                    else:
                        xmlConfig = None
                    # skip files used by another task
                    if 'skipFilesUsedBy' in taskParamMap:
                        skipFilesUsedBy = taskParamMap['skipFilesUsedBy']
                    else:
                        skipFilesUsedBy = None
                    # check no wait
                    noWaitParent = False
                    parentOutDatasets = set()
                    if taskSpec.noWaitParent() and not taskSpec.parent_tid in [
                            None, taskSpec.jediTaskID
                    ]:
                        tmpStat = self.taskBufferIF.checkParentTask_JEDI(
                            taskSpec.parent_tid)
                        if tmpStat == 'running':
                            noWaitParent = True
                            # get output datasets from parent task
                            tmpParentStat, tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                                taskSpec.parent_tid, ['output', 'log'])
                            # collect dataset names
                            for tmpParentOutDataset in tmpParentOutDatasets:
                                parentOutDatasets.add(
                                    tmpParentOutDataset.datasetName)
                    # loop over all datasets
                    nFilesMaster = 0
                    checkedMaster = False
                    setFrozenTime = True
                    if not taskBroken:
                        ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                        origNumFiles = None
                        if taskParamMap.has_key('nFiles'):
                            origNumFiles = taskParamMap['nFiles']
                        for datasetSpec in dsList:
                            tmpLog.debug('start loop for {0}(id={1})'.format(
                                datasetSpec.datasetName,
                                datasetSpec.datasetID))
                            # get dataset metadata
                            tmpLog.debug('get metadata')
                            gotMetadata = False
                            stateUpdateTime = datetime.datetime.utcnow()
                            try:
                                if not datasetSpec.isPseudo():
                                    tmpMetadata = ddmIF.getDatasetMetaData(
                                        datasetSpec.datasetName)
                                else:
                                    # dummy metadata for pseudo dataset
                                    tmpMetadata = {'state': 'closed'}
                                # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed
                                if (noWaitParent or taskSpec.runUntilClosed()) and \
                                        (tmpMetadata['state'] == 'open' \
                                             or datasetSpec.datasetName in parentOutDatasets \
                                             or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets):
                                    # dummy metadata when parent is running
                                    tmpMetadata = {'state': 'mutable'}
                                gotMetadata = True
                            except:
                                errtype, errvalue = sys.exc_info()[:2]
                                tmpLog.error(
                                    '{0} failed to get metadata to {1}:{2}'.
                                    format(self.__class__.__name__,
                                           errtype.__name__, errvalue))
                                if errtype == Interaction.JEDIFatalError:
                                    # fatal error
                                    datasetStatus = 'broken'
                                    taskBroken = True
                                    # update dataset status
                                    self.updateDatasetStatus(
                                        datasetSpec, datasetStatus, tmpLog)
                                else:
                                    if not taskSpec.ignoreMissingInDS():
                                        # temporary error
                                        taskOnHold = True
                                    else:
                                        # ignore missing
                                        datasetStatus = 'failed'
                                        # update dataset status
                                        self.updateDatasetStatus(
                                            datasetSpec, datasetStatus, tmpLog)
                                taskSpec.setErrDiag(
                                    'failed to get metadata for {0}'.format(
                                        datasetSpec.datasetName))
                                if not taskSpec.ignoreMissingInDS():
                                    allUpdated = False
                            else:
                                # get file list specified in task parameters
                                fileList, includePatt, excludePatt = RefinerUtils.extractFileList(
                                    taskParamMap, datasetSpec.datasetName)
                                # get the number of events in metadata
                                if taskParamMap.has_key(
                                        'getNumEventsInMetadata'):
                                    getNumEvents = True
                                else:
                                    getNumEvents = False
                                # get file list from DDM
                                tmpLog.debug('get files')
                                try:
                                    useInFilesWithNewAttemptNr = False
                                    skipDuplicate = not datasetSpec.useDuplicatedFiles(
                                    )
                                    if not datasetSpec.isPseudo():
                                        if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \
                                                not datasetSpec.containerName in ['',None]:
                                            # read files from container if file list is specified in task parameters
                                            tmpDatasetName = datasetSpec.containerName
                                        else:
                                            tmpDatasetName = datasetSpec.datasetName
                                        # use long format for LB
                                        longFormat = False
                                        if taskSpec.respectLumiblock():
                                            longFormat = True
                                        tmpRet = ddmIF.getFilesInDataset(
                                            tmpDatasetName,
                                            getNumEvents=getNumEvents,
                                            skipDuplicate=skipDuplicate,
                                            longFormat=longFormat)
                                        tmpLog.debug(
                                            'got {0} files in {1}'.format(
                                                len(tmpRet), tmpDatasetName))
                                        # remove lost files
                                        tmpLostFiles = ddmIF.findLostFiles(
                                            tmpDatasetName, tmpRet)
                                        if tmpLostFiles != {}:
                                            tmpLog.debug(
                                                'found {0} lost files in {1}'.
                                                format(len(tmpLostFiles),
                                                       tmpDatasetName))
                                            for tmpListGUID, tmpLostLFN in tmpLostFiles.iteritems(
                                            ):
                                                tmpLog.debug(
                                                    'removed {0}'.format(
                                                        tmpLostLFN))
                                                del tmpRet[tmpListGUID]
                                    else:
                                        if datasetSpec.isSeqNumber():
                                            # make dummy files for seq_number
                                            if datasetSpec.getNumRecords(
                                            ) != None:
                                                nPFN = datasetSpec.getNumRecords(
                                                )
                                            elif origNumFiles != None:
                                                nPFN = origNumFiles
                                                if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \
                                                        and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                    nPFN = nPFN * taskParamMap[
                                                        'nEventsPerFile'] / taskParamMap[
                                                            'nEventsPerJob']
                                                elif taskParamMap.has_key(
                                                        'nEventsPerFile'
                                                ) and taskParamMap.has_key(
                                                        'nEventsPerRange'):
                                                    nPFN = nPFN * taskParamMap[
                                                        'nEventsPerFile'] / taskParamMap[
                                                            'nEventsPerRange']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap:
                                                nPFN = taskParamMap[
                                                    'nEvents'] / taskParamMap[
                                                        'nEventsPerJob']
                                            elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \
                                                    and 'nFilesPerJob' in taskParamMap:
                                                nPFN = taskParamMap[
                                                    'nEvents'] / taskParamMap[
                                                        'nEventsPerFile'] / taskParamMap[
                                                            'nFilesPerJob']
                                            else:
                                                # the default number of records for seq_number
                                                seqDefNumRecords = 10000
                                                # get nFiles of the master
                                                tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI(
                                                    datasetSpec.jediTaskID,
                                                    datasetSpec.masterID,
                                                    ['nFiles'])
                                                # use nFiles of the master as the number of records if it is larger than the default
                                                if 'nFiles' in tmpMasterAtt and tmpMasterAtt[
                                                        'nFiles'] > seqDefNumRecords:
                                                    nPFN = tmpMasterAtt[
                                                        'nFiles']
                                                else:
                                                    nPFN = seqDefNumRecords
                                                # check usedBy
                                                if skipFilesUsedBy != None:
                                                    for tmpJediTaskID in str(
                                                            skipFilesUsedBy
                                                    ).split(','):
                                                        tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI(
                                                            tmpJediTaskID, {
                                                                'datasetName':
                                                                datasetSpec.
                                                                datasetName
                                                            }, ['nFiles'])
                                                        if 'nFiles' in tmpParentAtt and tmpParentAtt[
                                                                'nFiles']:
                                                            nPFN += tmpParentAtt[
                                                                'nFiles']
                                            tmpRet = {}
                                            # get offset
                                            tmpOffset = datasetSpec.getOffset()
                                            tmpOffset += 1
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {
                                                    'lfn': iPFN + tmpOffset,
                                                    'scope': None,
                                                    'filesize': 0,
                                                    'checksum': None,
                                                }
                                        elif not taskSpec.useListPFN():
                                            # dummy file list for pseudo dataset
                                            tmpRet = {
                                                str(uuid.uuid4()): {
                                                    'lfn': 'pseudo_lfn',
                                                    'scope': None,
                                                    'filesize': 0,
                                                    'checksum': None,
                                                }
                                            }
                                        else:
                                            # make dummy file list for PFN list
                                            if taskParamMap.has_key('nFiles'):
                                                nPFN = taskParamMap['nFiles']
                                            else:
                                                nPFN = 1
                                            tmpRet = {}
                                            for iPFN in range(nPFN):
                                                tmpRet[str(uuid.uuid4())] = {
                                                    'lfn':
                                                    '{0:06d}:{1}'.format(
                                                        iPFN,
                                                        taskParamMap['pfnList']
                                                        [iPFN].split('/')[-1]),
                                                    'scope':
                                                    None,
                                                    'filesize':
                                                    0,
                                                    'checksum':
                                                    None,
                                                }
                                except:
                                    errtype, errvalue = sys.exc_info()[:2]
                                    tmpLog.error(
                                        'failed to get files due to {0}:{1} {2}'
                                        .format(self.__class__.__name__,
                                                errtype.__name__, errvalue))
                                    if errtype == Interaction.JEDIFatalError:
                                        # fatal error
                                        datasetStatus = 'broken'
                                        taskBroken = True
                                        # update dataset status
                                        self.updateDatasetStatus(
                                            datasetSpec, datasetStatus, tmpLog)
                                    else:
                                        # temporary error
                                        taskOnHold = True
                                    taskSpec.setErrDiag(
                                        'failed to get files for {0}'.format(
                                            datasetSpec.datasetName))
                                    allUpdated = False
                                else:
                                    # parameters for master input
                                    respectLB = False
                                    useRealNumEvents = False
                                    if datasetSpec.isMaster():
                                        # respect LB boundaries
                                        respectLB = taskSpec.respectLumiblock()
                                        # use real number of events
                                        useRealNumEvents = taskSpec.useRealNumEvents(
                                        )
                                    # the number of events per file
                                    nEventsPerFile = None
                                    nEventsPerJob = None
                                    nEventsPerRange = None
                                    tgtNumEventsPerJob = None
                                    if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \
                                            (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()):
                                        if taskParamMap.has_key(
                                                'nEventsPerFile'):
                                            nEventsPerFile = taskParamMap[
                                                'nEventsPerFile']
                                        elif datasetSpec.isMaster(
                                        ) and datasetSpec.isPseudo(
                                        ) and taskParamMap.has_key('nEvents'):
                                            # use nEvents as nEventsPerFile for pseudo input
                                            nEventsPerFile = taskParamMap[
                                                'nEvents']
                                        if taskParamMap.has_key(
                                                'nEventsPerJob'):
                                            nEventsPerJob = taskParamMap[
                                                'nEventsPerJob']
                                        elif taskParamMap.has_key(
                                                'nEventsPerRange'):
                                            nEventsPerRange = taskParamMap[
                                                'nEventsPerRange']
                                        if 'tgtNumEventsPerJob' in taskParamMap:
                                            tgtNumEventsPerJob = taskParamMap[
                                                'tgtNumEventsPerJob']
                                            # reset nEventsPerJob
                                            nEventsPerJob = None
                                    # max attempts
                                    maxAttempt = None
                                    maxFailure = None
                                    if datasetSpec.isMaster(
                                    ) or datasetSpec.toKeepTrack():
                                        # max attempts
                                        if taskSpec.disableAutoRetry():
                                            # disable auto retry
                                            maxAttempt = 1
                                        elif taskParamMap.has_key(
                                                'maxAttempt'):
                                            maxAttempt = taskParamMap[
                                                'maxAttempt']
                                        else:
                                            # use default value
                                            maxAttempt = 3
                                        # max failure
                                        if 'maxFailure' in taskParamMap:
                                            maxFailure = taskParamMap[
                                                'maxFailure']
                                    # first event number
                                    firstEventNumber = None
                                    if datasetSpec.isMaster():
                                        # first event number
                                        firstEventNumber = 1 + taskSpec.getFirstEventOffset(
                                        )
                                    # nMaxEvents
                                    nMaxEvents = None
                                    if datasetSpec.isMaster(
                                    ) and taskParamMap.has_key('nEvents'):
                                        nMaxEvents = taskParamMap['nEvents']
                                    # nMaxFiles
                                    nMaxFiles = None
                                    if taskParamMap.has_key('nFiles'):
                                        if datasetSpec.isMaster():
                                            nMaxFiles = taskParamMap['nFiles']
                                        else:
                                            # calculate for secondary
                                            nMaxFiles = datasetSpec.getNumMultByRatio(
                                                origNumFiles)
                                            # multipled by the number of jobs per file for event-level splitting
                                            if nMaxFiles != None and taskParamMap.has_key(
                                                    'nEventsPerFile'):
                                                if taskParamMap.has_key(
                                                        'nEventsPerJob'):
                                                    if taskParamMap[
                                                            'nEventsPerFile'] > taskParamMap[
                                                                'nEventsPerJob']:
                                                        nMaxFiles *= float(
                                                            taskParamMap[
                                                                'nEventsPerFile']
                                                        ) / float(taskParamMap[
                                                            'nEventsPerJob'])
                                                        nMaxFiles = int(
                                                            math.ceil(
                                                                nMaxFiles))
                                                elif taskParamMap.has_key(
                                                        'nEventsPerRange'):
                                                    if taskParamMap[
                                                            'nEventsPerFile'] > taskParamMap[
                                                                'nEventsPerRange']:
                                                        nMaxFiles *= float(
                                                            taskParamMap[
                                                                'nEventsPerFile']
                                                        ) / float(taskParamMap[
                                                            'nEventsPerRange'])
                                                        nMaxFiles = int(
                                                            math.ceil(
                                                                nMaxFiles))
                                    # use scout
                                    useScout = False
                                    if datasetSpec.isMaster(
                                    ) and taskSpec.useScout() and (
                                            datasetSpec.status != 'toupdate'
                                            or not taskSpec.isPostScout()):
                                        useScout = True
                                    # use files with new attempt numbers
                                    useFilesWithNewAttemptNr = False
                                    if not datasetSpec.isPseudo(
                                    ) and fileList != [] and taskParamMap.has_key(
                                            'useInFilesWithNewAttemptNr'):
                                        useFilesWithNewAttemptNr = True
                                    #ramCount
                                    ramCount = 0

                                    # feed files to the contents table
                                    tmpLog.debug('update contents')
                                    retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(
                                        datasetSpec, tmpRet,
                                        tmpMetadata['state'], stateUpdateTime,
                                        nEventsPerFile, nEventsPerJob,
                                        maxAttempt, firstEventNumber,
                                        nMaxFiles, nMaxEvents, useScout,
                                        fileList, useFilesWithNewAttemptNr,
                                        nFilesPerJob, nEventsPerRange,
                                        nChunksForScout, includePatt,
                                        excludePatt, xmlConfig, noWaitParent,
                                        taskSpec.parent_tid, self.pid,
                                        maxFailure, useRealNumEvents,
                                        respectLB, tgtNumEventsPerJob,
                                        skipFilesUsedBy, ramCount)
                                    if retDB == False:
                                        taskSpec.setErrDiag(
                                            'failed to insert files for {0}. {1}'
                                            .format(datasetSpec.datasetName,
                                                    diagMap['errMsg']))
                                        allUpdated = False
                                        taskBroken = True
                                        break
                                    elif retDB == None:
                                        # the dataset is locked by another or status is not applicable
                                        allUpdated = False
                                        tmpLog.debug(
                                            'escape since task or dataset is locked'
                                        )
                                        break
                                    elif missingFileList != []:
                                        # files are missing
                                        tmpErrStr = '{0} files missing in {1}'.format(
                                            len(missingFileList),
                                            datasetSpec.datasetName)
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        allUpdated = False
                                        taskOnHold = True
                                        missingMap[datasetSpec.datasetName] = {
                                            'datasetSpec': datasetSpec,
                                            'missingFiles': missingFileList
                                        }
                                    else:
                                        # reduce the number of files to be read
                                        if taskParamMap.has_key('nFiles'):
                                            if datasetSpec.isMaster():
                                                taskParamMap[
                                                    'nFiles'] -= nFilesUnique
                                        # reduce the number of files for scout
                                        if useScout:
                                            nChunksForScout = diagMap[
                                                'nChunksForScout']
                                        # number of master input files
                                        if datasetSpec.isMaster():
                                            checkedMaster = True
                                            nFilesMaster += nFilesUnique
                                    # running task
                                    if diagMap['isRunningTask']:
                                        runningTask = True
                                    # no activated pending input for noWait
                                    if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout == 0) \
                                            and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster():
                                        tmpErrStr = 'insufficient inputs are ready. '
                                        tmpErrStr += diagMap['errMsg']
                                        tmpLog.debug(tmpErrStr)
                                        taskSpec.setErrDiag(tmpErrStr)
                                        taskOnHold = True
                                        setFrozenTime = False
                                        break
                            tmpLog.debug('end loop')
                    # no mater input
                    if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster:
                        tmpErrStr = 'no master input files. input dataset is empty'
                        tmpLog.error(tmpErrStr)
                        taskSpec.setErrDiag(tmpErrStr, None)
                        if taskSpec.allowEmptyInput() or noWaitParent:
                            taskOnHold = True
                        else:
                            taskBroken = True
                    # update task status
                    if taskBroken:
                        # task is broken
                        taskSpec.status = 'tobroken'
                        tmpMsg = 'set task.status={0}'.format(taskSpec.status)
                        tmpLog.info(tmpMsg)
                        tmpLog.sendMsg(tmpMsg, self.msgType)
                        allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                            jediTaskID, taskSpec, pid=self.pid)
                    # change task status unless the task is running
                    if not runningTask:
                        if taskOnHold:
                            # go to pending state
                            if not taskSpec.status in ['broken', 'tobroken']:
                                taskSpec.setOnHold()
                            tmpMsg = 'set task.status={0}'.format(
                                taskSpec.status)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg, self.msgType)
                            allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                                jediTaskID,
                                taskSpec,
                                pid=self.pid,
                                setFrozenTime=setFrozenTime)
                        elif allUpdated:
                            # all OK
                            allRet, newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                                jediTaskID,
                                getTaskStatus=True,
                                pid=self.pid,
                                useWorldCloud=taskSpec.useWorldCloud())
                            tmpMsg = 'set task.status={0}'.format(
                                newTaskStatus)
                            tmpLog.info(tmpMsg)
                            tmpLog.sendMsg(tmpMsg, self.msgType)
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(
                            jediTaskID, self.pid)
                        tmpLog.debug('unlock not-running task with {0}'.format(
                            retUnlock))
                    else:
                        # just unlock
                        retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(
                            jediTaskID, self.pid)
                        tmpLog.debug('unlock task with {0}'.format(retUnlock))
                    tmpLog.debug('done')
            except:
                errtype, errvalue = sys.exc_info()[:2]
                logger.error('{0} failed in runImpl() with {1}:{2}'.format(
                    self.__class__.__name__, errtype.__name__, errvalue))
Exemplo n.º 7
0
 def runImpl(self):
     # cutoff for disk in TB
     diskThreshold = self.taskBufferIF.getConfigValue(
         self.msgType,
         'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name), 'jedi',
         'atlas')
     if diskThreshold is None:
         diskThreshold = 100 * 1024
     # dataset type to ignore file availability check
     datasetTypeToSkipCheck = ['log']
     # thresholds for data availability check
     thrInputSize = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas')
     if thrInputSize is None:
         thrInputSize = 1
     thrInputSize *= 1024 * 1024 * 1024
     thrInputNum = self.taskBufferIF.getConfigValue(self.msgType,
                                                    'INPUT_NUM_THRESHOLD',
                                                    'jedi', 'atlas')
     if thrInputNum is None:
         thrInputNum = 100
     thrInputSizeFrac = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas')
     if thrInputSizeFrac is None:
         thrInputSizeFrac = 10
     thrInputSizeFrac = float(thrInputSizeFrac) / 100
     thrInputNumFrac = self.taskBufferIF.getConfigValue(
         self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas')
     if thrInputNumFrac is None:
         thrInputNumFrac = 10
     thrInputNumFrac = float(thrInputNumFrac) / 100
     cutOffRW = 50
     negWeightTape = 0.001
     minIoIntensityWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MIN_IO_INTENSITY_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if minIoIntensityWithLD is None:
         minIoIntensityWithLD = 200
     minInputSizeWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MIN_INPUT_SIZE_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if minInputSizeWithLD is None:
         minInputSizeWithLD = 10000
     maxTaskPrioWithLD = self.taskBufferIF.getConfigValue(
         self.msgType, 'MAX_TASK_PRIO_WITH_LOCAL_DATA', 'jedi', 'atlas')
     if maxTaskPrioWithLD is None:
         maxTaskPrioWithLD = 800
     # main
     lastJediTaskID = None
     siteMapper = self.taskBufferIF.getSiteMapper()
     while True:
         try:
             taskInputList = self.inputList.get(1)
             # no more datasets
             if len(taskInputList) == 0:
                 self.logger.debug(
                     '{0} terminating after processing {1} tasks since no more inputs '
                     .format(self.__class__.__name__, self.numTasks))
                 return
             # loop over all tasks
             for taskSpec, inputChunk in taskInputList:
                 lastJediTaskID = taskSpec.jediTaskID
                 # make logger
                 tmpLog = MsgWrapper(
                     self.logger,
                     '<jediTaskID={0}>'.format(taskSpec.jediTaskID),
                     monToken='jediTaskID={0}'.format(taskSpec.jediTaskID))
                 tmpLog.debug('start')
                 tmpLog.info(
                     'thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}'
                     .format(thrInputSize, thrInputNum, thrInputSizeFrac,
                             thrInputNumFrac))
                 # read task parameters
                 try:
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                         taskSpec.jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except Exception:
                     tmpLog.error('failed to read task params')
                     taskSpec.setErrDiag(
                         tmpLog.uploadLog(taskSpec.jediTaskID))
                     self.sendLogMessage(tmpLog)
                     continue
                 # RW
                 taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(
                     taskSpec.jediTaskID)
                 # get nuclei
                 nucleusList = siteMapper.nuclei
                 if taskSpec.nucleus in siteMapper.nuclei:
                     candidateNucleus = taskSpec.nucleus
                 elif taskSpec.nucleus in siteMapper.satellites:
                     nucleusList = siteMapper.satellites
                     candidateNucleus = taskSpec.nucleus
                 else:
                     tmpLog.info('got {0} candidates'.format(
                         len(nucleusList)))
                     ######################################
                     # check status
                     newNucleusList = {}
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleusSpec.state not in ['ACTIVE']:
                             tmpLog.info(
                                 '  skip nucleus={0} due to status={1} criteria=-status'
                                 .format(tmpNucleus, tmpNucleusSpec.state))
                         else:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info(
                         '{0} candidates passed status check'.format(
                             len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # check status of transfer backlog
                     t1Weight = taskSpec.getT1Weight()
                     if t1Weight < 0:
                         tmpLog.info(
                             'skip transfer backlog check due to negative T1Weight'
                         )
                     else:
                         newNucleusList = {}
                         backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei(
                         )
                         for tmpNucleus, tmpNucleusSpec in iteritems(
                                 nucleusList):
                             if tmpNucleus in backlogged_nuclei:
                                 tmpLog.info(
                                     '  skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog'
                                     .format(tmpNucleus))
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         nucleusList = newNucleusList
                         tmpLog.info(
                             '{0} candidates passed transfer backlog check'.
                             format(len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # check endpoint
                     fractionFreeSpace = {}
                     newNucleusList = {}
                     tmpStat, tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                         taskSpec.jediTaskID, ['output', 'log'])
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         toSkip = False
                         for tmpDatasetSpec in tmpDatasetSpecList:
                             # ignore distributed datasets
                             if DataServiceUtils.getDistributedDestination(
                                     tmpDatasetSpec.storageToken
                             ) is not None:
                                 continue
                             # get endpoint with the pattern
                             tmpEP = tmpNucleusSpec.getAssociatedEndpoint(
                                 tmpDatasetSpec.storageToken)
                             if tmpEP is None:
                                 tmpLog.info(
                                     '  skip nucleus={0} since no endpoint with {1} criteria=-match'
                                     .format(tmpNucleus,
                                             tmpDatasetSpec.storageToken))
                                 toSkip = True
                                 break
                             # check state
                             """
                             if tmpEP['state'] not in ['ACTIVE']:
                                 tmpLog.info('  skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus,
                                                                                                                          tmpEP['ddm_endpoint_name'],
                                                                                                                          tmpEP['state']))
                                 toSkip = True
                                 break
                             """
                             # check space
                             tmpSpaceSize = tmpEP['space_free'] + tmpEP[
                                 'space_expired']
                             tmpSpaceToUse = 0
                             if tmpNucleus in self.fullRW:
                                 # 0.25GB per cpuTime/corePower/day
                                 tmpSpaceToUse = long(
                                     self.fullRW[tmpNucleus] / 10 / 24 /
                                     3600 * 0.25)
                             if tmpSpaceSize - tmpSpaceToUse < diskThreshold:
                                 tmpLog.info(
                                     '  skip nucleus={0} since disk shortage (free {1} GB - reserved {2} GB < thr {3} GB) at endpoint {4} criteria=-space'
                                     .format(tmpNucleus, tmpSpaceSize,
                                             tmpSpaceToUse, diskThreshold,
                                             tmpEP['ddm_endpoint_name']))
                                 toSkip = True
                                 break
                             # keep fraction of free space
                             if tmpNucleus not in fractionFreeSpace:
                                 fractionFreeSpace[tmpNucleus] = {
                                     'total': 0,
                                     'free': 0
                                 }
                             try:
                                 tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                     float(fractionFreeSpace[tmpNucleus]['total'])
                             except Exception:
                                 tmpOld = None
                             try:
                                 tmpNew = float(tmpSpaceSize -
                                                tmpSpaceToUse) / float(
                                                    tmpEP['space_total'])
                             except Exception:
                                 tmpNew = None
                             if tmpNew is not None and (tmpOld is None
                                                        or tmpNew < tmpOld):
                                 fractionFreeSpace[tmpNucleus] = {
                                     'total': tmpEP['space_total'],
                                     'free': tmpSpaceSize - tmpSpaceToUse
                                 }
                         if not toSkip:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                     nucleusList = newNucleusList
                     tmpLog.info(
                         '{0} candidates passed endpoint check {1} TB'.
                         format(len(nucleusList), diskThreshold / 1024))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # ability to execute jobs
                     newNucleusList = {}
                     # get all panda sites
                     tmpSiteList = []
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         tmpSiteList += tmpNucleusSpec.allPandaSites
                     tmpSiteList = list(set(tmpSiteList))
                     tmpLog.debug('===== start for job check')
                     jobBroker = AtlasProdJobBroker(self.ddmIF,
                                                    self.taskBufferIF)
                     tmpSt, tmpRet = jobBroker.doBrokerage(
                         taskSpec, taskSpec.cloud, inputChunk, None, True,
                         tmpSiteList, tmpLog)
                     tmpLog.debug('===== done for job check')
                     if tmpSt != Interaction.SC_SUCCEEDED:
                         tmpLog.error('no sites can run jobs')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     okNuclei = set()
                     for tmpSite in tmpRet:
                         siteSpec = siteMapper.getSite(tmpSite)
                         okNuclei.add(siteSpec.pandasite)
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleus in okNuclei:
                             newNucleusList[tmpNucleus] = tmpNucleusSpec
                         else:
                             tmpLog.info(
                                 '  skip nucleus={0} due to missing ability to run jobs criteria=-job'
                                 .format(tmpNucleus))
                     nucleusList = newNucleusList
                     tmpLog.info('{0} candidates passed job check'.format(
                         len(nucleusList)))
                     if nucleusList == {}:
                         tmpLog.error('no candidates')
                         taskSpec.setErrDiag(
                             tmpLog.uploadLog(taskSpec.jediTaskID))
                         self.sendLogMessage(tmpLog)
                         continue
                     ######################################
                     # data locality
                     toSkip = False
                     availableData = {}
                     for datasetSpec in inputChunk.getDatasets():
                         # only for real datasets
                         if datasetSpec.isPseudo():
                             continue
                         # ignore DBR
                         if DataServiceUtils.isDBR(datasetSpec.datasetName):
                             continue
                         # skip locality check
                         if DataServiceUtils.getDatasetType(
                                 datasetSpec.datasetName
                         ) in datasetTypeToSkipCheck:
                             continue
                         # primary only
                         if taskParamMap.get(
                                 'taskBrokerOnMaster'
                         ) is True and not datasetSpec.isMaster():
                             continue
                         # use deep scan for primary dataset unless data carousel
                         if datasetSpec.isMaster(
                         ) and not taskSpec.inputPreStaging():
                             deepScan = True
                         else:
                             deepScan = False
                         # get nuclei where data is available
                         tmpSt, tmpRet = AtlasBrokerUtils.getNucleiWithData(
                             siteMapper, self.ddmIF,
                             datasetSpec.datasetName,
                             list(nucleusList.keys()), deepScan)
                         if tmpSt != Interaction.SC_SUCCEEDED:
                             tmpLog.error(
                                 'failed to get nuclei where data is available, since {0}'
                                 .format(tmpRet))
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             toSkip = True
                             break
                         # sum
                         for tmpNucleus, tmpVals in iteritems(tmpRet):
                             if tmpNucleus not in availableData:
                                 availableData[tmpNucleus] = tmpVals
                             else:
                                 availableData[tmpNucleus] = dict(
                                     (k, v + tmpVals[k])
                                     for (k, v) in iteritems(
                                         availableData[tmpNucleus]))
                     if toSkip:
                         continue
                     if availableData != {}:
                         newNucleusList = {}
                         # skip if no data
                         skipMsgList = []
                         for tmpNucleus, tmpNucleusSpec in iteritems(
                                 nucleusList):
                             if taskSpec.inputPreStaging(
                             ) and availableData[tmpNucleus][
                                     'ava_num_any'] > 0:
                                 # use incomplete replicas for data carousel since the completeness is guaranteed
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                             elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \
                                     availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac:
                                 tmpMsg = '  skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(
                                     tmpNucleus, availableData[tmpNucleus]
                                     ['ava_size_any'],
                                     availableData[tmpNucleus]['tot_size'],
                                     thrInputSizeFrac)
                                 skipMsgList.append(tmpMsg)
                             elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \
                                     availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac:
                                 tmpMsg = '  skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(
                                     tmpNucleus, availableData[tmpNucleus]
                                     ['ava_num_any'],
                                     availableData[tmpNucleus]['tot_num'],
                                     thrInputNumFrac)
                                 skipMsgList.append(tmpMsg)
                             else:
                                 newNucleusList[tmpNucleus] = tmpNucleusSpec
                         totInputSize = list(availableData.values(
                         ))[0]['tot_size'] / 1024 / 1024 / 1024
                         data_locality_check_str = (
                             '(ioIntensity ({0}) is None or less than {1} kBPerS '
                             'and input size ({2} GB) is less than {3}) '
                             'or task.currentPriority ({4}) is higher than or equal to {5}'
                         ).format(taskSpec.ioIntensity,
                                  minIoIntensityWithLD, int(totInputSize),
                                  minInputSizeWithLD,
                                  taskSpec.currentPriority,
                                  maxTaskPrioWithLD)
                         if len(newNucleusList) > 0:
                             nucleusList = newNucleusList
                             for tmpMsg in skipMsgList:
                                 tmpLog.info(tmpMsg)
                         elif ((taskSpec.ioIntensity is None
                               or taskSpec.ioIntensity <= minIoIntensityWithLD)
                               and totInputSize <= minInputSizeWithLD) \
                               or taskSpec.currentPriority >= maxTaskPrioWithLD:
                             availableData = {}
                             tmpLog.info(
                                 '  disable data locality check since no nucleus has input data, {}'
                                 .format(data_locality_check_str))
                         else:
                             # no candidate + unavoidable data locality check
                             nucleusList = newNucleusList
                             for tmpMsg in skipMsgList:
                                 tmpLog.info(tmpMsg)
                             tmpLog.info(
                                 '  the following conditions required to disable data locality check: {}'
                                 .format(data_locality_check_str))
                         tmpLog.info(
                             '{0} candidates passed data check'.format(
                                 len(nucleusList)))
                         if nucleusList == {}:
                             tmpLog.error('no candidates')
                             taskSpec.setErrDiag(
                                 tmpLog.uploadLog(taskSpec.jediTaskID))
                             self.sendLogMessage(tmpLog)
                             continue
                     ######################################
                     # weight
                     self.prioRW.acquire()
                     nucleusRW = self.prioRW[taskSpec.currentPriority]
                     self.prioRW.release()
                     totalWeight = 0
                     nucleusweights = []
                     for tmpNucleus, tmpNucleusSpec in iteritems(
                             nucleusList):
                         if tmpNucleus not in nucleusRW:
                             nucleusRW[tmpNucleus] = 0
                         wStr = '1'
                         # with RW
                         if tmpNucleus in nucleusRW and nucleusRW[
                                 tmpNucleus] >= cutOffRW:
                             weight = 1 / float(nucleusRW[tmpNucleus])
                             wStr += '/( RW={0} )'.format(
                                 nucleusRW[tmpNucleus])
                         else:
                             weight = 1
                             wStr += '/(1 : RW={0}<{1})'.format(
                                 nucleusRW[tmpNucleus], cutOffRW)
                         # with data
                         if availableData != {}:
                             if availableData[tmpNucleus]['tot_size'] > 0:
                                 weight *= float(availableData[tmpNucleus]
                                                 ['ava_size_any'])
                                 weight /= float(
                                     availableData[tmpNucleus]['tot_size'])
                                 wStr += '* ( available_input_size_DISKTAPE={0} )'.format(
                                     availableData[tmpNucleus]
                                     ['ava_size_any'])
                                 wStr += '/ ( total_input_size={0} )'.format(
                                     availableData[tmpNucleus]['tot_size'])
                                 # negative weight for tape
                                 if availableData[tmpNucleus][
                                         'ava_size_any'] > availableData[
                                             tmpNucleus]['ava_size_disk']:
                                     weight *= negWeightTape
                                     wStr += '*( weight_TAPE={0} )'.format(
                                         negWeightTape)
                         # fraction of free space
                         if tmpNucleus in fractionFreeSpace:
                             try:
                                 tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \
                                     float(fractionFreeSpace[tmpNucleus]['total'])
                                 weight *= tmpFrac
                                 wStr += '*( free_space={0} )/( total_space={1} )'.format(
                                     fractionFreeSpace[tmpNucleus]['free'],
                                     fractionFreeSpace[tmpNucleus]['total'])
                             except Exception:
                                 pass
                         tmpLog.info(
                             '  use nucleus={0} weight={1} {2} criteria=+use'
                             .format(tmpNucleus, weight, wStr))
                         totalWeight += weight
                         nucleusweights.append((tmpNucleus, weight))
                     tmpLog.info('final {0} candidates'.format(
                         len(nucleusList)))
                     ######################################
                     # final selection
                     tgtWeight = random.uniform(0, totalWeight)
                     candidateNucleus = None
                     for tmpNucleus, weight in nucleusweights:
                         tgtWeight -= weight
                         if tgtWeight <= 0:
                             candidateNucleus = tmpNucleus
                             break
                     if candidateNucleus is None:
                         candidateNucleus = nucleusweights[-1][0]
                 ######################################
                 # update
                 nucleusSpec = nucleusList[candidateNucleus]
                 # get output/log datasets
                 tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(
                     taskSpec.jediTaskID, ['output', 'log'])
                 # get destinations
                 retMap = {
                     taskSpec.jediTaskID:
                     AtlasBrokerUtils.getDictToSetNucleus(
                         nucleusSpec, tmpDatasetSpecs)
                 }
                 tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap)
                 tmpLog.info(
                     '  set nucleus={0} with {1} criteria=+set'.format(
                         candidateNucleus, tmpRet))
                 self.sendLogMessage(tmpLog)
                 if tmpRet:
                     tmpMsg = 'set task_status=ready'
                     tmpLog.sendMsg(tmpMsg, self.msgType)
                 # update RW table
                 self.prioRW.acquire()
                 for prio, rwMap in iteritems(self.prioRW):
                     if prio > taskSpec.currentPriority:
                         continue
                     if candidateNucleus in rwMap:
                         rwMap[candidateNucleus] += taskRW
                     else:
                         rwMap[candidateNucleus] = taskRW
                 self.prioRW.release()
         except Exception:
             errtype, errvalue = sys.exc_info()[:2]
             errMsg = '{0}.runImpl() failed with {1} {2} '.format(
                 self.__class__.__name__, errtype.__name__, errvalue)
             errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID)
             errMsg += traceback.format_exc()
             logger.error(errMsg)
tbIF = JediTaskBufferInterface()
tbIF.setupInterface()

siteMapper = tbIF.getSiteMapper()

ddmIF = DDMInterface()
ddmIF.setupInterface()

jediTaskID = int(sys.argv[1])

s, taskSpec = tbIF.getTaskWithID_JEDI(jediTaskID, False)
body = TaskBroker(None, tbIF, ddmIF, taskSpec.vo, taskSpec.prodSourceLabel)
body.initializeMods(tbIF, ddmIF)

taskParam = tbIF.getTaskParamsWithID_JEDI(jediTaskID)
taskParamMap = RefinerUtils.decodeJSON(taskParam)

vo = taskParamMap['vo']
prodSourceLabel = taskParamMap['prodSourceLabel']
taskType = taskParamMap['taskType']

workQueueMapper = tbIF.getWorkQueueMap()
workQueue = workQueueMapper.getQueueWithIDGshare(taskSpec.workQueue_ID,
                                                 taskSpec.gshare)

impl = body.getImpl(vo, prodSourceLabel)

tmpListItem = tbIF.getTasksToBeProcessed_JEDI(None,
                                              None,
                                              None,
                                              None,
Exemplo n.º 9
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.info('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,splitRule,taskStatus,parent_tid in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'< jediTaskID={0} >'.format(jediTaskID))
                 tmpLog.debug('start')
                 tmpStat = Interaction.SC_SUCCEEDED
                 errStr = ''
                 # read task parameters
                 try:
                     taskParam = None
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue)
                     tmpLog.debug(taskParam)
                     tmpLog.error(errStr)
                     continue
                     tmpStat = Interaction.SC_FAILED
                 # get impl
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('getting Impl')
                     try:
                         # get VO and sourceLabel
                         vo = taskParamMap['vo']
                         prodSourceLabel = taskParamMap['prodSourceLabel']
                         taskType = taskParamMap['taskType']
                         tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType))
                         # get impl
                         impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType,
                                                                 self.taskBufferIF,self.ddmIF)
                         if impl == None:
                             # task refiner is undefined
                             errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # extract common parameters
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('extracting common')
                     try:
                         # initalize impl
                         impl.initializeRefiner(tmpLog)
                         impl.oldTaskStatus = taskStatus
                         # extract common parameters
                         impl.extractCommon(jediTaskID, taskParamMap, self.workQueueMapper, splitRule)
                         # set parent tid
                         if not parent_tid in [None,jediTaskID]:
                             impl.taskSpec.parent_tid = parent_tid
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to extract common parameters with {0}:{1} {2}'.format(errtype.__name__,errvalue,
                                                                                                traceback.format_exc())
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # check attribute length
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('checking attribute length')
                     if not impl.taskSpec.checkAttrLength():
                         tmpLog.error(impl.taskSpec.errorDialog)
                         tmpStat = Interaction.SC_FAILED
                 # staging
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     if 'toStaging' in taskParamMap and taskStatus <> 'staged':
                         errStr = 'wait until staging is done'
                         impl.taskSpec.status = 'staging'
                         impl.taskSpec.oldStatus = taskStatus
                         impl.taskSpec.setErrDiag(errStr)
                         # not to update some task attributes
                         impl.taskSpec.resetRefinedAttrs()
                         tmpLog.info(errStr)
                         self.taskBufferIF.updateTask_JEDI(impl.taskSpec, {'jediTaskID':impl.taskSpec.jediTaskID},
                                                           oldStatus=[taskStatus], updateDEFT=False, setFrozenTime=False)
                         continue
                 # check parent
                 noWaitParent = False
                 parentState = None
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     if parent_tid not in [None,jediTaskID]:
                         tmpLog.info('check parent task')
                         try:
                             tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid)
                             parentState = tmpStat
                             if tmpStat == 'completed':
                                 # parent is done
                                 tmpStat = Interaction.SC_SUCCEEDED
                             elif tmpStat == 'running':
                                 if not impl.taskSpec.noWaitParent():
                                     # parent is running
                                     errStr = 'pending until parent task {0} is done'.format(parent_tid)
                                     impl.taskSpec.status = taskStatus
                                     impl.taskSpec.setOnHold()
                                     impl.taskSpec.setErrDiag(errStr)
                                     # not to update some task attributes
                                     impl.taskSpec.resetRefinedAttrs()
                                     tmpLog.info(errStr)
                                     self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                                       oldStatus=[taskStatus],setFrozenTime=False)
                                     continue
                                 else:
                                     # not wait for parent
                                     tmpStat = Interaction.SC_SUCCEEDED
                                     noWaitParent = True
                             else:
                                 # parent is corrupted
                                 tmpStat = Interaction.SC_FAILED
                                 tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid)
                                 impl.taskSpec.setErrDiag(tmpErrStr)
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # refine
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('refining with {0}'.format(impl.__class__.__name__))
                     try:
                         tmpStat = impl.doRefine(jediTaskID,taskParamMap)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         # wait unknown input if noWaitParent or waitInput
                         if ((impl.taskSpec.noWaitParent() or impl.taskSpec.waitInput()) \
                                 and errtype == JediException.UnknownDatasetError) or parentState == 'running' \
                                 or errtype == Interaction.JEDITemporaryError:
                             if impl.taskSpec.noWaitParent() or parentState == 'running':
                                 tmpErrStr = 'pending until parent produces input'
                                 setFrozenTime=False
                             elif errtype == Interaction.JEDITemporaryError:
                                 tmpErrStr = 'pending due to DDM problem. {0}'.format(errvalue)
                                 setFrozenTime=True
                             else:
                                 tmpErrStr = 'pending until input is staged'
                                 setFrozenTime=True
                             impl.taskSpec.status = taskStatus
                             impl.taskSpec.setOnHold()
                             impl.taskSpec.setErrDiag(tmpErrStr)
                             # not to update some task attributes
                             impl.taskSpec.resetRefinedAttrs()
                             tmpLog.info(tmpErrStr)
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                               oldStatus=[taskStatus],
                                                               insertUnknown=impl.unknownDatasetList,
                                                               setFrozenTime=setFrozenTime)
                             continue
                         else:
                             errStr  = 'failed to refine task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # register
                 if tmpStat != Interaction.SC_SUCCEEDED:
                     tmpLog.error('failed to refine the task')
                     if impl == None or impl.taskSpec == None:
                         tmpTaskSpec = JediTaskSpec()
                         tmpTaskSpec.jediTaskID = jediTaskID
                     else:
                         tmpTaskSpec = impl.taskSpec
                     tmpTaskSpec.status = 'tobroken'
                     if errStr != '':
                         tmpTaskSpec.setErrDiag(errStr,True)
                     self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID},oldStatus=[taskStatus])
                 else:
                     tmpLog.info('registering')                    
                     # fill JEDI tables
                     try:
                         # enable protection against task duplication
                         if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \
                                 not impl.taskSpec.checkPreProcessed():
                             uniqueTaskName = True
                         else:
                             uniqueTaskName = False
                         strTaskParams = None
                         if impl.updatedTaskParams != None:
                             strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams)
                         if taskStatus in ['registered', 'staged']:
                             # unset pre-process flag
                             if impl.taskSpec.checkPreProcessed():
                                 impl.taskSpec.setPostPreProcess()
                             # full registration
                             tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec,
                                                                                                  impl.inMasterDatasetSpec,
                                                                                                  impl.inSecDatasetSpecList,
                                                                                                  impl.outDatasetSpecList,
                                                                                                  impl.outputTemplateMap,
                                                                                                  impl.jobParamsTemplate,
                                                                                                  strTaskParams,
                                                                                                  impl.unmergeMasterDatasetSpec,
                                                                                                  impl.unmergeDatasetSpecMap,
                                                                                                  uniqueTaskName,
                                                                                                  taskStatus) 
                             if not tmpStat:
                                 tmpErrStr = 'failed to register the task to JEDI in a single shot'
                                 tmpLog.error(tmpErrStr)
                                 impl.taskSpec.status = newTaskStatus
                                 impl.taskSpec.setErrDiag(tmpErrStr,True)
                                 self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                                   oldStatus=[taskStatus])
                             tmpMsg = 'set task_status={0}'.format(newTaskStatus)
                             tmpLog.info(tmpMsg)
                             tmpLog.sendMsg(tmpMsg,self.msgType)
                         else:
                             # disable scouts if previous attempt didn't use it
                             if not impl.taskSpec.useScout(splitRule):
                                 impl.taskSpec.setUseScout(False)
                             # disallow to reset some attributes
                             for attName in ['ramCount', 'walltime', 'cpuTime', 'startTime']:
                                 impl.taskSpec.resetChangedAttr(attName)
                             # update task with new params
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                               oldStatus=[taskStatus])
                             # appending for incremetnal execution
                             tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec,
                                                                             impl.inSecDatasetSpecList)
                             if not tmpStat:
                                 tmpLog.error('failed to append datasets for incexec')
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(tmpErrStr)
                     else:
                         tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
Exemplo n.º 10
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug(
                     '{0} terminating since no more items'.format(
                         self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID, commandMap in taskList:
                 # make logger
                 tmpLog = MsgWrapper(
                     self.logger, ' < jediTaskID={0} >'.format(jediTaskID))
                 commandStr = commandMap['command']
                 commentStr = commandMap['comment']
                 oldStatus = commandMap['oldStatus']
                 tmpLog.info('start for {0}'.format(commandStr))
                 tmpStat = Interaction.SC_SUCCEEDED
                 if commandStr in ['kill', 'finish', 'reassign']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg, self.msgType)
                     # loop twice to see immediate result
                     for iLoop in range(2):
                         # get active PandaIDs to be killed
                         if commandStr == 'reassign' and commentStr is not None and 'soft reassign' in commentStr:
                             pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(
                                 jediTaskID)
                         elif commandStr == 'reassign' and commentStr is not None and 'nokill reassign' in commentStr:
                             pandaIDs = []
                         else:
                             pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(
                                 jediTaskID, True)
                         if pandaIDs is None:
                             tmpLog.error(
                                 'failed to get PandaIDs for jediTaskID={0}'
                                 .format(jediTaskID))
                             tmpStat = Interaction.SC_FAILED
                         # kill jobs or update task
                         if tmpStat == Interaction.SC_SUCCEEDED:
                             if pandaIDs == []:
                                 # done since no active jobs
                                 tmpMsg = 'completed cleaning jobs'
                                 tmpLog.sendMsg(tmpMsg, self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpTaskSpec = JediTaskSpec()
                                 tmpTaskSpec.jediTaskID = jediTaskID
                                 updateTaskStatus = True
                                 if commandStr != 'reassign':
                                     # reset oldStatus
                                     # keep oldStatus for task reassignment since it is reset when actually reassigned
                                     tmpTaskSpec.forceUpdate('oldStatus')
                                 else:
                                     # extract cloud or site
                                     if commentStr is not None:
                                         tmpItems = commentStr.split(':')
                                         if tmpItems[0] == 'cloud':
                                             tmpTaskSpec.cloud = tmpItems[1]
                                         elif tmpItems[0] == 'nucleus':
                                             tmpTaskSpec.nucleus = tmpItems[
                                                 1]
                                         else:
                                             tmpTaskSpec.site = tmpItems[1]
                                         tmpMsg = 'set {0}={1}'.format(
                                             tmpItems[0], tmpItems[1])
                                         tmpLog.sendMsg(
                                             tmpMsg, self.msgType)
                                         tmpLog.info(tmpMsg)
                                         # back to oldStatus if necessary
                                         if tmpItems[2] == 'y':
                                             tmpTaskSpec.status = oldStatus
                                             tmpTaskSpec.forceUpdate(
                                                 'oldStatus')
                                             updateTaskStatus = False
                                 if commandStr == 'reassign':
                                     tmpTaskSpec.forceUpdate('errorDialog')
                                 if commandStr == 'finish':
                                     # update datasets
                                     tmpLog.info(
                                         'updating datasets to finish')
                                     tmpStat = self.taskBufferIF.updateDatasetsToFinishTask_JEDI(
                                         jediTaskID, self.pid)
                                     if not tmpStat:
                                         tmpLog.info(
                                             'wait until datasets are updated to finish'
                                         )
                                     # ignore failGoalUnreached when manually finished
                                     tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(
                                         jediTaskID)
                                     tmpTaskSpec.splitRule = taskSpec.splitRule
                                     tmpTaskSpec.unsetFailGoalUnreached()
                                 if updateTaskStatus:
                                     tmpTaskSpec.status = JediTaskSpec.commandStatusMap(
                                     )[commandStr]['done']
                                 tmpMsg = 'set task_status={0}'.format(
                                     tmpTaskSpec.status)
                                 tmpLog.sendMsg(tmpMsg, self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpRet = self.taskBufferIF.updateTask_JEDI(
                                     tmpTaskSpec,
                                     {'jediTaskID': jediTaskID},
                                     setOldModTime=True)
                                 tmpLog.info('done with {0}'.format(
                                     str(tmpRet)))
                                 break
                             else:
                                 # kill only in the first loop
                                 if iLoop > 0:
                                     break
                                 # wait or kill jobs
                                 if commentStr and 'soft finish' in commentStr:
                                     queuedPandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(
                                         jediTaskID)
                                     tmpMsg = "trying to kill {0} queued jobs for soft finish".format(
                                         len(queuedPandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = self.taskBufferIF.killJobs(
                                         queuedPandaIDs, commentStr, '52',
                                         True)
                                     tmpMsg = "wating {0} jobs for soft finish".format(
                                         len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = True
                                     tmpLog.info('done with {0}'.format(
                                         str(tmpRet)))
                                     break
                                 else:
                                     tmpMsg = "trying to kill {0} jobs".format(
                                         len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpLog.sendMsg(tmpMsg, self.msgType)
                                     if commandStr in ['finish']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(
                                             pandaIDs, commentStr, '52',
                                             True)
                                     elif commandStr in ['reassign']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(
                                             pandaIDs, commentStr, '51',
                                             True)
                                     else:
                                         # normal kill
                                         tmpRet = self.taskBufferIF.killJobs(
                                             pandaIDs, commentStr, '50',
                                             True)
                                     tmpLog.info('done with {0}'.format(
                                         str(tmpRet)))
                 elif commandStr in ['retry', 'incexec']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg, self.msgType)
                     # change task params for incexec
                     if commandStr == 'incexec':
                         try:
                             # read task params
                             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                                 jediTaskID)
                             taskParamMap = RefinerUtils.decodeJSON(
                                 taskParam)
                             # remove some params
                             for newKey in ['nFiles', 'fixedSandbox']:
                                 try:
                                     del taskParamMap[newKey]
                                 except Exception:
                                     pass
                             # convert new params
                             newParamMap = RefinerUtils.decodeJSON(
                                 commentStr)
                             # change params
                             for newKey, newVal in iteritems(newParamMap):
                                 if newVal is None:
                                     # delete
                                     if newKey in taskParamMap:
                                         del taskParamMap[newKey]
                                 else:
                                     # change
                                     taskParamMap[newKey] = newVal
                             # overwrite sandbox
                             if 'fixedSandbox' in taskParamMap:
                                 # noBuild
                                 for tmpParam in taskParamMap[
                                         'jobParameters']:
                                     if tmpParam[
                                             'type'] == 'constant' and re.search(
                                                 '^-a [^ ]+$',
                                                 tmpParam['value']
                                             ) is not None:
                                         tmpParam['value'] = '-a {0}'.format(
                                             taskParamMap['fixedSandbox'])
                                 # build
                                 if 'buildSpec' in taskParamMap:
                                     taskParamMap['buildSpec'][
                                         'archiveName'] = taskParamMap[
                                             'fixedSandbox']
                                 # merge
                                 if 'mergeSpec' in taskParamMap:
                                     taskParamMap['mergeSpec']['jobParameters'] = \
                                         re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters'])
                             # encode new param
                             strTaskParams = RefinerUtils.encodeJSON(
                                 taskParamMap)
                             tmpRet = self.taskBufferIF.updateTaskParams_JEDI(
                                 jediTaskID, strTaskParams)
                             if tmpRet is not True:
                                 tmpLog.error(
                                     'failed to update task params')
                                 continue
                         except Exception as e:
                             tmpLog.error(
                                 'failed to change task params with {} {}'.
                                 format(str(e), traceback.format_exc()))
                             continue
                     # retry child tasks
                     if 'sole ' in commentStr:
                         retryChildTasks = False
                     else:
                         retryChildTasks = True
                     # discard events
                     if 'discard ' in commentStr:
                         discardEvents = True
                     else:
                         discardEvents = False
                     # release un-staged files
                     if 'staged ' in commentStr:
                         releaseUnstaged = True
                     else:
                         releaseUnstaged = False
                     tmpRet, newTaskStatus = self.taskBufferIF.retryTask_JEDI(
                         jediTaskID,
                         commandStr,
                         retryChildTasks=retryChildTasks,
                         discardEvents=discardEvents,
                         release_unstaged=releaseUnstaged)
                     if tmpRet is True:
                         tmpMsg = 'set task_status={0}'.format(
                             newTaskStatus)
                         tmpLog.sendMsg(tmpMsg, self.msgType)
                         tmpLog.info(tmpMsg)
                     tmpLog.info('done with {0}'.format(tmpRet))
                 else:
                     tmpLog.error('unknown command')
         except Exception as e:
             errStr = '{} failed in runImpl() with {} {} '.format(
                 self.__class__.__name__, str(e), traceback.format_exc())
             logger.error(errStr)
Exemplo n.º 11
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskDsList = self.taskDsList.get(nTasks)
             # no more datasets
             if len(taskDsList) == 0:
                 self.logger.debug('%s terminating since no more items' % self.__class__.__name__)
                 return
             # loop over all tasks
             for jediTaskID,dsList in taskDsList:
                 allUpdated = True
                 taskBroken = False
                 taskOnHold = False
                 runningTask = False
                 missingMap = {}
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(jediTaskID))
                 # get task
                 tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False,True,self.pid,10)
                 if not tmpStat or taskSpec == None:
                     tmpLog.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID))
                     continue
                 try:
                     # get task parameters
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue))
                     taskBroken = True
                 # renaming of parameters
                 if taskParamMap.has_key('nEventsPerInputFile'):
                     taskParamMap['nEventsPerFile'] = taskParamMap['nEventsPerInputFile']
                 # the number of files per job
                 nFilesPerJob = None
                 if taskParamMap.has_key('nFilesPerJob'):
                     nFilesPerJob = taskParamMap['nFilesPerJob']
                 # the number of chunks used by scout 
                 nChunksForScout = 10
                 # load XML
                 if taskSpec.useLoadXML():
                     xmlConfig = taskParamMap['loadXML']
                 else:
                     xmlConfig = None
                 # check no wait
                 noWaitParent = False
                 if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None,taskSpec.jediTaskID]:
                     tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid)
                     if tmpStat == 'running':
                         noWaitParent = True
                 # loop over all datasets
                 nFilesMaster = 0
                 checkedMaster = False
                 setFrozenTime = True
                 if not taskBroken:
                     ddmIF = self.ddmIF.getInterface(taskSpec.vo) 
                     origNumFiles = None
                     if taskParamMap.has_key('nFiles'):
                         origNumFiles = taskParamMap['nFiles']
                     for datasetSpec in dsList:
                         tmpLog.info('start loop for {0}(id={1})'.format(datasetSpec.datasetName,datasetSpec.datasetID))
                         # get dataset metadata
                         tmpLog.info('get metadata')
                         gotMetadata = False
                         stateUpdateTime = datetime.datetime.utcnow()                    
                         try:
                             if not datasetSpec.isPseudo():
                                 tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName)
                             else:
                                 # dummy metadata for pseudo dataset
                                 tmpMetadata = {'state':'closed'}
                             # set mutable when parent is running and the dataset is open
                             if noWaitParent and tmpMetadata['state'] == 'open':
                                 # dummy metadata when parent is running
                                 tmpMetadata = {'state':'mutable'}
                             gotMetadata = True
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             tmpLog.error('{0} failed to get metadata to {1}:{2}'.format(self.__class__.__name__,
                                                                                         errtype.__name__,errvalue))
                             if errtype == Interaction.JEDIFatalError:
                                 # fatal error
                                 datasetStatus = 'broken'
                                 taskBroken = True
                                 # update dataset status    
                                 self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                             else:
                                 # temporary error
                                 taskOnHold = True
                             taskSpec.setErrDiag('failed to get metadata for {0}'.format(datasetSpec.datasetName))
                             allUpdated = False
                         else:
                             # get file list specified in task parameters
                             fileList,includePatt,excludePatt = RefinerUtils.extractFileList(taskParamMap,datasetSpec.datasetName)   
                             # get the number of events in metadata
                             if taskParamMap.has_key('getNumEventsInMetadata'):
                                 getNumEvents = True
                             else:
                                 getNumEvents = False
                             # get file list from DDM
                             tmpLog.info('get files')
                             try:
                                 useInFilesWithNewAttemptNr = False
                                 skipDuplicate = not datasetSpec.useDuplicatedFiles()
                                 if not datasetSpec.isPseudo():
                                     if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \
                                             not datasetSpec.containerName in ['',None]:
                                         # read files from container if file list is specified in task parameters
                                         tmpDatasetName = datasetSpec.containerName
                                     else:
                                         tmpDatasetName = datasetSpec.datasetName
                                     tmpRet = ddmIF.getFilesInDataset(tmpDatasetName,
                                                                      getNumEvents=getNumEvents,
                                                                      skipDuplicate=skipDuplicate
                                                                      )
                                     tmpLog.info('got {0} files in {1}'.format(len(tmpRet),tmpDatasetName))
                                     # remove lost files
                                     tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName,tmpRet)
                                     if tmpLostFiles != {}:
                                         tmpLog.info('found {0} lost files in {1}'.format(len(tmpLostFiles),tmpDatasetName))
                                         for tmpListGUID,tmpLostLFN in tmpLostFiles.iteritems():
                                             tmpLog.info('removed {0}'.format(tmpLostLFN))
                                             del tmpRet[tmpListGUID]
                                 else:
                                     if not taskSpec.useListPFN():
                                         # dummy file list for pseudo dataset
                                         tmpRet = {str(uuid.uuid4()):{'lfn':'pseudo_lfn',
                                                                      'scope':None,
                                                                      'filesize':0,
                                                                      'checksum':None,
                                                                      }
                                                   }
                                     else:
                                         # make dummy file list for PFN list
                                         if taskParamMap.has_key('nFiles'):
                                             nPFN = taskParamMap['nFiles']
                                         else:
                                             nPFN = 1
                                         tmpRet = {}
                                         for iPFN in range(nPFN):
                                             tmpRet[str(uuid.uuid4())] = {'lfn':'{0:06d}:{1}'.format(iPFN,taskParamMap['pfnList'][iPFN].split('/')[-1]),
                                                                          'scope':None,
                                                                          'filesize':0,
                                                                          'checksum':None,
                                                                          }
                             except:
                                 errtype,errvalue = sys.exc_info()[:2]
                                 tmpLog.error('failed to get files due to {0}:{1}'.format(self.__class__.__name__,
                                                                                              errtype.__name__,errvalue))
                                 if errtype == Interaction.JEDIFatalError:
                                     # fatal error
                                     datasetStatus = 'broken'
                                     taskBroken = True
                                     # update dataset status    
                                     self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog)
                                 else:
                                     # temporary error
                                     taskOnHold = True
                                 taskSpec.setErrDiag('failed to get files for {0}'.format(datasetSpec.datasetName))
                                 allUpdated = False
                             else:
                                 # the number of events per file
                                 nEventsPerFile  = None
                                 nEventsPerJob   = None
                                 nEventsPerRange = None
                                 if (datasetSpec.isMaster() and taskParamMap.has_key('nEventsPerFile')) or \
                                         (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents')):
                                     if taskParamMap.has_key('nEventsPerFile'):
                                         nEventsPerFile = taskParamMap['nEventsPerFile']
                                     elif datasetSpec.isPseudo() and taskParamMap.has_key('nEvents'):
                                         # use nEvents as nEventsPerFile for pseudo input
                                         nEventsPerFile = taskParamMap['nEvents']
                                     if taskParamMap.has_key('nEventsPerJob'):
                                         nEventsPerJob = taskParamMap['nEventsPerJob']
                                     elif taskParamMap.has_key('nEventsPerRange'):
                                         nEventsPerRange = taskParamMap['nEventsPerRange']
                                 # max attempts
                                 maxAttempt = None
                                 if datasetSpec.isMaster() or datasetSpec.toKeepTrack():
                                     # max attempts 
                                     if taskSpec.disableAutoRetry():
                                         # disable auto retry 
                                         maxAttempt = 1
                                     elif taskParamMap.has_key('maxAttempt'):
                                         maxAttempt = taskParamMap['maxAttempt']
                                     else:
                                         # use default value
                                         maxAttempt = 3
                                 # first event number
                                 firstEventNumber = None
                                 if datasetSpec.isMaster():
                                     # first event number
                                     firstEventNumber = 1 + taskSpec.getFirstEventOffset()
                                 # nMaxEvents
                                 nMaxEvents = None 
                                 if datasetSpec.isMaster() and taskParamMap.has_key('nEvents'):
                                     nMaxEvents = taskParamMap['nEvents']
                                 # nMaxFiles
                                 nMaxFiles = None
                                 if taskParamMap.has_key('nFiles'):
                                     if datasetSpec.isMaster():
                                         nMaxFiles = taskParamMap['nFiles']
                                     else:
                                         # calculate for secondary
                                         nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles)
                                         # multipled by the number of jobs per file for event-level splitting
                                         if nMaxFiles != None and taskParamMap.has_key('nEventsPerFile'):
                                             if taskParamMap.has_key('nEventsPerJob'):
                                                 if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']:
                                                     nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerJob'])
                                                     nMaxFiles = int(math.ceil(nMaxFiles))
                                             elif taskParamMap.has_key('nEventsPerRange'):
                                                 if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerRange']:
                                                     nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerRange'])
                                                     nMaxFiles = int(math.ceil(nMaxFiles))
                                 # use scout
                                 useScout = False    
                                 if datasetSpec.isMaster() and taskSpec.useScout() and datasetSpec.status != 'toupdate':
                                     useScout = True
                                 # use files with new attempt numbers    
                                 useFilesWithNewAttemptNr = False
                                 if not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key('useInFilesWithNewAttemptNr'):
                                     useFilesWithNewAttemptNr = True
                                 # feed files to the contents table
                                 tmpLog.info('update contents')
                                 retDB,missingFileList,nFilesUnique,diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(datasetSpec,tmpRet,
                                                                                                                           tmpMetadata['state'],
                                                                                                                           stateUpdateTime,
                                                                                                                           nEventsPerFile,
                                                                                                                           nEventsPerJob,
                                                                                                                           maxAttempt,
                                                                                                                           firstEventNumber,
                                                                                                                           nMaxFiles,
                                                                                                                           nMaxEvents,
                                                                                                                           useScout,
                                                                                                                           fileList,
                                                                                                                           useFilesWithNewAttemptNr,
                                                                                                                           nFilesPerJob,
                                                                                                                           nEventsPerRange,
                                                                                                                           nChunksForScout,
                                                                                                                           includePatt,
                                                                                                                           excludePatt,
                                                                                                                           xmlConfig,
                                                                                                                           noWaitParent,
                                                                                                                           taskSpec.parent_tid,
                                                                                                                           self.pid)
                                 if retDB == False:
                                     taskSpec.setErrDiag('failed to insert files for {0}. {1}'.format(datasetSpec.datasetName,
                                                                                                      diagMap['errMsg']))
                                     allUpdated = False
                                     taskBroken = True
                                     break
                                 elif retDB == None:
                                     # the dataset is locked by another or status is not applicable
                                     allUpdated = False
                                     tmpLog.info('escape since task or dataset is locked')
                                     break
                                 elif missingFileList != []:
                                     # files are missing
                                     tmpErrStr = '{0} files missing in {1}'.format(len(missingFileList),datasetSpec.datasetName)
                                     tmpLog.info(tmpErrStr)
                                     taskSpec.setErrDiag(tmpErrStr)
                                     allUpdated = False
                                     taskOnHold = True
                                     missingMap[datasetSpec.datasetName] = {'datasetSpec':datasetSpec,
                                                                            'missingFiles':missingFileList} 
                                 else:
                                     # reduce the number of files to be read
                                     if taskParamMap.has_key('nFiles'):
                                         if datasetSpec.isMaster():
                                             taskParamMap['nFiles'] -= nFilesUnique
                                     # reduce the number of files for scout
                                     if useScout:
                                         nChunksForScout = diagMap['nChunksForScout']
                                     # number of master input files
                                     if datasetSpec.isMaster():
                                         checkedMaster = True
                                         nFilesMaster += nFilesUnique
                                 # running task
                                 if diagMap['isRunningTask']:
                                     runningTask = True
                                 # no activated pending input for noWait
                                 if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout == 0):
                                     tmpErrStr = 'insufficient inputs are ready'
                                     tmpLog.info(tmpErrStr)
                                     taskSpec.setErrDiag(tmpErrStr)
                                     taskOnHold = True
                                     setFrozenTime = False
                                     break
                         tmpLog.info('end loop')
                 # no mater input
                 if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster:
                     tmpErrStr = 'no master input files. input dataset is empty'
                     tmpLog.error(tmpErrStr)
                     taskSpec.setErrDiag(tmpErrStr,None)
                     if taskSpec.allowEmptyInput() or noWaitParent:
                         taskOnHold = True
                     else:
                         taskBroken = True
                 # update task status
                 if taskBroken:
                     # task is broken
                     taskSpec.status = 'tobroken'
                     tmpMsg = 'set task.status={0}'.format(taskSpec.status)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid)
                 # change task status unless the task is running
                 if not runningTask:
                     if taskOnHold:
                         if not noWaitParent:
                             # initialize task generator
                             taskGenerator = TaskGenerator(taskSpec.vo,taskSpec.prodSourceLabel)
                             tmpStat = taskGenerator.initializeMods(self.taskBufferIF,
                                                                    self.ddmIF.getInterface(taskSpec.vo))
                             if not tmpStat:
                                 tmpErrStr = 'failed to initialize TaskGenerator'
                                 tmpLog.error(tmpErrStr)
                                 taskSpec.status = 'tobroken'
                                 taskSpec.setErrDiag(tmpErrStr)
                             else:
                                 # make parent tasks if necessary
                                 tmpLog.info('make parent tasks with {0} (if necessary)'.format(taskGenerator.getClassName(taskSpec.vo,
                                                                                                                           taskSpec.prodSourceLabel)))
                                 tmpStat = taskGenerator.doGenerate(taskSpec,taskParamMap,missingFilesMap=missingMap)
                                 if tmpStat == Interaction.SC_FATAL:
                                     # failed to make parent tasks
                                     taskSpec.status = 'tobroken'
                                     tmpLog.error('failed to make parent tasks')
                         # go to pending state
                         if not taskSpec.status in ['broken','tobroken']:
                             taskSpec.setOnHold()
                         tmpMsg = 'set task.status={0}'.format(taskSpec.status)
                         tmpLog.info(tmpMsg)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                         allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid,setFrozenTime=setFrozenTime)
                     elif allUpdated:
                         # all OK
                         allRet,newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,getTaskStatus=True,
                                                                                                    pid=self.pid)
                         tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                         tmpLog.info(tmpMsg)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                 tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
Exemplo n.º 12
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.info('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,splitRule,taskStatus,parent_tid in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,'< jediTaskID={0} >'.format(jediTaskID))
                 tmpLog.debug('start')
                 tmpStat = Interaction.SC_SUCCEEDED
                 errStr = ''
                 # read task parameters
                 try:
                     taskParam = None
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue)
                     tmpLog.debug(taskParam)
                     tmpLog.error(errStr)
                     continue
                     tmpStat = Interaction.SC_FAILED
                 # get impl
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('getting Impl')
                     try:
                         # get VO and sourceLabel
                         vo = taskParamMap['vo']
                         prodSourceLabel = taskParamMap['prodSourceLabel']
                         taskType = taskParamMap['taskType']
                         tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType))
                         # get impl
                         impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType,
                                                                 self.taskBufferIF,self.ddmIF)
                         if impl == None:
                             # task refiner is undefined
                             errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # extract common parameters
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('extracting common')
                     try:
                         # initalize impl
                         impl.initializeRefiner(tmpLog)
                         impl.oldTaskStatus = taskStatus
                         # extract common parameters
                         impl.extractCommon(jediTaskID,taskParamMap,self.workQueueMapper,splitRule)
                         # set parent tid
                         if not parent_tid in [None,jediTaskID]:
                             impl.taskSpec.parent_tid = parent_tid
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         errStr = 'failed to extract common parameters with {0}:{1} {2}'.format(errtype.__name__,errvalue,
                                                                                                traceback.format_exc())
                         tmpLog.error(errStr)
                         tmpStat = Interaction.SC_FAILED
                 # check attribute length
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('checking attribute length')
                     if not impl.taskSpec.checkAttrLength():
                         tmpLog.error(impl.taskSpec.errorDialog)
                         tmpStat = Interaction.SC_FAILED
                 # check parent
                 noWaitParent = False
                 parentState = None
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     if not parent_tid in [None,jediTaskID]:
                         tmpLog.info('check parent task')
                         try:
                             tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid)
                             parentState = tmpStat
                             if tmpStat == 'completed':
                                 # parent is done
                                 tmpStat = Interaction.SC_SUCCEEDED
                             elif tmpStat == 'running':
                                 if not impl.taskSpec.noWaitParent():
                                     # parent is running
                                     errStr = 'pending until parent task {0} is done'.format(parent_tid)
                                     impl.taskSpec.status = taskStatus
                                     impl.taskSpec.setOnHold()
                                     impl.taskSpec.setErrDiag(errStr)
                                     tmpLog.info(errStr)
                                     self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                                       oldStatus=[taskStatus],setFrozenTime=False)
                                     continue
                                 else:
                                     # not wait for parent
                                     tmpStat = Interaction.SC_SUCCEEDED
                                     noWaitParent = True
                             else:
                                 # parent is corrupted
                                 tmpStat = Interaction.SC_FAILED
                                 tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid)
                                 impl.taskSpec.setErrDiag(tmpErrStr)
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # refine
                 if tmpStat == Interaction.SC_SUCCEEDED:
                     tmpLog.info('refining with {0}'.format(impl.__class__.__name__))
                     try:
                         tmpStat = impl.doRefine(jediTaskID,taskParamMap)
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         # wait unknown input if noWaitParent or waitInput
                         if ((impl.taskSpec.noWaitParent() or impl.taskSpec.waitInput()) \
                                 and errtype == JediException.UnknownDatasetError) or parentState == 'running' \
                                 or errtype == Interaction.JEDITemporaryError:
                             if impl.taskSpec.noWaitParent() or parentState == 'running':
                                 tmpErrStr = 'pending until parent produces input'
                                 setFrozenTime=False
                             elif errtype == Interaction.JEDITemporaryError:
                                 tmpErrStr = 'pending due to DDM problem. {0}'.format(errvalue)
                                 setFrozenTime=True
                             else:
                                 tmpErrStr = 'pending until input is staged'
                                 setFrozenTime=True
                             impl.taskSpec.status = taskStatus
                             impl.taskSpec.setOnHold()
                             impl.taskSpec.setErrDiag(tmpErrStr)
                             tmpLog.info(tmpErrStr)
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                               oldStatus=[taskStatus],
                                                               insertUnknown=impl.unknownDatasetList,
                                                               setFrozenTime=setFrozenTime)
                             continue
                         else:
                             errStr  = 'failed to refine task with {0}:{1}'.format(errtype.__name__,errvalue)
                             tmpLog.error(errStr)
                             tmpStat = Interaction.SC_FAILED
                 # register
                 if tmpStat != Interaction.SC_SUCCEEDED:
                     tmpLog.error('failed to refine the task')
                     if impl == None or impl.taskSpec == None:
                         tmpTaskSpec = JediTaskSpec()
                         tmpTaskSpec.jediTaskID = jediTaskID
                     else:
                         tmpTaskSpec = impl.taskSpec
                     tmpTaskSpec.status = 'tobroken'
                     if errStr != '':
                         tmpTaskSpec.setErrDiag(errStr,True)
                     self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID},oldStatus=[taskStatus])
                 else:
                     tmpLog.info('registering')                    
                     # fill JEDI tables
                     try:
                         # enable protection against task duplication
                         if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \
                                 not impl.taskSpec.checkPreProcessed():
                             uniqueTaskName = True
                         else:
                             uniqueTaskName = False
                         strTaskParams = None
                         if impl.updatedTaskParams != None:
                             strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams)
                         if taskStatus == 'registered':
                             # unset pre-process flag
                             if impl.taskSpec.checkPreProcessed():
                                 impl.taskSpec.setPostPreProcess()
                             # full registration
                             tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec,
                                                                                                  impl.inMasterDatasetSpec,
                                                                                                  impl.inSecDatasetSpecList,
                                                                                                  impl.outDatasetSpecList,
                                                                                                  impl.outputTemplateMap,
                                                                                                  impl.jobParamsTemplate,
                                                                                                  strTaskParams,
                                                                                                  impl.unmergeMasterDatasetSpec,
                                                                                                  impl.unmergeDatasetSpecMap,
                                                                                                  uniqueTaskName,
                                                                                                  taskStatus) 
                             if not tmpStat:
                                 tmpErrStr = 'failed to register the task to JEDI in a single shot'
                                 tmpLog.error(tmpErrStr)
                                 impl.taskSpec.status = newTaskStatus
                                 impl.taskSpec.setErrDiag(tmpErrStr,True)
                                 self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                                   oldStatus=[taskStatus])
                             tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                             tmpLog.info(tmpMsg)
                             tmpLog.sendMsg(tmpMsg,self.msgType)
                         else:
                             # disable scouts if previous attempt didn't use it
                             if not impl.taskSpec.useScout(splitRule):
                                 impl.taskSpec.setUseScout(False)
                             # update task with new params
                             self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID},
                                                               oldStatus=[taskStatus])
                             # appending for incremetnal execution
                             tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec,
                                                                             impl.inSecDatasetSpecList)
                             if not tmpStat:
                                 tmpLog.error('failed to append datasets for incexec')
                     except:
                         errtype,errvalue = sys.exc_info()[:2]
                         tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue)
                         tmpLog.error(tmpErrStr)
                     else:
                         tmpLog.info('done')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
Exemplo n.º 13
0
 def run(self):
     try:
         # get process lock
         locked = self.taskBufferIF.lockProcess_JEDI(
             vo=self.vo,
             prodSourceLabel=self.prodSourceLabel,
             cloud=None,
             workqueue_id=None,
             resource_name=None,
             component=self.component,
             pid=self.pid,
             timeLimit=10)
         if not locked:
             self.log.debug(
                 'component={0} skipped since locked by another'.format(
                     self.component))
             return
         # get parameters for conversion
         self.log.debug('component={0} start'.format(self.component))
         maxTasks = self.taskBufferIF.getConfigValue(
             self.component, 'JUMBO_MAX_TASKS', 'jedi', self.vo)
         if maxTasks is None:
             maxTasks = 1
         nEventsToDisable = self.taskBufferIF.getConfigValue(
             self.component, 'JUMBO_MIN_EVENTS_DISABLE', 'jedi', self.vo)
         if nEventsToDisable is None:
             nEventsToDisable = 100000
         nEventsToEnable = self.taskBufferIF.getConfigValue(
             self.component, 'JUMBO_MIN_EVENTS_ENABLE', 'jedi', self.vo)
         if nEventsToEnable is None:
             nEventsToEnable = nEventsToDisable * 10
         maxEvents = self.taskBufferIF.getConfigValue(
             self.component, 'JUMBO_MAX_EVENTS', 'jedi', self.vo)
         if maxEvents is None:
             maxEvents = maxTasks * nEventsToEnable // 2
         nJumboPerTask = self.taskBufferIF.getConfigValue(
             self.component, 'JUMBO_PER_TASK', 'jedi', self.vo)
         if nJumboPerTask is None:
             nJumboPerTask = 1
         nJumboPerSite = self.taskBufferIF.getConfigValue(
             self.component, 'JUMBO_PER_SITE', 'jedi', self.vo)
         if nJumboPerSite is None:
             nJumboPerSite = 1
         maxPrio = self.taskBufferIF.getConfigValue(self.component,
                                                    'JUMBO_MAX_CURR_PRIO',
                                                    'jedi', self.vo)
         if maxPrio is None:
             maxPrio = 500
         progressToBoost = self.taskBufferIF.getConfigValue(
             self.component, 'JUMBO_PROG_TO_BOOST', 'jedi', self.vo)
         if progressToBoost is None:
             progressToBoost = 95
         maxFilesToBoost = self.taskBufferIF.getConfigValue(
             self.component, 'JUMBO_MAX_FILES_TO_BOOST', 'jedi', self.vo)
         if maxFilesToBoost is None:
             maxFilesToBoost = 500
         prioToBoost = 900
         prioWhenDisabled = self.taskBufferIF.getConfigValue(
             self.component, 'JUMBO_PRIO_DISABLED', 'jedi', self.vo)
         if prioWhenDisabled is None:
             prioWhenDisabled = 500
         # get current info
         tasksWithJumbo = self.taskBufferIF.getTaskWithJumbo_JEDI(
             self.vo, self.prodSourceLabel)
         totEvents = 0
         doneEvents = 0
         nTasks = 0
         for jediTaskID, taskData in iteritems(tasksWithJumbo):
             # disable jumbo
             if taskData['useJumbo'] != JediTaskSpec.enum_useJumbo[
                     'disabled'] and taskData['site'] is None:
                 if taskData['nEvents'] - taskData[
                         'nEventsDone'] < nEventsToDisable:
                     # disable
                     self.log.info(
                         'component={0} disable jumbo in jediTaskID={1} due to n_events_to_process={2} < {3}'
                         .format(
                             self.component, jediTaskID,
                             taskData['nEvents'] - taskData['nEventsDone'],
                             nEventsToDisable))
                     self.taskBufferIF.enableJumboJobs(jediTaskID, 0, 0)
                 else:
                     # wait
                     nTasks += 1
                     totEvents += taskData['nEvents']
                     doneEvents += taskData['nEventsDone']
                     self.log.info(
                         'component={0} keep jumbo in jediTaskID={1} due to n_events_to_process={2} > {3}'
                         .format(
                             self.component, jediTaskID,
                             taskData['nEvents'] - taskData['nEventsDone'],
                             nEventsToDisable))
             # increase priority for jumbo disabled
             if taskData['useJumbo'] == JediTaskSpec.enum_useJumbo[
                     'disabled'] and taskData[
                         'currentPriority'] < prioWhenDisabled:
                 self.taskBufferIF.changeTaskPriorityPanda(
                     jediTaskID, prioWhenDisabled)
                 self.log.info(
                     'component={0} priority boost to {1} after disabing jumbo in in jediTaskID={2}'
                     .format(self.component, prioWhenDisabled, jediTaskID))
             # increase priority when close to completion
             if taskData['nEvents'] > 0 and (taskData['nEvents'] - taskData['nEventsDone']) * 100 // taskData['nEvents'] < progressToBoost \
                     and taskData['currentPriority'] < prioToBoost and (taskData['nFiles'] - taskData['nFilesDone']) < maxFilesToBoost:
                 # boost
                 tmpStr = 'component={0} priority boost to {5} for jediTaskID={1} due to n_events_done={2} > {3}*{4}% '.format(
                     self.component, jediTaskID, taskData['nEventsDone'],
                     taskData['nEvents'], progressToBoost, prioToBoost)
                 tmpStr += 'n_files_remaining={0} < {1}'.format(
                     taskData['nFiles'] - taskData['nFilesDone'],
                     maxFilesToBoost)
                 self.log.info(tmpStr)
                 self.taskBufferIF.changeTaskPriorityPanda(
                     jediTaskID, prioToBoost)
             # kick pending
             if taskData['taskStatus'] in [
                     'pending', 'running'
             ] and taskData['useJumbo'] in [
                     JediTaskSpec.enum_useJumbo['pending'],
                     JediTaskSpec.enum_useJumbo['running']
             ]:
                 nActiveJumbo = 0
                 for computingSite, jobStatusMap in iteritems(
                         taskData['jumboJobs']):
                     for jobStatus, nJobs in iteritems(jobStatusMap):
                         if jobStatus in [
                                 'defined', 'assigned', 'activated', 'sent',
                                 'starting', 'running', 'transferring',
                                 'holding'
                         ]:
                             nActiveJumbo += nJobs
                 if nActiveJumbo == 0:
                     self.log.info(
                         'component={0} kick jumbo in {2} jediTaskID={1}'.
                         format(self.component, jediTaskID,
                                taskData['taskStatus']))
                     self.taskBufferIF.kickPendingTasksWithJumbo_JEDI(
                         jediTaskID)
             # reset input to re-generate co-jumbo
             if taskData['currentPriority'] >= prioToBoost:
                 nReset = self.taskBufferIF.resetInputToReGenCoJumbo_JEDI(
                     jediTaskID)
                 if nReset is not None and nReset > 0:
                     self.log.info(
                         'component={0} reset {1} inputs to regenerate co-jumbo for jediTaskID={2}'
                         .format(self.component, nReset, jediTaskID))
                 else:
                     self.log.debug(
                         'component={0} tried to reset inputs to regenerate co-jumbo with {1} for jediTaskID={2}'
                         .format(self.component, nReset, jediTaskID))
         self.log.info(
             'component={0} total_events={1} n_events_to_process={2} n_tasks={3} available for jumbo'
             .format(self.component, totEvents, totEvents - doneEvents,
                     nTasks))
         if True:
             # get list of releases and caches available at jumbo job enabled PQs
             jumboRels, jumboCaches = self.taskBufferIF.getRelCacheForJumbo_JEDI(
             )
             # look for tasks to enable jumbo
             if self.dryRun:
                 self.log.info(
                     'component={0} look for tasks to enable jumbo in dry run mode'
                     .format(self.component))
             else:
                 self.log.info(
                     'component={0} look for tasks to enable jumbo due to lack of tasks and events to meet max_tasks={1} max_events={2}'
                     .format(self.component, maxTasks, maxEvents))
             tasksToEnableJumbo = self.taskBufferIF.getTaskToEnableJumbo_JEDI(
                 self.vo, self.prodSourceLabel, maxPrio, nEventsToEnable)
             nGoodTasks = 0
             self.log.debug('component={0} got {1} tasks to check'.format(
                 self.component, len(tasksToEnableJumbo)))
             # sort by nevents
             nEventsMap = dict()
             for jediTaskID, taskData in iteritems(tasksToEnableJumbo):
                 nEventsMap[jediTaskID] = taskData['nEvents']
             sortedList = sorted(list(nEventsMap.items()),
                                 key=operator.itemgetter(1))
             sortedList.reverse()
             for jediTaskID, nEvents in sortedList:
                 taskData = tasksToEnableJumbo[jediTaskID]
                 # get task parameters
                 try:
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                         jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except Exception:
                     self.log.error(
                         'component={0} failed to get task params for jediTaskID={1}'
                         .format(self.component, jediTaskID))
                     continue
                 taskSpec = JediTaskSpec()
                 taskSpec.splitRule = taskData['splitRule']
                 # check if good for jumbo
                 if 'esConvertible' not in taskParamMap or taskParamMap[
                         'esConvertible'] is False:
                     self.log.info(
                         'component={0} skip to enable jumbo for jediTaskID={1} since not ES-convertible'
                         .format(self.component, jediTaskID))
                     continue
                 if taskSpec.inFilePosEvtNum():
                     pass
                 elif taskSpec.getNumFilesPerJob() == 1:
                     pass
                 elif taskSpec.getNumEventsPerJob() is not None and 'nEventsPerInputFile' in taskParamMap \
                         and taskSpec.getNumEventsPerJob() <= taskParamMap['nEventsPerInputFile']:
                     pass
                 else:
                     self.log.info(
                         'component={0} skip to enable jumbo for jediTaskID={1} since not good for in-file positional event numbers'
                         .format(self.component, jediTaskID))
                     continue
                 # check software
                 transHome = taskData['transHome']
                 cmtConfig = taskData['architecture']
                 if re.search('^\d+\.\d+\.\d+$',
                              transHome.split('-')[-1]) is not None:
                     transHome = transHome.split('-')[-1]
                     swDict = jumboRels
                 else:
                     swDict = jumboCaches
                 key = (transHome, cmtConfig)
                 if key not in swDict:
                     self.log.info(
                         'component={0} skip to enable jumbo for jediTaskID={1} since {2}:{3} is unavailable at jumbo job enabled PQs'
                         .format(self.component, jediTaskID, transHome,
                                 cmtConfig))
                     continue
                 if not self.dryRun and nTasks < maxTasks and (
                         totEvents - doneEvents) < maxEvents:
                     self.log.info(
                         'component={0} enable jumbo in jediTaskID={1} with n_events_to_process={2}'
                         .format(
                             self.component, jediTaskID,
                             taskData['nEvents'] - taskData['nEventsDone']))
                     if taskData['eventService'] == 0:
                         tmpS, tmpO = self.taskBufferIF.enableEventService(
                             taskData['jediTaskID'])
                         if tmpS != 0:
                             self.log.error(
                                 'component={0} failed to enable ES in jediTaskID={1} with {2}'
                                 .format(self.component, jediTaskID, tmpO))
                             continue
                     self.taskBufferIF.enableJumboJobs(
                         taskData['jediTaskID'], nJumboPerTask,
                         nJumboPerSite)
                     nTasks += 1
                     totEvents += taskData['nEvents']
                     doneEvents += taskData['nEventsDone']
                 else:
                     nGoodTasks += 1
                     self.log.info(
                         'component={0} good to enable jumbo in jediTaskID={1} with n_events_to_process={2}'
                         .format(
                             self.component, jediTaskID,
                             taskData['nEvents'] - taskData['nEventsDone']))
             self.log.info(
                 'component={0} there are n_good_tasks={1} tasks good for jumbo'
                 .format(self.component, nGoodTasks))
         self.log.debug('component={0} done'.format(self.component))
     except Exception:
         # error
         errtype, errvalue = sys.exc_info()[:2]
         errStr = ": %s %s" % (errtype.__name__, errvalue)
         errStr.strip()
         errStr += traceback.format_exc()
         self.log.error(errStr)
Exemplo n.º 14
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskDsList = self.taskDsList.get(nTasks)
             # no more datasets
             if len(taskDsList) == 0:
                 self.logger.debug("%s terminating since no more items" % self.__class__.__name__)
                 return
             # loop over all tasks
             for jediTaskID, dsList in taskDsList:
                 allUpdated = True
                 taskBroken = False
                 taskOnHold = False
                 runningTask = False
                 missingMap = {}
                 # make logger
                 tmpLog = MsgWrapper(self.logger, "<jediTaskID={0}>".format(jediTaskID))
                 # get task
                 tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID, False, True, None, 10)
                 if not tmpStat or taskSpec == None:
                     tmpLog.error("failed to get taskSpec for jediTaskID={0}".format(jediTaskID))
                     continue
                 try:
                     # get task parameters
                     taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                     taskParamMap = RefinerUtils.decodeJSON(taskParam)
                 except:
                     errtype, errvalue = sys.exc_info()[:2]
                     tmpLog.error(
                         "task param conversion from json failed with {0}:{1}".format(errtype.__name__, errvalue)
                     )
                     taskBroken = True
                 # renaming of parameters
                 if taskParamMap.has_key("nEventsPerInputFile"):
                     taskParamMap["nEventsPerFile"] = taskParamMap["nEventsPerInputFile"]
                 # the number of files per job
                 nFilesPerJob = None
                 if taskParamMap.has_key("nFilesPerJob"):
                     nFilesPerJob = taskParamMap["nFilesPerJob"]
                 # the number of files used by scout
                 nFilesForScout = 0
                 if nFilesPerJob != None:
                     nFilesForScout = 10 * nFilesPerJob
                 else:
                     nFilesForScout = 10
                 # load XML
                 if taskSpec.useLoadXML():
                     try:
                         loadXML = taskParamMap["loadXML"]
                         xmlConfig = ParseJobXML.dom_parser(xmlStr=loadXML)
                     except:
                         errtype, errvalue = sys.exc_info()[:2]
                         tmpLog.error("failed to load XML config with {0}:{1}".format(errtype.__name__, errvalue))
                         taskBroken = True
                 else:
                     xmlConfig = None
                 # check no wait
                 noWaitParent = False
                 if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None, taskSpec.jediTaskID]:
                     tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid)
                     if tmpStat == "running":
                         noWaitParent = True
                 # loop over all datasets
                 nFilesMaster = 0
                 if not taskBroken:
                     ddmIF = self.ddmIF.getInterface(taskSpec.vo)
                     origNumFiles = None
                     if taskParamMap.has_key("nFiles"):
                         origNumFiles = taskParamMap["nFiles"]
                     for datasetSpec in dsList:
                         tmpLog.info(
                             "start loop for {0}(id={1})".format(datasetSpec.datasetName, datasetSpec.datasetID)
                         )
                         # get dataset metadata
                         tmpLog.info("get metadata")
                         gotMetadata = False
                         stateUpdateTime = datetime.datetime.utcnow()
                         try:
                             if not datasetSpec.isPseudo():
                                 tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName)
                             else:
                                 # dummy metadata for pseudo dataset
                                 tmpMetadata = {"state": "closed"}
                             # set mutable when parent is running and the dataset is open
                             if noWaitParent and tmpMetadata["state"] == "open":
                                 # dummy metadata when parent is running
                                 tmpMetadata = {"state": "mutable"}
                             gotMetadata = True
                         except:
                             errtype, errvalue = sys.exc_info()[:2]
                             tmpLog.error(
                                 "{0} failed to get metadata to {1}:{2}".format(
                                     self.__class__.__name__, errtype.__name__, errvalue
                                 )
                             )
                             if errtype == Interaction.JEDIFatalError:
                                 # fatal error
                                 datasetStatus = "broken"
                                 taskBroken = True
                                 # update dataset status
                                 self.updateDatasetStatus(datasetSpec, datasetStatus, tmpLog)
                             else:
                                 # temporary error
                                 taskOnHold = True
                             taskSpec.setErrDiag("failed to get metadata for {0}".format(datasetSpec.datasetName))
                             allUpdated = False
                         else:
                             # get file list specified in task parameters
                             fileList, includePatt, excludePatt = RefinerUtils.extractFileList(
                                 taskParamMap, datasetSpec.datasetName
                             )
                             # get the number of events in metadata
                             if taskParamMap.has_key("getNumEventsInMetadata"):
                                 getNumEvents = True
                             else:
                                 getNumEvents = False
                             # get file list from DDM
                             tmpLog.info("get files")
                             try:
                                 useInFilesWithNewAttemptNr = False
                                 skipDuplicate = not datasetSpec.useDuplicatedFiles()
                                 if not datasetSpec.isPseudo():
                                     if (
                                         fileList != []
                                         and taskParamMap.has_key("useInFilesInContainer")
                                         and not datasetSpec.containerName in ["", None]
                                     ):
                                         # read files from container if file list is specified in task parameters
                                         tmpDatasetName = datasetSpec.containerName
                                     else:
                                         tmpDatasetName = datasetSpec.datasetName
                                     tmpRet = ddmIF.getFilesInDataset(
                                         tmpDatasetName, getNumEvents=getNumEvents, skipDuplicate=skipDuplicate
                                     )
                                     # remove lost files
                                     tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName, tmpRet)
                                     if tmpLostFiles != {}:
                                         tmpLog.info(
                                             "found {0} lost files in {1}".format(len(tmpLostFiles), tmpDatasetName)
                                         )
                                         for tmpListGUID, tmpLostLFN in tmpLostFiles.iteritems():
                                             tmpLog.info("removed {0}".format(tmpLostLFN))
                                             del tmpRet[tmpListGUID]
                                 else:
                                     if not taskSpec.useListPFN():
                                         # dummy file list for pseudo dataset
                                         tmpRet = {
                                             str(uuid.uuid4()): {
                                                 "lfn": "pseudo_lfn",
                                                 "scope": None,
                                                 "filesize": 0,
                                                 "checksum": None,
                                             }
                                         }
                                     else:
                                         # make dummy file list for PFN list
                                         if taskParamMap.has_key("nFiles"):
                                             nPFN = taskParamMap["nFiles"]
                                         else:
                                             nPFN = 1
                                         tmpRet = {}
                                         for iPFN in range(nPFN):
                                             tmpRet[str(uuid.uuid4())] = {
                                                 "lfn": "{0:06d}:{1}".format(
                                                     iPFN, taskParamMap["pfnList"][iPFN].split("/")[-1]
                                                 ),
                                                 "scope": None,
                                                 "filesize": 0,
                                                 "checksum": None,
                                             }
                             except:
                                 errtype, errvalue = sys.exc_info()[:2]
                                 tmpLog.error(
                                     "failed to get files due to {0}:{1}".format(
                                         self.__class__.__name__, errtype.__name__, errvalue
                                     )
                                 )
                                 if errtype == Interaction.JEDIFatalError:
                                     # fatal error
                                     datasetStatus = "broken"
                                     taskBroken = True
                                     # update dataset status
                                     self.updateDatasetStatus(datasetSpec, datasetStatus, tmpLog)
                                 else:
                                     # temporary error
                                     taskOnHold = True
                                 taskSpec.setErrDiag("failed to get files for {0}".format(datasetSpec.datasetName))
                                 allUpdated = False
                             else:
                                 # the number of events per file
                                 nEventsPerFile = None
                                 nEventsPerJob = None
                                 nEventsPerRange = None
                                 if (datasetSpec.isMaster() and taskParamMap.has_key("nEventsPerFile")) or (
                                     datasetSpec.isPseudo() and taskParamMap.has_key("nEvents")
                                 ):
                                     if taskParamMap.has_key("nEventsPerFile"):
                                         nEventsPerFile = taskParamMap["nEventsPerFile"]
                                     elif datasetSpec.isPseudo() and taskParamMap.has_key("nEvents"):
                                         # use nEvents as nEventsPerFile for pseudo input
                                         nEventsPerFile = taskParamMap["nEvents"]
                                     if taskParamMap.has_key("nEventsPerJob"):
                                         nEventsPerJob = taskParamMap["nEventsPerJob"]
                                     elif taskParamMap.has_key("nEventsPerRange"):
                                         nEventsPerRange = taskParamMap["nEventsPerRange"]
                                 # max attempts and first event number
                                 maxAttempt = None
                                 firstEventNumber = None
                                 if datasetSpec.isMaster():
                                     # max attempts
                                     if taskSpec.disableAutoRetry():
                                         # disable auto retry
                                         maxAttempt = 1
                                     elif taskParamMap.has_key("maxAttempt"):
                                         maxAttempt = taskParamMap["maxAttempt"]
                                     else:
                                         # use default value
                                         maxAttempt = 3
                                     # first event number
                                     firstEventNumber = 1 + taskSpec.getFirstEventOffset()
                                 # nMaxEvents
                                 nMaxEvents = None
                                 if datasetSpec.isMaster() and taskParamMap.has_key("nEvents"):
                                     nMaxEvents = taskParamMap["nEvents"]
                                 # nMaxFiles
                                 nMaxFiles = None
                                 if taskParamMap.has_key("nFiles"):
                                     if datasetSpec.isMaster():
                                         nMaxFiles = taskParamMap["nFiles"]
                                     else:
                                         # calculate for secondary
                                         nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles)
                                         # multipled by the number of jobs per file for event-level splitting
                                         if nMaxFiles != None and taskParamMap.has_key("nEventsPerFile"):
                                             if taskParamMap.has_key("nEventsPerJob"):
                                                 if taskParamMap["nEventsPerFile"] > taskParamMap["nEventsPerJob"]:
                                                     nMaxFiles *= float(taskParamMap["nEventsPerFile"]) / float(
                                                         taskParamMap["nEventsPerJob"]
                                                     )
                                                     nMaxFiles = int(math.ceil(nMaxFiles))
                                             elif taskParamMap.has_key("nEventsPerRange"):
                                                 if taskParamMap["nEventsPerFile"] > taskParamMap["nEventsPerRange"]:
                                                     nMaxFiles *= float(taskParamMap["nEventsPerFile"]) / float(
                                                         taskParamMap["nEventsPerRange"]
                                                     )
                                                     nMaxFiles = int(math.ceil(nMaxFiles))
                                 # use scout
                                 useScout = False
                                 if datasetSpec.isMaster() and taskSpec.useScout():
                                     useScout = True
                                 # use files with new attempt numbers
                                 useFilesWithNewAttemptNr = False
                                 if (
                                     not datasetSpec.isPseudo()
                                     and fileList != []
                                     and taskParamMap.has_key("useInFilesWithNewAttemptNr")
                                 ):
                                     useFilesWithNewAttemptNr = True
                                 # feed files to the contents table
                                 tmpLog.info("update contents")
                                 retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(
                                     datasetSpec,
                                     tmpRet,
                                     tmpMetadata["state"],
                                     stateUpdateTime,
                                     nEventsPerFile,
                                     nEventsPerJob,
                                     maxAttempt,
                                     firstEventNumber,
                                     nMaxFiles,
                                     nMaxEvents,
                                     useScout,
                                     fileList,
                                     useFilesWithNewAttemptNr,
                                     nFilesPerJob,
                                     nEventsPerRange,
                                     nFilesForScout,
                                     includePatt,
                                     excludePatt,
                                     xmlConfig,
                                     noWaitParent,
                                     taskSpec.parent_tid,
                                 )
                                 if retDB == False:
                                     taskSpec.setErrDiag(
                                         "failed to insert files for {0}. {1}".format(
                                             datasetSpec.datasetName, diagMap["errMsg"]
                                         )
                                     )
                                     allUpdated = False
                                     taskBroken = True
                                     break
                                 elif retDB == None:
                                     # the dataset is locked by another or status is not applicable
                                     allUpdated = False
                                 elif missingFileList != []:
                                     # files are missing
                                     tmpErrStr = "{0} files missing in {1}".format(
                                         len(missingFileList), datasetSpec.datasetName
                                     )
                                     tmpLog.info(tmpErrStr)
                                     taskSpec.setErrDiag(tmpErrStr)
                                     allUpdated = False
                                     taskOnHold = True
                                     missingMap[datasetSpec.datasetName] = {
                                         "datasetSpec": datasetSpec,
                                         "missingFiles": missingFileList,
                                     }
                                 else:
                                     # reduce the number of files to be read
                                     if taskParamMap.has_key("nFiles"):
                                         if datasetSpec.isMaster():
                                             taskParamMap["nFiles"] -= nFilesUnique
                                     # reduce the number of files for scout
                                     if useScout:
                                         nFilesForScout = diagMap["nFilesForScout"]
                                     # number of master input files
                                     if datasetSpec.isMaster():
                                         nFilesMaster += nFilesUnique
                                 # running task
                                 if diagMap["isRunningTask"]:
                                     runningTask = True
                                 # no activated pending input for noWait
                                 if noWaitParent and diagMap["nActivatedPending"] == 0:
                                     tmpErrStr = "insufficient inputs are ready"
                                     tmpLog.info(tmpErrStr)
                                     taskSpec.setErrDiag(tmpErrStr)
                                     taskOnHold = True
                         tmpLog.info("end loop")
                 # no mater input
                 if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0:
                     tmpErrStr = "no master input files. input dataset is empty"
                     tmpLog.error(tmpErrStr)
                     taskSpec.setErrDiag(tmpErrStr, None)
                     if taskSpec.allowEmptyInput() or noWaitParent:
                         taskOnHold = True
                     else:
                         taskBroken = True
                 # update task status
                 if taskBroken:
                     # task is broken
                     taskSpec.status = "tobroken"
                     tmpMsg = "set task.status={0}".format(taskSpec.status)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg, self.msgType)
                     allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID, taskSpec)
                 # change task status unless the task is running
                 if not runningTask:
                     if taskOnHold:
                         if not noWaitParent:
                             # initialize task generator
                             taskGenerator = TaskGenerator(taskSpec.vo, taskSpec.prodSourceLabel)
                             tmpStat = taskGenerator.initializeMods(
                                 self.taskBufferIF, self.ddmIF.getInterface(taskSpec.vo)
                             )
                             if not tmpStat:
                                 tmpErrStr = "failed to initialize TaskGenerator"
                                 tmpLog.error(tmpErrStr)
                                 taskSpec.status = "tobroken"
                                 taskSpec.setErrDiag(tmpErrStr)
                             else:
                                 # make parent tasks if necessary
                                 tmpLog.info(
                                     "make parent tasks with {0} (if necessary)".format(
                                         taskGenerator.getClassName(taskSpec.vo, taskSpec.prodSourceLabel)
                                     )
                                 )
                                 tmpStat = taskGenerator.doGenerate(
                                     taskSpec, taskParamMap, missingFilesMap=missingMap
                                 )
                                 if tmpStat == Interaction.SC_FATAL:
                                     # failed to make parent tasks
                                     taskSpec.status = "tobroken"
                                     tmpLog.error("failed to make parent tasks")
                         # go to pending state
                         if not taskSpec.status in ["broken", "tobroken"]:
                             taskSpec.setOnHold()
                         tmpMsg = "set task.status={0}".format(taskSpec.status)
                         tmpLog.info(tmpMsg)
                         tmpLog.sendMsg(tmpMsg, self.msgType)
                         allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID, taskSpec)
                     elif allUpdated:
                         # all OK
                         allRet, newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(
                             jediTaskID, getTaskStatus=True
                         )
                         tmpMsg = "set task.status={0}".format(newTaskStatus)
                         tmpLog.info(tmpMsg)
                         tmpLog.sendMsg(tmpMsg, self.msgType)
                 tmpLog.info("done")
         except:
             errtype, errvalue = sys.exc_info()[:2]
             logger.error(
                 "{0} failed in runImpl() with {1}:{2}".format(self.__class__.__name__, errtype.__name__, errvalue)
             )
Exemplo n.º 15
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,commandMap in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID))
                 commandStr = commandMap['command']
                 commentStr = commandMap['comment']
                 oldStatus  = commandMap['oldStatus']
                 tmpLog.info('start for {0}'.format(commandStr))
                 tmpStat = Interaction.SC_SUCCEEDED
                 if commandStr in ['kill','finish','reassign']:
                     # get active PandaIDs to be killed
                     pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True)
                     if pandaIDs == None:
                         tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID))
                         tmpStat = Interaction.SC_FAILED
                     # kill jobs or update task
                     if tmpStat == Interaction.SC_SUCCEEDED:
                         if pandaIDs == []:
                             # done since no active jobs
                             tmpLog.info('completed the command')
                             tmpTaskSpec = JediTaskSpec()
                             tmpTaskSpec.jediTaskID = jediTaskID
                             updateTaskStatus = True
                             if commandStr != 'reassign':
                                 # keep oldStatus for task reassignment since it is reset when actually reassigned
                                 tmpTaskSpec.forceUpdate('oldStatus')
                             else:
                                 # extract cloud or site
                                 tmpItems = commentStr.split(':')
                                 if tmpItems[0] == 'cloud':
                                     tmpTaskSpec.cloud = tmpItems[1]
                                 else:
                                     tmpTaskSpec.site = tmpItems[1]
                                 # back to oldStatus if necessary 
                                 if tmpItems[2] == 'y':
                                     tmpTaskSpec.status = oldStatus
                                     tmpTaskSpec.forceUpdate('oldStatus')
                                     updateTaskStatus = False
                             if updateTaskStatus:
                                 tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done']
                             tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID})
                         else:
                             tmpLog.info('sending kill command')
                             tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True)
                         tmpLog.info('done with {0}'.format(str(tmpRet)))
                 elif commandStr in ['retry','incexec']:
                     # change task params for incexec
                     if commandStr == 'incexec':
                         try:
                             # read task params
                             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                             taskParamMap = RefinerUtils.decodeJSON(taskParam)
                             # remove some params
                             for newKey in ['nFiles','fixedSandbox']:
                                 try:
                                     del taskParamMap[newKey]
                                 except:
                                     pass
                             # convert new params
                             newParamMap = RefinerUtils.decodeJSON(commentStr)
                             # change params
                             for newKey,newVal in newParamMap.iteritems():
                                 if newVal == None:
                                     # delete
                                     if newKey in taskParamMap:
                                         del taskParamMap[newKey]
                                 else:
                                     # change
                                     taskParamMap[newKey] = newVal
                             # overwrite sandbox
                             if 'fixedSandbox' in taskParamMap:
                                 # noBuild
                                 for tmpParam in taskParamMap['jobParameters']:
                                     if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None:
                                         tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox']
                                 # build
                                 if taskParamMap.has_key('buildSpec'):
                                     taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox']
                                 # merge
                                 if taskParamMap.has_key('mergeSpec'):
                                     taskParamMap['mergeSpec']['jobParameters'] = \
                                         re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters'])
                             # encode new param
                             strTaskParams = RefinerUtils.encodeJSON(taskParamMap)
                             tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams)
                             if tmpRet != True:
                                 tmpLog.error('failed to update task params')
                                 continue
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue))
                             continue
                     # retry failed files
                     tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr)
                     if tmpRet == True:
                         tmpMsg = 'set task.status={0}'.format(newTaskStatus)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                         tmpLog.info(tmpMsg)
                     tmpLog.info('done with {0}'.format(tmpRet))
                 else:
                     tmpLog.error('unknown command')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
Exemplo n.º 16
0
 def runImpl(self):
     while True:
         try:
             # get a part of list
             nTasks = 10
             taskList = self.taskList.get(nTasks)
             # no more datasets
             if len(taskList) == 0:
                 self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__))
                 return
             # loop over all tasks
             for jediTaskID,commandMap in taskList:
                 # make logger
                 tmpLog = MsgWrapper(self.logger,' < jediTaskID={0} >'.format(jediTaskID))
                 commandStr = commandMap['command']
                 commentStr = commandMap['comment']
                 oldStatus  = commandMap['oldStatus']
                 tmpLog.info('start for {0}'.format(commandStr))
                 tmpStat = Interaction.SC_SUCCEEDED
                 if commandStr in ['kill','finish','reassign']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # loop twice to see immediate result
                     for iLoop in range(2):
                         # get active PandaIDs to be killed
                         if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr:
                             pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID)
                         elif commandStr == 'reassign' and commentStr != None and 'nokill reassign' in commentStr:
                             pandaIDs = []
                         else:
                             pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True)
                         if pandaIDs == None:
                             tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID))
                             tmpStat = Interaction.SC_FAILED
                         # kill jobs or update task
                         if tmpStat == Interaction.SC_SUCCEEDED:
                             if pandaIDs == []:
                                 # done since no active jobs
                                 tmpMsg = 'completed cleaning jobs'
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpTaskSpec = JediTaskSpec()
                                 tmpTaskSpec.jediTaskID = jediTaskID
                                 updateTaskStatus = True
                                 if commandStr != 'reassign':
                                     # reset oldStatus
                                     # keep oldStatus for task reassignment since it is reset when actually reassigned
                                     tmpTaskSpec.forceUpdate('oldStatus')
                                 else:
                                     # extract cloud or site
                                     if commentStr != None:
                                         tmpItems = commentStr.split(':')
                                         if tmpItems[0] == 'cloud':
                                             tmpTaskSpec.cloud = tmpItems[1]
                                         elif tmpItems[0] == 'nucleus':
                                             tmpTaskSpec.nucleus = tmpItems[1]
                                         else:
                                             tmpTaskSpec.site = tmpItems[1]
                                         tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1])
                                         tmpLog.sendMsg(tmpMsg,self.msgType)
                                         tmpLog.info(tmpMsg)
                                         # back to oldStatus if necessary 
                                         if tmpItems[2] == 'y':
                                             tmpTaskSpec.status = oldStatus
                                             tmpTaskSpec.forceUpdate('oldStatus')
                                             updateTaskStatus = False
                                 if commandStr == 'reassign':
                                     tmpTaskSpec.forceUpdate('errorDialog')
                                 if commandStr == 'finish':
                                     # update datasets
                                     tmpLog.info('updating datasets to finish')
                                     tmpStat = self.taskBufferIF.updateDatasetsToFinishTask_JEDI(jediTaskID, self.pid)
                                     if not tmpStat:
                                         tmpLog.info('wait until datasets are updated to finish')
                                     # ignore failGoalUnreached when manually finished
                                     tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID)
                                     tmpTaskSpec.splitRule = taskSpec.splitRule
                                     tmpTaskSpec.unsetFailGoalUnreached()
                                 if updateTaskStatus:
                                     tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done']
                                 tmpMsg = 'set task_status={0}'.format(tmpTaskSpec.status)
                                 tmpLog.sendMsg(tmpMsg,self.msgType)
                                 tmpLog.info(tmpMsg)
                                 tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID},
                                                                            setOldModTime=True)
                                 tmpLog.info('done with {0}'.format(str(tmpRet)))
                                 break
                             else:
                                 # kill only in the first loop
                                 if iLoop > 0:
                                     break
                                 # wait or kill jobs 
                                 if 'soft finish' in commentStr:
                                     queuedPandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID)
                                     tmpMsg = "trying to kill {0} queued jobs for soft finish".format(len(queuedPandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = self.taskBufferIF.killJobs(queuedPandaIDs,commentStr,'52',True)
                                     tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpRet = True
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                                     break
                                 else:
                                     tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs))
                                     tmpLog.info(tmpMsg)
                                     tmpLog.sendMsg(tmpMsg,self.msgType)
                                     if commandStr in ['finish']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True)
                                     elif commandStr in ['reassign']:
                                         # force kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'51',True)
                                     else:
                                         # normal kill
                                         tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True)
                                     tmpLog.info('done with {0}'.format(str(tmpRet)))
                 elif commandStr in ['retry','incexec']:
                     tmpMsg = 'executing {0}'.format(commandStr)
                     tmpLog.info(tmpMsg)
                     tmpLog.sendMsg(tmpMsg,self.msgType)
                     # change task params for incexec
                     if commandStr == 'incexec':
                         try:
                             # read task params
                             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID)
                             taskParamMap = RefinerUtils.decodeJSON(taskParam)
                             # remove some params
                             for newKey in ['nFiles','fixedSandbox']:
                                 try:
                                     del taskParamMap[newKey]
                                 except:
                                     pass
                             # convert new params
                             newParamMap = RefinerUtils.decodeJSON(commentStr)
                             # change params
                             for newKey,newVal in newParamMap.iteritems():
                                 if newVal == None:
                                     # delete
                                     if newKey in taskParamMap:
                                         del taskParamMap[newKey]
                                 else:
                                     # change
                                     taskParamMap[newKey] = newVal
                             # overwrite sandbox
                             if 'fixedSandbox' in taskParamMap:
                                 # noBuild
                                 for tmpParam in taskParamMap['jobParameters']:
                                     if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None:
                                         tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox']
                                 # build
                                 if taskParamMap.has_key('buildSpec'):
                                     taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox']
                                 # merge
                                 if taskParamMap.has_key('mergeSpec'):
                                     taskParamMap['mergeSpec']['jobParameters'] = \
                                         re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters'])
                             # encode new param
                             strTaskParams = RefinerUtils.encodeJSON(taskParamMap)
                             tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams)
                             if tmpRet != True:
                                 tmpLog.error('failed to update task params')
                                 continue
                         except:
                             errtype,errvalue = sys.exc_info()[:2]
                             tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue))
                             continue
                     # retry child tasks
                     if 'sole ' in commentStr:
                         retryChildTasks = False
                     else:
                         retryChildTasks = True
                     # discard events
                     if 'discard ' in commentStr:
                         discardEvents = True
                     else:
                         discardEvents = False
                     tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr,
                                                                             retryChildTasks=retryChildTasks,
                                                                             discardEvents=discardEvents)
                     if tmpRet == True:
                         tmpMsg = 'set task_status={0}'.format(newTaskStatus)
                         tmpLog.sendMsg(tmpMsg,self.msgType)
                         tmpLog.info(tmpMsg)
                     tmpLog.info('done with {0}'.format(tmpRet))
                 else:
                     tmpLog.error('unknown command')
         except:
             errtype,errvalue = sys.exc_info()[:2]
             errStr  = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue)
             errStr += traceback.format_exc()
             logger.error(errStr)
Exemplo n.º 17
0
 def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap):
     # make logger
     tmpLog = MsgWrapper(logger,
                         '<jediTaskID={0}>'.format(taskSpec.jediTaskID))
     tmpLog.debug('start')
     # return for failure
     retFatal = self.SC_FATAL, inputChunk
     retTmpError = self.SC_FAILED, inputChunk
     # set cloud
     try:
         if not taskParamMap:
             taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(
                 taskSpec.jediTaskID)
             taskParamMap = RefinerUtils.decodeJSON(taskParam)
         if not taskSpec.cloud and 'cloud' in taskParamMap:
             taskSpec.cloud = taskParamMap['cloud']
     except Exception:
         pass
     # get sites in the cloud
     site_preassigned = True
     if taskSpec.site not in ['', None]:
         tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site))
         if self.siteMapper.checkSite(taskSpec.site):
             scanSiteList = [taskSpec.site]
         else:
             scanSiteList = []
             for tmpSite in self.siteMapper.getCloud(
                     taskSpec.cloud)['sites']:
                 if re.search(taskSpec.site, tmpSite):
                     scanSiteList.append(tmpSite)
             if not scanSiteList:
                 tmpLog.error('unknown site={}'.format(taskSpec.site))
                 taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
                 return retTmpError
     elif inputChunk.getPreassignedSite() is not None:
         scanSiteList = [inputChunk.getPreassignedSite()]
         tmpLog.debug('site={0} is pre-assigned in masterDS'.format(
             inputChunk.getPreassignedSite()))
     else:
         site_preassigned = False
         scanSiteList = self.siteMapper.getCloud(taskSpec.cloud)['sites']
         # remove NA
         if 'NA' in scanSiteList:
             scanSiteList.remove('NA')
         tmpLog.debug('cloud=%s has %s candidates' %
                      (taskSpec.cloud, len(scanSiteList)))
     tmpLog.debug('initial {0} candidates'.format(len(scanSiteList)))
     ######################################
     # selection for status and PandaSite
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check site status
         if tmpSiteSpec.status != 'online' and not site_preassigned:
             tmpLog.debug('  skip %s due to status=%s' %
                          (tmpSiteName, tmpSiteSpec.status))
             continue
         # check PandaSite
         if 'PandaSite' in taskParamMap and taskParamMap['PandaSite']:
             if tmpSiteSpec.pandasite != taskParamMap['PandaSite']:
                 tmpLog.debug('  skip %s due to wrong PandaSite=%s <> %s' %
                              (tmpSiteName, tmpSiteSpec.pandasite,
                               taskParamMap['PandaSite']))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed site status check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for scratch disk
     minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize(
     ) + inputChunk.getMaxAtomSize()
     minDiskCountS = minDiskCountS // 1024 // 1024
     # size for direct IO sites
     if taskSpec.useLocalIO():
         minDiskCountR = minDiskCountS
     else:
         minDiskCountR = taskSpec.getOutDiskSize(
         ) + taskSpec.getWorkDiskSize()
         minDiskCountR = minDiskCountR // 1024 // 1024
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # check at the site
         if tmpSiteSpec.maxwdir:
             if JediCoreUtils.use_direct_io_for_job(taskSpec, tmpSiteSpec,
                                                    inputChunk):
                 minDiskCount = minDiskCountR
             else:
                 minDiskCount = minDiskCountS
             if minDiskCount > tmpSiteSpec.maxwdir:
                 tmpLog.debug(
                     '  skip {0} due to small scratch disk={1} < {2}'.
                     format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount))
                 continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed scratch disk check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for available space in SE
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
         # free space must be >= 200GB
         diskThreshold = 200
         tmpSpaceSize = tmpSiteSpec.space
         if tmpSiteSpec.space and tmpSpaceSize < diskThreshold:
             tmpLog.debug(
                 '  skip {0} due to disk shortage in SE = {1} < {2}GB'.
                 format(tmpSiteName, tmpSiteSpec.space, diskThreshold))
             continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed SE space check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # selection for walltime
     minWalltime = taskSpec.walltime
     if minWalltime not in [0, None]:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # check at the site
             if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime:
                 tmpLog.debug(
                     '  skip {0} due to short site walltime={1}(site upper limit) < {2}'
                     .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime))
                 continue
             if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime:
                 tmpLog.debug(
                     '  skip {0} due to short job walltime={1}(site lower limit) > {2}'
                     .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime))
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format(
             len(scanSiteList), minWalltime, taskSpec.walltimeUnit))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for memory
     origMinRamCount = inputChunk.getMaxRamCount()
     if not site_preassigned and origMinRamCount:
         newScanSiteList = []
         for tmpSiteName in scanSiteList:
             tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
             # job memory requirement
             if taskSpec.ramPerCore():
                 minRamCount = origMinRamCount * (
                     tmpSiteSpec.coreCount if tmpSiteSpec.coreCount else 1)
                 minRamCount += (taskSpec.baseRamCount
                                 if taskSpec.baseRamCount else 0)
             else:
                 minRamCount = origMinRamCount
             # site max memory requirement
             site_maxmemory = tmpSiteSpec.maxrss if tmpSiteSpec.maxrss else 0
             # check at the site
             if site_maxmemory and minRamCount and minRamCount > site_maxmemory:
                 tmpMsg = '  skip site={0} due to site RAM shortage {1}(site upper limit) less than {2} '.format(
                     tmpSiteName, site_maxmemory, minRamCount)
                 tmpLog.debug(tmpMsg)
                 continue
             # site min memory requirement
             site_minmemory = tmpSiteSpec.minrss if tmpSiteSpec.minrss else 0
             if site_minmemory and minRamCount and minRamCount < site_minmemory:
                 tmpMsg = '  skip site={0} due to job RAM shortage {1}(site lower limit) greater than {2} '.format(
                     tmpSiteName, site_minmemory, minRamCount)
                 tmpLog.info(tmpMsg)
                 continue
             newScanSiteList.append(tmpSiteName)
         scanSiteList = newScanSiteList
         tmpLog.debug('{0} candidates passed memory check'.format(
             len(scanSiteList)))
         if scanSiteList == []:
             tmpLog.error('no candidates')
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # selection for nPilot
     nWNmap = self.taskBufferIF.getCurrentSiteData()
     newScanSiteList = []
     for tmpSiteName in scanSiteList:
         # check at the site
         nPilot = 0
         if tmpSiteName in nWNmap:
             nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][
                 'updateJob']
         if nPilot == 0 and taskSpec.prodSourceLabel not in ['test']:
             tmpLog.debug('  skip %s due to no pilot' % tmpSiteName)
             #continue
         newScanSiteList.append(tmpSiteName)
     scanSiteList = newScanSiteList
     tmpLog.debug('{0} candidates passed pilot activity check'.format(
         len(scanSiteList)))
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # sites already used by task
     tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI(
         taskSpec.jediTaskID)
     if not tmpSt:
         tmpLog.error('failed to get sites which already used by task')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # get list of available files
     availableFileMap = {}
     for datasetSpec in inputChunk.getDatasets():
         try:
             # get list of site to be scanned
             tmpLog.debug(
                 'getting the list of available files for {0}'.format(
                     datasetSpec.datasetName))
             fileScanSiteList = []
             for tmpPseudoSiteName in scanSiteList:
                 tmpSiteSpec = self.siteMapper.getSite(tmpPseudoSiteName)
                 tmpSiteName = tmpSiteSpec.get_unified_name()
                 if tmpSiteName in fileScanSiteList:
                     continue
                 fileScanSiteList.append(tmpSiteName)
             # mapping between sites and input storage endpoints
             siteStorageEP = AtlasBrokerUtils.getSiteInputStorageEndpointMap(
                 fileScanSiteList, self.siteMapper,
                 taskSpec.prodSourceLabel, None)
             # disable file lookup for merge jobs
             if inputChunk.isMerging:
                 checkCompleteness = False
             else:
                 checkCompleteness = True
             if not datasetSpec.isMaster():
                 useCompleteOnly = True
             else:
                 useCompleteOnly = False
             # get available files per site/endpoint
             tmpAvFileMap = self.ddmIF.getAvailableFiles(
                 datasetSpec,
                 siteStorageEP,
                 self.siteMapper,
                 check_completeness=checkCompleteness,
                 file_scan_in_container=False,
                 complete_only=useCompleteOnly)
             if tmpAvFileMap is None:
                 raise Interaction.JEDITemporaryError(
                     'ddmIF.getAvailableFiles failed')
             availableFileMap[datasetSpec.datasetName] = tmpAvFileMap
         except Exception as e:
             tmpLog.error('failed to get available files with {}'.format(e))
             taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
             return retTmpError
     ######################################
     # calculate weight
     tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsByGlobalShare(
         taskSpec.vo)
     if not tmpSt:
         tmpLog.error('failed to get job statistics with priority')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     ######################################
     # final procedure
     tmpLog.debug('final {0} candidates'.format(len(scanSiteList)))
     weightMap = {}
     candidateSpecList = []
     preSiteCandidateSpec = None
     for tmpSiteName in scanSiteList:
         # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs
         nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName,
                                                'running', None, None)
         nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                 tmpSiteName, 'defined',
                                                 None, None)
         nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,
                                                  tmpSiteName, 'activated',
                                                  None, None)
         weight = float(nRunning + 1) / float(nActivated + nAssigned +
                                              1) / float(nAssigned + 1)
         # make candidate
         siteCandidateSpec = SiteCandidate(tmpSiteName)
         # set weight
         siteCandidateSpec.weight = weight
         # files
         for tmpDatasetName, availableFiles in six.iteritems(
                 availableFileMap):
             if tmpSiteName in availableFiles:
                 siteCandidateSpec.add_local_disk_files(
                     availableFiles[tmpSiteName]['localdisk'])
         # append
         if tmpSiteName in sitesUsedByTask:
             candidateSpecList.append(siteCandidateSpec)
         else:
             if weight not in weightMap:
                 weightMap[weight] = []
             weightMap[weight].append(siteCandidateSpec)
     # limit the number of sites
     maxNumSites = 5
     weightList = list(weightMap.keys())
     weightList.sort()
     weightList.reverse()
     for weightVal in weightList:
         if len(candidateSpecList) >= maxNumSites:
             break
         sitesWithWeight = weightMap[weightVal]
         random.shuffle(sitesWithWeight)
         candidateSpecList += sitesWithWeight[:(maxNumSites -
                                                len(candidateSpecList))]
     # collect site names
     scanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         scanSiteList.append(siteCandidateSpec.siteName)
     # append candidates
     newScanSiteList = []
     for siteCandidateSpec in candidateSpecList:
         # append
         inputChunk.addSiteCandidate(siteCandidateSpec)
         newScanSiteList.append(siteCandidateSpec.siteName)
         tmpLog.debug('  use {} with weight={} nFiles={}'.format(
             siteCandidateSpec.siteName, siteCandidateSpec.weight,
             len(siteCandidateSpec.localDiskFiles)))
     scanSiteList = newScanSiteList
     if scanSiteList == []:
         tmpLog.error('no candidates')
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retTmpError
     # return
     tmpLog.debug('done')
     return self.SC_SUCCEEDED, inputChunk