Пример #1
0
 def __init__(self, taskBuffer, jobs, logger, params, defaultMap):
     self.jobs = []
     self.jumboJobs = []
     # separate normal and jumbo jobs
     for tmpJob in jobs:
         if EventServiceUtils.isJumboJob(tmpJob):
             self.jumboJobs.append(tmpJob)
         else:
             self.jobs.append(tmpJob)
     self.taskBuffer = taskBuffer
     self.logger = logger
     # set named parameters
     for tmpKey in params:
         tmpVal = params[tmpKey]
         setattr(self, tmpKey, tmpVal)
     # set defaults
     for tmpKey in defaultMap:
         tmpVal = defaultMap[tmpKey]
         if not hasattr(self, tmpKey):
             setattr(self, tmpKey, tmpVal)
Пример #2
0
 def doPostProcess(self,taskSpec,tmpLog):
     # pre-check
     try:
         tmpStat = self.doPreCheck(taskSpec,tmpLog)
         if tmpStat:
             return self.SC_SUCCEEDED
     except Exception:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('doPreCheck failed with {0}:{1}'.format(errtype.__name__,errvalue))
         return self.SC_FATAL
     # get DDM I/F
     ddmIF = self.ddmIF.getInterface(taskSpec.vo)
     # loop over all datasets
     for datasetSpec in taskSpec.datasetSpecList:
         # skip pseudo output datasets
         if datasetSpec.type in ['output'] and datasetSpec.isPseudo():
             continue
         try:
             # remove wrong files
             if datasetSpec.type in ['output']:
                 # get successful files
                 okFiles = self.taskBufferIF.getSuccessfulFiles_JEDI(datasetSpec.jediTaskID,datasetSpec.datasetID)
                 if okFiles is None:
                     tmpLog.warning('failed to get successful files for {0}'.format(datasetSpec.datasetName))
                     return self.SC_FAILED
                 # get files in dataset
                 ddmFiles = ddmIF.getFilesInDataset(datasetSpec.datasetName,skipDuplicate=False,ignoreUnknown=True)
                 tmpLog.debug('datasetID={0}:Name={1} has {2} files in DB, {3} files in DDM'.format(datasetSpec.datasetID,
                                                                                                   datasetSpec.datasetName,
                                                                                                   len(okFiles),len(ddmFiles)))
                 # check all files
                 toDelete = []
                 for tmpGUID,attMap in iteritems(ddmFiles):
                     if attMap['lfn'] not in okFiles:
                         did = {'scope':attMap['scope'], 'name':attMap['lfn']}
                         toDelete.append(did)
                         tmpLog.debug('delete {0} from {1}'.format(attMap['lfn'],datasetSpec.datasetName))
                 # delete
                 if toDelete != []:
                     ddmIF.deleteFilesFromDataset(datasetSpec.datasetName,toDelete)
         except Exception:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.warning('failed to remove wrong files with {0}:{1}'.format(errtype.__name__,errvalue))
             return self.SC_FAILED
         try:
             # freeze output and log datasets
             if datasetSpec.type in ['output','log','trn_log']:
                 tmpLog.info('freezing datasetID={0}:Name={1}'.format(datasetSpec.datasetID,datasetSpec.datasetName))
                 ddmIF.freezeDataset(datasetSpec.datasetName,ignoreUnknown=True)
         except Exception:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.warning('failed to freeze datasets with {0}:{1}'.format(errtype.__name__,errvalue))
             return self.SC_FAILED
         try:
             # delete transient datasets
             if datasetSpec.type in ['trn_output']:
                 tmpLog.debug('deleting datasetID={0}:Name={1}'.format(datasetSpec.datasetID,datasetSpec.datasetName))
                 retStr = ddmIF.deleteDataset(datasetSpec.datasetName,False,ignoreUnknown=True)
                 tmpLog.info(retStr)
         except Exception:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.warning('failed to delete datasets with {0}:{1}'.format(errtype.__name__,errvalue))
     # check duplication
     if self.getFinalTaskStatus(taskSpec) in ['finished','done'] and taskSpec.gshare != 'Test':
         nDup = self.taskBufferIF.checkDuplication_JEDI(taskSpec.jediTaskID)
         tmpLog.debug('checked duplication with {0}'.format(nDup))
         if nDup > 0:
             errStr = 'paused since {0} duplication found'.format(nDup)
             taskSpec.oldStatus = self.getFinalTaskStatus(taskSpec)
             taskSpec.status = 'paused'
             taskSpec.setErrDiag(errStr)
             tmpLog.debug(errStr)
     # delete ES datasets
     if taskSpec.registerEsFiles():
         try:
             targetName = EventServiceUtils.getEsDatasetName(taskSpec.jediTaskID)
             tmpLog.debug('deleting ES dataset name={0}'.format(targetName))
             retStr = ddmIF.deleteDataset(targetName,False,ignoreUnknown=True)
             tmpLog.debug(retStr)
         except Exception:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.warning('failed to delete ES dataset with {0}:{1}'.format(errtype.__name__,errvalue))
     try:
         AtlasPostProcessorUtils.send_notification(self.taskBufferIF, ddmIF, taskSpec, tmpLog)
     except Exception as e:
         tmpLog.error('failed to talk to external system with {0}'.format(str(e)))
         return self.SC_FAILED
     try:
         self.doBasicPostProcess(taskSpec,tmpLog)
     except Exception:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('doBasicPostProcess failed with {0}:{1}'.format(errtype.__name__,errvalue))
         return self.SC_FATAL
     return self.SC_SUCCEEDED
Пример #3
0
 def extractCommon(self,jediTaskID,taskParamMap,workQueueMapper,splitRule):
     # make task spec
     taskSpec = JediTaskSpec()
     taskSpec.jediTaskID = jediTaskID
     taskSpec.taskName = taskParamMap['taskName']
     taskSpec.userName = taskParamMap['userName']
     taskSpec.vo = taskParamMap['vo']     
     taskSpec.prodSourceLabel = taskParamMap['prodSourceLabel']
     taskSpec.taskPriority = taskParamMap['taskPriority']
     taskSpec.currentPriority = taskSpec.taskPriority
     taskSpec.architecture = taskParamMap['architecture']
     taskSpec.transUses = taskParamMap['transUses']
     taskSpec.transHome = taskParamMap['transHome']
     taskSpec.transPath = taskParamMap['transPath']
     taskSpec.processingType = taskParamMap['processingType']
     taskSpec.taskType = taskParamMap['taskType']
     taskSpec.splitRule = splitRule
     taskSpec.startTime = datetime.datetime.utcnow()
     if taskParamMap.has_key('workingGroup'):
         taskSpec.workingGroup = taskParamMap['workingGroup']
     if taskParamMap.has_key('countryGroup'):
         taskSpec.countryGroup = taskParamMap['countryGroup']
     if taskParamMap.has_key('ticketID'):
         taskSpec.ticketID = taskParamMap['ticketID']
     if taskParamMap.has_key('ticketSystemType'):
         taskSpec.ticketSystemType = taskParamMap['ticketSystemType']
     if taskParamMap.has_key('reqID'):
         taskSpec.reqID = taskParamMap['reqID']
     else:
         taskSpec.reqID = jediTaskID
     if taskParamMap.has_key('coreCount'):
         taskSpec.coreCount = taskParamMap['coreCount']
     else:
         taskSpec.coreCount = 1
     if taskParamMap.has_key('walltime'):
         taskSpec.walltime = taskParamMap['walltime']
     else:
         taskSpec.walltime = 0
     if taskParamMap.has_key('walltimeUnit'):
         taskSpec.walltimeUnit = taskParamMap['walltimeUnit']
     if taskParamMap.has_key('outDiskCount'):
         taskSpec.outDiskCount = taskParamMap['outDiskCount']
     else:
         taskSpec.outDiskCount = 0
     if 'outDiskUnit' in taskParamMap:
         taskSpec.outDiskUnit = taskParamMap['outDiskUnit']
     if taskParamMap.has_key('workDiskCount'):
         taskSpec.workDiskCount = taskParamMap['workDiskCount']
     else:
         taskSpec.workDiskCount = 0
     if taskParamMap.has_key('workDiskUnit'):
         taskSpec.workDiskUnit = taskParamMap['workDiskUnit']
     if taskParamMap.has_key('ramCount'):
         taskSpec.ramCount = taskParamMap['ramCount']
     else:
         taskSpec.ramCount = 0
     if taskParamMap.has_key('ramUnit'):
         taskSpec.ramUnit = taskParamMap['ramUnit']
     if taskParamMap.has_key('baseRamCount'):
         taskSpec.baseRamCount = taskParamMap['baseRamCount']
     else:
         taskSpec.baseRamCount = 0
     # HS06 stuff
     if 'cpuTimeUnit' in taskParamMap:
         taskSpec.cpuTimeUnit = taskParamMap['cpuTimeUnit']
     if 'cpuTime' in taskParamMap:
         taskSpec.cpuTime = taskParamMap['cpuTime']
     if 'cpuEfficiency' in taskParamMap:
         taskSpec.cpuEfficiency = taskParamMap['cpuEfficiency']
     else:
         # 90% of cpu efficiency by default
         taskSpec.cpuEfficiency = 90
     if 'baseWalltime' in taskParamMap:
         taskSpec.baseWalltime = taskParamMap['baseWalltime']
     else:
         # 10min of offset by default
         taskSpec.baseWalltime = 10*60
     # for merge
     if 'mergeRamCount' in taskParamMap:
         taskSpec.mergeRamCount = taskParamMap['mergeRamCount']
     if 'mergeCoreCount' in taskParamMap:
         taskSpec.mergeCoreCount = taskParamMap['mergeCoreCount']
     # scout
     if not taskParamMap.has_key('skipScout') and not taskSpec.isPostScout():
         taskSpec.setUseScout(True)
     # cloud
     if taskParamMap.has_key('cloud'):
         self.cloudName = taskParamMap['cloud']
         taskSpec.cloud = self.cloudName
     else:
         # set dummy to force update
         taskSpec.cloud = 'dummy'
         taskSpec.cloud = None
     # site
     if taskParamMap.has_key('site'):
         self.siteName = taskParamMap['site']
         taskSpec.site = self.siteName
     else:
         # set dummy to force update
         taskSpec.site = 'dummy'
         taskSpec.site = None
     # nucleus
     if 'nucleus' in taskParamMap:
         taskSpec.nucleus = taskParamMap['nucleus']
     # preset some parameters for job cloning
     if 'useJobCloning' in taskParamMap:
         # set implicit parameters
         if not 'nEventsPerWorker' in taskParamMap:
             taskParamMap['nEventsPerWorker'] = 1
         if not 'nSitesPerJob' in taskParamMap:
             taskParamMap['nSitesPerJob'] = 2
         if not 'nEsConsumers' in taskParamMap:
             taskParamMap['nEsConsumers'] = taskParamMap['nSitesPerJob']
     # event service
     if taskParamMap.has_key('nEventsPerWorker'):
         taskSpec.eventService = 1
     else:
         taskSpec.eventService = 0
     # ttcr: requested time to completion
     if taskParamMap.has_key('ttcrTimestamp'):
         try:
             # get rid of the +00:00 timezone string and parse the timestamp
             taskSpec.ttcRequested = datetime.datetime.strptime(taskParamMap['ttcrTimestamp'].split('+')[0], '%Y-%m-%d %H:%M:%S.%f')
         except (IndexError, ValueError):
             pass
     # goal
     if 'goal' in taskParamMap:
         try:
             taskSpec.goal = int(float(taskParamMap['goal'])*10)
             if taskSpec.goal >= 1000:
                 taskSpec.goal = None
         except:
             pass
     # campaign
     if taskParamMap.has_key('campaign'):
         taskSpec.campaign = taskParamMap['campaign']
     # work queue
     workQueue = None
     if 'workQueueName' in taskParamMap:
         # work queue is specified
         workQueue = workQueueMapper.getQueueWithName(taskSpec.vo,taskSpec.prodSourceLabel,taskParamMap['workQueueName'])
     if workQueue == None:
         # get work queue based on task attributes
         workQueue,tmpStr = workQueueMapper.getQueueWithSelParams(taskSpec.vo,
                                                                  taskSpec.prodSourceLabel,
                                                                  processingType=taskSpec.processingType,
                                                                  workingGroup=taskSpec.workingGroup,
                                                                  coreCount=taskSpec.coreCount,
                                                                  site=taskSpec.site)
     if workQueue == None:
         errStr  = 'workqueue is undefined for vo={0} labal={1} '.format(taskSpec.vo,taskSpec.prodSourceLabel)
         errStr += 'processingType={0} workingGroup={1} coreCount={2} '.format(taskSpec.processingType,
                                                                               taskSpec.workingGroup,
                                                                               taskSpec.coreCount)
         raise RuntimeError,errStr
     taskSpec.workQueue_ID = workQueue.queue_id
     self.taskSpec = taskSpec
     # set split rule    
     if 'tgtNumEventsPerJob' in taskParamMap:
         # set nEventsPerJob not respect file boundaries when nFilesPerJob is not used
         if not 'nFilesPerJob' in taskParamMap:
             self.setSplitRule(None,taskParamMap['tgtNumEventsPerJob'],JediTaskSpec.splitRuleToken['nEventsPerJob'])
     self.setSplitRule(taskParamMap,'nFilesPerJob',     JediTaskSpec.splitRuleToken['nFilesPerJob'])
     self.setSplitRule(taskParamMap,'nEventsPerJob',    JediTaskSpec.splitRuleToken['nEventsPerJob'])
     self.setSplitRule(taskParamMap,'nGBPerJob',        JediTaskSpec.splitRuleToken['nGBPerJob'])
     self.setSplitRule(taskParamMap,'nMaxFilesPerJob',  JediTaskSpec.splitRuleToken['nMaxFilesPerJob'])
     self.setSplitRule(taskParamMap,'nEventsPerWorker', JediTaskSpec.splitRuleToken['nEventsPerWorker'])
     self.setSplitRule(taskParamMap,'useLocalIO',       JediTaskSpec.splitRuleToken['useLocalIO'])
     self.setSplitRule(taskParamMap,'disableAutoRetry', JediTaskSpec.splitRuleToken['disableAutoRetry'])
     self.setSplitRule(taskParamMap,'nEsConsumers',     JediTaskSpec.splitRuleToken['nEsConsumers'])
     self.setSplitRule(taskParamMap,'waitInput',        JediTaskSpec.splitRuleToken['waitInput'])
     self.setSplitRule(taskParamMap,'addNthFieldToLFN', JediTaskSpec.splitRuleToken['addNthFieldToLFN'])
     self.setSplitRule(taskParamMap,'scoutSuccessRate', JediTaskSpec.splitRuleToken['scoutSuccessRate'])
     self.setSplitRule(taskParamMap,'t1Weight',         JediTaskSpec.splitRuleToken['t1Weight'])
     self.setSplitRule(taskParamMap,'maxAttemptES',     JediTaskSpec.splitRuleToken['maxAttemptES'])
     self.setSplitRule(taskParamMap,'nSitesPerJob',     JediTaskSpec.splitRuleToken['nSitesPerJob'])
     self.setSplitRule(taskParamMap,'nEventsPerMergeJob',   JediTaskSpec.splitRuleToken['nEventsPerMergeJob'])
     self.setSplitRule(taskParamMap,'nFilesPerMergeJob',    JediTaskSpec.splitRuleToken['nFilesPerMergeJob'])
     self.setSplitRule(taskParamMap,'nGBPerMergeJob',       JediTaskSpec.splitRuleToken['nGBPerMergeJob'])
     self.setSplitRule(taskParamMap,'nMaxFilesPerMergeJob', JediTaskSpec.splitRuleToken['nMaxFilesPerMergeJob'])
     if taskParamMap.has_key('loadXML'):
         self.setSplitRule(None,3,JediTaskSpec.splitRuleToken['loadXML'])
         self.setSplitRule(None,4,JediTaskSpec.splitRuleToken['groupBoundaryID'])
     if taskParamMap.has_key('pfnList'):
         self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['pfnList'])
     if taskParamMap.has_key('noWaitParent'):
         self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['noWaitParent'])
     if 'respectLB' in taskParamMap:
         self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['respectLB'])
     if taskParamMap.has_key('reuseSecOnDemand'):
         self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['reuseSecOnDemand'])
     if 'ddmBackEnd' in taskParamMap:
         self.taskSpec.setDdmBackEnd(taskParamMap['ddmBackEnd'])
     if 'disableReassign' in taskParamMap:
         self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['disableReassign'])
     if 'allowPartialFinish' in taskParamMap:
         self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['allowPartialFinish'])
     if 'useExhausted' in taskParamMap:
         self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['useExhausted'])
     if 'useRealNumEvents' in taskParamMap:
         self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['useRealNumEvents'])
     if 'ipConnectivity' in taskParamMap:
         self.taskSpec.setIpConnectivity(taskParamMap['ipConnectivity'])
     if 'altStageOut' in taskParamMap:
         self.taskSpec.setAltStageOut(taskParamMap['altStageOut'])
     if 'allowInputLAN' in taskParamMap:
         self.taskSpec.setAllowInputLAN(taskParamMap['allowInputLAN'])
     if 'runUntilClosed' in taskParamMap:
         self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['runUntilClosed'])
     if 'stayOutputOnSite' in taskParamMap:
         self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['stayOutputOnSite'])
     if 'useJobCloning' in taskParamMap:
         scValue = EventServiceUtils.getJobCloningValue(taskParamMap['useJobCloning'])
         self.setSplitRule(None,scValue,JediTaskSpec.splitRuleToken['useJobCloning'])
     if 'failWhenGoalUnreached' in taskParamMap and taskParamMap['failWhenGoalUnreached'] == True:
         self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['failGoalUnreached'])
     if 'switchEStoNormal' in taskParamMap:
         self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['switchEStoNormal'])
     if 'nEventsPerRange' in taskParamMap:
         self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['dynamicNumEvents'])
     if 'allowInputWAN' in taskParamMap and taskParamMap['allowInputWAN'] == True:
         self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['allowInputWAN'])
     if 'putLogToOS' in taskParamMap and taskParamMap['putLogToOS'] == True:
         self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['putLogToOS'])
     # return
     return
Пример #4
0
    def doSetup(self,taskSpec,datasetToRegister,pandaJobs):
        # make logger
        tmpLog = MsgWrapper(logger,"< jediTaskID={0} >".format(taskSpec.jediTaskID))
        tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType))
        # returns
        retFatal    = self.SC_FATAL
        retTmpError = self.SC_FAILED
        retOK       = self.SC_SUCCEEDED
        try:
            # get DDM I/F
            ddmIF = self.ddmIF.getInterface(taskSpec.vo)
            # register datasets
            if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']:
                # prod vs anal
                userSetup = False
                if taskSpec.prodSourceLabel in ['user']:
                    userSetup = True
                    # collect datasetID to register datasets/containers just in case
                    for tmpPandaJob in pandaJobs:
                        if not tmpPandaJob.produceUnMerge():
                            for tmpFileSpec in tmpPandaJob.Files:
                                if tmpFileSpec.type in ['output','log']:
                                    if tmpFileSpec.datasetID not in datasetToRegister:
                                        datasetToRegister.append(tmpFileSpec.datasetID)
                tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister)))
                # get site mapper
                siteMapper = self.taskBufferIF.getSiteMapper()

                # loop over all datasets
                avDatasetList = []
                cnDatasetMap  = {}
                for datasetID in datasetToRegister:
                    # get output and log datasets
                    tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID))
                    tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID,
                                                                                  datasetID)
                    if not tmpStat:
                        tmpLog.error('failed to get output and log datasets')
                        return retFatal
                    if datasetSpec.isPseudo():
                        tmpLog.info('skip pseudo dataset')
                        continue
                    # DDM backend
                    ddmBackEnd = taskSpec.getDdmBackEnd()
                    tmpLog.info('checking {0}'.format(datasetSpec.datasetName))
                    # check if dataset and container are available in DDM
                    for targetName in [datasetSpec.datasetName,datasetSpec.containerName]:
                        if targetName is None:
                            continue
                        if targetName not in avDatasetList:
                            # set lifetime
                            if targetName.startswith('panda'):
                                if datasetSpec.type == 'trn_log' and taskSpec.prodSourceLabel == 'managed':
                                    lifetime = 365
                                else:
                                    lifetime = 14
                            else:
                                lifetime = None
                            # check dataset/container in DDM
                            tmpList = ddmIF.listDatasets(targetName)
                            if tmpList == []:
                                # get location
                                location = None
                                locForRule = None
                                if targetName == datasetSpec.datasetName:
                                    # dataset
                                    if datasetSpec.site in ['',None]:
                                        if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                                            locForRule = datasetSpec.destination
                                        elif DataServiceUtils.getDestinationSE(datasetSpec.storageToken) is not None:
                                            location = DataServiceUtils.getDestinationSE(datasetSpec.storageToken)
                                        elif taskSpec.cloud is not None:
                                            # use T1 SE
                                            tmpT1Name = siteMapper.getCloud(taskSpec.cloud)['source']
                                            location = siteMapper.getDdmEndpoint(tmpT1Name, datasetSpec.storageToken,
                                                                                 taskSpec.prodSourceLabel,
                                                                                 JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType))
                                    else:
                                        tmpLog.info('site={0} token={1}'.format(datasetSpec.site, datasetSpec.storageToken))
                                        location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken,
                                                                             taskSpec.prodSourceLabel,
                                                                             JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType))
                                if locForRule is None:
                                    locForRule = location
                                # set metadata
                                if taskSpec.prodSourceLabel in ['managed','test'] and targetName == datasetSpec.datasetName:
                                    metaData = {}
                                    metaData['task_id'] = taskSpec.jediTaskID
                                    if taskSpec.campaign not in [None,'']:
                                        metaData['campaign'] = taskSpec.campaign
                                    if datasetSpec.getTransient() is not None:
                                        metaData['transient'] = datasetSpec.getTransient()
                                else:
                                    metaData = None
                                # register dataset/container
                                tmpLog.info('registering {0} with location={1} backend={2} lifetime={3} meta={4}'.format(targetName,
                                                                                                                         location,
                                                                                                                         ddmBackEnd,
                                                                                                                         lifetime,
                                                                                                                         str(metaData)))
                                tmpStat = ddmIF.registerNewDataset(targetName,backEnd=ddmBackEnd,location=location,
                                                                   lifetime=lifetime,metaData=metaData)
                                if not tmpStat:
                                    tmpLog.error('failed to register {0}'.format(targetName))
                                    return retFatal
                                # procedures for user
                                if userSetup or DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                                    # register location
                                    tmpToRegister = False
                                    if userSetup and targetName == datasetSpec.datasetName and datasetSpec.site not in ['',None]:
                                        if taskSpec.workingGroup:
                                            userName = taskSpec.workingGroup
                                        else:
                                            userName = taskSpec.userName
                                        grouping = None
                                        tmpToRegister = True
                                    elif DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None:
                                        userName = None
                                        grouping = 'NONE'
                                        tmpToRegister = True
                                    if tmpToRegister:
                                        activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel)
                                        tmpLog.info('registering location={} lifetime={} days activity={} grouping={} '
                                                    'owner={}'.format(locForRule, lifetime, activity, grouping,
                                                                      userName))
                                        tmpStat = ddmIF.registerDatasetLocation(targetName,locForRule,owner=userName,
                                                                                lifetime=lifetime,backEnd=ddmBackEnd,
                                                                                activity=activity,grouping=grouping)
                                        if not tmpStat:
                                            tmpLog.error('failed to register location {0} for {1}'.format(locForRule,
                                                                                                          targetName))
                                            return retFatal
                                        # double copy
                                        if userSetup and datasetSpec.type == 'output':
                                            if datasetSpec.destination != datasetSpec.site:
                                                tmpLog.info('skip making double copy as destination={0} is not site={1}'.format(datasetSpec.destination,
                                                                                                                                datasetSpec.site))
                                            else:

                                                second_copy = True
                                                try:
                                                    if taskSpec.site:
                                                        panda_site = siteMapper.getSite(taskSpec.site)
                                                        if panda_site.catchall and 'skip_2nd_copy' in panda_site.catchall:
                                                            tmpLog.info('skip making double copy as specified in {0} catchall'.format(panda_site))
                                                            second_copy = False
                                                except Exception:
                                                    second_copy = True

                                                if second_copy:
                                                    locForDouble = '(type=SCRATCHDISK)\\notforextracopy=True'
                                                    tmpMsg  = 'registering double copy '
                                                    tmpMsg += 'location="{0}" lifetime={1}days activity={2} for dataset={3}'.format(locForDouble,lifetime,
                                                                                                                                    activity,targetName)
                                                    tmpLog.info(tmpMsg)
                                                    tmpStat = ddmIF.registerDatasetLocation(targetName,locForDouble,copies=2,owner=userName,
                                                                                            lifetime=lifetime,activity=activity,
                                                                                            grouping='NONE',weight='freespace',
                                                                                            ignore_availability=False)
                                                    if not tmpStat:
                                                        tmpLog.error('failed to register double copylocation {0} for {1}'.format(locForDouble,
                                                                                                                               targetName))
                                                        return retFatal
                                avDatasetList.append(targetName)
                            else:
                                tmpLog.info('{0} already registered'.format(targetName))
                    # check if dataset is in the container
                    if datasetSpec.containerName is not None and datasetSpec.containerName != datasetSpec.datasetName:
                        # get list of constituent datasets in the container
                        if datasetSpec.containerName not in cnDatasetMap:
                            cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName)
                        # add dataset
                        if datasetSpec.datasetName not in cnDatasetMap[datasetSpec.containerName]:
                            tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName))
                            tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName],
                                                                   backEnd=ddmBackEnd)
                            if not tmpStat:
                                tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName,
                                                                               datasetSpec.containerName))
                                return retFatal
                            cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName)
                        else:
                            tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName))
                    # update dataset
                    datasetSpec.status = 'registered'
                    self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID,
                                                                      'datasetID':datasetID})
            # register ES datasets
            if taskSpec.registerEsFiles():
                targetName = EventServiceUtils.getEsDatasetName(taskSpec.jediTaskID)
                location = None
                metaData = {}
                metaData['task_id'] = taskSpec.jediTaskID
                metaData['hidden']  = True
                tmpLog.info('registering ES dataset {0} with location={1} meta={2}'.format(targetName,
                                                                                           location,
                                                                                           str(metaData)))
                tmpStat = ddmIF.registerNewDataset(targetName,location=location,metaData=metaData,
                                                   resurrect=True)
                if not tmpStat:
                    tmpLog.error('failed to register ES dataset {0}'.format(targetName))
                    return retFatal
                # register rule
                location = 'type=DATADISK'
                activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel)
                grouping = 'NONE'
                tmpLog.info('registering location={0} activity={1} grouping={2}'.format(location,
                                                                                        activity,
                                                                                        grouping))
                tmpStat = ddmIF.registerDatasetLocation(targetName,location,activity=activity,
                                                        grouping=grouping)
                if not tmpStat:
                    tmpLog.error('failed to register location {0} with {2} for {1}'.format(location,
                                                                                           targetName,
                                                                                           activity))
                    return retFatal
            # open datasets
            if taskSpec.prodSourceLabel in ['managed','test']:
                # get the list of output/log datasets
                outDatasetList = []
                for tmpPandaJob in pandaJobs:
                    for tmpFileSpec in tmpPandaJob.Files:
                        if tmpFileSpec.type in ['output','log']:
                            if tmpFileSpec.destinationDBlock not in outDatasetList:
                                outDatasetList.append(tmpFileSpec.destinationDBlock)
                # open datasets
                for outDataset in outDatasetList:
                    tmpLog.info('open {0}'.format(outDataset))
                    ddmIF.openDataset(outDataset)
                    # unset lifetime
                    ddmIF.setDatasetMetadata(outDataset,'lifetime',None)
            # return
            tmpLog.info('done')
            return retOK
        except Exception:
            errtype,errvalue = sys.exc_info()[:2]
            tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue))
            taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
            return retFatal
Пример #5
0
    def extractCommon(self, jediTaskID, taskParamMap, workQueueMapper,
                      splitRule):
        # make task spec
        taskSpec = JediTaskSpec()
        taskSpec.jediTaskID = jediTaskID
        taskSpec.taskName = taskParamMap['taskName']
        taskSpec.userName = taskParamMap['userName']
        taskSpec.vo = taskParamMap['vo']
        taskSpec.prodSourceLabel = taskParamMap['prodSourceLabel']
        taskSpec.taskPriority = taskParamMap['taskPriority']
        taskSpec.currentPriority = taskSpec.taskPriority
        taskSpec.architecture = taskParamMap['architecture']
        taskSpec.transUses = taskParamMap['transUses']
        taskSpec.transHome = taskParamMap['transHome']
        taskSpec.transPath = taskParamMap['transPath']
        taskSpec.processingType = taskParamMap['processingType']
        taskSpec.taskType = taskParamMap['taskType']
        taskSpec.splitRule = splitRule
        taskSpec.startTime = datetime.datetime.utcnow()
        if taskParamMap.has_key('workingGroup'):
            taskSpec.workingGroup = taskParamMap['workingGroup']
        if taskParamMap.has_key('countryGroup'):
            taskSpec.countryGroup = taskParamMap['countryGroup']
        if taskParamMap.has_key('ticketID'):
            taskSpec.ticketID = taskParamMap['ticketID']
        if taskParamMap.has_key('ticketSystemType'):
            taskSpec.ticketSystemType = taskParamMap['ticketSystemType']
        if taskParamMap.has_key('reqID'):
            taskSpec.reqID = taskParamMap['reqID']
        else:
            taskSpec.reqID = jediTaskID
        if taskParamMap.has_key('coreCount'):
            taskSpec.coreCount = taskParamMap['coreCount']
        else:
            taskSpec.coreCount = 1
        if taskParamMap.has_key('walltime'):
            taskSpec.walltime = taskParamMap['walltime']
        else:
            taskSpec.walltime = 0
        if not taskParamMap.has_key('walltimeUnit'):
            # force to set NULL so that retried tasks get data from scouts again
            taskSpec.forceUpdate('walltimeUnit')
        if taskParamMap.has_key('outDiskCount'):
            taskSpec.outDiskCount = taskParamMap['outDiskCount']
        else:
            taskSpec.outDiskCount = 0
        if 'outDiskUnit' in taskParamMap:
            taskSpec.outDiskUnit = taskParamMap['outDiskUnit']
        if taskParamMap.has_key('workDiskCount'):
            taskSpec.workDiskCount = taskParamMap['workDiskCount']
        else:
            taskSpec.workDiskCount = 0
        if taskParamMap.has_key('workDiskUnit'):
            taskSpec.workDiskUnit = taskParamMap['workDiskUnit']
        if taskParamMap.has_key('ramCount'):
            taskSpec.ramCount = taskParamMap['ramCount']
        else:
            taskSpec.ramCount = 0
        if taskParamMap.has_key('ramUnit'):
            taskSpec.ramUnit = taskParamMap['ramUnit']
        if taskParamMap.has_key('baseRamCount'):
            taskSpec.baseRamCount = taskParamMap['baseRamCount']
        else:
            taskSpec.baseRamCount = 0
        # HS06 stuff
        if 'cpuTimeUnit' in taskParamMap:
            taskSpec.cpuTimeUnit = taskParamMap['cpuTimeUnit']
        if 'cpuTime' in taskParamMap:
            taskSpec.cpuTime = taskParamMap['cpuTime']
        if 'cpuEfficiency' in taskParamMap:
            taskSpec.cpuEfficiency = taskParamMap['cpuEfficiency']
        else:
            # 90% of cpu efficiency by default
            taskSpec.cpuEfficiency = 90
        if 'baseWalltime' in taskParamMap:
            taskSpec.baseWalltime = taskParamMap['baseWalltime']
        else:
            # 10min of offset by default
            taskSpec.baseWalltime = 10 * 60
        # for merge
        if 'mergeRamCount' in taskParamMap:
            taskSpec.mergeRamCount = taskParamMap['mergeRamCount']
        if 'mergeCoreCount' in taskParamMap:
            taskSpec.mergeCoreCount = taskParamMap['mergeCoreCount']
        # scout
        if not taskParamMap.has_key(
                'skipScout') and not taskSpec.isPostScout():
            taskSpec.setUseScout(True)
        # cloud
        if taskParamMap.has_key('cloud'):
            self.cloudName = taskParamMap['cloud']
            taskSpec.cloud = self.cloudName
        else:
            # set dummy to force update
            taskSpec.cloud = 'dummy'
            taskSpec.cloud = None
        # site
        if taskParamMap.has_key('site'):
            self.siteName = taskParamMap['site']
            taskSpec.site = self.siteName
        else:
            # set dummy to force update
            taskSpec.site = 'dummy'
            taskSpec.site = None
        # nucleus
        if 'nucleus' in taskParamMap:
            taskSpec.nucleus = taskParamMap['nucleus']
        # preset some parameters for job cloning
        if 'useJobCloning' in taskParamMap:
            # set implicit parameters
            if not 'nEventsPerWorker' in taskParamMap:
                taskParamMap['nEventsPerWorker'] = 1
            if not 'nSitesPerJob' in taskParamMap:
                taskParamMap['nSitesPerJob'] = 2
            if not 'nEsConsumers' in taskParamMap:
                taskParamMap['nEsConsumers'] = taskParamMap['nSitesPerJob']
        # event service flag
        if 'useJobCloning' in taskParamMap:
            taskSpec.eventService = 2
        elif taskParamMap.has_key('nEventsPerWorker'):
            taskSpec.eventService = 1
        else:
            taskSpec.eventService = 0
        # ttcr: requested time to completion
        if taskParamMap.has_key('ttcrTimestamp'):
            try:
                # get rid of the +00:00 timezone string and parse the timestamp
                taskSpec.ttcRequested = datetime.datetime.strptime(
                    taskParamMap['ttcrTimestamp'].split('+')[0],
                    '%Y-%m-%d %H:%M:%S.%f')
            except (IndexError, ValueError):
                pass
        # goal
        if 'goal' in taskParamMap:
            try:
                taskSpec.goal = int(float(taskParamMap['goal']) * 10)
                if taskSpec.goal >= 1000:
                    taskSpec.goal = None
            except:
                pass
        # campaign
        if taskParamMap.has_key('campaign'):
            taskSpec.campaign = taskParamMap['campaign']
        # request type
        if 'requestType' in taskParamMap:
            taskSpec.requestType = taskParamMap['requestType']
        self.taskSpec = taskSpec
        # set split rule
        if 'tgtNumEventsPerJob' in taskParamMap:
            # set nEventsPerJob not respect file boundaries when nFilesPerJob is not used
            if not 'nFilesPerJob' in taskParamMap:
                self.setSplitRule(None, taskParamMap['tgtNumEventsPerJob'],
                                  JediTaskSpec.splitRuleToken['nEventsPerJob'])
        self.setSplitRule(taskParamMap, 'nFilesPerJob',
                          JediTaskSpec.splitRuleToken['nFilesPerJob'])
        self.setSplitRule(taskParamMap, 'nEventsPerJob',
                          JediTaskSpec.splitRuleToken['nEventsPerJob'])
        self.setSplitRule(taskParamMap, 'nGBPerJob',
                          JediTaskSpec.splitRuleToken['nGBPerJob'])
        self.setSplitRule(taskParamMap, 'nMaxFilesPerJob',
                          JediTaskSpec.splitRuleToken['nMaxFilesPerJob'])
        self.setSplitRule(taskParamMap, 'nEventsPerWorker',
                          JediTaskSpec.splitRuleToken['nEventsPerWorker'])
        self.setSplitRule(taskParamMap, 'useLocalIO',
                          JediTaskSpec.splitRuleToken['useLocalIO'])
        self.setSplitRule(taskParamMap, 'disableAutoRetry',
                          JediTaskSpec.splitRuleToken['disableAutoRetry'])
        self.setSplitRule(taskParamMap, 'nEsConsumers',
                          JediTaskSpec.splitRuleToken['nEsConsumers'])
        self.setSplitRule(taskParamMap, 'waitInput',
                          JediTaskSpec.splitRuleToken['waitInput'])
        self.setSplitRule(taskParamMap, 'addNthFieldToLFN',
                          JediTaskSpec.splitRuleToken['addNthFieldToLFN'])
        self.setSplitRule(taskParamMap, 'scoutSuccessRate',
                          JediTaskSpec.splitRuleToken['scoutSuccessRate'])
        self.setSplitRule(taskParamMap, 't1Weight',
                          JediTaskSpec.splitRuleToken['t1Weight'])
        self.setSplitRule(taskParamMap, 'maxAttemptES',
                          JediTaskSpec.splitRuleToken['maxAttemptES'])
        self.setSplitRule(taskParamMap, 'nSitesPerJob',
                          JediTaskSpec.splitRuleToken['nSitesPerJob'])
        self.setSplitRule(taskParamMap, 'nJumboJobs',
                          JediTaskSpec.splitRuleToken['nJumboJobs'])
        self.setSplitRule(taskParamMap, 'nEventsPerMergeJob',
                          JediTaskSpec.splitRuleToken['nEventsPerMergeJob'])
        self.setSplitRule(taskParamMap, 'nFilesPerMergeJob',
                          JediTaskSpec.splitRuleToken['nFilesPerMergeJob'])
        self.setSplitRule(taskParamMap, 'nGBPerMergeJob',
                          JediTaskSpec.splitRuleToken['nGBPerMergeJob'])
        self.setSplitRule(taskParamMap, 'nMaxFilesPerMergeJob',
                          JediTaskSpec.splitRuleToken['nMaxFilesPerMergeJob'])
        if taskParamMap.has_key('loadXML'):
            self.setSplitRule(None, 3, JediTaskSpec.splitRuleToken['loadXML'])
            self.setSplitRule(None, 4,
                              JediTaskSpec.splitRuleToken['groupBoundaryID'])
        if taskParamMap.has_key('pfnList'):
            self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['pfnList'])
        if taskParamMap.has_key(
                'noWaitParent') and taskParamMap['noWaitParent'] == True:
            self.setSplitRule(None, 1,
                              JediTaskSpec.splitRuleToken['noWaitParent'])
        if 'respectLB' in taskParamMap:
            self.setSplitRule(None, 1,
                              JediTaskSpec.splitRuleToken['respectLB'])
        if taskParamMap.has_key('reuseSecOnDemand'):
            self.setSplitRule(None, 1,
                              JediTaskSpec.splitRuleToken['reuseSecOnDemand'])
        if 'ddmBackEnd' in taskParamMap:
            self.taskSpec.setDdmBackEnd(taskParamMap['ddmBackEnd'])
        if 'disableReassign' in taskParamMap:
            self.setSplitRule(None, 1,
                              JediTaskSpec.splitRuleToken['disableReassign'])
        if 'allowPartialFinish' in taskParamMap:
            self.setSplitRule(
                None, 1, JediTaskSpec.splitRuleToken['allowPartialFinish'])
        if 'useExhausted' in taskParamMap:
            self.setSplitRule(None, 1,
                              JediTaskSpec.splitRuleToken['useExhausted'])
        if 'useRealNumEvents' in taskParamMap:
            self.setSplitRule(None, 1,
                              JediTaskSpec.splitRuleToken['useRealNumEvents'])
        if 'ipConnectivity' in taskParamMap:
            self.taskSpec.setIpConnectivity(taskParamMap['ipConnectivity'])
        if 'altStageOut' in taskParamMap:
            self.taskSpec.setAltStageOut(taskParamMap['altStageOut'])
        if 'allowInputLAN' in taskParamMap:
            self.taskSpec.setAllowInputLAN(taskParamMap['allowInputLAN'])
        if 'runUntilClosed' in taskParamMap:
            self.setSplitRule(None, 1,
                              JediTaskSpec.splitRuleToken['runUntilClosed'])
        if 'stayOutputOnSite' in taskParamMap:
            self.setSplitRule(None, 1,
                              JediTaskSpec.splitRuleToken['stayOutputOnSite'])
        if 'useJobCloning' in taskParamMap:
            scValue = EventServiceUtils.getJobCloningValue(
                taskParamMap['useJobCloning'])
            self.setSplitRule(None, scValue,
                              JediTaskSpec.splitRuleToken['useJobCloning'])
        if 'failWhenGoalUnreached' in taskParamMap and taskParamMap[
                'failWhenGoalUnreached'] == True:
            self.setSplitRule(None, 1,
                              JediTaskSpec.splitRuleToken['failGoalUnreached'])
        if 'switchEStoNormal' in taskParamMap:
            self.setSplitRule(None, 1,
                              JediTaskSpec.splitRuleToken['switchEStoNormal'])
        if 'nEventsPerRange' in taskParamMap:
            self.setSplitRule(None, 1,
                              JediTaskSpec.splitRuleToken['dynamicNumEvents'])
        if 'allowInputWAN' in taskParamMap and taskParamMap[
                'allowInputWAN'] == True:
            self.setSplitRule(None, 1,
                              JediTaskSpec.splitRuleToken['allowInputWAN'])
        if 'putLogToOS' in taskParamMap and taskParamMap['putLogToOS'] == True:
            self.setSplitRule(None, 1,
                              JediTaskSpec.splitRuleToken['putLogToOS'])
        if 'mergeEsOnOS' in taskParamMap and taskParamMap[
                'mergeEsOnOS'] == True:
            self.setSplitRule(None, 1,
                              JediTaskSpec.splitRuleToken['mergeEsOnOS'])
        if 'writeInputToFile' in taskParamMap and taskParamMap[
                'writeInputToFile'] == True:
            self.setSplitRule(None, 1,
                              JediTaskSpec.splitRuleToken['writeInputToFile'])
        if 'useFileAsSourceLFN' in taskParamMap and taskParamMap[
                'useFileAsSourceLFN'] == True:
            self.setSplitRule(
                None, 1, JediTaskSpec.splitRuleToken['useFileAsSourceLFN'])
        if 'ignoreMissingInDS' in taskParamMap and taskParamMap[
                'ignoreMissingInDS'] == True:
            self.setSplitRule(None, 1,
                              JediTaskSpec.splitRuleToken['ignoreMissingInDS'])
        # work queue
        workQueue = None
        if 'workQueueName' in taskParamMap:
            # work queue is specified
            workQueue = workQueueMapper.getQueueWithName(
                taskSpec.vo, taskSpec.prodSourceLabel,
                taskParamMap['workQueueName'])
        if workQueue is None:
            # get work queue based on task attributes
            workQueue, tmpStr = workQueueMapper.getQueueWithSelParams(
                taskSpec.vo,
                taskSpec.prodSourceLabel,
                processingType=taskSpec.processingType,
                workingGroup=taskSpec.workingGroup,
                coreCount=taskSpec.coreCount,
                site=taskSpec.site,
                eventService=taskSpec.eventService,
                splitRule=taskSpec.splitRule,
                campaign=taskSpec.campaign)
        if workQueue is None:
            errStr = 'workqueue is undefined for vo={0} label={1} '.format(
                taskSpec.vo, taskSpec.prodSourceLabel)
            errStr += 'processingType={0} workingGroup={1} coreCount={2} eventService={3} '.format(
                taskSpec.processingType, taskSpec.workingGroup,
                taskSpec.coreCount, taskSpec.eventService)
            errStr += 'splitRule={0} campaign={1}'.format(
                taskSpec.splitRule, taskSpec.campaign)
            raise RuntimeError, errStr
        self.taskSpec.workQueue_ID = workQueue.queue_id

        # Initialize the global share
        gshare = None
        if 'gshare' in taskParamMap and self.taskBufferIF.is_valid_share(
                taskParamMap['gshare']):
            # work queue is specified
            gshare = taskParamMap['gshare']
        else:
            # get share based on definition
            gshare = self.taskBufferIF.get_share_for_task(self.taskSpec)
            if gshare is None:
                gshare = 'No match'
                # errStr  = 'share is undefined for vo={0} label={1} '.format(taskSpec.vo,taskSpec.prodSourceLabel)
                # errStr += 'workingGroup={0} campaign={1} '.format(taskSpec.workingGroup, taskSpec.campaign)
                # raise RuntimeError,errStr

            self.taskSpec.gshare = gshare

        # return
        return
Пример #6
0
 def parseXML(self):
     # get LFN and GUID
     # self.logger.debug('XML filename : %s' % self.xmlFile)
     # no outputs
     log_out = [f for f in self.job.Files if f.type in ['log', 'output']]
     if not log_out:
         self.logger.debug("has no outputs")
         self.logger.debug("parseXML end")
         return 0
     # get input files
     inputLFNs = []
     for file in self.job.Files:
         if file.type == 'input':
             inputLFNs.append(file.lfn)
     # parse XML
     lfns = []
     guids = []
     fsizes = []
     md5sums = []
     chksums = []
     surls = []
     fullLfnMap = {}
     nEventsMap = {}
     guidMap = dict()
     try:
         # root  = xml.dom.minidom.parse(self.xmlFile)
         root = xml.dom.minidom.parseString(self.data)
         files = root.getElementsByTagName('File')
         for file in files:
             # get GUID
             guid = str(file.getAttribute('ID'))
             # get PFN and LFN nodes
             logical = file.getElementsByTagName('logical')[0]
             lfnNode = logical.getElementsByTagName('lfn')[0]
             # convert UTF8 to Raw
             lfn = str(lfnNode.getAttribute('name'))
             # get metadata
             fsize = None
             md5sum = None
             adler32 = None
             surl = None
             fullLFN = None
             for meta in file.getElementsByTagName('metadata'):
                 # get fsize
                 name = str(meta.getAttribute('att_name'))
                 if name == 'fsize':
                     fsize = long(meta.getAttribute('att_value'))
                 elif name == 'md5sum':
                     md5sum = str(meta.getAttribute('att_value'))
                     # check
                     if re.search("^[a-fA-F0-9]{32}$", md5sum) is None:
                         md5sum = None
                 elif name == 'adler32':
                     adler32 = str(meta.getAttribute('att_value'))
                 elif name == 'surl':
                     surl = str(meta.getAttribute('att_value'))
                 elif name == 'full_lfn':
                     fullLFN = str(meta.getAttribute('att_value'))
             # endpoints
             self.extraInfo['endpoint'][lfn] = []
             for epNode in file.getElementsByTagName('endpoint'):
                 self.extraInfo['endpoint'][lfn].append(
                     str(epNode.firstChild.data))
             # error check
             if (lfn not in inputLFNs) and (fsize is None or
                                            (md5sum is None
                                             and adler32 is None)):
                 if EventServiceUtils.isEventServiceMerge(self.job):
                     continue
                 else:
                     raise RuntimeError('fsize/md5sum/adler32/surl=None')
             # append
             lfns.append(lfn)
             guids.append(guid)
             fsizes.append(fsize)
             md5sums.append(md5sum)
             surls.append(surl)
             if adler32 is not None:
                 # use adler32 if available
                 chksums.append("ad:%s" % adler32)
             else:
                 chksums.append("md5:%s" % md5sum)
             if fullLFN is not None:
                 fullLfnMap[lfn] = fullLFN
     except Exception:
         # parse json
         try:
             import json
             # with open(self.xmlFile) as tmpF:
             jsonDict = json.loads(self.data)
             for lfn in jsonDict:
                 fileData = jsonDict[lfn]
                 lfn = str(lfn)
                 fsize = None
                 md5sum = None
                 adler32 = None
                 surl = None
                 fullLFN = None
                 guid = str(fileData['guid'])
                 if 'fsize' in fileData:
                     fsize = long(fileData['fsize'])
                 if 'md5sum' in fileData:
                     md5sum = str(fileData['md5sum'])
                     # check
                     if re.search("^[a-fA-F0-9]{32}$", md5sum) is None:
                         md5sum = None
                 if 'adler32' in fileData:
                     adler32 = str(fileData['adler32'])
                 if 'surl' in fileData:
                     surl = str(fileData['surl'])
                 if 'full_lfn' in fileData:
                     fullLFN = str(fileData['full_lfn'])
                 # endpoints
                 self.extraInfo['endpoint'][lfn] = []
                 if 'endpoint' in fileData:
                     self.extraInfo['endpoint'][lfn] = fileData['endpoint']
                 # error check
                 if (lfn not in inputLFNs) and (fsize is None or
                                                (md5sum is None
                                                 and adler32 is None)):
                     if EventServiceUtils.isEventServiceMerge(self.job):
                         continue
                     else:
                         raise RuntimeError(
                             'fsize/md5sum/adler32/surl=None')
                 # append
                 lfns.append(lfn)
                 guids.append(guid)
                 fsizes.append(fsize)
                 md5sums.append(md5sum)
                 surls.append(surl)
                 if adler32 is not None:
                     # use adler32 if available
                     chksums.append("ad:%s" % adler32)
                 else:
                     chksums.append("md5:%s" % md5sum)
                 if fullLFN is not None:
                     fullLfnMap[lfn] = fullLFN
         except Exception:
             # check if file exists
             # if os.path.exists(self.xmlFile):
             if True:
                 type, value, traceBack = sys.exc_info()
                 self.logger.error(": %s %s" % (type, value))
                 # set failed anyway
                 self.job.jobStatus = 'failed'
                 # XML error happens when pilot got killed due to wall-time limit or failures in wrapper
                 if (self.job.pilotErrorCode in [0,'0','NULL']) and \
                    (self.job.taskBufferErrorCode not in [pandaserver.taskbuffer.ErrorCode.EC_WorkerDone]) and \
                    (self.job.transExitCode  in [0,'0','NULL']):
                     self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder
                     self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML"
                 return 2
             else:
                 # XML was deleted
                 return 1
     # parse metadata to get nEvents
     nEventsFrom = None
     try:
         root = xml.dom.minidom.parseString(self.job.metadata)
         files = root.getElementsByTagName('File')
         for file in files:
             # get GUID
             guid = str(file.getAttribute('ID'))
             # get PFN and LFN nodes
             logical = file.getElementsByTagName('logical')[0]
             lfnNode = logical.getElementsByTagName('lfn')[0]
             # convert UTF8 to Raw
             lfn = str(lfnNode.getAttribute('name'))
             guidMap[lfn] = guid
             # get metadata
             nevents = None
             for meta in file.getElementsByTagName('metadata'):
                 # get fsize
                 name = str(meta.getAttribute('att_name'))
                 if name == 'events':
                     nevents = long(meta.getAttribute('att_value'))
                     nEventsMap[lfn] = nevents
                     break
         nEventsFrom = "xml"
     except Exception:
         pass
     # parse json
     try:
         import json
         jsonDict = json.loads(self.job.metadata)
         for jsonFileItem in jsonDict['files']['output']:
             for jsonSubFileItem in jsonFileItem['subFiles']:
                 lfn = str(jsonSubFileItem['name'])
                 try:
                     nevents = long(jsonSubFileItem['nentries'])
                     nEventsMap[lfn] = nevents
                 except Exception:
                     pass
                 try:
                     guid = str(jsonSubFileItem['file_guid'])
                     guidMap[lfn] = guid
                 except Exception:
                     pass
         nEventsFrom = "json"
     except Exception:
         pass
     # use nEvents and GUIDs reported by the pilot if no job report
     if self.job.metadata == 'NULL' and self.jobStatus == 'finished' and self.job.nEvents > 0 \
             and self.job.prodSourceLabel in ['managed']:
         for file in self.job.Files:
             if file.type == 'output':
                 nEventsMap[file.lfn] = self.job.nEvents
         for lfn, guid in zip(lfns, guids):
             guidMap[lfn] = guid
         nEventsFrom = "pilot"
     self.logger.debug('nEventsMap=%s' % str(nEventsMap))
     self.logger.debug('nEventsFrom=%s' % str(nEventsFrom))
     self.logger.debug('guidMap=%s' % str(guidMap))
     self.logger.debug('self.job.jobStatus=%s in parseXML' %
                       self.job.jobStatus)
     self.logger.debug(
         'isES=%s isJumbo=%s' % (EventServiceUtils.isEventServiceJob(
             self.job), EventServiceUtils.isJumboJob(self.job)))
     # get lumi block number
     lumiBlockNr = self.job.getLumiBlockNr()
     # copy files for variable number of outputs
     tmpStat = self.copyFilesForVariableNumOutputs(lfns)
     if not tmpStat:
         self.logger.error(
             "failed to copy files for variable number of outputs")
         return 2
     # check files
     fileList = []
     for file in self.job.Files:
         fileList.append(file.lfn)
         if file.type == 'input':
             if file.lfn in lfns:
                 if self.job.prodSourceLabel in ['user', 'panda']:
                     # skipped file
                     file.status = 'skipped'
                 elif self.job.prodSourceLabel in [
                         'managed', 'test'
                 ] + JobUtils.list_ptest_prod_sources:
                     # failed by pilot
                     file.status = 'failed'
         elif file.type == 'output' or file.type == 'log':
             # add only log file for failed jobs
             if self.jobStatus == 'failed' and file.type != 'log':
                 file.status = 'failed'
                 continue
             # set failed if it is missing in XML
             if file.lfn not in lfns:
                 if (self.job.jobStatus == 'finished' and EventServiceUtils.isEventServiceJob(self.job)) \
                         or EventServiceUtils.isJumboJob(self.job):
                     # unset file status for ES jobs
                     pass
                 elif file.isAllowedNoOutput():
                     # allowed not to be produced
                     file.status = 'nooutput'
                     self.logger.debug('set {0} to status={1}'.format(
                         file.lfn, file.status))
                 else:
                     file.status = 'failed'
                     self.job.jobStatus = 'failed'
                     self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder
                     self.job.ddmErrorDiag = "expected output {0} is missing in pilot XML".format(
                         file.lfn)
                     self.logger.error(self.job.ddmErrorDiag)
                 continue
             # look for GUID with LFN
             try:
                 i = lfns.index(file.lfn)
                 file.GUID = guids[i]
                 file.fsize = fsizes[i]
                 file.md5sum = md5sums[i]
                 file.checksum = chksums[i]
                 surl = surls[i]
                 # status
                 file.status = 'ready'
                 # change to full LFN
                 if file.lfn in fullLfnMap:
                     file.lfn = fullLfnMap[file.lfn]
                 # add SURL to extraInfo
                 self.extraInfo['surl'][file.lfn] = surl
                 # add nevents
                 if file.lfn in nEventsMap:
                     self.extraInfo['nevents'][file.lfn] = nEventsMap[
                         file.lfn]
             except Exception:
                 # status
                 file.status = 'failed'
                 type, value, traceBack = sys.exc_info()
                 self.logger.error(": %s %s" % (type, value))
             # set lumi block number
             if lumiBlockNr is not None and file.status != 'failed':
                 self.extraInfo['lbnr'][file.lfn] = lumiBlockNr
     self.extraInfo['guid'] = guidMap
     # check consistency between XML and filesTable
     for lfn in lfns:
         if lfn not in fileList:
             self.logger.error("%s is not found in filesTable" % lfn)
             self.job.jobStatus = 'failed'
             for tmpFile in self.job.Files:
                 tmpFile.status = 'failed'
             self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder
             self.job.ddmErrorDiag = "pilot produced {0} inconsistently with jobdef".format(
                 lfn)
             return 2
     # return
     self.logger.debug("parseXML end")
     return 0
Пример #7
0
    def run(self):
        try:
            self.logger.debug("new start: %s attemptNr=%s" %
                              (self.jobStatus, self.attemptNr))

            # got lock, get the report
            report_dict = self.taskBuffer.getJobOutputReport(
                panda_id=self.jobID, attempt_nr=self.attemptNr)
            self.data = report_dict.get('data')

            # query job
            self.job = self.taskBuffer.peekJobs([self.jobID],
                                                fromDefined=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
            # check if job has finished
            if self.job is None:
                self.logger.debug(': job not found in DB')
            elif self.job.jobStatus in [
                    'finished', 'failed', 'unknown', 'merging'
            ]:
                self.logger.error(': invalid state -> %s' % self.job.jobStatus)
            elif self.attemptNr is not None and self.job.attemptNr != self.attemptNr:
                self.logger.error('wrong attemptNr -> job=%s <> %s' %
                                  (self.job.attemptNr, self.attemptNr))
            # elif self.attemptNr is not None and self.job.jobStatus == 'transferring':
            #     errMsg = 'XML with attemptNr for {0}'.format(self.job.jobStatus)
            #     self.logger.error(errMsg)
            elif self.jobStatus == EventServiceUtils.esRegStatus:
                # instantiate concrete plugin
                adderPluginClass = self.getPluginClass(self.job.VO,
                                                       self.job.cloud)
                adderPlugin = adderPluginClass(self.job,
                                               taskBuffer=self.taskBuffer,
                                               siteMapper=self.siteMapper,
                                               logger=self.logger)
                # execute
                self.logger.debug('plugin is ready for ES file registration')
                adderPlugin.registerEventServiceFiles()
            else:
                # check file status in JEDI
                if not self.job.isCancelled() and self.job.taskBufferErrorCode not in \
                                                      [pandaserver.taskbuffer.ErrorCode.EC_PilotRetried]:
                    fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(
                        self.job)
                    self.logger.debug("check file status in JEDI : {0}".format(
                        fileCheckInJEDI))
                    if fileCheckInJEDI is None:
                        raise RuntimeError(
                            'failed to check file status in JEDI')
                    if fileCheckInJEDI is False:
                        # set job status to failed since some file status is wrong in JEDI
                        self.jobStatus = 'failed'
                        self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder
                        errStr = "inconsistent file status between Panda and JEDI. "
                        errStr += "failed to avoid duplicated processing caused by synchronization failure"
                        self.job.ddmErrorDiag = errStr
                        self.logger.debug(
                            "set jobStatus={0} since input is inconsistent between Panda and JEDI"
                            .format(self.jobStatus))
                    elif self.job.jobSubStatus in ['pilot_closed']:
                        # terminated by the pilot
                        self.logger.debug(
                            "going to closed since terminated by the pilot")
                        retClosed = self.taskBuffer.killJobs([self.jobID],
                                                             'pilot', '60',
                                                             True)
                        if retClosed[0] is True:
                            self.logger.debug("end")
                            # remove Catalog
                            self.taskBuffer.deleteJobOutputReport(
                                panda_id=self.jobID, attempt_nr=self.attemptNr)
                            return
                    # check for cloned jobs
                    if EventServiceUtils.isJobCloningJob(self.job):
                        checkJC = self.taskBuffer.checkClonedJob(self.job)
                        if checkJC is None:
                            raise RuntimeError(
                                'failed to check the cloned job')
                        # failed to lock semaphore
                        if checkJC['lock'] is False:
                            self.jobStatus = 'failed'
                            self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder
                            self.job.ddmErrorDiag = "failed to lock semaphore for job cloning"
                            self.logger.debug(
                                "set jobStatus={0} since did not get semaphore for job cloning"
                                .format(self.jobStatus))
                # use failed for cancelled/closed jobs
                if self.job.isCancelled():
                    self.jobStatus = 'failed'
                    # reset error codes to skip retrial module
                    self.job.pilotErrorCode = 0
                    self.job.exeErrorCode = 0
                    self.job.ddmErrorCode = 0
                # keep old status
                oldJobStatus = self.job.jobStatus
                # set job status
                if self.job.jobStatus not in ['transferring']:
                    self.job.jobStatus = self.jobStatus
                addResult = None
                adderPlugin = None
                # parse XML
                parseResult = self.parseXML()
                if parseResult < 2:
                    # interaction with DDM
                    try:
                        # instantiate concrete plugin
                        adderPluginClass = self.getPluginClass(
                            self.job.VO, self.job.cloud)
                        adderPlugin = adderPluginClass(
                            self.job,
                            taskBuffer=self.taskBuffer,
                            siteMapper=self.siteMapper,
                            extraInfo=self.extraInfo,
                            logger=self.logger)
                        # execute
                        self.logger.debug('plugin is ready')
                        adderPlugin.execute()
                        addResult = adderPlugin.result
                        self.logger.debug('plugin done with %s' %
                                          (addResult.statusCode))
                    except Exception:
                        errtype, errvalue = sys.exc_info()[:2]
                        self.logger.error(
                            "failed to execute AdderPlugin for VO={0} with {1}:{2}"
                            .format(self.job.VO, errtype, errvalue))
                        self.logger.error(
                            "failed to execute AdderPlugin for VO={0} with {1}"
                            .format(self.job.VO, traceback.format_exc()))
                        addResult = None
                        self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "AdderPlugin failure"

                    # ignore temporary errors
                    if self.ignoreTmpError and addResult is not None and addResult.isTemporary(
                    ):
                        self.logger.debug(': ignore %s ' %
                                          self.job.ddmErrorDiag)
                        self.logger.debug('escape')
                        # unlock job output report
                        self.taskBuffer.unlockJobOutputReport(
                            panda_id=self.jobID,
                            attempt_nr=self.attemptNr,
                            pid=self.pid,
                            lock_offset=self.lock_offset)
                        return
                    # failed
                    if addResult is None or not addResult.isSucceeded():
                        self.job.jobStatus = 'failed'
                # set file status for failed jobs or failed transferring jobs
                self.logger.debug(
                    "status after plugin call :job.jobStatus=%s jobStatus=%s" %
                    (self.job.jobStatus, self.jobStatus))
                if self.job.jobStatus == 'failed' or self.jobStatus == 'failed':
                    # First of all: check if job failed and in this case take first actions according to error table
                    source, error_code, error_diag = None, None, None
                    errors = []
                    if self.job.pilotErrorCode:
                        source = 'pilotErrorCode'
                        error_code = self.job.pilotErrorCode
                        error_diag = self.job.pilotErrorDiag
                        errors.append({
                            'source': source,
                            'error_code': error_code,
                            'error_diag': error_diag
                        })
                    if self.job.exeErrorCode:
                        source = 'exeErrorCode'
                        error_code = self.job.exeErrorCode
                        error_diag = self.job.exeErrorDiag
                        errors.append({
                            'source': source,
                            'error_code': error_code,
                            'error_diag': error_diag
                        })
                    if self.job.ddmErrorCode:
                        source = 'ddmErrorCode'
                        error_code = self.job.ddmErrorCode
                        error_diag = self.job.ddmErrorDiag
                        errors.append({
                            'source': source,
                            'error_code': error_code,
                            'error_diag': error_diag
                        })
                    if self.job.transExitCode:
                        source = 'transExitCode'
                        error_code = self.job.transExitCode
                        error_diag = ''
                        errors.append({
                            'source': source,
                            'error_code': error_code,
                            'error_diag': error_diag
                        })

                    # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag))

                    if source and error_code:
                        try:
                            self.logger.debug(
                                "AdderGen.run will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, self.job.PandaID, errors,
                                self.job.attemptNr)
                            self.logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            self.logger.error(
                                "apply_retrial_rules excepted and needs to be investigated (%s): %s"
                                % (e, traceback.format_exc()))

                    self.job.jobStatus = 'failed'
                    for file in self.job.Files:
                        if file.type in ['output', 'log']:
                            if addResult is not None and file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                            else:
                                file.status = 'failed'
                else:
                    # reset errors
                    self.job.jobDispatcherErrorCode = 0
                    self.job.jobDispatcherErrorDiag = 'NULL'
                    # set status
                    if addResult is not None and addResult.mergingFiles != []:
                        # set status for merging:
                        for file in self.job.Files:
                            if file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                        self.job.jobStatus = 'merging'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime(
                            '%Y-%m-%d %H:%M:%S', time.gmtime())
                    elif addResult is not None and addResult.transferringFiles != []:
                        # set status for transferring
                        for file in self.job.Files:
                            if file.lfn in addResult.transferringFiles:
                                file.status = 'transferring'
                        self.job.jobStatus = 'transferring'
                        self.job.jobSubStatus = None
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime(
                            '%Y-%m-%d %H:%M:%S', time.gmtime())
                    else:
                        self.job.jobStatus = 'finished'
                # endtime
                if self.job.endTime == 'NULL':
                    self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',
                                                     time.gmtime())
                # output size and # of outputs
                self.job.nOutputDataFiles = 0
                self.job.outputFileBytes = 0
                for tmpFile in self.job.Files:
                    if tmpFile.type == 'output':
                        self.job.nOutputDataFiles += 1
                        try:
                            self.job.outputFileBytes += tmpFile.fsize
                        except Exception:
                            pass
                # protection
                maxOutputFileBytes = 99999999999
                if self.job.outputFileBytes > maxOutputFileBytes:
                    self.job.outputFileBytes = maxOutputFileBytes
                # set cancelled state
                if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
                    self.job.jobStatus = 'cancelled'
                # update job
                if oldJobStatus in ['cancelled', 'closed']:
                    pass
                else:
                    self.logger.debug("updating DB")
                    retU = self.taskBuffer.updateJobs(
                        [self.job],
                        False,
                        oldJobStatusList=[oldJobStatus],
                        extraInfo=self.extraInfo)
                    self.logger.debug("retU: %s" % retU)
                    # failed
                    if not retU[0]:
                        self.logger.error(
                            'failed to update DB for pandaid={0}'.format(
                                self.job.PandaID))
                        # unlock job output report
                        self.taskBuffer.unlockJobOutputReport(
                            panda_id=self.jobID,
                            attempt_nr=self.attemptNr,
                            pid=self.pid,
                            lock_offset=self.lock_offset)
                        return

                    try:
                        # updateJobs was successful and it failed a job with taskBufferErrorCode
                        self.logger.debug("AdderGen.run will peek the job")
                        job_tmp = self.taskBuffer.peekJobs(
                            [self.job.PandaID],
                            fromDefined=False,
                            fromArchived=True,
                            fromWaiting=False)[0]
                        self.logger.debug(
                            "status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}"
                            .format(job_tmp.jobStatus,
                                    job_tmp.taskBufferErrorCode,
                                    job_tmp.taskBufferErrorDiag))
                        if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode:
                            source = 'taskBufferErrorCode'
                            error_code = job_tmp.taskBufferErrorCode
                            error_diag = job_tmp.taskBufferErrorDiag
                            errors = [{
                                'source': source,
                                'error_code': error_code,
                                'error_diag': error_diag
                            }]
                            self.logger.debug(
                                "AdderGen.run 2 will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, job_tmp.PandaID, errors,
                                job_tmp.attemptNr)
                            self.logger.debug("apply_retrial_rules 2 is back")
                    except IndexError:
                        pass
                    except Exception as e:
                        self.logger.error(
                            "apply_retrial_rules 2 excepted and needs to be investigated (%s): %s"
                            % (e, traceback.format_exc()))

                    # setup for closer
                    if not (EventServiceUtils.isEventServiceJob(self.job)
                            and self.job.isCancelled()):
                        destDBList = []
                        guidList = []
                        for file in self.job.Files:
                            # ignore inputs
                            if file.type == 'input':
                                continue
                            # skip pseudo datasets
                            if file.destinationDBlock in ['', None, 'NULL']:
                                continue
                            # start closer for output/log datasets
                            if file.destinationDBlock not in destDBList:
                                destDBList.append(file.destinationDBlock)
                            # collect GUIDs
                            if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['rucio_test'] + JobUtils.list_ptest_prod_sources and \
                                                                      self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \
                                                                      and file.type == 'output':
                                # extract base LFN since LFN was changed to full LFN for CMS
                                baseLFN = file.lfn.split('/')[-1]
                                guidList.append({
                                    'lfn': baseLFN,
                                    'guid': file.GUID,
                                    'type': file.type,
                                    'checksum': file.checksum,
                                    'md5sum': file.md5sum,
                                    'fsize': file.fsize,
                                    'scope': file.scope
                                })
                        if guidList != []:
                            retG = self.taskBuffer.setGUIDs(guidList)
                        if destDBList != []:
                            # start Closer
                            if adderPlugin is not None and hasattr(
                                    adderPlugin, 'datasetMap'
                            ) and adderPlugin.datasetMap != {}:
                                cThr = Closer.Closer(
                                    self.taskBuffer,
                                    destDBList,
                                    self.job,
                                    datasetMap=adderPlugin.datasetMap)
                            else:
                                cThr = Closer.Closer(self.taskBuffer,
                                                     destDBList, self.job)
                            self.logger.debug("start Closer")
                            # cThr.start()
                            # cThr.join()
                            cThr.run()
                            del cThr
                            self.logger.debug("end Closer")
                        # run closer for assocaiate parallel jobs
                        if EventServiceUtils.isJobCloningJob(self.job):
                            assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer(
                                self.job.jediTaskID, self.job.PandaID,
                                destDBList)
                            for assJobID in assDBlockMap:
                                assDBlocks = assDBlockMap[assJobID]
                                assJob = self.taskBuffer.peekJobs(
                                    [assJobID],
                                    fromDefined=False,
                                    fromArchived=False,
                                    fromWaiting=False,
                                    forAnal=True)[0]
                                if self.job is None:
                                    self.logger.debug(
                                        ': associated job PandaID={0} not found in DB'
                                        .format(assJobID))
                                else:
                                    cThr = Closer.Closer(
                                        self.taskBuffer, assDBlocks, assJob)
                                    self.logger.debug(
                                        "start Closer for PandaID={0}".format(
                                            assJobID))
                                    # cThr.start()
                                    # cThr.join()
                                    cThr.run()
                                    del cThr
                                    self.logger.debug(
                                        "end Closer for PandaID={0}".format(
                                            assJobID))
            self.logger.debug("end")
            # try:
            #     # remove Catalog
            #     os.remove(self.xmlFile)
            # except Exception:
            #     pass
            # remove Catalog
            self.taskBuffer.deleteJobOutputReport(panda_id=self.jobID,
                                                  attempt_nr=self.attemptNr)
            del self.data
            del report_dict
        except Exception as e:
            errStr = ": {} {}".format(str(e), traceback.format_exc())
            self.logger.error(errStr)
            self.logger.error("except")
            # unlock job output report
            self.taskBuffer.unlockJobOutputReport(panda_id=self.jobID,
                                                  attempt_nr=self.attemptNr,
                                                  pid=self.pid,
                                                  lock_offset=self.lock_offset)
Пример #8
0
 def run(self):
     self.lock.acquire()
     try:
         for vuid,name,modDate in self.datasets:
             _logger.debug("Freezer start %s %s" % (modDate,name))
             self.proxyLock.acquire()
             retF,resF = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID,status FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock ",
                                          {':destinationDBlock':name})
             self.proxyLock.release()
             if retF < 0:
                 _logger.error("SQL error")
             else:
                 allFinished = True
                 onePandaID = None
                 for tmpPandaID,tmpFileStatus in resF:
                     onePandaID = tmpPandaID
                     if not tmpFileStatus in ['ready', 'failed', 'skipped', 'merging', 'finished']:
                         allFinished = False
                         break
                 # check sub datasets in the jobset for event service job
                 if allFinished:
                     self.proxyLock.acquire()
                     tmpJobs = taskBuffer.getFullJobStatus([onePandaID])
                     self.proxyLock.release()
                     if len(tmpJobs) > 0 and tmpJobs[0] is not None:
                         if EventServiceUtils.isEventServiceMerge(tmpJobs[0]):
                             self.proxyLock.acquire()
                             cThr = Closer(taskBuffer, [], tmpJobs[0])
                             allFinished = cThr.checkSubDatasetsInJobset()
                             self.proxyLock.release()
                             _logger.debug("closer checked sub datasets in the jobset for %s : %s" % (name, allFinished))
                 # no files in filesTable
                 if allFinished:
                     _logger.debug("freeze %s " % name)
                     dsExists = True
                     if name.startswith('pandaddm_') or name.startswith('user.') or name.startswith('group.') \
                             or name.startswith('hc_test.') or name.startswith('panda.um.'):
                         dsExists = False
                     if name.startswith('panda.um.'):
                         self.proxyLock.acquire()
                         retMer,resMer = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND status IN (:statusM,:statusF) ",
                                                              {':destinationDBlock':name,
                                                               ':statusM':'merging',
                                                               ':statusF':'failed'})
                         self.proxyLock.release()
                         if resMer is not None and len(resMer)>0:
                             mergeID = resMer[0][0]
                             # get merging jobs
                             self.proxyLock.acquire()
                             mergingJobs = taskBuffer.peekJobs([mergeID],fromDefined=False,fromArchived=False,fromWaiting=False)
                             self.proxyLock.release()    
                             mergeJob = mergingJobs[0]
                             if mergeJob is not None:
                                 tmpDestDBlocks = []
                                 # get destDBlock
                                 for tmpFile in mergeJob.Files:
                                     if tmpFile.type in ['output','log']:
                                         if not tmpFile.destinationDBlock in tmpDestDBlocks:
                                             tmpDestDBlocks.append(tmpFile.destinationDBlock)
                                 # run
                                 _logger.debug("start JEDI closer for %s " % name)
                                 self.proxyLock.acquire()
                                 cThr = Closer(taskBuffer,tmpDestDBlocks,mergeJob)
                                 cThr.start()
                                 cThr.join()
                                 self.proxyLock.release()
                                 _logger.debug("end JEDI closer for %s " % name)
                                 continue
                             else:
                                 _logger.debug("failed to get merging job for %s " % name)
                         else:
                             _logger.debug("failed to get merging file for %s " % name)
                         status,out = True,''
                     elif dsExists:
                         # check if dataset exists
                         status,out = rucioAPI.getMetaData(name)
                         if status == True:
                             if out is not None:
                                 try:
                                     rucioAPI.closeDataset(name)
                                     status = True
                                 except Exception:
                                     errtype,errvalue = sys.exc_info()[:2]
                                     out = 'failed to freeze : {0} {1}'.format(errtype,errvalue)
                                     status = False
                             else:
                                 # dataset not exist
                                 status,out = True,''
                                 dsExists = False
                     else:
                         status,out = True,''
                     if not status:
                         _logger.error('{0} failed to freeze with {1}'.format(name,out))
                     else:
                         self.proxyLock.acquire()
                         varMap = {}
                         varMap[':vuid'] = vuid
                         varMap[':status'] = 'completed' 
                         taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid",
                                          varMap)
                         self.proxyLock.release()                            
                         if name.startswith('pandaddm_') or name.startswith('panda.um.') or not dsExists:
                             continue
                         # set tobedeleted to dis
                         setTobeDeletedToDis(name)
                         # count # of files
                         status,out = rucioAPI.getNumberOfFiles(name)
                         if status is not True:
                             if status is False:
                                 _logger.error(out)
                         else:
                             _logger.debug(out)                                            
                             try:
                                 nFile = int(out)
                                 _logger.debug(nFile)
                                 if nFile == 0:
                                     # erase dataset
                                     _logger.debug('erase %s' % name)                                
                                     status,out = rucioAPI.eraseDataset(name)
                                     _logger.debug('OK with %s' % name)
                             except Exception:
                                 pass
                 else:
                     _logger.debug("wait %s " % name)
                     self.proxyLock.acquire()                        
                     taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE WHERE vuid=:vuid", {':vuid':vuid})
                     self.proxyLock.release()                                                    
             _logger.debug("end %s " % name)
     except Exception:
         errStr = traceback.format_exc()
         _logger.error(errStr)
     self.pool.remove(self)
     self.lock.release()
Пример #9
0
 def updateJobs(self, jobList, tmpLog):
     updateJobs = []
     failedJobs = []
     activateJobs = []
     waitingJobs = []
     closeJobs = []
     # sort out jobs
     for job in jobList:
         # failed jobs
         if job.jobStatus in ['failed', 'cancelled']:
             failedJobs.append(job)
         # waiting
         elif job.jobStatus == 'waiting':
             waitingJobs.append(job)
         # no input jobs
         elif job.dispatchDBlock == 'NULL':
             activateJobs.append(job)
         # normal jobs
         else:
             # change status
             job.jobStatus = "assigned"
             updateJobs.append(job)
     # trigger merge generation if all events are done
     newActivateJobs = []
     nFinished = 0
     for job in activateJobs:
         if job.notDiscardEvents() and job.allOkEvents(
         ) and not EventServiceUtils.isEventServiceMerge(job):
             self.taskBuffer.activateJobs([job])
             # change status
             job.jobStatus = "finished"
             self.taskBuffer.updateJobs([job], False)
             nFinished += 1
         else:
             newActivateJobs.append(job)
     activateJobs = newActivateJobs
     tmpLog.debug('# of finished jobs in activated : {0}'.format(nFinished))
     newUpdateJobs = []
     nFinished = 0
     for job in updateJobs:
         if job.notDiscardEvents() and job.allOkEvents(
         ) and not EventServiceUtils.isEventServiceMerge(job):
             self.taskBuffer.updateJobs([job], True)
             # change status
             job.jobStatus = "finished"
             self.taskBuffer.updateJobs([job], True)
             nFinished += 1
         else:
             newUpdateJobs.append(job)
     updateJobs = newUpdateJobs
     tmpLog.debug('# of finished jobs in defined : {0}'.format(nFinished))
     # update DB
     tmpLog.debug('# of activated jobs : {0}'.format(len(activateJobs)))
     self.taskBuffer.activateJobs(activateJobs)
     tmpLog.debug('# of updated jobs : {0}'.format(len(updateJobs)))
     self.taskBuffer.updateJobs(updateJobs, True)
     tmpLog.debug('# of failed jobs : {0}'.format(len(failedJobs)))
     self.taskBuffer.updateJobs(failedJobs, True)
     tmpLog.debug('# of waiting jobs : {0}'.format(len(waitingJobs)))
     self.taskBuffer.keepJobs(waitingJobs)
     # delete local values
     del updateJobs
     del failedJobs
     del activateJobs
     del waitingJobs
Пример #10
0
 def doSetup(self,taskSpec,datasetToRegister,pandaJobs):
     # make logger
     tmpLog = MsgWrapper(logger,"<jediTaskID={0}>".format(taskSpec.jediTaskID))
     tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType))
     # returns
     retFatal    = self.SC_FATAL
     retTmpError = self.SC_FAILED
     retOK       = self.SC_SUCCEEDED
     try:
         # get DDM I/F
         ddmIF = self.ddmIF.getInterface(taskSpec.vo)
         # register datasets
         if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']:
             # prod vs anal
             userSetup = False
             if taskSpec.prodSourceLabel in ['user']:
                 userSetup = True
                 # collect datasetID to register datasets/containers just in case
                 for tmpPandaJob in pandaJobs:
                     if not tmpPandaJob.produceUnMerge():
                         for tmpFileSpec in tmpPandaJob.Files:
                             if tmpFileSpec.type in ['output','log']:
                                 if not tmpFileSpec.datasetID in datasetToRegister:
                                     datasetToRegister.append(tmpFileSpec.datasetID)
             tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister)))
             # get site mapper
             siteMapper = self.taskBufferIF.getSiteMapper()
             # loop over all datasets
             avDatasetList = []
             cnDatasetMap  = {}
             for datasetID in datasetToRegister:
                 # get output and log datasets
                 tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID))
                 tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID,
                                                                               datasetID)
                 if not tmpStat:
                     tmpLog.error('failed to get output and log datasets')
                     return retFatal
                 if datasetSpec.isPseudo():
                     tmpLog.info('skip pseudo dataset')
                     continue
                 # DDM backend
                 ddmBackEnd = taskSpec.getDdmBackEnd()
                 tmpLog.info('checking {0}'.format(datasetSpec.datasetName)) 
                 # check if dataset and container are available in DDM
                 for targetName in [datasetSpec.datasetName,datasetSpec.containerName]:
                     if targetName == None:
                         continue
                     if not targetName in avDatasetList:
                         # set lifetime
                         if targetName.startswith('panda'):
                             if datasetSpec.type == 'trn_log' and taskSpec.prodSourceLabel == 'managed':
                                 lifetime = 365
                             else:
                                 lifetime = 14
                         else:
                             lifetime = None
                         # check dataset/container in DDM
                         tmpList = ddmIF.listDatasets(targetName)
                         if tmpList == []:
                             # get location
                             location = None
                             locForRule = None
                             if targetName == datasetSpec.datasetName:
                                 # dataset
                                 if datasetSpec.site in ['',None]:
                                     if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                                         locForRule = datasetSpec.destination
                                     elif DataServiceUtils.getDestinationSE(datasetSpec.storageToken) != None:
                                         location = DataServiceUtils.getDestinationSE(datasetSpec.storageToken)
                                     elif taskSpec.cloud != None:
                                         # use T1 SE
                                         tmpT1Name = siteMapper.getCloud(taskSpec.cloud)['source']
                                         location = siteMapper.getDdmEndpoint(tmpT1Name,datasetSpec.storageToken)
                                 else:
                                     tmpLog.info('site={0} token='.format(datasetSpec.site,datasetSpec.storageToken))
                                     location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken)
                             if locForRule == None:
                                 locForRule = location
                             # set metadata
                             if taskSpec.prodSourceLabel in ['managed','test'] and targetName == datasetSpec.datasetName:
                                 metaData = {}
                                 metaData['task_id'] = taskSpec.jediTaskID
                                 if not taskSpec.campaign in [None,'']:
                                     metaData['campaign'] = taskSpec.campaign 
                                 if datasetSpec.getTransient() != None:
                                     metaData['transient'] = datasetSpec.getTransient()
                             else:
                                 metaData = None
                             # register dataset/container
                             tmpLog.info('registering {0} with location={1} backend={2} lifetime={3} meta={4}'.format(targetName,
                                                                                                                      location,
                                                                                                                      ddmBackEnd,
                                                                                                                      lifetime,
                                                                                                                      str(metaData)))
                             tmpStat = ddmIF.registerNewDataset(targetName,backEnd=ddmBackEnd,location=location,
                                                                lifetime=lifetime,metaData=metaData)
                             if not tmpStat:
                                 tmpLog.error('failed to register {0}'.format(targetName))
                                 return retFatal
                             # procedures for user 
                             if userSetup or DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                                 # register location
                                 tmpToRegister = False
                                 if userSetup and targetName == datasetSpec.datasetName and not datasetSpec.site in ['',None]:
                                     userName = taskSpec.userName
                                     grouping = None
                                     tmpToRegister = True
                                 elif DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None:
                                     userName = None
                                     grouping = 'NONE'
                                     tmpToRegister = True
                                 if tmpToRegister:
                                     activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel)
                                     tmpLog.info('registering location={0} lifetime={1}days activity={2} grouping={3}'.format(locForRule,lifetime,
                                                                                                                              activity,grouping))
                                     tmpStat = ddmIF.registerDatasetLocation(targetName,locForRule,owner=userName,
                                                                             lifetime=lifetime,backEnd=ddmBackEnd,
                                                                             activity=activity,grouping=grouping)
                                     if not tmpStat:
                                         tmpLog.error('failed to register location {0} for {1}'.format(locForRule,
                                                                                                       targetName))
                                         return retFatal
                                     # double copy
                                     if userSetup and datasetSpec.type == 'output':
                                         if datasetSpec.destination != datasetSpec.site:
                                             tmpLog.info('skip making double copy as destination={0} is not site={1}'.format(datasetSpec.destination,
                                                                                                                             datasetSpec.site))
                                         else:
                                             locForDouble = '(type=SCRATCHDISK)\\notforextracopy=1'
                                             tmpMsg  = 'registering double copy '
                                             tmpMsg += 'location="{0}" lifetime={1}days activity={2} for dataset={3}'.format(locForDouble,lifetime,
                                                                                                                             activity,targetName)
                                             tmpLog.info(tmpMsg)
                                             tmpStat = ddmIF.registerDatasetLocation(targetName,locForDouble,copies=2,owner=userName,
                                                                                     lifetime=lifetime,activity=activity,
                                                                                     grouping='NONE',weight='freespace',
                                                                                     ignore_availability=False)
                                             if not tmpStat:
                                                 tmpLog.error('failed to register double copylocation {0} for {1}'.format(locForDouble,
                                                                                                                        targetName))
                                                 return retFatal
                             avDatasetList.append(targetName)
                         else:
                             tmpLog.info('{0} already registered'.format(targetName))
                 # check if dataset is in the container
                 if datasetSpec.containerName != None and datasetSpec.containerName != datasetSpec.datasetName:
                     # get list of constituent datasets in the container
                     if not cnDatasetMap.has_key(datasetSpec.containerName):
                         cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName)
                     # add dataset
                     if not datasetSpec.datasetName in cnDatasetMap[datasetSpec.containerName]:
                         tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) 
                         tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName],
                                                                backEnd=ddmBackEnd)
                         if not tmpStat:
                             tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName,
                                                                            datasetSpec.containerName))
                             return retFatal
                         cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName)
                     else:
                         tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) 
                 # update dataset
                 datasetSpec.status = 'registered'
                 self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID,
                                                                   'datasetID':datasetID})
         # register ES datasets
         if taskSpec.registerEsFiles():
             targetName = EventServiceUtils.getEsDatasetName(taskSpec.jediTaskID)
             location = None
             metaData = {}
             metaData['task_id'] = taskSpec.jediTaskID
             metaData['hidden']  = True
             tmpLog.info('registering ES dataset {0} with location={1} meta={2}'.format(targetName,
                                                                                        location,
                                                                                        str(metaData)))
             tmpStat = ddmIF.registerNewDataset(targetName,location=location,metaData=metaData,
                                                resurrect=True)
             if not tmpStat:
                 tmpLog.error('failed to register ES dataset {0}'.format(targetName))
                 return retFatal
             # register rule
             location = 'type=DATADISK' 
             activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel)
             grouping = 'NONE'
             tmpLog.info('registering location={0} activity={1} grouping={2}'.format(location,
                                                                                     activity,
                                                                                     grouping))
             tmpStat = ddmIF.registerDatasetLocation(targetName,location,activity=activity,
                                                     grouping=grouping)
             if not tmpStat:
                 tmpLog.error('failed to register location {0} with {2} for {1}'.format(location,
                                                                                        targetName,
                                                                                        activity))
                 return retFatal
         # open datasets
         if taskSpec.prodSourceLabel in ['managed','test']:
             # get the list of output/log datasets
             outDatasetList = []
             for tmpPandaJob in pandaJobs:
                 for tmpFileSpec in tmpPandaJob.Files:
                     if tmpFileSpec.type in ['output','log']:
                         if not tmpFileSpec.destinationDBlock in outDatasetList:
                             outDatasetList.append(tmpFileSpec.destinationDBlock)
             # open datasets
             for outDataset in outDatasetList:
                 tmpLog.info('open {0}'.format(outDataset))
                 ddmIF.openDataset(outDataset)
                 # unset lifetime
                 ddmIF.setDatasetMetadata(outDataset,'lifetime',None)
         # return
         tmpLog.info('done')        
         return retOK
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue))
         taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID))
         return retFatal
Пример #11
0
    def run(self):
        try:
            while True:
                _logger.debug('%s start' % self.pandaID)
                # query job
                job = self.taskBuffer.peekJobs([self.pandaID],
                                               fromDefined=False,
                                               fromArchived=False,
                                               fromWaiting=False)[0]
                _logger.debug('%s in %s' % (self.pandaID, job.jobStatus))
                # check job status
                if job is None:
                    _logger.debug('%s escape : not found' % self.pandaID)
                    return
                if job.jobStatus not in [
                        'running', 'sent', 'starting', 'holding', 'stagein',
                        'stageout'
                ]:
                    if job.jobStatus == 'transferring' and (
                            job.prodSourceLabel in ['user', 'panda']
                            or job.jobSubStatus not in [None, 'NULL', '']):
                        pass
                    else:
                        _logger.debug('%s escape : %s' %
                                      (self.pandaID, job.jobStatus))
                        return
                # time limit
                timeLimit = datetime.datetime.utcnow() - datetime.timedelta(
                    minutes=self.sleepTime)
                if job.modificationTime < timeLimit or (
                        job.endTime != 'NULL' and job.endTime < timeLimit):
                    _logger.debug(
                        '%s %s lastmod:%s endtime:%s' %
                        (job.PandaID, job.jobStatus, str(
                            job.modificationTime), str(job.endTime)))
                    destDBList = []
                    if job.jobStatus == 'sent':
                        # sent job didn't receive reply from pilot within 30 min
                        job.jobDispatcherErrorCode = ErrorCode.EC_SendError
                        job.jobDispatcherErrorDiag = "Sent job didn't receive reply from pilot within 30 min"
                    elif job.exeErrorDiag == 'NULL' and job.pilotErrorDiag == 'NULL':
                        # lost heartbeat
                        if job.jobDispatcherErrorDiag == 'NULL':
                            if job.endTime == 'NULL':
                                # normal lost heartbeat
                                job.jobDispatcherErrorCode = ErrorCode.EC_Watcher
                                job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(
                                    job.modificationTime)
                            else:
                                if job.jobStatus == 'holding':
                                    job.jobDispatcherErrorCode = ErrorCode.EC_Holding
                                elif job.jobStatus == 'transferring':
                                    job.jobDispatcherErrorCode = ErrorCode.EC_Transferring
                                else:
                                    job.jobDispatcherErrorCode = ErrorCode.EC_Timeout
                                job.jobDispatcherErrorDiag = 'timeout in {0} : last heartbeat at {1}'.format(
                                    job.jobStatus, str(job.endTime))
                            # get worker
                            workerSpecs = self.taskBuffer.getWorkersForJob(
                                job.PandaID)
                            if len(workerSpecs) > 0:
                                workerSpec = workerSpecs[0]
                                if workerSpec.status in [
                                        'finished', 'failed', 'cancelled',
                                        'missed'
                                ]:
                                    job.supErrorCode = SupErrors.error_codes[
                                        'WORKER_ALREADY_DONE']
                                    job.supErrorDiag = 'worker already {0} at {1} with {2}'.format(
                                        workerSpec.status,
                                        str(workerSpec.endTime),
                                        workerSpec.diagMessage)
                                    job.supErrorDiag = JobSpec.truncateStringAttr(
                                        'supErrorDiag', job.supErrorDiag)
                    else:
                        # job recovery failed
                        job.jobDispatcherErrorCode = ErrorCode.EC_Recovery
                        job.jobDispatcherErrorDiag = 'job recovery failed for %s hours' % (
                            self.sleepTime / 60)
                    # set job status
                    job.jobStatus = 'failed'
                    # set endTime for lost heartbeat
                    if job.endTime == 'NULL':
                        # normal lost heartbeat
                        job.endTime = job.modificationTime
                    # set files status
                    for file in job.Files:
                        if file.type == 'output' or file.type == 'log':
                            file.status = 'failed'
                            if file.destinationDBlock not in destDBList:
                                destDBList.append(file.destinationDBlock)
                    # event service
                    if EventServiceUtils.isEventServiceJob(
                            job
                    ) and not EventServiceUtils.isJobCloningJob(job):
                        eventStat = self.taskBuffer.getEventStat(
                            job.jediTaskID, job.PandaID)
                        # set sub status when no sucessful events
                        if EventServiceUtils.ST_finished not in eventStat:
                            job.jobSubStatus = 'es_heartbeat'
                    # update job
                    self.taskBuffer.updateJobs([job], False)
                    # start closer
                    if job.jobStatus == 'failed':

                        source = 'jobDispatcherErrorCode'
                        error_code = job.jobDispatcherErrorCode
                        error_diag = job.jobDispatcherErrorDiag

                        try:
                            _logger.debug(
                                "Watcher will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, job.PandaID, source,
                                error_code, error_diag, job.attemptNr)
                            _logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            _logger.debug(
                                "apply_retrial_rules excepted and needs to be investigated (%s): %s"
                                % (e, traceback.format_exc()))

                        # updateJobs was successful and it failed a job with taskBufferErrorCode
                        try:

                            _logger.debug("Watcher.run will peek the job")
                            job_tmp = self.taskBuffer.peekJobs(
                                [job.PandaID],
                                fromDefined=False,
                                fromArchived=True,
                                fromWaiting=False)[0]
                            if job_tmp.taskBufferErrorCode:
                                source = 'taskBufferErrorCode'
                                error_code = job_tmp.taskBufferErrorCode
                                error_diag = job_tmp.taskBufferErrorDiag
                                _logger.debug(
                                    "Watcher.run 2 will call apply_retrial_rules"
                                )
                                retryModule.apply_retrial_rules(
                                    self.taskBuffer, job_tmp.PandaID, source,
                                    error_code, error_diag, job_tmp.attemptNr)
                                _logger.debug("apply_retrial_rules 2 is back")
                        except IndexError:
                            pass
                        except Exception as e:
                            self.logger.error(
                                "apply_retrial_rules 2 excepted and needs to be investigated (%s): %s"
                                % (e, traceback.format_exc()))

                        cThr = Closer(self.taskBuffer, destDBList, job)
                        cThr.start()
                        cThr.join()
                    _logger.debug('%s end' % job.PandaID)
                    return
                # single action
                if self.single:
                    return
                # sleep
                time.sleep(60 * self.sleepTime)
        except Exception:
            type, value, traceBack = sys.exc_info()
            _logger.error("run() : %s %s" % (type, value))
            return
Пример #12
0
 def run(self):
     try:
         _logger.debug('%s Start %s' % (self.pandaID,self.job.jobStatus))
         flagComplete    = True
         topUserDsList   = []
         usingMerger     = False        
         disableNotifier = False
         firstIndvDS     = True
         finalStatusDS   = []
         for destinationDBlock in self.destinationDBlocks:
             dsList = []
             _logger.debug('%s start %s' % (self.pandaID,destinationDBlock))
             # ignore tid datasets
             if re.search('_tid[\d_]+$',destinationDBlock):
                 _logger.debug('%s skip %s' % (self.pandaID,destinationDBlock))                
                 continue
             # ignore HC datasets
             if re.search('^hc_test\.',destinationDBlock) is not None or re.search('^user\.gangarbt\.',destinationDBlock) is not None:
                 if re.search('_sub\d+$',destinationDBlock) is None and re.search('\.lib$',destinationDBlock) is None:
                     _logger.debug('%s skip HC %s' % (self.pandaID,destinationDBlock))                
                     continue
             # query dataset
             if destinationDBlock in self.datasetMap:
                 dataset = self.datasetMap[destinationDBlock]
             else:
                 dataset = self.taskBuffer.queryDatasetWithMap({'name':destinationDBlock})
             if dataset is None:
                 _logger.error('%s Not found : %s' % (self.pandaID,destinationDBlock))
                 flagComplete = False
                 continue
             # skip tobedeleted/tobeclosed 
             if dataset.status in ['cleanup','tobeclosed','completed','deleted']:
                 _logger.debug('%s skip %s due to %s' % (self.pandaID,destinationDBlock,dataset.status))
                 continue
             dsList.append(dataset)
             # sort
             dsList.sort()
             # count number of completed files
             notFinish = self.taskBuffer.countFilesWithMap({'destinationDBlock':destinationDBlock,
                                                            'status':'unknown'})
             if notFinish < 0:
                 _logger.error('%s Invalid DB return : %s' % (self.pandaID,notFinish))
                 flagComplete = False                
                 continue
             # check if completed
             _logger.debug('%s notFinish:%s' % (self.pandaID,notFinish))
             if self.job.destinationSE == 'local' and self.job.prodSourceLabel in ['user','panda']:
                 # close non-DQ2 destinationDBlock immediately
                 finalStatus = 'closed'
             elif self.job.lockedby == 'jedi' and self.isTopLevelDS(destinationDBlock):
                 # set it closed in order not to trigger DDM cleanup. It will be closed by JEDI
                 finalStatus = 'closed'
             elif self.job.prodSourceLabel in ['user'] and "--mergeOutput" in self.job.jobParameters \
                      and self.job.processingType != 'usermerge':
                 # merge output files
                 if firstIndvDS:
                     # set 'tobemerged' to only the first dataset to avoid triggering many Mergers for --individualOutDS
                     finalStatus = 'tobemerged'
                     firstIndvDS = False
                 else:
                     finalStatus = 'tobeclosed'
                 # set merging to top dataset
                 usingMerger = True
                 # disable Notifier
                 disableNotifier = True
             elif self.job.produceUnMerge():
                 finalStatus = 'doing'
             else:
                 # set status to 'tobeclosed' to trigger DQ2 closing
                 finalStatus = 'tobeclosed'
             if notFinish == 0 and EventServiceUtils.isEventServiceMerge(self.job):
                 allInJobsetFinished = self.checkSubDatasetsInJobset()
             else:
                 allInJobsetFinished = True
             if notFinish == 0 and allInJobsetFinished: 
                 _logger.debug('%s set %s to dataset : %s' % (self.pandaID,finalStatus,destinationDBlock))
                 # set status
                 dataset.status = finalStatus
                 # update dataset in DB
                 retT = self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ",
                                                       criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'})
                 if len(retT) > 0 and retT[0]==1:
                     finalStatusDS += dsList
                     # close user datasets
                     if self.job.prodSourceLabel in ['user'] and self.job.destinationDBlock.endswith('/') \
                            and (dataset.name.startswith('user') or dataset.name.startswith('group')):
                         # get top-level user dataset 
                         topUserDsName = re.sub('_sub\d+$','',dataset.name)
                         # update if it is the first attempt
                         if topUserDsName != dataset.name and not topUserDsName in topUserDsList and self.job.lockedby != 'jedi':
                             topUserDs = self.taskBuffer.queryDatasetWithMap({'name':topUserDsName})
                             if topUserDs is not None:
                                 # check status
                                 if topUserDs.status in ['completed','cleanup','tobeclosed','deleted',
                                                         'tobemerged','merging']:
                                     _logger.debug('%s skip %s due to status=%s' % (self.pandaID,topUserDsName,topUserDs.status))
                                 else:
                                     # set status
                                     if self.job.processingType.startswith('gangarobot') or \
                                            self.job.processingType.startswith('hammercloud'):
                                         # not trigger freezing for HC datasets so that files can be appended
                                         topUserDs.status = 'completed'
                                     elif not usingMerger:
                                         topUserDs.status = finalStatus
                                     else:
                                         topUserDs.status = 'merging'
                                     # append to avoid repetition
                                     topUserDsList.append(topUserDsName)
                                     # update DB
                                     retTopT = self.taskBuffer.updateDatasets([topUserDs],withLock=True,withCriteria="status<>:crStatus",
                                                                              criteriaMap={':crStatus':topUserDs.status})
                                     if len(retTopT) > 0 and retTopT[0]==1:
                                         _logger.debug('%s set %s to top dataset : %s' % (self.pandaID,topUserDs.status,topUserDsName))
                                     else:
                                         _logger.debug('%s failed to update top dataset : %s' % (self.pandaID,topUserDsName))
                         # get parent dataset for merge job
                         if self.job.processingType == 'usermerge':
                             tmpMatch = re.search('--parentDS ([^ \'\"]+)',self.job.jobParameters)
                             if tmpMatch is None:
                                 _logger.error('%s failed to extract parentDS' % self.pandaID)
                             else:
                                 unmergedDsName = tmpMatch.group(1)
                                 # update if it is the first attempt
                                 if not unmergedDsName in topUserDsList:
                                     unmergedDs = self.taskBuffer.queryDatasetWithMap({'name':unmergedDsName})
                                     if unmergedDs is None:
                                         _logger.error('%s failed to get parentDS=%s from DB' % (self.pandaID,unmergedDsName))
                                     else:
                                         # check status
                                         if unmergedDs.status in ['completed','cleanup','tobeclosed']:
                                             _logger.debug('%s skip %s due to status=%s' % (self.pandaID,unmergedDsName,unmergedDs.status))
                                         else:
                                             # set status
                                             unmergedDs.status = finalStatus
                                             # append to avoid repetition
                                             topUserDsList.append(unmergedDsName)
                                             # update DB
                                             retTopT = self.taskBuffer.updateDatasets([unmergedDs],withLock=True,withCriteria="status<>:crStatus",
                                                                                      criteriaMap={':crStatus':unmergedDs.status})
                                             if len(retTopT) > 0 and retTopT[0]==1:
                                                 _logger.debug('%s set %s to parent dataset : %s' % (self.pandaID,unmergedDs.status,unmergedDsName))
                                             else:
                                                 _logger.debug('%s failed to update parent dataset : %s' % (self.pandaID,unmergedDsName))
                     # start Activator
                     if re.search('_sub\d+$',dataset.name) is None:
                         if self.job.prodSourceLabel=='panda' and self.job.processingType in ['merge','unmerge']:
                             # don't trigger Activator for merge jobs
                             pass
                         else:
                             if self.job.jobStatus == 'finished':
                                 aThr = Activator(self.taskBuffer,dataset)
                                 aThr.start()
                                 aThr.join()
                 else:
                     # unset flag since another thread already updated 
                     #flagComplete = False
                     pass
             else:
                 # update dataset in DB
                 self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ",
                                                criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'})
                 # unset flag
                 flagComplete = False
             # end
             _logger.debug('%s end %s' % (self.pandaID,destinationDBlock))
         # special actions for vo
         if flagComplete:
             closerPluginClass = panda_config.getPlugin('closer_plugins',self.job.VO)
             if closerPluginClass is None and self.job.VO == 'atlas':
                 # use ATLAS plugin for ATLAS
                 from pandaserver.dataservice.CloserAtlasPlugin import CloserAtlasPlugin
                 closerPluginClass = CloserAtlasPlugin
             if closerPluginClass is not None:
                 closerPlugin = closerPluginClass(self.job,finalStatusDS,_logger)
                 closerPlugin.execute()
         # change pending jobs to failed
         finalizedFlag = True
         if flagComplete and self.job.prodSourceLabel=='user':
             _logger.debug('%s finalize %s %s' % (self.pandaID,self.job.prodUserName,self.job.jobDefinitionID))
             finalizedFlag = self.taskBuffer.finalizePendingJobs(self.job.prodUserName,self.job.jobDefinitionID)
             _logger.debug('%s finalized with %s' % (self.pandaID,finalizedFlag))
         # update unmerged datasets in JEDI to trigger merging
         if flagComplete and self.job.produceUnMerge() and finalStatusDS != []:
             if finalizedFlag:
                 tmpStat = self.taskBuffer.updateUnmergedDatasets(self.job,finalStatusDS)
                 _logger.debug('%s updated unmerged datasets with %s' % (self.pandaID,tmpStat))
         # start notifier
         _logger.debug('%s source:%s complete:%s' % (self.pandaID,self.job.prodSourceLabel,flagComplete))
         if (self.job.jobStatus != 'transferring') and ((flagComplete and self.job.prodSourceLabel=='user') or \
            (self.job.jobStatus=='failed' and self.job.prodSourceLabel=='panda')) and \
            self.job.lockedby != 'jedi':
             # don't send email for merge jobs
             if (not disableNotifier) and not self.job.processingType in ['merge','unmerge']:
                 useNotifier = True
                 summaryInfo = {}
                 # check all jobDefIDs in jobsetID
                 if not self.job.jobsetID in [0,None,'NULL']:
                     useNotifier,summaryInfo = self.taskBuffer.checkDatasetStatusForNotifier(self.job.jobsetID,self.job.jobDefinitionID,
                                                                                             self.job.prodUserName)
                     _logger.debug('%s useNotifier:%s' % (self.pandaID,useNotifier))
                 if useNotifier:
                     _logger.debug('%s start Notifier' % self.pandaID)
                     nThr = Notifier.Notifier(self.taskBuffer,self.job,self.destinationDBlocks,summaryInfo)
                     nThr.run()
                     _logger.debug('%s end Notifier' % self.pandaID)                    
         _logger.debug('%s End' % self.pandaID)
     except Exception:
         errType,errValue = sys.exc_info()[:2]
         _logger.error("%s %s" % (errType,errValue))
Пример #13
0
 def appendJob(self, job, siteMapperCache=None):
     # event service merge
     if EventServiceUtils.isEventServiceMerge(job):
         isEventServiceMerge = True
     else:
         isEventServiceMerge = False
     # PandaID
     self.data['PandaID'] = job.PandaID
     # prodSourceLabel
     self.data['prodSourceLabel'] = job.prodSourceLabel
     # swRelease
     self.data['swRelease'] = job.AtlasRelease
     # homepackage
     self.data['homepackage'] = job.homepackage
     # transformation
     self.data['transformation'] = job.transformation
     # job name
     self.data['jobName'] = job.jobName
     # job definition ID
     self.data['jobDefinitionID'] = job.jobDefinitionID
     # cloud
     self.data['cloud'] = job.cloud
     # files
     strIFiles = ''
     strOFiles = ''
     strDispatch = ''
     strDisToken = ''
     strDisTokenForOutput = ''
     strDestination = ''
     strRealDataset = ''
     strRealDatasetIn = ''
     strProdDBlock = ''
     strDestToken = ''
     strProdToken = ''
     strProdTokenForOutput = ''
     strGUID = ''
     strFSize = ''
     strCheckSum = ''
     strFileDestinationSE = ''
     strScopeIn = ''
     strScopeOut = ''
     strScopeLog = ''
     logFile = ''
     logGUID = ''
     ddmEndPointIn = []
     ddmEndPointOut = []
     noOutput = []
     siteSpec = None
     inDsLfnMap = {}
     inLFNset = set()
     if siteMapperCache is not None:
         siteMapper = siteMapperCache.getObj()
         siteSpec = siteMapper.getSite(job.computingSite)
         # resolve destSE
         try:
             job.destinationSE = siteMapper.resolveNucleus(
                 job.destinationSE)
             for tmpFile in job.Files:
                 tmpFile.destinationSE = siteMapper.resolveNucleus(
                     tmpFile.destinationSE)
         except Exception:
             pass
         siteMapperCache.releaseObj()
     for file in job.Files:
         if file.type == 'input':
             if EventServiceUtils.isJumboJob(job) and file.lfn in inLFNset:
                 pass
             else:
                 inLFNset.add(file.lfn)
                 if strIFiles != '':
                     strIFiles += ','
                 strIFiles += file.lfn
                 if strDispatch != '':
                     strDispatch += ','
                 strDispatch += file.dispatchDBlock
                 if strDisToken != '':
                     strDisToken += ','
                 strDisToken += file.dispatchDBlockToken
                 strProdDBlock += '%s,' % file.prodDBlock
                 if not isEventServiceMerge:
                     strProdToken += '%s,' % file.prodDBlockToken
                 else:
                     strProdToken += '%s,' % job.metadata[1][file.lfn]
                 if strGUID != '':
                     strGUID += ','
                 strGUID += file.GUID
                 strRealDatasetIn += '%s,' % file.dataset
                 strFSize += '%s,' % file.fsize
                 if file.checksum not in ['', 'NULL', None]:
                     strCheckSum += '%s,' % file.checksum
                 else:
                     strCheckSum += '%s,' % file.md5sum
                 strScopeIn += '%s,' % file.scope
                 ddmEndPointIn.append(
                     self.getDdmEndpoint(siteSpec, file.dispatchDBlockToken,
                                         'input', job.prodSourceLabel,
                                         job.job_label))
                 if file.dataset not in inDsLfnMap:
                     inDsLfnMap[file.dataset] = []
                 inDsLfnMap[file.dataset].append(file.lfn)
         if file.type == 'output' or file.type == 'log':
             if strOFiles != '':
                 strOFiles += ','
             strOFiles += file.lfn
             if strDestination != '':
                 strDestination += ','
             strDestination += file.destinationDBlock
             if strRealDataset != '':
                 strRealDataset += ','
             strRealDataset += file.dataset
             strFileDestinationSE += '%s,' % file.destinationSE
             if file.type == 'log':
                 logFile = file.lfn
                 logGUID = file.GUID
                 strScopeLog = file.scope
             else:
                 strScopeOut += '%s,' % file.scope
             if strDestToken != '':
                 strDestToken += ','
             strDestToken += re.sub(
                 '^ddd:', 'dst:',
                 file.destinationDBlockToken.split(',')[0])
             strDisTokenForOutput += '%s,' % file.dispatchDBlockToken
             strProdTokenForOutput += '%s,' % file.prodDBlockToken
             ddmEndPointOut.append(
                 self.getDdmEndpoint(
                     siteSpec,
                     file.destinationDBlockToken.split(',')[0], 'output',
                     job.prodSourceLabel, job.job_label))
             if file.isAllowedNoOutput():
                 noOutput.append(file.lfn)
     # inFiles
     self.data['inFiles'] = strIFiles
     # dispatch DBlock
     self.data['dispatchDblock'] = strDispatch
     # dispatch DBlock space token
     self.data['dispatchDBlockToken'] = strDisToken
     # dispatch DBlock space token for output
     self.data['dispatchDBlockTokenForOut'] = strDisTokenForOutput[:-1]
     # outFiles
     self.data['outFiles'] = strOFiles
     # destination DBlock
     self.data['destinationDblock'] = strDestination
     # destination DBlock space token
     self.data['destinationDBlockToken'] = strDestToken
     # prod DBlocks
     self.data['prodDBlocks'] = strProdDBlock[:-1]
     # prod DBlock space token
     self.data['prodDBlockToken'] = strProdToken[:-1]
     # real output datasets
     self.data['realDatasets'] = strRealDataset
     # real output datasets
     self.data['realDatasetsIn'] = strRealDatasetIn[:-1]
     # file's destinationSE
     self.data['fileDestinationSE'] = strFileDestinationSE[:-1]
     # log filename
     self.data['logFile'] = logFile
     # log GUID
     self.data['logGUID'] = logGUID
     # jobPars
     self.data['jobPars'], ppSteps = job.extractMultiStepExec()
     if ppSteps is not None:
         self.data.update(ppSteps)
     if job.to_encode_job_params():
         self.data['jobPars'] = base64.b64encode(
             self.data['jobPars'].encode()).decode()
     # attempt number
     self.data['attemptNr'] = job.attemptNr
     # GUIDs
     self.data['GUID'] = strGUID
     # checksum
     self.data['checksum'] = strCheckSum[:-1]
     # fsize
     self.data['fsize'] = strFSize[:-1]
     # scope
     self.data['scopeIn'] = strScopeIn[:-1]
     self.data['scopeOut'] = strScopeOut[:-1]
     self.data['scopeLog'] = strScopeLog
     # DDM endpoints
     try:
         self.data['ddmEndPointIn'] = ','.join(ddmEndPointIn)
     except TypeError:
         self.data['ddmEndPointIn'] = ''
     try:
         self.data['ddmEndPointOut'] = ','.join(ddmEndPointOut)
     except TypeError:
         self.data['ddmEndPointOut'] = ''
     # destinationSE
     self.data['destinationSE'] = job.destinationSE
     # user ID
     self.data['prodUserID'] = job.prodUserID
     # CPU count
     self.data['maxCpuCount'] = job.maxCpuCount
     # RAM count
     self.data['minRamCount'] = job.minRamCount
     # disk count
     self.data['maxDiskCount'] = job.maxDiskCount
     # cmtconfig
     if ppSteps is None:
         self.data['cmtConfig'] = job.cmtConfig
     else:
         self.data['cmtConfig'] = ''
     # processingType
     self.data['processingType'] = job.processingType
     # transferType
     self.data['transferType'] = job.transferType
     # sourceSite
     self.data['sourceSite'] = job.sourceSite
     # current priority
     self.data['currentPriority'] = job.currentPriority
     # taskID
     if job.lockedby == 'jedi':
         self.data['taskID'] = job.jediTaskID
     else:
         self.data['taskID'] = job.taskID
     # core count
     if job.coreCount in ['NULL', None]:
         self.data['coreCount'] = 1
     else:
         self.data['coreCount'] = job.coreCount
     # jobsetID
     self.data['jobsetID'] = job.jobsetID
     # nucleus
     self.data['nucleus'] = job.nucleus
     # walltime
     self.data['maxWalltime'] = job.maxWalltime
     # looping check
     if job.is_no_looping_check():
         self.data['loopingCheck'] = False
     # debug mode
     if job.specialHandling is not None and 'debug' in job.specialHandling:
         self.data['debug'] = 'True'
     # event service or job cloning
     if EventServiceUtils.isJobCloningJob(job):
         self.data['cloneJob'] = EventServiceUtils.getJobCloningType(job)
     elif EventServiceUtils.isEventServiceJob(
             job) or EventServiceUtils.isJumboJob(job):
         self.data['eventService'] = 'True'
         # prod DBlock space token for pre-merging output
         self.data['prodDBlockTokenForOutput'] = strProdTokenForOutput[:-1]
     # event service merge
     if isEventServiceMerge:
         self.data['eventServiceMerge'] = 'True'
         # write to file for ES merge
         writeToFileStr = ''
         try:
             for outputName in job.metadata[0]:
                 inputList = job.metadata[0][outputName]
                 writeToFileStr += 'inputFor_{0}:'.format(outputName)
                 for tmpInput in inputList:
                     writeToFileStr += '{0},'.format(tmpInput)
                 writeToFileStr = writeToFileStr[:-1]
                 writeToFileStr += '^'
             writeToFileStr = writeToFileStr[:-1]
         except Exception:
             pass
         self.data['writeToFile'] = writeToFileStr
     elif job.writeInputToFile():
         try:
             # write input to file
             writeToFileStr = ''
             for inDS in inDsLfnMap:
                 inputList = inDsLfnMap[inDS]
                 inDS = re.sub('/$', '', inDS)
                 inDS = inDS.split(':')[-1]
                 writeToFileStr += 'tmpin_{0}:'.format(inDS)
                 writeToFileStr += ','.join(inputList)
                 writeToFileStr += '^'
             writeToFileStr = writeToFileStr[:-1]
             self.data['writeToFile'] = writeToFileStr
         except Exception:
             pass
     # replace placeholder
     if EventServiceUtils.isJumboJob(job) or EventServiceUtils.isCoJumboJob(
             job):
         try:
             for inDS in inDsLfnMap:
                 inputList = inDsLfnMap[inDS]
                 inDS = re.sub('/$', '', inDS)
                 inDS = inDS.split(':')[-1]
                 srcStr = 'tmpin__cnt_{0}'.format(inDS)
                 dstStr = ','.join(inputList)
                 self.data['jobPars'] = self.data['jobPars'].replace(
                     srcStr, dstStr)
         except Exception:
             pass
     # no output
     if noOutput != []:
         self.data['allowNoOutput'] = ','.join(noOutput)
     # alternative stage-out
     if job.getAltStgOut() is not None:
         self.data['altStageOut'] = job.getAltStgOut()
     # log to OS
     if job.putLogToOS():
         self.data['putLogToOS'] = 'True'
     # suppress execute string conversion
     if job.noExecStrCnv():
         self.data['noExecStrCnv'] = 'True'
     # in-file positional event number
     if job.inFilePosEvtNum():
         self.data['inFilePosEvtNum'] = 'True'
     # use prefetcher
     if job.usePrefetcher():
         self.data['usePrefetcher'] = 'True'
     # image name
     if job.container_name not in ['NULL', None]:
         self.data['container_name'] = job.container_name
     # IO
     self.data['ioIntensity'] = job.get_task_attribute('ioIntensity')
     self.data['ioIntensityUnit'] = job.get_task_attribute(
         'ioIntensityUnit')
     # HPO
     if job.is_hpo_workflow():
         self.data['isHPO'] = 'True'
     # VP
     if siteSpec is not None:
         scope_input, scope_output = DataServiceUtils.select_scope(
             siteSpec, job.prodSourceLabel, job.job_label)
         if siteSpec.use_vp(scope_input):
             self.data['useVP'] = 'True'
Пример #14
0
 def extractCommon(self, jediTaskID, taskParamMap, workQueueMapper,
                   splitRule):
     # make task spec
     taskSpec = JediTaskSpec()
     taskSpec.jediTaskID = jediTaskID
     taskSpec.taskName = taskParamMap['taskName']
     taskSpec.userName = taskParamMap['userName']
     taskSpec.vo = taskParamMap['vo']
     taskSpec.prodSourceLabel = taskParamMap['prodSourceLabel']
     taskSpec.taskPriority = taskParamMap['taskPriority']
     taskSpec.currentPriority = taskSpec.taskPriority
     taskSpec.architecture = taskParamMap['architecture']
     taskSpec.transUses = taskParamMap['transUses']
     taskSpec.transHome = taskParamMap['transHome']
     taskSpec.transPath = taskParamMap['transPath']
     taskSpec.processingType = taskParamMap['processingType']
     taskSpec.taskType = taskParamMap['taskType']
     taskSpec.splitRule = splitRule
     taskSpec.startTime = datetime.datetime.utcnow()
     if taskParamMap.has_key('workingGroup'):
         taskSpec.workingGroup = taskParamMap['workingGroup']
     if taskParamMap.has_key('countryGroup'):
         taskSpec.countryGroup = taskParamMap['countryGroup']
     if taskParamMap.has_key('ticketID'):
         taskSpec.ticketID = taskParamMap['ticketID']
     if taskParamMap.has_key('ticketSystemType'):
         taskSpec.ticketSystemType = taskParamMap['ticketSystemType']
     if taskParamMap.has_key('reqID'):
         taskSpec.reqID = taskParamMap['reqID']
     else:
         taskSpec.reqID = jediTaskID
     if taskParamMap.has_key('coreCount'):
         taskSpec.coreCount = taskParamMap['coreCount']
     else:
         taskSpec.coreCount = 1
     if taskParamMap.has_key('walltime'):
         taskSpec.walltime = taskParamMap['walltime']
     else:
         taskSpec.walltime = 0
     if taskParamMap.has_key('walltimeUnit'):
         taskSpec.walltimeUnit = taskParamMap['walltimeUnit']
     if taskParamMap.has_key('outDiskCount'):
         taskSpec.outDiskCount = taskParamMap['outDiskCount']
     else:
         taskSpec.outDiskCount = 0
     if 'outDiskUnit' in taskParamMap:
         taskSpec.outDiskUnit = taskParamMap['outDiskUnit']
     if taskParamMap.has_key('workDiskCount'):
         taskSpec.workDiskCount = taskParamMap['workDiskCount']
     else:
         taskSpec.workDiskCount = 0
     if taskParamMap.has_key('workDiskUnit'):
         taskSpec.workDiskUnit = taskParamMap['workDiskUnit']
     if taskParamMap.has_key('ramCount'):
         taskSpec.ramCount = taskParamMap['ramCount']
     else:
         taskSpec.ramCount = 0
     # HS06 stuff
     if 'cpuTimeUnit' in taskParamMap:
         taskSpec.cpuTimeUnit = taskParamMap['cpuTimeUnit']
     if 'cpuTime' in taskParamMap:
         taskSpec.cpuTime = taskParamMap['cpuTime']
     if 'cpuEfficiency' in taskParamMap:
         taskSpec.cpuEfficiency = taskParamMap['cpuEfficiency']
     else:
         # 90% of cpu efficiency by default
         taskSpec.cpuEfficiency = 90
     if 'baseWalltime' in taskParamMap:
         taskSpec.baseWalltime = taskParamMap['baseWalltime']
     else:
         # 10min of offset by default
         taskSpec.baseWalltime = 10 * 60
     # for merge
     if 'mergeRamCount' in taskParamMap:
         taskSpec.mergeRamCount = taskParamMap['mergeRamCount']
     if 'mergeCoreCount' in taskParamMap:
         taskSpec.mergeCoreCount = taskParamMap['mergeCoreCount']
     # scout
     if not taskParamMap.has_key(
             'skipScout') and not taskSpec.isPostScout():
         taskSpec.setUseScout(True)
     # cloud
     if taskParamMap.has_key('cloud'):
         self.cloudName = taskParamMap['cloud']
         taskSpec.cloud = self.cloudName
     else:
         # set dummy to force update
         taskSpec.cloud = 'dummy'
         taskSpec.cloud = None
     # site
     if taskParamMap.has_key('site'):
         self.siteName = taskParamMap['site']
         taskSpec.site = self.siteName
     else:
         # set dummy to force update
         taskSpec.site = 'dummy'
         taskSpec.site = None
     # event service
     if taskParamMap.has_key('nEventsPerWorker'):
         taskSpec.eventService = 1
     else:
         taskSpec.eventService = 0
     # goal
     if 'goal' in taskParamMap:
         try:
             taskSpec.goal = int(float(taskParamMap['goal']) * 10)
             if taskSpec.goal >= 1000:
                 taskSpec.goal = None
         except:
             pass
     # campaign
     if taskParamMap.has_key('campaign'):
         taskSpec.campaign = taskParamMap['campaign']
     # work queue
     workQueue, tmpStr = workQueueMapper.getQueueWithSelParams(
         taskSpec.vo,
         taskSpec.prodSourceLabel,
         processingType=taskSpec.processingType,
         workingGroup=taskSpec.workingGroup,
         coreCount=taskSpec.coreCount,
         site=taskSpec.site)
     if workQueue == None:
         errStr = 'workqueue is undefined for vo={0} labal={1} '.format(
             taskSpec.vo, taskSpec.prodSourceLabel)
         errStr += 'processingType={0} workingGroup={1} coreCount={2} '.format(
             taskSpec.processingType, taskSpec.workingGroup,
             taskSpec.coreCount)
         raise RuntimeError, errStr
     taskSpec.workQueue_ID = workQueue.queue_id
     self.taskSpec = taskSpec
     # set split rule
     if 'tgtNumEventsPerJob' in taskParamMap:
         # set nEventsPerJob not respect file boundaries when nFilesPerJob is not used
         if not 'nFilesPerJob' in taskParamMap:
             self.setSplitRule(None, taskParamMap['tgtNumEventsPerJob'],
                               JediTaskSpec.splitRuleToken['nEventsPerJob'])
     self.setSplitRule(taskParamMap, 'nFilesPerJob',
                       JediTaskSpec.splitRuleToken['nFilesPerJob'])
     self.setSplitRule(taskParamMap, 'nEventsPerJob',
                       JediTaskSpec.splitRuleToken['nEventsPerJob'])
     self.setSplitRule(taskParamMap, 'nGBPerJob',
                       JediTaskSpec.splitRuleToken['nGBPerJob'])
     self.setSplitRule(taskParamMap, 'nMaxFilesPerJob',
                       JediTaskSpec.splitRuleToken['nMaxFilesPerJob'])
     self.setSplitRule(taskParamMap, 'nEventsPerWorker',
                       JediTaskSpec.splitRuleToken['nEventsPerWorker'])
     self.setSplitRule(taskParamMap, 'useLocalIO',
                       JediTaskSpec.splitRuleToken['useLocalIO'])
     self.setSplitRule(taskParamMap, 'disableAutoRetry',
                       JediTaskSpec.splitRuleToken['disableAutoRetry'])
     self.setSplitRule(taskParamMap, 'nEsConsumers',
                       JediTaskSpec.splitRuleToken['nEsConsumers'])
     self.setSplitRule(taskParamMap, 'waitInput',
                       JediTaskSpec.splitRuleToken['waitInput'])
     self.setSplitRule(taskParamMap, 'addNthFieldToLFN',
                       JediTaskSpec.splitRuleToken['addNthFieldToLFN'])
     self.setSplitRule(taskParamMap, 'scoutSuccessRate',
                       JediTaskSpec.splitRuleToken['scoutSuccessRate'])
     self.setSplitRule(taskParamMap, 't1Weight',
                       JediTaskSpec.splitRuleToken['t1Weight'])
     self.setSplitRule(taskParamMap, 'maxAttemptES',
                       JediTaskSpec.splitRuleToken['maxAttemptES'])
     self.setSplitRule(taskParamMap, 'nSitesPerJob',
                       JediTaskSpec.splitRuleToken['nSitesPerJob'])
     self.setSplitRule(taskParamMap, 'nEventsPerMergeJob',
                       JediTaskSpec.splitRuleToken['nEventsPerMergeJob'])
     self.setSplitRule(taskParamMap, 'nFilesPerMergeJob',
                       JediTaskSpec.splitRuleToken['nFilesPerMergeJob'])
     self.setSplitRule(taskParamMap, 'nGBPerMergeJob',
                       JediTaskSpec.splitRuleToken['nGBPerMergeJob'])
     self.setSplitRule(taskParamMap, 'nMaxFilesPerMergeJob',
                       JediTaskSpec.splitRuleToken['nMaxFilesPerMergeJob'])
     if taskParamMap.has_key('loadXML'):
         self.setSplitRule(None, 3, JediTaskSpec.splitRuleToken['loadXML'])
         self.setSplitRule(None, 4,
                           JediTaskSpec.splitRuleToken['groupBoundaryID'])
     if taskParamMap.has_key('pfnList'):
         self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['pfnList'])
     if taskParamMap.has_key('noWaitParent'):
         self.setSplitRule(None, 1,
                           JediTaskSpec.splitRuleToken['noWaitParent'])
     if 'respectLB' in taskParamMap:
         self.setSplitRule(None, 1,
                           JediTaskSpec.splitRuleToken['respectLB'])
     if taskParamMap.has_key('reuseSecOnDemand'):
         self.setSplitRule(None, 1,
                           JediTaskSpec.splitRuleToken['reuseSecOnDemand'])
     if 'ddmBackEnd' in taskParamMap:
         self.taskSpec.setDdmBackEnd(taskParamMap['ddmBackEnd'])
     if 'disableReassign' in taskParamMap:
         self.setSplitRule(None, 1,
                           JediTaskSpec.splitRuleToken['disableReassign'])
     if 'allowPartialFinish' in taskParamMap:
         self.setSplitRule(
             None, 1, JediTaskSpec.splitRuleToken['allowPartialFinish'])
     if 'useExhausted' in taskParamMap:
         self.setSplitRule(None, 1,
                           JediTaskSpec.splitRuleToken['useExhausted'])
     if 'useRealNumEvents' in taskParamMap:
         self.setSplitRule(None, 1,
                           JediTaskSpec.splitRuleToken['useRealNumEvents'])
     if 'ipConnectivity' in taskParamMap:
         self.taskSpec.setIpConnectivity(taskParamMap['ipConnectivity'])
     if 'runUntilClosed' in taskParamMap:
         self.setSplitRule(None, 1,
                           JediTaskSpec.splitRuleToken['runUntilClosed'])
     if 'stayOutputOnSite' in taskParamMap:
         self.setSplitRule(None, 1,
                           JediTaskSpec.splitRuleToken['stayOutputOnSite'])
     if 'useJobCloning' in taskParamMap:
         scValue = EventServiceUtils.getJobCloningValue(
             taskParamMap['useJobCloning'])
         self.setSplitRule(None, scValue,
                           JediTaskSpec.splitRuleToken['useJobCloning'])
     if 'failWhenGoalUnreached' in taskParamMap:
         self.setSplitRule(None, 1,
                           JediTaskSpec.splitRuleToken['failGoalUnreached'])
     if 'switchEStoNormal' in taskParamMap:
         self.setSplitRule(None, 1,
                           JediTaskSpec.splitRuleToken['switchEStoNormal'])
     # return
     return
Пример #15
0
    def extractCommon(self,jediTaskID,taskParamMap,workQueueMapper,splitRule):
        # make task spec
        taskSpec = JediTaskSpec()
        taskSpec.jediTaskID = jediTaskID
        taskSpec.taskName = taskParamMap['taskName']
        taskSpec.userName = taskParamMap['userName']
        taskSpec.vo = taskParamMap['vo']     
        taskSpec.prodSourceLabel = taskParamMap['prodSourceLabel']
        taskSpec.taskPriority = taskParamMap['taskPriority']
        if 'currentPriority' in taskParamMap:
            taskSpec.currentPriority = taskParamMap['currentPriority']
        else:
            taskSpec.currentPriority = taskSpec.taskPriority
        taskSpec.architecture = taskParamMap['architecture']
        taskSpec.transUses = taskParamMap['transUses']
        taskSpec.transHome = taskParamMap['transHome']
        taskSpec.transPath = taskParamMap['transPath']
        taskSpec.processingType = taskParamMap['processingType']
        taskSpec.taskType = taskParamMap['taskType']
        taskSpec.splitRule = splitRule
        taskSpec.startTime = datetime.datetime.utcnow()
        if taskParamMap.has_key('workingGroup'):
            taskSpec.workingGroup = taskParamMap['workingGroup']
        if taskParamMap.has_key('countryGroup'):
            taskSpec.countryGroup = taskParamMap['countryGroup']
        if taskParamMap.has_key('ticketID'):
            taskSpec.ticketID = taskParamMap['ticketID']
        if taskParamMap.has_key('ticketSystemType'):
            taskSpec.ticketSystemType = taskParamMap['ticketSystemType']
        if taskParamMap.has_key('reqID'):
            taskSpec.reqID = taskParamMap['reqID']
        else:
            taskSpec.reqID = jediTaskID
        if taskParamMap.has_key('coreCount'):
            taskSpec.coreCount = taskParamMap['coreCount']
        else:
            taskSpec.coreCount = 1
        if taskParamMap.has_key('walltime'):
            taskSpec.walltime = taskParamMap['walltime']
        else:
            taskSpec.walltime = 0
        if not taskParamMap.has_key('walltimeUnit'):
            # force to set NULL so that retried tasks get data from scouts again
            taskSpec.forceUpdate('walltimeUnit')
        if taskParamMap.has_key('outDiskCount'):
            taskSpec.outDiskCount = taskParamMap['outDiskCount']
        else:
            taskSpec.outDiskCount = 0
        if 'outDiskUnit' in taskParamMap:
            taskSpec.outDiskUnit = taskParamMap['outDiskUnit']
        if taskParamMap.has_key('workDiskCount'):
            taskSpec.workDiskCount = taskParamMap['workDiskCount']
        else:
            taskSpec.workDiskCount = 0
        if taskParamMap.has_key('workDiskUnit'):
            taskSpec.workDiskUnit = taskParamMap['workDiskUnit']
        if taskParamMap.has_key('ramCount'):
            taskSpec.ramCount = taskParamMap['ramCount']
        else:
            taskSpec.ramCount = 0
        if taskParamMap.has_key('ramUnit'):
            taskSpec.ramUnit = taskParamMap['ramUnit']
        if taskParamMap.has_key('baseRamCount'):
            taskSpec.baseRamCount = taskParamMap['baseRamCount']
        else:
            taskSpec.baseRamCount = 0
        # IO
        if 'ioIntensity' in taskParamMap:
            taskSpec.ioIntensity = taskParamMap['ioIntensity']
        if 'ioIntensityUnit' in taskParamMap:
            taskSpec.ioIntensityUnit = taskParamMap['ioIntensityUnit']
        # HS06 stuff
        if 'cpuTimeUnit' in taskParamMap:
            taskSpec.cpuTimeUnit = taskParamMap['cpuTimeUnit']
        if 'cpuTime' in taskParamMap:
            taskSpec.cpuTime = taskParamMap['cpuTime']
        if 'cpuEfficiency' in taskParamMap:
            taskSpec.cpuEfficiency = taskParamMap['cpuEfficiency']
        else:
            # 90% of cpu efficiency by default
            taskSpec.cpuEfficiency = 90
        if 'baseWalltime' in taskParamMap:
            taskSpec.baseWalltime = taskParamMap['baseWalltime']
        else:
            # 10min of offset by default
            taskSpec.baseWalltime = 10*60
        # for merge
        if 'mergeRamCount' in taskParamMap:
            taskSpec.mergeRamCount = taskParamMap['mergeRamCount']
        if 'mergeCoreCount' in taskParamMap:
            taskSpec.mergeCoreCount = taskParamMap['mergeCoreCount']
        # scout
        if not taskParamMap.has_key('skipScout') and not taskSpec.isPostScout():
            taskSpec.setUseScout(True)
        # cloud
        if taskParamMap.has_key('cloud'):
            self.cloudName = taskParamMap['cloud']
            taskSpec.cloud = self.cloudName
        else:
            # set dummy to force update
            taskSpec.cloud = 'dummy'
            taskSpec.cloud = None
        # site
        if taskParamMap.has_key('site'):
            self.siteName = taskParamMap['site']
            taskSpec.site = self.siteName
        else:
            # set dummy to force update
            taskSpec.site = 'dummy'
            taskSpec.site = None
        # nucleus
        if 'nucleus' in taskParamMap:
            taskSpec.nucleus = taskParamMap['nucleus']
        # preset some parameters for job cloning
        if 'useJobCloning' in taskParamMap:
            # set implicit parameters
            if not 'nEventsPerWorker' in taskParamMap:
                taskParamMap['nEventsPerWorker'] = 1
            if not 'nSitesPerJob' in taskParamMap:
                taskParamMap['nSitesPerJob'] = 2
            if not 'nEsConsumers' in taskParamMap:
                taskParamMap['nEsConsumers'] = taskParamMap['nSitesPerJob']
        # minimum granularity
        if 'minGranularity' in taskParamMap:
            taskParamMap['nEventsPerRange'] = taskParamMap['minGranularity']
        # event service flag
        if 'useJobCloning' in taskParamMap:
            taskSpec.eventService = 2
        elif taskParamMap.has_key('nEventsPerWorker'):
            taskSpec.eventService = 1
        else:
            taskSpec.eventService = 0
        # OS
        if 'osInfo' in taskParamMap:
            taskSpec.termCondition = taskParamMap['osInfo']
        # ttcr: requested time to completion
        if taskParamMap.has_key('ttcrTimestamp'):
            try:
                # get rid of the +00:00 timezone string and parse the timestamp
                taskSpec.ttcRequested = datetime.datetime.strptime(taskParamMap['ttcrTimestamp'].split('+')[0], '%Y-%m-%d %H:%M:%S.%f')
            except (IndexError, ValueError):
                pass
        # goal
        if 'goal' in taskParamMap:
            try:
                taskSpec.goal = int(float(taskParamMap['goal'])*10)
                if taskSpec.goal > 1000:
                    taskSpec.goal = None
            except:
                pass
        # campaign
        if taskParamMap.has_key('campaign'):
            taskSpec.campaign = taskParamMap['campaign']
        # request type
        if 'requestType' in taskParamMap:
            taskSpec.requestType = taskParamMap['requestType']
        self.taskSpec = taskSpec
        # set split rule    
        if 'tgtNumEventsPerJob' in taskParamMap:
            # set nEventsPerJob not respect file boundaries when nFilesPerJob is not used
            if not 'nFilesPerJob' in taskParamMap:
                self.setSplitRule(None,taskParamMap['tgtNumEventsPerJob'],JediTaskSpec.splitRuleToken['nEventsPerJob'])
        self.setSplitRule(taskParamMap,'nFilesPerJob',     JediTaskSpec.splitRuleToken['nFilesPerJob'])
        self.setSplitRule(taskParamMap,'nEventsPerJob',    JediTaskSpec.splitRuleToken['nEventsPerJob'])
        self.setSplitRule(taskParamMap,'nGBPerJob',        JediTaskSpec.splitRuleToken['nGBPerJob'])
        self.setSplitRule(taskParamMap,'nMaxFilesPerJob',  JediTaskSpec.splitRuleToken['nMaxFilesPerJob'])
        self.setSplitRule(taskParamMap,'nEventsPerWorker', JediTaskSpec.splitRuleToken['nEventsPerWorker'])
        self.setSplitRule(taskParamMap,'useLocalIO',       JediTaskSpec.splitRuleToken['useLocalIO'])
        self.setSplitRule(taskParamMap,'disableAutoRetry', JediTaskSpec.splitRuleToken['disableAutoRetry'])
        self.setSplitRule(taskParamMap,'nEsConsumers',     JediTaskSpec.splitRuleToken['nEsConsumers'])
        self.setSplitRule(taskParamMap,'waitInput',        JediTaskSpec.splitRuleToken['waitInput'])
        self.setSplitRule(taskParamMap,'addNthFieldToLFN', JediTaskSpec.splitRuleToken['addNthFieldToLFN'])
        self.setSplitRule(taskParamMap,'scoutSuccessRate', JediTaskSpec.splitRuleToken['scoutSuccessRate'])
        self.setSplitRule(taskParamMap,'t1Weight',         JediTaskSpec.splitRuleToken['t1Weight'])
        self.setSplitRule(taskParamMap,'maxAttemptES',     JediTaskSpec.splitRuleToken['maxAttemptES'])
        self.setSplitRule(taskParamMap,'maxAttemptEsJob',  JediTaskSpec.splitRuleToken['maxAttemptEsJob'])
        self.setSplitRule(taskParamMap,'nSitesPerJob',     JediTaskSpec.splitRuleToken['nSitesPerJob'])
        self.setSplitRule(taskParamMap,'nEventsPerMergeJob',   JediTaskSpec.splitRuleToken['nEventsPerMergeJob'])
        self.setSplitRule(taskParamMap,'nFilesPerMergeJob',    JediTaskSpec.splitRuleToken['nFilesPerMergeJob'])
        self.setSplitRule(taskParamMap,'nGBPerMergeJob',       JediTaskSpec.splitRuleToken['nGBPerMergeJob'])
        self.setSplitRule(taskParamMap,'nMaxFilesPerMergeJob', JediTaskSpec.splitRuleToken['nMaxFilesPerMergeJob'])
        self.setSplitRule(taskParamMap,'maxWalltime', JediTaskSpec.splitRuleToken['maxWalltime'])
        self.setSplitRule(taskParamMap,'tgtMaxOutputForNG', JediTaskSpec.splitRuleToken['tgtMaxOutputForNG'])
        if 'nJumboJobs' in taskParamMap:
            self.setSplitRule(taskParamMap,'nJumboJobs',JediTaskSpec.splitRuleToken['nJumboJobs'])
            taskSpec.useJumbo = JediTaskSpec.enum_useJumbo['waiting']
            if 'maxJumboPerSite' in taskParamMap:
                self.setSplitRule(taskParamMap,'maxJumboPerSite',JediTaskSpec.splitRuleToken['maxJumboPerSite'])
        if 'minCpuEfficiency' in taskParamMap: 
            self.setSplitRule(taskParamMap,'minCpuEfficiency',JediTaskSpec.splitRuleToken['minCpuEfficiency'])
        if taskParamMap.has_key('loadXML'):
            self.setSplitRule(None,3,JediTaskSpec.splitRuleToken['loadXML'])
            self.setSplitRule(None,4,JediTaskSpec.splitRuleToken['groupBoundaryID'])
        if taskParamMap.has_key('pfnList'):
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['pfnList'])
        if taskParamMap.has_key('noWaitParent') and taskParamMap['noWaitParent'] == True:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['noWaitParent'])
        if 'respectLB' in taskParamMap:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['respectLB'])
        if 'orderByLB' in taskParamMap:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['orderByLB'])
        if 'respectSplitRule' in taskParamMap:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['respectSplitRule'])
        if taskParamMap.has_key('reuseSecOnDemand'):
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['reuseSecOnDemand'])
        if 'ddmBackEnd' in taskParamMap:
            self.taskSpec.setDdmBackEnd(taskParamMap['ddmBackEnd'])
        if 'disableReassign' in taskParamMap:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['disableReassign'])
        if 'allowPartialFinish' in taskParamMap:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['allowPartialFinish'])
        if 'useExhausted' in taskParamMap:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['useExhausted'])
        if 'useRealNumEvents' in taskParamMap:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['useRealNumEvents'])
        if 'ipConnectivity' in taskParamMap:
            self.taskSpec.setIpConnectivity(taskParamMap['ipConnectivity'])
        if 'altStageOut' in taskParamMap:
            self.taskSpec.setAltStageOut(taskParamMap['altStageOut'])
        if 'allowInputLAN' in taskParamMap:
            self.taskSpec.setAllowInputLAN(taskParamMap['allowInputLAN'])
        if 'runUntilClosed' in taskParamMap:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['runUntilClosed'])
        if 'stayOutputOnSite' in taskParamMap:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['stayOutputOnSite'])
        if 'useJobCloning' in taskParamMap:
            scValue = EventServiceUtils.getJobCloningValue(taskParamMap['useJobCloning'])
            self.setSplitRule(None,scValue,JediTaskSpec.splitRuleToken['useJobCloning'])
        if 'failWhenGoalUnreached' in taskParamMap and taskParamMap['failWhenGoalUnreached'] == True:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['failGoalUnreached'])
        if 'switchEStoNormal' in taskParamMap:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['switchEStoNormal'])
        if 'nEventsPerRange' in taskParamMap:
            self.setSplitRule(taskParamMap,'nEventsPerRange',JediTaskSpec.splitRuleToken['dynamicNumEvents'])
        if 'allowInputWAN' in taskParamMap and taskParamMap['allowInputWAN'] == True:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['allowInputWAN'])
        if 'putLogToOS' in taskParamMap and taskParamMap['putLogToOS'] == True:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['putLogToOS'])
        if 'mergeEsOnOS' in taskParamMap and taskParamMap['mergeEsOnOS'] == True:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['mergeEsOnOS'])
        if 'writeInputToFile' in taskParamMap and taskParamMap['writeInputToFile'] == True:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['writeInputToFile'])
        if 'useFileAsSourceLFN' in taskParamMap and taskParamMap['useFileAsSourceLFN'] == True:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['useFileAsSourceLFN'])
        if 'ignoreMissingInDS' in taskParamMap and taskParamMap['ignoreMissingInDS'] == True:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['ignoreMissingInDS'])
        if 'noExecStrCnv' in taskParamMap and taskParamMap['noExecStrCnv'] == True:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['noExecStrCnv'])
        if 'inFilePosEvtNum' in taskParamMap and taskParamMap['inFilePosEvtNum'] == True:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['inFilePosEvtNum'])
        if self.taskSpec.useEventService() and not taskSpec.useJobCloning():
            if 'registerEsFiles' in taskParamMap and taskParamMap['registerEsFiles'] == True:
                self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['registerEsFiles'])
        if 'disableAutoFinish' in taskParamMap and taskParamMap['disableAutoFinish'] == True:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['disableAutoFinish'])
        if 'resurrectConsumers' in taskParamMap and taskParamMap['resurrectConsumers'] == True:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['resurrectConsumers'])
        if 'usePrefetcher' in taskParamMap and taskParamMap['usePrefetcher'] == True:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['usePrefetcher'])
        if 'notDiscardEvents' in taskParamMap and taskParamMap['notDiscardEvents'] == True:
            self.setSplitRule(None,1,JediTaskSpec.splitRuleToken['notDiscardEvents'])
        if 'decAttOnFailedES' in taskParamMap and taskParamMap['decAttOnFailedES'] is True:
            self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['decAttOnFailedES'])
        if 'useZipToPin' in taskParamMap and taskParamMap['useZipToPin'] is True:
            self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['useZipToPin'])
        if 'osMatching' in taskParamMap and taskParamMap['osMatching'] is True:
            self.setSplitRule(None, 1, JediTaskSpec.splitRuleToken['osMatching'])
        # work queue
        workQueue = None
        if 'workQueueName' in taskParamMap:
            # work queue is specified
            workQueue = workQueueMapper.getQueueByName(taskSpec.vo, taskSpec.prodSourceLabel, taskParamMap['workQueueName'])
        if workQueue is None:
            # get work queue based on task attributes
            workQueue,tmpStr = workQueueMapper.getQueueWithSelParams(taskSpec.vo,
                                                                     taskSpec.prodSourceLabel,
                                                                     prodSourceLabel=taskSpec.prodSourceLabel,
                                                                     processingType=taskSpec.processingType,
                                                                     workingGroup=taskSpec.workingGroup,
                                                                     coreCount=taskSpec.coreCount,
                                                                     site=taskSpec.site,
                                                                     eventService=taskSpec.eventService,
                                                                     splitRule=taskSpec.splitRule,
                                                                     campaign=taskSpec.campaign)
        if workQueue is None:
            errStr  = 'workqueue is undefined for vo={0} label={1} '.format(taskSpec.vo,taskSpec.prodSourceLabel)
            errStr += 'processingType={0} workingGroup={1} coreCount={2} eventService={3} '.format(taskSpec.processingType,
                                                                                                   taskSpec.workingGroup,
                                                                                                   taskSpec.coreCount,
                                                                                                   taskSpec.eventService)
            errStr += 'splitRule={0} campaign={1}'.format(taskSpec.splitRule,taskSpec.campaign)
            raise RuntimeError,errStr
        self.taskSpec.workQueue_ID = workQueue.queue_id

        # Initialize the global share
        gshare = None
        if 'gshare' in taskParamMap and self.taskBufferIF.is_valid_share(taskParamMap['gshare']):
            # work queue is specified
            gshare = taskParamMap['gshare']
        else:
            # get share based on definition
            gshare = self.taskBufferIF.get_share_for_task(self.taskSpec)
            if gshare is None:
                gshare = 'Undefined' # Should not happen. Undefined is set when no share is found
                # errStr  = 'share is undefined for vo={0} label={1} '.format(taskSpec.vo,taskSpec.prodSourceLabel)
                # errStr += 'workingGroup={0} campaign={1} '.format(taskSpec.workingGroup, taskSpec.campaign)
                # raise RuntimeError,errStr

            self.taskSpec.gshare = gshare

        # Initialize the resource type
        try:
            self.taskSpec.resource_type = self.taskBufferIF.get_resource_type_task(self.taskSpec)
        except:
            self.taskSpec.resource_type = 'Undefined'

        # return
        return
Пример #16
0
 def doPostProcess(self,taskSpec,tmpLog):
     # pre-check
     try:
         tmpStat = self.doPreCheck(taskSpec,tmpLog)
         if tmpStat:
             return self.SC_SUCCEEDED
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('doPreCheck failed with {0}:{1}'.format(errtype.__name__,errvalue))
         return self.SC_FATAL
     # get DDM I/F
     ddmIF = self.ddmIF.getInterface(taskSpec.vo)
     # loop over all datasets
     for datasetSpec in taskSpec.datasetSpecList:
         # skip pseudo output datasets
         if datasetSpec.type in ['output'] and datasetSpec.isPseudo():
             continue
         try:
             # remove wrong files
             if datasetSpec.type in ['output']:
                 # get successful files
                 okFiles = self.taskBufferIF.getSuccessfulFiles_JEDI(datasetSpec.jediTaskID,datasetSpec.datasetID)
                 if okFiles == None:
                     tmpLog.warning('failed to get successful files for {0}'.format(datasetSpec.datasetName))
                     return self.SC_FAILED
                 # get files in dataset
                 ddmFiles = ddmIF.getFilesInDataset(datasetSpec.datasetName,skipDuplicate=False,ignoreUnknown=True)
                 tmpLog.debug('datasetID={0}:Name={1} has {2} files in DB, {3} files in DDM'.format(datasetSpec.datasetID,
                                                                                                   datasetSpec.datasetName,
                                                                                                   len(okFiles),len(ddmFiles)))
                 # check all files
                 toDelete = []
                 for tmpGUID,attMap in ddmFiles.iteritems():
                     if attMap['lfn'] not in okFiles:
                         did = {'scope':attMap['scope'], 'name':attMap['lfn']}
                         toDelete.append(did)
                         tmpLog.debug('delete {0} from {1}'.format(attMap['lfn'],datasetSpec.datasetName))
                 # delete
                 if toDelete != []:
                     ddmIF.deleteFilesFromDataset(datasetSpec.datasetName,toDelete)
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.warning('failed to remove wrong files with {0}:{1}'.format(errtype.__name__,errvalue))
             return self.SC_FAILED
         try:
             # freeze output and log datasets
             if datasetSpec.type in ['output','log','trn_log']:
                 tmpLog.info('freezing datasetID={0}:Name={1}'.format(datasetSpec.datasetID,datasetSpec.datasetName))
                 ddmIF.freezeDataset(datasetSpec.datasetName,ignoreUnknown=True)
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.warning('failed to freeze datasets with {0}:{1}'.format(errtype.__name__,errvalue))
             return self.SC_FAILED
         try:
             # delete transient datasets
             if datasetSpec.type in ['trn_output']:
                 tmpLog.debug('deleting datasetID={0}:Name={1}'.format(datasetSpec.datasetID,datasetSpec.datasetName))
                 retStr = ddmIF.deleteDataset(datasetSpec.datasetName,False,ignoreUnknown=True)
                 tmpLog.info(retStr)
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.warning('failed to delete datasets with {0}:{1}'.format(errtype.__name__,errvalue))
     # check duplication
     if self.getFinalTaskStatus(taskSpec) in ['finished','done']:
         nDup = self.taskBufferIF.checkDuplication_JEDI(taskSpec.jediTaskID)
         tmpLog.debug('checked duplication with {0}'.format(nDup))
         if nDup > 0:
             errStr = 'paused since {0} duplication found'.format(nDup)
             taskSpec.oldStatus = self.getFinalTaskStatus(taskSpec)
             taskSpec.status = 'paused'
             taskSpec.setErrDiag(errStr)
             tmpLog.debug(errStr)
     # delete ES datasets
     if taskSpec.registerEsFiles():
         try:
             targetName = EventServiceUtils.getEsDatasetName(taskSpec.jediTaskID)
             tmpLog.debug('deleting ES dataset name={0}'.format(targetName))
             retStr = ddmIF.deleteDataset(targetName,False,ignoreUnknown=True)
             tmpLog.debug(retStr)
         except:
             errtype,errvalue = sys.exc_info()[:2]
             tmpLog.warning('failed to delete ES dataset with {0}:{1}'.format(errtype.__name__,errvalue))
     try:
         self.doBasicPostProcess(taskSpec,tmpLog)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('doBasicPostProcess failed with {0}:{1}'.format(errtype.__name__,errvalue))
         return self.SC_FATAL
     return self.SC_SUCCEEDED