def doCleanDataLocality(self): tmpLog = MsgWrapper(logger, ' #ATM #KV doCleanDataLocality') tmpLog.debug('start') try: # lock got_lock = self.taskBufferIF.lockProcess_JEDI( vo=self.vo, prodSourceLabel='default', cloud=None, workqueue_id=None, resource_name=None, component='AtlasDataLocalityUpdaterWatchDog.doCleanDataLocality', pid=self.pid, timeLimit=1440) if not got_lock: tmpLog.debug('locked by another process. Skipped') return tmpLog.debug('got lock') # lifetime of records record_lifetime_hours = 24 # run now_timestamp = datetime.datetime.utcnow() before_timestamp = now_timestamp - datetime.timedelta(hours=record_lifetime_hours) n_rows = self.taskBufferIF.deleteOutdatedDatasetLocality_JEDI(before_timestamp) tmpLog.info('cleaned up {0} records'.format(n_rows)) # done tmpLog.debug('done') except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0} {1} {2}'.format(errtype, errvalue, traceback.format_exc()))
def registerDatasetSubscription(self,datasetName,location,activity=None,ignoreUnknown=False): methodName = 'registerDatasetSubscription' methodName = '{0} datasetName={1} location={2}'.format(methodName,datasetName,location) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') isOK = True try: # get DQ2 API dq2 = DQ2() # call dq2.registerDatasetSubscription(datasetName,location,activity=activity) except DQSubscriptionExistsException: pass except DQUnknownDatasetException: if ignoreUnknown: pass else: isOK = False except: isOK = False if not isOK: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0} : {1}'.format(methodName,errMsg) tmpLog.info('done') return self.SC_SUCCEEDED,True
def freezeDataset(self,datasetName,ignoreUnknown=False): methodName = 'freezeDataset' methodName = '{0} datasetName={1}'.format(methodName,datasetName) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') isOK = True try: # get DQ2 API dq2=DQ2() # freeze dq2.freezeDataset(datasetName) except DQFrozenDatasetException: pass except DQUnknownDatasetException: if ignoreUnknown: pass else: isOK = False except: isOK = False if isOK: tmpLog.info('done') return self.SC_SUCCEEDED,True else: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0} : {1}'.format(methodName,errMsg)
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.info('start') # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # prepare tasks to be finished tmpLog.info('preparing tasks to be finished for vo={0} label={1}'.format(vo,prodSourceLabel)) tmpRet = self.taskBufferIF.prepareTasksToBeFinished_JEDI(vo,prodSourceLabel, jedi_config.postprocessor.nTasks, pid=self.pid) if tmpRet == None: # failed tmpLog.error('failed to prepare tasks') # get tasks to be finished tmpLog.info('getting tasks to be finished') tmpList = self.taskBufferIF.getTasksToBeFinished_JEDI(vo,prodSourceLabel,self.pid, jedi_config.postprocessor.nTasks) if tmpList == None: # failed tmpLog.error('failed to get tasks to be finished') else: tmpLog.info('got {0} tasks'.format(len(tmpList))) # put to a locked list taskList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # make workers nWorker = jedi_config.postprocessor.nWorkers for iWorker in range(nWorker): thr = PostProcessorThread(taskList,threadPool, self.taskBufferIF, self.ddmIF, self) thr.start() # join threadPool.join() tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,errtype.__name__,errvalue)) # sleep if needed loopCycle = 60 timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod)
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.info('start') # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # vo/prodSourceLabel specific action impl = self.getImpl(vo, prodSourceLabel, subType=self.subStr) if impl is not None: plugin_name = impl.__class__.__name__ tmpLog.info( 'pre-action for vo={} label={} cls={}'.format( vo, prodSourceLabel, plugin_name)) impl.pre_action(tmpLog, vo, prodSourceLabel, self.pid) tmpLog.info( 'do action for vo={} label={} cls={}'.format( vo, prodSourceLabel, plugin_name)) tmpStat = impl.doAction() if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error( 'failed to run special action for vo={} label={} cls={}' .format(vo, prodSourceLabel, plugin_name)) else: tmpLog.info( 'done for vo={} label={} cls={}'.format( vo, prodSourceLabel, plugin_name)) tmpLog.info('done') except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format( self.__class__.__name__, errtype.__name__, errvalue)) # sleep if needed loopCycle = jedi_config.watchdog.loopCycle if self.period is None else self.period timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep(max_val=loopCycle)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for taskSpec in taskList: # make logger tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID)) tmpLog.info('start') tmpStat = Interaction.SC_SUCCEEDED # get impl impl = self.implFactory.instantiateImpl(taskSpec.vo,taskSpec.prodSourceLabel,None, self.taskBufferIF,self.ddmIF) if impl == None: # post processor is undefined tmpLog.error('post-processor is undefined for vo={0} sourceLabel={1}'.format(taskSpec.vo,taskSpec.prodSourceLabel)) tmpStat = Interaction.SC_FATAL # execute if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('post-process with {0}'.format(impl.__class__.__name__)) try: impl.doPostProcess(taskSpec,tmpLog) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doPostProcess failed with {0}:{1}'.format(errtype.__name__,errvalue)) tmpStat = Interaction.SC_FATAL # done if tmpStat == Interaction.SC_FATAL: # task is broken tmpErrStr = 'post-process failed' tmpLog.error(tmpErrStr) taskSpec.status = 'broken' taskSpec.setErrDiag(tmpErrStr) taskSpec.lockedBy = None self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID}) elif tmpStat == Interaction.SC_FAILED: tmpErrStr = 'post processing failed' taskSpec.setOnHold() taskSpec.setErrDiag(tmpErrStr,True) taskSpec.lockedBy = None self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID}) tmpLog.info('set task_status={0} since {1}'.format(taskSpec.status,taskSpec.errorDialog)) continue # final procedure try: impl.doFinalProcedure(taskSpec,tmpLog) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doFinalProcedure failed with {0}:{1}'.format(errtype.__name__,errvalue)) # done tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def runImpl(self): while True: try: # get a part of list nTasks = 100 taskList = self.taskList.get(nTasks) totalTasks,idxTasks = self.taskList.stat() # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # make logger tmpLog = MsgWrapper(self.logger) tmpLog.info('start TaskBrokerThread {0}/{1} for jediTaskID={2}'.format(idxTasks,totalTasks,taskList)) tmpStat = Interaction.SC_SUCCEEDED # get TaskSpecs tmpListToAssign = [] for tmpTaskItem in taskList: tmpListItem = self.taskBufferIF.getTasksToBeProcessed_JEDI(None,None,None,None,None, simTasks=[tmpTaskItem], readMinFiles=True) if tmpListItem == None: # failed tmpLog.error('failed to get the input chunks for jediTaskID={0}'.format(tmpTaskItem)) tmpStat = Interaction.SC_FAILED break tmpListToAssign += tmpListItem # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: impl = self.implFactory.getImpl(self.vo,self.prodSourceLabel) if impl == None: # task refiner is undefined tmpLog.error('task broker is undefined for vo={0} sourceLabel={1}'.format(self.vo,self.prodSourceLabel)) tmpStat = Interaction.SC_FAILED except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('getImpl failed with {0}:{1}'.format(errtype.__name__,errvalue)) tmpStat = Interaction.SC_FAILED # brokerage if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('brokerage with {0} for {1} tasks '.format(impl.__class__.__name__,len(tmpListToAssign))) try: tmpStat = impl.doBrokerage(tmpListToAssign,self.vo, self.prodSourceLabel,self.workQueue) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doBrokerage failed with {0}:{1}'.format(errtype.__name__,errvalue)) tmpStat = Interaction.SC_FAILED # register if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed') else: tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def runImpl(self): while True: try: # get a part of list nTasks = 100 taskList = self.taskList.get(nTasks) totalTasks,idxTasks = self.taskList.stat() # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # make logger tmpLog = MsgWrapper(self.logger) tmpLog.info('start TaskCheckerThread {0}/{1} for jediTaskID={2}'.format(idxTasks,totalTasks,taskList)) tmpStat = Interaction.SC_SUCCEEDED # get TaskSpecs taskSpecList = [] for jediTaskID in taskList: tmpRet,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False) if tmpRet and taskSpec != None: taskSpecList.append(taskSpec) else: tmpLog.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID)) if taskSpecList != []: # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: impl = self.implFactory.getImpl(self.vo,self.prodSourceLabel) if impl == None: # task brokerage is undefined tmpLog.error('task broker is undefined for vo={0} sourceLabel={1}'.format(self.vo,self.prodSourceLabel)) tmpStat = Interaction.SC_FAILED except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('getImpl failed with {0}:{1}'.format(errtype.__name__,errvalue)) tmpStat = Interaction.SC_FAILED # check if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('brokerage with {0}'.format(impl.__class__.__name__)) try: tmpStat,taskCloudMap = impl.doCheck(taskSpecList) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doCheck failed with {0}:{1}'.format(errtype.__name__,errvalue)) tmpStat = Interaction.SC_FAILED # update if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to check assignment') else: tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(taskCloudMap) tmpLog.info('done with {0} for {1}'.format(tmpRet,str(taskCloudMap))) except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def finger(self,userName): methodName = 'finger' methodName = '{0} userName={1}'.format(methodName,userName) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') try: # cleanup DN userName = parse_dn(userName) # exec tmpRet = infoClient().finger(userName) except: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0}:{1}'.format(methodName,errMsg) tmpLog.info('done') return self.SC_SUCCEEDED,tmpRet
def setDatasetOwner(self,datasetName,userName): methodName = 'setDatasetOwner' methodName = '{0} datasetName={1} userName={2}'.format(methodName,datasetName,userName) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') try: # cleanup DN userName = parse_dn(userName) # get DQ2 API dq2=DQ2() # set dq2.setMetaDataAttribute(datasetName,'owner',userName) except: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0} : {1}'.format(methodName,errMsg) tmpLog.info('done') return self.SC_SUCCEEDED,True
def registerDatasetLocation(self,datasetName,location,lifetime=None,owner=None): methodName = 'registerDatasetLocation' methodName = '{0} datasetName={1} location={2}'.format(methodName,datasetName,location) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') try: # cleanup DN owner = parse_dn(owner) # get DQ2 API dq2 = DQ2() # set dq2.registerDatasetLocation(datasetName,location,lifetime=lifetime) dq2.setReplicaMetaDataAttribute(datasetName,location,'owner',owner) except: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0} : {1}'.format(methodName,errMsg) tmpLog.info('done') return self.SC_SUCCEEDED,True
def setDatasetMetadata(self,datasetName,metadataName,metadaValue): methodName = 'setDatasetMetadata' methodName = '{0} datasetName={1} metadataName={2} metadaValue={3}'.format(methodName,datasetName, metadataName,metadaValue) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') try: # get DQ2 API dq2 = DQ2() # set dq2.setMetaDataAttribute(datasetName,metadataName,metadaValue) except DQUnknownDatasetException: pass except: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0} : {1}'.format(methodName,errMsg) tmpLog.info('done') return self.SC_SUCCEEDED,True
def expandContainer(self,containerName): methodName = 'expandContainer' methodName = '{0} contName={1}'.format(methodName,containerName) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') try: dsList = [] # get real names tmpS,tmpRealNameList = self.listDatasets(containerName) if tmpS != self.SC_SUCCEEDED: tmpLog.error('failed to get real names') return tmpS,tmpRealNameList # loop over all names for tmpRealName in tmpRealNameList: # container if tmpRealName.endswith('/'): # get contents tmpS,tmpO = self.listDatasetsInContainer(tmpRealName) if tmpS != self.SC_SUCCEEDED: tmpLog.error('failed to get datasets in {0}'.format(tmpRealName)) return tmpS,tmpO else: tmpO = [tmpRealName] # collect dataset names for tmpStr in tmpO: if not tmpStr in dsList: dsList.append(tmpStr) dsList.sort() # return tmpLog.info('got {0}'.format(str(dsList))) return self.SC_SUCCEEDED,dsList except: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error('failed with {0}'.format(errMsg)) return errCode,'{0} : {1}'.format(methodName,errMsg)
def deleteDataset(self,datasetName,emptyOnly,ignoreUnknown=False): methodName = 'deleteDataset' methodName = '{0} datasetName={1}'.format(methodName,datasetName) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') isOK = True retStr = '' nFiles = -1 try: # get DQ2 API dq2=DQ2() # get the number of files if emptyOnly: nFiles = dq2.getNumberOfFiles(datasetName) # erase if not emptyOnly or nFiles == 0: dq2.eraseDataset(datasetName) retStr = 'deleted {0}'.format(datasetName) else: retStr = 'keep {0} where {1} files are available'.format(datasetName,nFiles) except DQUnknownDatasetException: if ignoreUnknown: pass else: isOK = False except: isOK = False if isOK: tmpLog.info('done') return self.SC_SUCCEEDED,retStr else: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0} : {1}'.format(methodName,errMsg)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.info('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,splitRule,taskStatus,parent_tid in taskList: # make logger tmpLog = MsgWrapper(self.logger,'< jediTaskID={0} >'.format(jediTaskID)) tmpLog.debug('start') tmpStat = Interaction.SC_SUCCEEDED errStr = '' # read task parameters try: taskParam = None taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.debug(taskParam) tmpLog.error(errStr) continue tmpStat = Interaction.SC_FAILED # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: # get VO and sourceLabel vo = taskParamMap['vo'] prodSourceLabel = taskParamMap['prodSourceLabel'] taskType = taskParamMap['taskType'] tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType)) # get impl impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType, self.taskBufferIF,self.ddmIF) if impl == None: # task refiner is undefined errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # extract common parameters if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('extracting common') try: # initalize impl impl.initializeRefiner(tmpLog) impl.oldTaskStatus = taskStatus # extract common parameters impl.extractCommon(jediTaskID,taskParamMap,self.workQueueMapper,splitRule) # set parent tid if not parent_tid in [None,jediTaskID]: impl.taskSpec.parent_tid = parent_tid except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to extract common parameters with {0}:{1} {2}'.format(errtype.__name__,errvalue, traceback.format_exc()) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # check attribute length if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('checking attribute length') if not impl.taskSpec.checkAttrLength(): tmpLog.error(impl.taskSpec.errorDialog) tmpStat = Interaction.SC_FAILED # check parent noWaitParent = False parentState = None if tmpStat == Interaction.SC_SUCCEEDED: if not parent_tid in [None,jediTaskID]: tmpLog.info('check parent task') try: tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid) parentState = tmpStat if tmpStat == 'completed': # parent is done tmpStat = Interaction.SC_SUCCEEDED elif tmpStat == 'running': if not impl.taskSpec.noWaitParent(): # parent is running errStr = 'pending until parent task {0} is done'.format(parent_tid) impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(errStr) tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus],setFrozenTime=False) continue else: # not wait for parent tmpStat = Interaction.SC_SUCCEEDED noWaitParent = True else: # parent is corrupted tmpStat = Interaction.SC_FAILED tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid) impl.taskSpec.setErrDiag(tmpErrStr) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # refine if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('refining with {0}'.format(impl.__class__.__name__)) try: tmpStat = impl.doRefine(jediTaskID,taskParamMap) except: errtype,errvalue = sys.exc_info()[:2] # wait unknown input if noWaitParent or waitInput if ((impl.taskSpec.noWaitParent() or impl.taskSpec.waitInput()) \ and errtype == JediException.UnknownDatasetError) or parentState == 'running' \ or errtype == Interaction.JEDITemporaryError: if impl.taskSpec.noWaitParent() or parentState == 'running': tmpErrStr = 'pending until parent produces input' setFrozenTime=False elif errtype == Interaction.JEDITemporaryError: tmpErrStr = 'pending due to DDM problem. {0}'.format(errvalue) setFrozenTime=True else: tmpErrStr = 'pending until input is staged' setFrozenTime=True impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(tmpErrStr) tmpLog.info(tmpErrStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus], insertUnknown=impl.unknownDatasetList, setFrozenTime=setFrozenTime) continue else: errStr = 'failed to refine task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # register if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to refine the task') if impl == None or impl.taskSpec == None: tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID else: tmpTaskSpec = impl.taskSpec tmpTaskSpec.status = 'tobroken' if errStr != '': tmpTaskSpec.setErrDiag(errStr,True) self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID},oldStatus=[taskStatus]) else: tmpLog.info('registering') # fill JEDI tables try: # enable protection against task duplication if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \ not impl.taskSpec.checkPreProcessed(): uniqueTaskName = True else: uniqueTaskName = False strTaskParams = None if impl.updatedTaskParams != None: strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams) if taskStatus == 'registered': # unset pre-process flag if impl.taskSpec.checkPreProcessed(): impl.taskSpec.setPostPreProcess() # full registration tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec, impl.inMasterDatasetSpec, impl.inSecDatasetSpecList, impl.outDatasetSpecList, impl.outputTemplateMap, impl.jobParamsTemplate, strTaskParams, impl.unmergeMasterDatasetSpec, impl.unmergeDatasetSpecMap, uniqueTaskName, taskStatus) if not tmpStat: tmpErrStr = 'failed to register the task to JEDI in a single shot' tmpLog.error(tmpErrStr) impl.taskSpec.status = newTaskStatus impl.taskSpec.setErrDiag(tmpErrStr,True) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus]) tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) else: # disable scouts if previous attempt didn't use it if not impl.taskSpec.useScout(splitRule): impl.taskSpec.setUseScout(False) # update task with new params self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus]) # appending for incremetnal execution tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec, impl.inSecDatasetSpecList) if not tmpStat: tmpLog.error('failed to append datasets for incexec') except: errtype,errvalue = sys.exc_info()[:2] tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(tmpErrStr) else: tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def doSetup(self, taskSpec, datasetToRegister, pandaJobs): # make logger tmpLog = MsgWrapper(logger, "<jediTaskID={0}>".format(taskSpec.jediTaskID)) tmpLog.info('start label={0} taskType={1}'.format( taskSpec.prodSourceLabel, taskSpec.taskType)) # returns retFatal = self.SC_FATAL retTmpError = self.SC_FAILED retOK = self.SC_SUCCEEDED try: # get DDM I/F ddmIF = self.ddmIF.getInterface(taskSpec.vo) # register datasets if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']: # prod vs anal userSetup = False if taskSpec.prodSourceLabel in ['user']: userSetup = True # collect datasetID to register datasets/containers just in case for tmpPandaJob in pandaJobs: if not tmpPandaJob.produceUnMerge(): for tmpFileSpec in tmpPandaJob.Files: if tmpFileSpec.type in ['output', 'log']: if not tmpFileSpec.datasetID in datasetToRegister: datasetToRegister.append( tmpFileSpec.datasetID) tmpLog.info('datasetToRegister={0}'.format( str(datasetToRegister))) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # loop over all datasets avDatasetList = [] cnDatasetMap = {} for datasetID in datasetToRegister: # get output and log datasets tmpLog.info( 'getting datasetSpec with datasetID={0}'.format( datasetID)) tmpStat, datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI( taskSpec.jediTaskID, datasetID) if not tmpStat: tmpLog.error('failed to get output and log datasets') return retFatal # DDM backend ddmBackEnd = taskSpec.getDdmBackEnd() tmpLog.info('checking {0}'.format(datasetSpec.datasetName)) # check if dataset and container are available in DDM for targetName in [ datasetSpec.datasetName, datasetSpec.containerName ]: if targetName == None: continue if not targetName in avDatasetList: # set lifetime if targetName.startswith('panda'): if datasetSpec.type == 'trn_log' and taskSpec.prodSourceLabel == 'managed': lifetime = 365 else: lifetime = 14 else: lifetime = None # check dataset/container in DDM tmpList = ddmIF.listDatasets(targetName) if tmpList == []: # get location location = None locForRule = None if targetName == datasetSpec.datasetName: # dataset if datasetSpec.site in ['', None]: if DataServiceUtils.getDistributedDestination( datasetSpec.storageToken ) != None: locForRule = datasetSpec.destination elif DataServiceUtils.getDestinationSE( datasetSpec.storageToken ) != None: location = DataServiceUtils.getDestinationSE( datasetSpec.storageToken) elif taskSpec.cloud != None: # use T1 SE tmpT1Name = siteMapper.getCloud( taskSpec.cloud)['source'] location = siteMapper.getDdmEndpoint( tmpT1Name, datasetSpec.storageToken) else: tmpLog.info('site={0} token='.format( datasetSpec.site, datasetSpec.storageToken)) location = siteMapper.getDdmEndpoint( datasetSpec.site, datasetSpec.storageToken) if locForRule == None: locForRule = location # set metadata if taskSpec.prodSourceLabel in [ 'managed', 'test' ] and targetName == datasetSpec.datasetName: metaData = {} metaData['task_id'] = taskSpec.jediTaskID if not taskSpec.campaign in [None, '']: metaData[ 'campaign'] = taskSpec.campaign if datasetSpec.getTransient() != None: metaData[ 'transient'] = datasetSpec.getTransient( ) else: metaData = None # register dataset/container tmpLog.info( 'registering {0} with location={1} backend={2} lifetime={3} meta={4}' .format(targetName, location, ddmBackEnd, lifetime, str(metaData))) tmpStat = ddmIF.registerNewDataset( targetName, backEnd=ddmBackEnd, location=location, lifetime=lifetime, metaData=metaData) if not tmpStat: tmpLog.error( 'failed to register {0}'.format( targetName)) return retFatal # procedures for user if userSetup or DataServiceUtils.getDistributedDestination( datasetSpec.storageToken) != None: # register location tmpToRegister = False if userSetup and targetName == datasetSpec.datasetName and not datasetSpec.site in [ '', None ]: userName = taskSpec.userName grouping = None tmpToRegister = True elif DataServiceUtils.getDistributedDestination( datasetSpec.storageToken) != None: userName = None grouping = 'NONE' tmpToRegister = True if tmpToRegister: activity = DataServiceUtils.getActivityForOut( taskSpec.prodSourceLabel) tmpLog.info( 'registering location={0} lifetime={1}days activity={2} grouping={3}' .format(locForRule, lifetime, activity, grouping)) tmpStat = ddmIF.registerDatasetLocation( targetName, locForRule, owner=userName, lifetime=lifetime, backEnd=ddmBackEnd, activity=activity, grouping=grouping) if not tmpStat: tmpLog.error( 'failed to register location {0} with {2} for {1}' .format( locForRule, targetName, ddmBackEnd)) return retFatal avDatasetList.append(targetName) else: tmpLog.info('{0} already registered'.format( targetName)) # check if dataset is in the container if datasetSpec.containerName != None and datasetSpec.containerName != datasetSpec.datasetName: # get list of constituent datasets in the container if not cnDatasetMap.has_key(datasetSpec.containerName): cnDatasetMap[ datasetSpec. containerName] = ddmIF.listDatasetsInContainer( datasetSpec.containerName) # add dataset if not datasetSpec.datasetName in cnDatasetMap[ datasetSpec.containerName]: tmpLog.info('adding {0} to {1}'.format( datasetSpec.datasetName, datasetSpec.containerName)) tmpStat = ddmIF.addDatasetsToContainer( datasetSpec.containerName, [datasetSpec.datasetName], backEnd=ddmBackEnd) if not tmpStat: tmpLog.error('failed to add {0} to {1}'.format( datasetSpec.datasetName, datasetSpec.containerName)) return retFatal cnDatasetMap[datasetSpec.containerName].append( datasetSpec.datasetName) else: tmpLog.info('{0} already in {1}'.format( datasetSpec.datasetName, datasetSpec.containerName)) # update dataset datasetSpec.status = 'registered' self.taskBufferIF.updateDataset_JEDI( datasetSpec, { 'jediTaskID': taskSpec.jediTaskID, 'datasetID': datasetID }) # register ES datasets if False: # FIXME taskSpec.useEventService() and not taskSpec.useJobCloning() and datasetSpec.type == 'output': targetName = datasetSpec.datasetName + EventServiceUtils.esSuffixDDM location = None metaData = {} metaData['task_id'] = taskSpec.jediTaskID metaData['hidden'] = True tmpLog.info( 'registering ES dataset {0} with location={1} meta={2}' .format(targetName, location, str(metaData))) tmpStat = ddmIF.registerNewDataset(targetName, location=location, metaData=metaData) if not tmpStat: tmpLog.error( 'failed to register ES dataset {0}'.format( targetName)) return retFatal # register rule location = 'type=ES' activity = DataServiceUtils.getActivityForOut( taskSpec.prodSourceLabel) grouping = 'NONE' tmpLog.info( 'registering location={0} activity={1} grouping={2}' .format(location, activity, grouping)) tmpStat = ddmIF.registerDatasetLocation( targetName, location, activity=activity, grouping=grouping) if not tmpStat: tmpLog.error( 'failed to register location {0} with {2} for {1}' .format(location, targetName, activity)) return retFatal # open datasets if taskSpec.prodSourceLabel in ['managed', 'test']: # get the list of output/log datasets outDatasetList = [] for tmpPandaJob in pandaJobs: for tmpFileSpec in tmpPandaJob.Files: if tmpFileSpec.type in ['output', 'log']: if not tmpFileSpec.destinationDBlock in outDatasetList: outDatasetList.append( tmpFileSpec.destinationDBlock) # open datasets for outDataset in outDatasetList: tmpLog.info('open {0}'.format(outDataset)) ddmIF.openDataset(outDataset) # unset lifetime ddmIF.setDatasetMetadata(outDataset, 'lifetime', None) # return tmpLog.info('done') return retOK except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('doSetup failed with {0}:{1}'.format( errtype.__name__, errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskDsList = self.taskDsList.get(nTasks) # no more datasets if len(taskDsList) == 0: self.logger.debug('%s terminating since no more items' % self.__class__.__name__) return # loop over all tasks for jediTaskID,dsList in taskDsList: allUpdated = True taskBroken = False taskOnHold = False runningTask = False missingMap = {} # make logger tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(jediTaskID)) # get task tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False,True,self.pid,10) if not tmpStat or taskSpec == None: tmpLog.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID)) continue try: # get task parameters taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue)) taskBroken = True # renaming of parameters if taskParamMap.has_key('nEventsPerInputFile'): taskParamMap['nEventsPerFile'] = taskParamMap['nEventsPerInputFile'] # the number of files per job nFilesPerJob = None if taskParamMap.has_key('nFilesPerJob'): nFilesPerJob = taskParamMap['nFilesPerJob'] # the number of chunks used by scout nChunksForScout = 10 # load XML if taskSpec.useLoadXML(): xmlConfig = taskParamMap['loadXML'] else: xmlConfig = None # check no wait noWaitParent = False if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None,taskSpec.jediTaskID]: tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid) if tmpStat == 'running': noWaitParent = True # loop over all datasets nFilesMaster = 0 checkedMaster = False setFrozenTime = True if not taskBroken: ddmIF = self.ddmIF.getInterface(taskSpec.vo) origNumFiles = None if taskParamMap.has_key('nFiles'): origNumFiles = taskParamMap['nFiles'] for datasetSpec in dsList: tmpLog.info('start loop for {0}(id={1})'.format(datasetSpec.datasetName,datasetSpec.datasetID)) # get dataset metadata tmpLog.info('get metadata') gotMetadata = False stateUpdateTime = datetime.datetime.utcnow() try: if not datasetSpec.isPseudo(): tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName) else: # dummy metadata for pseudo dataset tmpMetadata = {'state':'closed'} # set mutable when parent is running and the dataset is open if noWaitParent and tmpMetadata['state'] == 'open': # dummy metadata when parent is running tmpMetadata = {'state':'mutable'} gotMetadata = True except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('{0} failed to get metadata to {1}:{2}'.format(self.__class__.__name__, errtype.__name__,errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag('failed to get metadata for {0}'.format(datasetSpec.datasetName)) allUpdated = False else: # get file list specified in task parameters fileList,includePatt,excludePatt = RefinerUtils.extractFileList(taskParamMap,datasetSpec.datasetName) # get the number of events in metadata if taskParamMap.has_key('getNumEventsInMetadata'): getNumEvents = True else: getNumEvents = False # get file list from DDM tmpLog.info('get files') try: useInFilesWithNewAttemptNr = False skipDuplicate = not datasetSpec.useDuplicatedFiles() if not datasetSpec.isPseudo(): if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \ not datasetSpec.containerName in ['',None]: # read files from container if file list is specified in task parameters tmpDatasetName = datasetSpec.containerName else: tmpDatasetName = datasetSpec.datasetName tmpRet = ddmIF.getFilesInDataset(tmpDatasetName, getNumEvents=getNumEvents, skipDuplicate=skipDuplicate ) tmpLog.info('got {0} files in {1}'.format(len(tmpRet),tmpDatasetName)) # remove lost files tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName,tmpRet) if tmpLostFiles != {}: tmpLog.info('found {0} lost files in {1}'.format(len(tmpLostFiles),tmpDatasetName)) for tmpListGUID,tmpLostLFN in tmpLostFiles.iteritems(): tmpLog.info('removed {0}'.format(tmpLostLFN)) del tmpRet[tmpListGUID] else: if not taskSpec.useListPFN(): # dummy file list for pseudo dataset tmpRet = {str(uuid.uuid4()):{'lfn':'pseudo_lfn', 'scope':None, 'filesize':0, 'checksum':None, } } else: # make dummy file list for PFN list if taskParamMap.has_key('nFiles'): nPFN = taskParamMap['nFiles'] else: nPFN = 1 tmpRet = {} for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = {'lfn':'{0:06d}:{1}'.format(iPFN,taskParamMap['pfnList'][iPFN].split('/')[-1]), 'scope':None, 'filesize':0, 'checksum':None, } except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to get files due to {0}:{1}'.format(self.__class__.__name__, errtype.__name__,errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag('failed to get files for {0}'.format(datasetSpec.datasetName)) allUpdated = False else: # the number of events per file nEventsPerFile = None nEventsPerJob = None nEventsPerRange = None if (datasetSpec.isMaster() and taskParamMap.has_key('nEventsPerFile')) or \ (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents')): if taskParamMap.has_key('nEventsPerFile'): nEventsPerFile = taskParamMap['nEventsPerFile'] elif datasetSpec.isPseudo() and taskParamMap.has_key('nEvents'): # use nEvents as nEventsPerFile for pseudo input nEventsPerFile = taskParamMap['nEvents'] if taskParamMap.has_key('nEventsPerJob'): nEventsPerJob = taskParamMap['nEventsPerJob'] elif taskParamMap.has_key('nEventsPerRange'): nEventsPerRange = taskParamMap['nEventsPerRange'] # max attempts maxAttempt = None if datasetSpec.isMaster() or datasetSpec.toKeepTrack(): # max attempts if taskSpec.disableAutoRetry(): # disable auto retry maxAttempt = 1 elif taskParamMap.has_key('maxAttempt'): maxAttempt = taskParamMap['maxAttempt'] else: # use default value maxAttempt = 3 # first event number firstEventNumber = None if datasetSpec.isMaster(): # first event number firstEventNumber = 1 + taskSpec.getFirstEventOffset() # nMaxEvents nMaxEvents = None if datasetSpec.isMaster() and taskParamMap.has_key('nEvents'): nMaxEvents = taskParamMap['nEvents'] # nMaxFiles nMaxFiles = None if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): nMaxFiles = taskParamMap['nFiles'] else: # calculate for secondary nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles) # multipled by the number of jobs per file for event-level splitting if nMaxFiles != None and taskParamMap.has_key('nEventsPerFile'): if taskParamMap.has_key('nEventsPerJob'): if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']: nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerJob']) nMaxFiles = int(math.ceil(nMaxFiles)) elif taskParamMap.has_key('nEventsPerRange'): if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerRange']: nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerRange']) nMaxFiles = int(math.ceil(nMaxFiles)) # use scout useScout = False if datasetSpec.isMaster() and taskSpec.useScout() and datasetSpec.status != 'toupdate': useScout = True # use files with new attempt numbers useFilesWithNewAttemptNr = False if not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key('useInFilesWithNewAttemptNr'): useFilesWithNewAttemptNr = True # feed files to the contents table tmpLog.info('update contents') retDB,missingFileList,nFilesUnique,diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(datasetSpec,tmpRet, tmpMetadata['state'], stateUpdateTime, nEventsPerFile, nEventsPerJob, maxAttempt, firstEventNumber, nMaxFiles, nMaxEvents, useScout, fileList, useFilesWithNewAttemptNr, nFilesPerJob, nEventsPerRange, nChunksForScout, includePatt, excludePatt, xmlConfig, noWaitParent, taskSpec.parent_tid, self.pid) if retDB == False: taskSpec.setErrDiag('failed to insert files for {0}. {1}'.format(datasetSpec.datasetName, diagMap['errMsg'])) allUpdated = False taskBroken = True break elif retDB == None: # the dataset is locked by another or status is not applicable allUpdated = False tmpLog.info('escape since task or dataset is locked') break elif missingFileList != []: # files are missing tmpErrStr = '{0} files missing in {1}'.format(len(missingFileList),datasetSpec.datasetName) tmpLog.info(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) allUpdated = False taskOnHold = True missingMap[datasetSpec.datasetName] = {'datasetSpec':datasetSpec, 'missingFiles':missingFileList} else: # reduce the number of files to be read if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): taskParamMap['nFiles'] -= nFilesUnique # reduce the number of files for scout if useScout: nChunksForScout = diagMap['nChunksForScout'] # number of master input files if datasetSpec.isMaster(): checkedMaster = True nFilesMaster += nFilesUnique # running task if diagMap['isRunningTask']: runningTask = True # no activated pending input for noWait if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout == 0): tmpErrStr = 'insufficient inputs are ready' tmpLog.info(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) taskOnHold = True setFrozenTime = False break tmpLog.info('end loop') # no mater input if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster: tmpErrStr = 'no master input files. input dataset is empty' tmpLog.error(tmpErrStr) taskSpec.setErrDiag(tmpErrStr,None) if taskSpec.allowEmptyInput() or noWaitParent: taskOnHold = True else: taskBroken = True # update task status if taskBroken: # task is broken taskSpec.status = 'tobroken' tmpMsg = 'set task.status={0}'.format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid) # change task status unless the task is running if not runningTask: if taskOnHold: if not noWaitParent: # initialize task generator taskGenerator = TaskGenerator(taskSpec.vo,taskSpec.prodSourceLabel) tmpStat = taskGenerator.initializeMods(self.taskBufferIF, self.ddmIF.getInterface(taskSpec.vo)) if not tmpStat: tmpErrStr = 'failed to initialize TaskGenerator' tmpLog.error(tmpErrStr) taskSpec.status = 'tobroken' taskSpec.setErrDiag(tmpErrStr) else: # make parent tasks if necessary tmpLog.info('make parent tasks with {0} (if necessary)'.format(taskGenerator.getClassName(taskSpec.vo, taskSpec.prodSourceLabel))) tmpStat = taskGenerator.doGenerate(taskSpec,taskParamMap,missingFilesMap=missingMap) if tmpStat == Interaction.SC_FATAL: # failed to make parent tasks taskSpec.status = 'tobroken' tmpLog.error('failed to make parent tasks') # go to pending state if not taskSpec.status in ['broken','tobroken']: taskSpec.setOnHold() tmpMsg = 'set task.status={0}'.format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid,setFrozenTime=setFrozenTime) elif allUpdated: # all OK allRet,newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,getTaskStatus=True, pid=self.pid) tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue, resource_name): # params nBunch = 4 threshold = 2.0 nJobsInBunchMax = 600 nJobsInBunchMin = 500 minTotalWalltime = 50 * 1000 * 1000 nWaitingLimit = 4 nWaitingBunchLimit = 2 nParallel = 2 nParallelCap = 5 # make logger tmpLog = MsgWrapper(logger) workQueueID = workQueue.getID() workQueueName = workQueue.queue_name workQueueName = '_'.join(workQueue.queue_name.split(' ')) msgHeader = '{0}:{1} cloud={2} queue={3} resource_type={4}:'.format( vo, prodSourceLabel, cloudName, workQueueName, resource_name) tmpLog.debug('{0} start workQueueID={1}'.format( msgHeader, workQueueID)) # get central configuration values config_map = self.__getConfiguration(vo, workQueue.queue_name, resource_name) configQueueLimit = config_map[NQUEUELIMIT]['value'] configQueueCap = config_map[NQUEUECAP]['value'] configRunningCap = config_map[NRUNNINGCAP]['value'] tmpLog.debug( msgHeader + ' got configuration configQueueLimit={0}, configQueueCap={1}, configRunningCap={2}' .format(configQueueLimit, configQueueCap, configRunningCap)) # check if unthrottled if not workQueue.throttled: msgBody = "PASS unthrottled since GS_throttled is False" tmpLog.info(msgHeader + " " + msgBody) return self.retUnThrottled # get the jobs statistics for our wq/gs and expand the stats map jobstats_map = self.__prepareJobStats(workQueue, resource_name, config_map) nRunning_rt = jobstats_map['nRunning_rt'] nRunning_gs = jobstats_map['nRunning_gs'] nRunning_runningcap = jobstats_map['nRunning_runningcap'] nNotRun_rt = jobstats_map['nNotRun_rt'] nNotRun_gs = jobstats_map['nNotRun_gs'] nNotRun_queuelimit = jobstats_map['nNotRun_queuelimit'] nNotRun_queuecap = jobstats_map['nNotRun_queuecap'] nDefine_rt = jobstats_map['nDefine_rt'] nDefine_gs = jobstats_map['nDefine_gs'] nDefine_queuelimit = jobstats_map['nDefine_queuelimit'] nDefine_queuecap = jobstats_map['nDefine_queuecap'] nWaiting_rt = jobstats_map['nWaiting_rt'] nWaiting_gs = jobstats_map['nWaiting_gs'] # check if higher prio tasks are waiting if workQueue.queue_name in non_rt_wqs: # find highest priority of currently defined jobs tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI( 'managed', cloudName, workQueue) # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI( vo, workQueue, 'managed', cloudName) else: # find highest priority of currently defined jobs tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI( 'managed', cloudName, workQueue, resource_name) # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI( vo, workQueue, 'managed', cloudName, resource_name) highestPrioInPandaDB = highestPrioJobStat['highestPrio'] nNotRunHighestPrio = highestPrioJobStat['nNotRun'] if highestPrioWaiting is None: msgBody = 'failed to get the highest priority of waiting tasks' tmpLog.error("{0} {1}".format(msgHeader, msgBody)) return self.retTmpError # high priority tasks are waiting highPrioQueued = False if highestPrioWaiting > highestPrioInPandaDB \ or (highestPrioWaiting == highestPrioInPandaDB and nNotRunHighestPrio < nJobsInBunchMin): highPrioQueued = True tmpLog.debug( "{0} highestPrio waiting:{1} inPanda:{2} numNotRun:{3} -> highPrioQueued={4}" .format(msgHeader, highestPrioWaiting, highestPrioInPandaDB, nNotRunHighestPrio, highPrioQueued)) # set maximum number of jobs to be submitted if workQueue.queue_name in non_rt_wqs: tmpRemainingSlot = int(nRunning_gs * threshold - nNotRun_gs) else: tmpRemainingSlot = int(nRunning_rt * threshold - nNotRun_rt) # use the lower limit to avoid creating too many _sub/_dis datasets nJobsInBunch = min(max(nJobsInBunchMin, tmpRemainingSlot), nJobsInBunchMax) if configQueueLimit is not None: nQueueLimit = configQueueLimit else: nQueueLimit = nJobsInBunch * nBunch # use nPrestage for reprocessing if workQueue.queue_name in ['Heavy Ion', 'Reprocessing default']: # reset nJobsInBunch if nQueueLimit > (nNotRun_queuelimit + nDefine_queuelimit): tmpRemainingSlot = nQueueLimit - (nNotRun_queuelimit + nDefine_queuelimit) if tmpRemainingSlot > nJobsInBunch: nJobsInBunch = min(tmpRemainingSlot, nJobsInBunchMax) # get cap # set number of jobs to be submitted if configQueueCap is None: self.setMaxNumJobs(nJobsInBunch / nParallel) else: self.setMaxNumJobs(configQueueCap / nParallelCap) # get total walltime totWalltime = self.taskBufferIF.getTotalWallTime_JEDI( vo, prodSourceLabel, workQueue, resource_name, cloudName) # log the current situation and limits tmpLog.info("{0} nQueueLimit={1} nRunCap={2} nQueueCap={3}".format( msgHeader, nQueueLimit, configRunningCap, configQueueCap)) tmpLog.info( "{0} at global share level: nQueued={1} nDefine={2} nRunning={3}". format(msgHeader, nNotRun_gs + nDefine_gs, nDefine_gs, nRunning_gs)) tmpLog.info( "{0} at resource type level: nQueued_rt={1} nDefine_rt={2} nRunning_rt={3} totWalltime={4}" .format(msgHeader, nNotRun_rt + nDefine_rt, nDefine_rt, nRunning_rt, totWalltime)) # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling limitPriority = False if workQueue.queue_name not in non_rt_wqs \ and nRunning_rt == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit \ and (totWalltime is None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format( nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name in non_rt_wqs \ and nRunning_gs == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit: limitPriority = True if not highPrioQueued: # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format( nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name not in non_rt_wqs and nRunning_rt != 0 \ and float(nNotRun_rt + nDefine_rt) / float(nRunning_rt) > threshold and \ (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit and (totWalltime is None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # enough jobs in Panda msgBody = "SKIP nQueued_rt({0})/nRunning_rt({1})>{2} & nQueued_queuelimit({3})>{4} totWalltime({5})>{6}".format( nNotRun_rt + nDefine_rt, nRunning_rt, threshold, nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name in non_rt_wqs and nRunning_gs != 0 \ and float(nNotRun_gs + nDefine_gs) / float(nRunning_gs) > threshold and \ (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit: limitPriority = True if not highPrioQueued: # enough jobs in Panda msgBody = "SKIP nQueued_gs({0})/nRunning_gs({1})>{2} & nQueued_queuelimit({3})>{4}".format( nNotRun_gs + nDefine_gs, nRunning_gs, threshold, nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif nDefine_queuelimit > nQueueLimit: limitPriority = True if not highPrioQueued: # brokerage is stuck msgBody = "SKIP too many nDefined_queuelimit({0})>{1}".format( nDefine_queuelimit, nQueueLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif nWaiting_rt > max(nRunning_rt * nWaitingLimit, nJobsInBunch * nWaitingBunchLimit): limitPriority = True if not highPrioQueued: # too many waiting msgBody = "SKIP too many nWaiting_rt({0})>max(nRunning_rt({1})x{2},{3}x{4})".format( nWaiting_rt, nRunning_rt, nWaitingLimit, nJobsInBunch, nWaitingBunchLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif configRunningCap and nRunning_runningcap > configRunningCap: # cap on running msgBody = "SKIP nRunning_runningcap({0})>nRunningCap({1})".format( nRunning_runningcap, configRunningCap) tmpLog.warning('{0} {1}'.format(msgHeader, msgBody)) tmpLog.sendMsg('{0} {1}'.format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif configQueueCap and nNotRun_queuecap + nDefine_queuecap > configQueueCap: limitPriority = True if not highPrioQueued: # cap on queued msgBody = "SKIP nQueued_queuecap({0})>nQueueCap({1})".format( nNotRun_queuecap + nDefine_queuecap, configQueueCap) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr # get jobs from prodDB limitPriorityValue = None if limitPriority: limitPriorityValue = highestPrioWaiting self.setMinPriority(limitPriorityValue) else: # not enough jobs are queued if (nNotRun_queuelimit + nDefine_queuelimit < nQueueLimit * 0.9) \ or (workQueue.queue_name in non_rt_wqs and nNotRun_gs + nDefine_gs < nRunning_gs) \ or (workQueue.queue_name not in non_rt_wqs and nNotRun_rt + nDefine_rt < nRunning_rt): tmpLog.debug(msgHeader + " not enough jobs queued") if not workQueue.queue_name in non_rt_wqs: self.notEnoughJobsQueued() self.setMaxNumJobs(max(self.maxNumJobs, nQueueLimit / 20)) msgBody = "PASS - priority limit={0} maxNumJobs={1}".format( limitPriorityValue, self.maxNumJobs) tmpLog.info(msgHeader + " " + msgBody) return self.retUnThrottled
def runImpl(self): # cutoff for disk in TB diskThreshold = self.taskBufferIF.getConfigValue( self.msgType, 'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name), 'jedi', 'atlas') if diskThreshold is None: diskThreshold = 100 * 1024 # dataset type to ignore file availability check datasetTypeToSkipCheck = ['log'] # thresholds for data availability check thrInputSize = self.taskBufferIF.getConfigValue( self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas') if thrInputSize is None: thrInputSize = 1 thrInputSize *= 1024 * 1024 * 1024 thrInputNum = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_THRESHOLD', 'jedi', 'atlas') if thrInputNum is None: thrInputNum = 100 thrInputSizeFrac = self.taskBufferIF.getConfigValue( self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas') if thrInputSizeFrac is None: thrInputSizeFrac = 10 thrInputSizeFrac = float(thrInputSizeFrac) / 100 thrInputNumFrac = self.taskBufferIF.getConfigValue( self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas') if thrInputNumFrac is None: thrInputNumFrac = 10 thrInputNumFrac = float(thrInputNumFrac) / 100 cutOffRW = 50 negWeightTape = 0.001 minIoIntensityWithLD = self.taskBufferIF.getConfigValue( self.msgType, 'MIN_IO_INTENSITY_WITH_LOCAL_DATA', 'jedi', 'atlas') if minIoIntensityWithLD is None: minIoIntensityWithLD = 200 minInputSizeWithLD = self.taskBufferIF.getConfigValue( self.msgType, 'MIN_INPUT_SIZE_WITH_LOCAL_DATA', 'jedi', 'atlas') if minInputSizeWithLD is None: minInputSizeWithLD = 10000 maxTaskPrioWithLD = self.taskBufferIF.getConfigValue( self.msgType, 'MAX_TASK_PRIO_WITH_LOCAL_DATA', 'jedi', 'atlas') if maxTaskPrioWithLD is None: maxTaskPrioWithLD = 800 # main lastJediTaskID = None siteMapper = self.taskBufferIF.getSiteMapper() while True: try: taskInputList = self.inputList.get(1) # no more datasets if len(taskInputList) == 0: self.logger.debug( '{0} terminating after processing {1} tasks since no more inputs ' .format(self.__class__.__name__, self.numTasks)) return # loop over all tasks for taskSpec, inputChunk in taskInputList: lastJediTaskID = taskSpec.jediTaskID # make logger tmpLog = MsgWrapper( self.logger, '<jediTaskID={0}>'.format(taskSpec.jediTaskID), monToken='jediTaskID={0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start') tmpLog.info( 'thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}' .format(thrInputSize, thrInputNum, thrInputSizeFrac, thrInputNumFrac)) # read task parameters try: taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI( taskSpec.jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except Exception: tmpLog.error('failed to read task params') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue # RW taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI( taskSpec.jediTaskID) # get nuclei nucleusList = siteMapper.nuclei if taskSpec.nucleus in siteMapper.nuclei: candidateNucleus = taskSpec.nucleus elif taskSpec.nucleus in siteMapper.satellites: nucleusList = siteMapper.satellites candidateNucleus = taskSpec.nucleus else: tmpLog.info('got {0} candidates'.format( len(nucleusList))) ###################################### # check status newNucleusList = {} for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleusSpec.state not in ['ACTIVE']: tmpLog.info( ' skip nucleus={0} due to status={1} criteria=-status' .format(tmpNucleus, tmpNucleusSpec.state)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info( '{0} candidates passed status check'.format( len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check status of transfer backlog t1Weight = taskSpec.getT1Weight() if t1Weight < 0: tmpLog.info( 'skip transfer backlog check due to negative T1Weight' ) else: newNucleusList = {} backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei( ) for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleus in backlogged_nuclei: tmpLog.info( ' skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog' .format(tmpNucleus)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info( '{0} candidates passed transfer backlog check'. format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check endpoint fractionFreeSpace = {} newNucleusList = {} tmpStat, tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( taskSpec.jediTaskID, ['output', 'log']) for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): toSkip = False for tmpDatasetSpec in tmpDatasetSpecList: # ignore distributed datasets if DataServiceUtils.getDistributedDestination( tmpDatasetSpec.storageToken ) is not None: continue # get endpoint with the pattern tmpEP = tmpNucleusSpec.getAssociatedEndpoint( tmpDatasetSpec.storageToken) if tmpEP is None: tmpLog.info( ' skip nucleus={0} since no endpoint with {1} criteria=-match' .format(tmpNucleus, tmpDatasetSpec.storageToken)) toSkip = True break # check state """ if tmpEP['state'] not in ['ACTIVE']: tmpLog.info(' skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus, tmpEP['ddm_endpoint_name'], tmpEP['state'])) toSkip = True break """ # check space tmpSpaceSize = tmpEP['space_free'] + tmpEP[ 'space_expired'] tmpSpaceToUse = 0 if tmpNucleus in self.fullRW: # 0.25GB per cpuTime/corePower/day tmpSpaceToUse = long( self.fullRW[tmpNucleus] / 10 / 24 / 3600 * 0.25) if tmpSpaceSize - tmpSpaceToUse < diskThreshold: tmpLog.info( ' skip nucleus={0} since disk shortage (free {1} GB - reserved {2} GB < thr {3} GB) at endpoint {4} criteria=-space' .format(tmpNucleus, tmpSpaceSize, tmpSpaceToUse, diskThreshold, tmpEP['ddm_endpoint_name'])) toSkip = True break # keep fraction of free space if tmpNucleus not in fractionFreeSpace: fractionFreeSpace[tmpNucleus] = { 'total': 0, 'free': 0 } try: tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \ float(fractionFreeSpace[tmpNucleus]['total']) except Exception: tmpOld = None try: tmpNew = float(tmpSpaceSize - tmpSpaceToUse) / float( tmpEP['space_total']) except Exception: tmpNew = None if tmpNew is not None and (tmpOld is None or tmpNew < tmpOld): fractionFreeSpace[tmpNucleus] = { 'total': tmpEP['space_total'], 'free': tmpSpaceSize - tmpSpaceToUse } if not toSkip: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info( '{0} candidates passed endpoint check {1} TB'. format(len(nucleusList), diskThreshold / 1024)) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # ability to execute jobs newNucleusList = {} # get all panda sites tmpSiteList = [] for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): tmpSiteList += tmpNucleusSpec.allPandaSites tmpSiteList = list(set(tmpSiteList)) tmpLog.debug('===== start for job check') jobBroker = AtlasProdJobBroker(self.ddmIF, self.taskBufferIF) tmpSt, tmpRet = jobBroker.doBrokerage( taskSpec, taskSpec.cloud, inputChunk, None, True, tmpSiteList, tmpLog) tmpLog.debug('===== done for job check') if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error('no sites can run jobs') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue okNuclei = set() for tmpSite in tmpRet: siteSpec = siteMapper.getSite(tmpSite) okNuclei.add(siteSpec.pandasite) for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleus in okNuclei: newNucleusList[tmpNucleus] = tmpNucleusSpec else: tmpLog.info( ' skip nucleus={0} due to missing ability to run jobs criteria=-job' .format(tmpNucleus)) nucleusList = newNucleusList tmpLog.info('{0} candidates passed job check'.format( len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # data locality toSkip = False availableData = {} for datasetSpec in inputChunk.getDatasets(): # only for real datasets if datasetSpec.isPseudo(): continue # ignore DBR if DataServiceUtils.isDBR(datasetSpec.datasetName): continue # skip locality check if DataServiceUtils.getDatasetType( datasetSpec.datasetName ) in datasetTypeToSkipCheck: continue # primary only if taskParamMap.get( 'taskBrokerOnMaster' ) is True and not datasetSpec.isMaster(): continue # use deep scan for primary dataset unless data carousel if datasetSpec.isMaster( ) and not taskSpec.inputPreStaging(): deepScan = True else: deepScan = False # get nuclei where data is available tmpSt, tmpRet = AtlasBrokerUtils.getNucleiWithData( siteMapper, self.ddmIF, datasetSpec.datasetName, list(nucleusList.keys()), deepScan) if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error( 'failed to get nuclei where data is available, since {0}' .format(tmpRet)) taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) toSkip = True break # sum for tmpNucleus, tmpVals in iteritems(tmpRet): if tmpNucleus not in availableData: availableData[tmpNucleus] = tmpVals else: availableData[tmpNucleus] = dict( (k, v + tmpVals[k]) for (k, v) in iteritems( availableData[tmpNucleus])) if toSkip: continue if availableData != {}: newNucleusList = {} # skip if no data skipMsgList = [] for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if taskSpec.inputPreStaging( ) and availableData[tmpNucleus][ 'ava_num_any'] > 0: # use incomplete replicas for data carousel since the completeness is guaranteed newNucleusList[tmpNucleus] = tmpNucleusSpec elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \ availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac: tmpMsg = ' skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format( tmpNucleus, availableData[tmpNucleus] ['ava_size_any'], availableData[tmpNucleus]['tot_size'], thrInputSizeFrac) skipMsgList.append(tmpMsg) elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \ availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac: tmpMsg = ' skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format( tmpNucleus, availableData[tmpNucleus] ['ava_num_any'], availableData[tmpNucleus]['tot_num'], thrInputNumFrac) skipMsgList.append(tmpMsg) else: newNucleusList[tmpNucleus] = tmpNucleusSpec totInputSize = list(availableData.values( ))[0]['tot_size'] / 1024 / 1024 / 1024 data_locality_check_str = ( '(ioIntensity ({0}) is None or less than {1} kBPerS ' 'and input size ({2} GB) is less than {3}) ' 'or task.currentPriority ({4}) is higher than or equal to {5}' ).format(taskSpec.ioIntensity, minIoIntensityWithLD, int(totInputSize), minInputSizeWithLD, taskSpec.currentPriority, maxTaskPrioWithLD) if len(newNucleusList) > 0: nucleusList = newNucleusList for tmpMsg in skipMsgList: tmpLog.info(tmpMsg) elif ((taskSpec.ioIntensity is None or taskSpec.ioIntensity <= minIoIntensityWithLD) and totInputSize <= minInputSizeWithLD) \ or taskSpec.currentPriority >= maxTaskPrioWithLD: availableData = {} tmpLog.info( ' disable data locality check since no nucleus has input data, {}' .format(data_locality_check_str)) else: # no candidate + unavoidable data locality check nucleusList = newNucleusList for tmpMsg in skipMsgList: tmpLog.info(tmpMsg) tmpLog.info( ' the following conditions required to disable data locality check: {}' .format(data_locality_check_str)) tmpLog.info( '{0} candidates passed data check'.format( len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # weight self.prioRW.acquire() nucleusRW = self.prioRW[taskSpec.currentPriority] self.prioRW.release() totalWeight = 0 nucleusweights = [] for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleus not in nucleusRW: nucleusRW[tmpNucleus] = 0 wStr = '1' # with RW if tmpNucleus in nucleusRW and nucleusRW[ tmpNucleus] >= cutOffRW: weight = 1 / float(nucleusRW[tmpNucleus]) wStr += '/( RW={0} )'.format( nucleusRW[tmpNucleus]) else: weight = 1 wStr += '/(1 : RW={0}<{1})'.format( nucleusRW[tmpNucleus], cutOffRW) # with data if availableData != {}: if availableData[tmpNucleus]['tot_size'] > 0: weight *= float(availableData[tmpNucleus] ['ava_size_any']) weight /= float( availableData[tmpNucleus]['tot_size']) wStr += '* ( available_input_size_DISKTAPE={0} )'.format( availableData[tmpNucleus] ['ava_size_any']) wStr += '/ ( total_input_size={0} )'.format( availableData[tmpNucleus]['tot_size']) # negative weight for tape if availableData[tmpNucleus][ 'ava_size_any'] > availableData[ tmpNucleus]['ava_size_disk']: weight *= negWeightTape wStr += '*( weight_TAPE={0} )'.format( negWeightTape) # fraction of free space if tmpNucleus in fractionFreeSpace: try: tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \ float(fractionFreeSpace[tmpNucleus]['total']) weight *= tmpFrac wStr += '*( free_space={0} )/( total_space={1} )'.format( fractionFreeSpace[tmpNucleus]['free'], fractionFreeSpace[tmpNucleus]['total']) except Exception: pass tmpLog.info( ' use nucleus={0} weight={1} {2} criteria=+use' .format(tmpNucleus, weight, wStr)) totalWeight += weight nucleusweights.append((tmpNucleus, weight)) tmpLog.info('final {0} candidates'.format( len(nucleusList))) ###################################### # final selection tgtWeight = random.uniform(0, totalWeight) candidateNucleus = None for tmpNucleus, weight in nucleusweights: tgtWeight -= weight if tgtWeight <= 0: candidateNucleus = tmpNucleus break if candidateNucleus is None: candidateNucleus = nucleusweights[-1][0] ###################################### # update nucleusSpec = nucleusList[candidateNucleus] # get output/log datasets tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( taskSpec.jediTaskID, ['output', 'log']) # get destinations retMap = { taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus( nucleusSpec, tmpDatasetSpecs) } tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) tmpLog.info( ' set nucleus={0} with {1} criteria=+set'.format( candidateNucleus, tmpRet)) self.sendLogMessage(tmpLog) if tmpRet: tmpMsg = 'set task_status=ready' tmpLog.sendMsg(tmpMsg, self.msgType) # update RW table self.prioRW.acquire() for prio, rwMap in iteritems(self.prioRW): if prio > taskSpec.currentPriority: continue if candidateNucleus in rwMap: rwMap[candidateNucleus] += taskRW else: rwMap[candidateNucleus] = taskRW self.prioRW.release() except Exception: errtype, errvalue = sys.exc_info()[:2] errMsg = '{0}.runImpl() failed with {1} {2} '.format( self.__class__.__name__, errtype.__name__, errvalue) errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID) errMsg += traceback.format_exc() logger.error(errMsg)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,splitRule,taskStatus,parent_tid in taskList: # make logger tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(jediTaskID)) tmpLog.info('start') tmpStat = Interaction.SC_SUCCEEDED errStr = '' # read task parameters try: taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: # get VO and sourceLabel vo = taskParamMap['vo'] prodSourceLabel = taskParamMap['prodSourceLabel'] taskType = taskParamMap['taskType'] tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType)) # get impl impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType, self.taskBufferIF,self.ddmIF) if impl == None: # task refiner is undefined errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # extract common parameters if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('extracting common') try: # initalize impl impl.initializeRefiner(tmpLog) # extarct common parameters impl.extractCommon(jediTaskID,taskParamMap,self.workQueueMapper,splitRule) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to extract common parameters with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # check parent noWaitParent = False if tmpStat == Interaction.SC_SUCCEEDED: if not parent_tid in [None,jediTaskID]: tmpLog.info('check parent task') try: tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid) if tmpStat == 'completed': # parent is done tmpStat = Interaction.SC_SUCCEEDED elif tmpStat == 'running': if not impl.taskSpec.noWaitParent(): # parent is running errStr = 'pending until parent task {0} is done'.format(parent_tid) impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(errStr) tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}) continue else: # not wait for parent tmpStat = Interaction.SC_SUCCEEDED noWaitParent = True else: # parent is corrupted tmpStat = Interaction.SC_FAILED tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid) impl.taskSpec.setErrDiag(tmpErrStr) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # refine if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('refining with {0}'.format(impl.__class__.__name__)) try: tmpStat = impl.doRefine(jediTaskID,taskParamMap) except: errtype,errvalue = sys.exc_info()[:2] # no wait for parent if impl.taskSpec.noWaitParent() and errtype == JediException.UnknownDatasetError: impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() errStr = 'pending until parent produces input' tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}) continue else: errStr = 'failed to refine task' tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # register if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to refine the task') if impl == None or impl.taskSpec == None: tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID else: tmpTaskSpec = impl.taskSpec tmpTaskSpec.status = 'tobroken' if errStr != '': tmpTaskSpec.setErrDiag(errStr,True) self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID}) else: tmpLog.info('registering') # fill JEDI tables try: # enable protection against task duplication if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \ not impl.taskSpec.checkPreProcessed(): uniqueTaskName = True else: uniqueTaskName = False strTaskParams = None if impl.updatedTaskParams != None: strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams) if taskStatus == 'registered': # unset pre-process flag if impl.taskSpec.checkPreProcessed(): impl.taskSpec.setPostPreProcess() # full registration tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec, impl.inMasterDatasetSpec, impl.inSecDatasetSpecList, impl.outDatasetSpecList, impl.outputTemplateMap, impl.jobParamsTemplate, strTaskParams, impl.unmergeMasterDatasetSpec, impl.unmergeDatasetSpecMap, uniqueTaskName) if not tmpStat: tmpErrStr = 'failed to register the task to JEDI in a single shot' tmpLog.error(tmpErrStr) impl.taskSpec.status = 'tobroken' impl.taskSpec.setErrDiag(tmpErrStr,True) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}) tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) else: # appending for incremetnal execution tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec, impl.inSecDatasetSpecList) if not tmpStat: tmpLog.error('failed to append datasets for incexec') except: errtype,errvalue = sys.exc_info()[:2] tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(tmpErrStr) else: tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def undo_preassign(self): tmp_log = MsgWrapper(logger, 'undo_preassign') # refresh self.refresh() # busy sites busy_sites_dict = self.get_busy_sites() # loop to undo preassignment for prod_source_label in self.prodSourceLabelList: # parameter from GDP config max_preassigned_tasks = self.taskBufferIF.getConfigValue( 'queue_filler', 'MAX_PREASSIGNED_TASKS_{0}'.format(prod_source_label), 'jedi', self.vo) if max_preassigned_tasks is None: max_preassigned_tasks = 3 min_files_ready = self.taskBufferIF.getConfigValue( 'queue_filler', 'MIN_FILES_READY_{0}'.format(prod_source_label), 'jedi', self.vo) if min_files_ready is None: min_files_ready = 50 min_files_remaining = self.taskBufferIF.getConfigValue( 'queue_filler', 'MIN_FILES_REMAINING_{0}'.format(prod_source_label), 'jedi', self.vo) if min_files_remaining is None: min_files_remaining = 100 # clean up outdated blacklist blacklist_duration_hours = 12 blacklisted_tasks_map_orig = self._get_from_bt_cache() blacklisted_tasks_map = copy.deepcopy(blacklisted_tasks_map_orig) now_time = datetime.datetime.utcnow() min_allowed_time = now_time - datetime.timedelta( hours=blacklist_duration_hours) min_allowed_ts = int(min_allowed_time.timestamp()) for ts_str in blacklisted_tasks_map_orig: ts = int(ts_str) if ts < min_allowed_ts: del blacklisted_tasks_map[ts_str] self._update_to_bt_cache(blacklisted_tasks_map) n_bt_old = sum([ len(bt_list) for bt_list in blacklisted_tasks_map_orig.values() ]) n_bt = sum( [len(bt_list) for bt_list in blacklisted_tasks_map.values()]) tmp_log.debug( 'done cleanup blacklist; before {n_bt_old} , now {n_bt} tasks in blacklist' .format(n_bt_old=n_bt_old, n_bt=n_bt)) # get a copy of preassigned_tasks_map from cache preassigned_tasks_map_orig = self._get_from_pt_cache() preassigned_tasks_map = copy.deepcopy(preassigned_tasks_map_orig) # clean up task_orig_attr_map in cache task_orig_attr_map_orig = self._get_from_attr_cache() task_orig_attr_map = copy.deepcopy(task_orig_attr_map_orig) all_preassiged_taskids = set() for taskid_list in preassigned_tasks_map_orig.values(): all_preassiged_taskids |= set(taskid_list) for taskid_str in task_orig_attr_map_orig: taskid = int(taskid_str) if taskid not in all_preassiged_taskids: del task_orig_attr_map[taskid_str] self._update_to_attr_cache(task_orig_attr_map) # loop on preassigned tasks in cache for key_name in preassigned_tasks_map_orig: # parse key name = site + resource_type site, resource_type = key_name.split('|') # preassigned tasks in cache preassigned_tasks_cached = preassigned_tasks_map.get( key_name, []) # force_undo=True for all tasks in busy sites, and force_undo=False for tasks not in status to generate jobs force_undo = False if site in busy_sites_dict or len( preassigned_tasks_cached) > max_preassigned_tasks: force_undo = True reason_str = 'site busy or offline or with too many preassigned tasks' if force_undo \ else 'task paused/terminated or without enough files to process' # parameters for undo, kinda ugly params_map = { ':min_files_ready': min_files_ready, ':min_files_remaining': min_files_remaining, } # undo preassign had_undo = False updated_tasks = [] if DRY_RUN: if force_undo: updated_tasks = list(preassigned_tasks_cached) n_tasks = len(updated_tasks) else: preassigned_tasks_list = [] preassigned_tasks_params_map = {} for j, taskid in enumerate(preassigned_tasks_cached): pt_param = ':pt_{0}'.format(j + 1) preassigned_tasks_list.append(pt_param) preassigned_tasks_params_map[pt_param] = taskid if not preassigned_tasks_list: continue preassigned_tasks_params_str = ','.join( preassigned_tasks_list) dry_sql_query = ( "SELECT t.jediTaskID " "FROM {jedi_schema}.JEDI_Tasks t " "WHERE t.jediTaskID IN ({preassigned_tasks_params_str}) " "AND t.site IS NOT NULL " "AND NOT ( " "t.status IN ('ready','running') " "AND EXISTS ( " "SELECT d.datasetID FROM {0}.JEDI_Datasets d " "WHERE t.jediTaskID=d.jediTaskID AND d.type='input' " "AND d.nFilesToBeUsed-d.nFilesUsed>=:min_files_ready AND d.nFiles-d.nFilesUsed>=:min_files_remaining " ") " ") ").format(jedi_schema=jedi_config.db.schemaJEDI, preassigned_tasks_params_str= preassigned_tasks_params_str) res = self.taskBufferIF.querySQL( dry_sql_query, preassigned_tasks_params_map) n_tasks = 0 if res is None else len(res) if n_tasks > 0: updated_tasks = [x[0] for x in res] # tmp_log.debug('[dry run] {} {} force={}'.format(key_name, str(updated_tasks), force_undo)) had_undo = True if n_tasks > 0: tmp_log.debug( '[dry run] {key_name:<64} {n_tasks:>3} preassigned tasks would be undone ({reason_str}) ' .format(key_name=key_name, n_tasks=n_tasks, reason_str=reason_str)) else: updated_tasks = self.taskBufferIF.undoPreassignedTasks_JEDI( preassigned_tasks_cached, task_orig_attr_map=task_orig_attr_map, params_map=params_map, force=force_undo) if updated_tasks is None: # dbproxy method failed tmp_log.error( '{key_name:<64} failed to undo preassigned tasks (force={force_undo})' .format(key_name=key_name, force_undo=force_undo)) else: had_undo = True n_tasks = len(updated_tasks) if n_tasks > 0: tmp_log.info( '{key_name:<64} {n_tasks:>3} preassigned tasks undone ({reason_str}) : {updated_tasks} ' .format(key_name=key_name, n_tasks=str(n_tasks), reason_str=reason_str, updated_tasks=updated_tasks)) # Kibana log for taskid in updated_tasks: tmp_log.debug( '#ATM #KV jediTaskID={taskid} action=undo_preassign site={site} rtype={rtype} un-preassinged since {reason_str}' .format(taskid=taskid, site=site, rtype=resource_type, reason_str=reason_str)) # update preassigned_tasks_map into cache if had_undo: if force_undo: del preassigned_tasks_map[key_name] else: tmp_tasks_set = set(preassigned_tasks_cached) - set( updated_tasks) if not tmp_tasks_set: del preassigned_tasks_map[key_name] else: preassigned_tasks_map[key_name] = list( tmp_tasks_set) self._update_to_pt_cache(preassigned_tasks_map) # update blacklisted_tasks_map into cache if had_undo and not force_undo: blacklisted_tasks_map_orig = self._get_from_bt_cache() blacklisted_tasks_map = copy.deepcopy( blacklisted_tasks_map_orig) now_time = datetime.datetime.utcnow() now_rounded_ts = int( now_time.replace(minute=0, second=0, microsecond=0).timestamp()) ts_str = str(now_rounded_ts) if ts_str in blacklisted_tasks_map_orig: tmp_bt_list = blacklisted_tasks_map[ts_str] blacklisted_tasks_map[ts_str] = list( set(tmp_bt_list) | set(updated_tasks)) else: blacklisted_tasks_map[ts_str] = list(updated_tasks) self._update_to_bt_cache(blacklisted_tasks_map)
def doSetup(self,taskSpec,datasetToRegister): # make logger tmpLog = MsgWrapper(logger,"<jediTaskID={0}>".format(taskSpec.jediTaskID)) tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType)) tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister))) # returns retFatal = self.SC_FATAL retTmpError = self.SC_FAILED retOK = self.SC_SUCCEEDED try: if datasetToRegister != []: # prod vs anal userSetup = False if taskSpec.prodSourceLabel in ['user']: userSetup = True # get DDM I/F ddmIF = self.ddmIF.getInterface(taskSpec.vo) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # loop over all datasets avDatasetList = [] cnDatasetMap = {} for datasetID in datasetToRegister: # get output and log datasets tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID)) tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID, datasetID) if not tmpStat: tmpLog.error('failed to get output and log datasets') return retFatal tmpLog.info('checking {0}'.format(datasetSpec.datasetName)) # check if dataset and container are available in DDM for targetName in [datasetSpec.datasetName,datasetSpec.containerName]: if targetName == None: continue if not targetName in avDatasetList: # check dataset/container in DDM tmpList = ddmIF.listDatasets(targetName) if tmpList == []: # register dataset/container tmpLog.info('registering {0}'.format(targetName)) tmpStat = ddmIF.registerNewDataset(targetName) if not tmpStat: tmpLog.error('failed to register {0}'.format(targetName)) return retFatal # procedures for user if userSetup: # set owner tmpLog.info('setting owner={0}'.format(taskSpec.userName)) tmpStat = ddmIF.setDatasetOwner(targetName,taskSpec.userName) if not tmpStat: tmpLog.error('failed to set ownership {0} with {1}'.format(targetName, taskSpec.userName)) return retFatal # register location if targetName == datasetSpec.datasetName and not datasetSpec.site in ['',None]: location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken) tmpLog.info('registring location={0}'.format(location)) tmpStat = ddmIF.registerDatasetLocation(targetName,location,owner=taskSpec.userName) if not tmpStat: tmpLog.error('failed to register location {0} for {1}'.format(location, targetName)) return retFatal avDatasetList.append(targetName) else: tmpLog.info('{0} already registered'.format(targetName)) # check if dataset is in the container if datasetSpec.containerName != None and datasetSpec.containerName != datasetSpec.datasetName: # get list of constituent datasets in the container if not cnDatasetMap.has_key(datasetSpec.containerName): cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName) # add dataset if not datasetSpec.datasetName in cnDatasetMap[datasetSpec.containerName]: tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName]) if not tmpStat: tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName, datasetSpec.containerName)) return retFatal cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName) else: tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) # update dataset datasetSpec.status = 'registered' self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID, 'datasetID':datasetID}) # return tmpLog.info('done') return retOK except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal
def getAvailableFiles(self,datasetSpec,siteEndPointMap,siteMapper,ngGroup=[],checkLFC=False): # make logger methodName = 'getAvailableFiles' methodName += ' <datasetID={0}>'.format(datasetSpec.datasetID) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start datasetName={0}'.format(datasetSpec.datasetName)) try: # list of NG endpoints ngEndPoints = [] if 1 in ngGroup: ngEndPoints += ['_SCRATCHDISK$','_LOCALGROUPDISK$','_LOCALGROUPTAPE$','_USERDISK$', '_DAQ$','_TMPDISK$','_TZERO$','_GRIDFTP$','MOCKTEST$'] if 2 in ngGroup: ngEndPoints += ['_LOCALGROUPTAPE$', '_DAQ$','_TMPDISK$','_TZERO$','_GRIDFTP$','MOCKTEST$'] # get all associated endpoints siteAllEndPointsMap = {} for siteName,endPointPattList in siteEndPointMap.iteritems(): # get all endpoints matching with patterns allEndPointList = [] for endPointPatt in endPointPattList: if '*' in endPointPatt: # wildcard endPointPatt = endPointPatt.replace('*','.*') for endPointToA in TiersOfATLAS.getAllDestinationSites(): if re.search('^'+endPointPatt+'$',endPointToA) != None: if not endPointToA in allEndPointList: allEndPointList.append(endPointToA) else: # normal endpoint if endPointPatt in TiersOfATLAS.getAllDestinationSites() and \ not endPointPatt in allEndPointList: allEndPointList.append(endPointPatt) # get associated endpoints siteAllEndPointsMap[siteName] = [] for endPoint in allEndPointList: # append if not self.checkNGEndPoint(endPoint,ngEndPoints) and \ not endPoint in siteAllEndPointsMap[siteName]: siteAllEndPointsMap[siteName].append(endPoint) else: # already checked continue # get alternate name altName = TiersOfATLAS.getSiteProperty(endPoint,'alternateName') if altName != None and altName != ['']: for assEndPoint in TiersOfATLAS.resolveGOC({altName[0]:None})[altName[0]]: if not assEndPoint in siteAllEndPointsMap[siteName] and \ not self.checkNGEndPoint(assEndPoint,ngEndPoints): siteAllEndPointsMap[siteName].append(assEndPoint) # get replica map tmpStat,tmpOut = self.listDatasetReplicas(datasetSpec.datasetName) if tmpStat != self.SC_SUCCEEDED: tmpLog.error('faild to get dataset replicas with {0}'.format(tmpOut)) raise tmpStat,tmpOut datasetReplicaMap = tmpOut # collect SE, LFC hosts, storage path, storage type lfcSeMap = {} storagePathMap = {} completeReplicaMap = {} siteHasCompleteReplica = False for siteName,allEndPointList in siteAllEndPointsMap.iteritems(): tmpLfcSeMap = {} tmpStoragePathMap = {} tmpSiteSpec = siteMapper.getSite(siteName) for tmpEndPoint in allEndPointList: # storage type if TiersOfATLAS.isTapeSite(tmpEndPoint): storageType = 'localtape' else: storageType = 'localdisk' # no scan when site has complete replicas if datasetReplicaMap.has_key(tmpEndPoint) and datasetReplicaMap[tmpEndPoint][-1]['found'] != None \ and datasetReplicaMap[tmpEndPoint][-1]['total'] == datasetReplicaMap[tmpEndPoint][-1]['found']: completeReplicaMap[tmpEndPoint] = storageType siteHasCompleteReplica = True # no LFC scan for many-time datasets if datasetSpec.isManyTime(): continue # get LFC lfc = TiersOfATLAS.getLocalCatalog(tmpEndPoint) # add map if not tmpLfcSeMap.has_key(lfc): tmpLfcSeMap[lfc] = [] # get SE seStr = TiersOfATLAS.getSiteProperty(tmpEndPoint, 'srm') tmpMatch = re.search('://([^:/]+):*\d*/',seStr) if tmpMatch != None: se = tmpMatch.group(1) if not se in tmpLfcSeMap[lfc]: tmpLfcSeMap[lfc].append(se) else: tmpLog.error('faild to extract SE from %s for %s:%s' % \ (seStr,siteName,tmpEndPoint)) # get SE + path seStr = TiersOfATLAS.getSiteProperty(tmpEndPoint, 'srm') tmpMatch = re.search('(srm://.+)$',seStr) if tmpMatch == None: tmpLog.error('faild to extract SE+PATH from %s for %s:%s' % \ (seStr,siteName,tmpEndPoint)) continue # add full path to storage map tmpSePath = tmpMatch.group(1) tmpStoragePathMap[tmpSePath] = {'siteName':siteName,'storageType':storageType} # add compact path tmpSePath = re.sub('(:\d+)*/srm/[^\?]+\?SFN=','',tmpSePath) tmpStoragePathMap[tmpSePath] = {'siteName':siteName,'storageType':storageType} # add to map to trigger LFC scan if complete replica is missing at the site if DataServiceUtils.isCachedFile(datasetSpec.datasetName,tmpSiteSpec): pass elif not siteHasCompleteReplica or checkLFC: for tmpKey,tmpVal in tmpLfcSeMap.iteritems(): if not lfcSeMap.has_key(tmpKey): lfcSeMap[tmpKey] = [] lfcSeMap[tmpKey] += tmpVal for tmpKey,tmpVal in tmpStoragePathMap.iteritems(): storagePathMap[tmpKey] = tmpVal # collect GUIDs and LFNs fileMap = {} lfnMap = {} lfnFileSepcMap = {} scopeMap = {} for tmpFile in datasetSpec.Files: fileMap[tmpFile.GUID] = tmpFile.lfn lfnMap[tmpFile.lfn] = tmpFile lfnFileSepcMap[tmpFile.lfn] = tmpFile scopeMap[tmpFile.lfn] = tmpFile.scope # get SURLs surlMap = {} for lfcHost,seList in lfcSeMap.iteritems(): tmpLog.debug('lookup in LFC:{0} for {1}'.format(lfcHost,str(seList))) tmpStat,tmpRetMap = self.getSURLsFromLFC(fileMap,lfcHost,seList,scopes=scopeMap) tmpLog.debug(str(tmpStat)) if tmpStat != self.SC_SUCCEEDED: raise RuntimeError,tmpRetMap for lfn,surls in tmpRetMap.iteritems(): if not surlMap.has_key(lfn): surlMap[lfn] = surls else: surlMap[lfn] += surls # make return returnMap = {} for siteName,allEndPointList in siteAllEndPointsMap.iteritems(): # set default return values if not returnMap.has_key(siteName): returnMap[siteName] = {'localdisk':[],'localtape':[],'cache':[],'remote':[]} # loop over all files tmpSiteSpec = siteMapper.getSite(siteName) # check if the file is cached if DataServiceUtils.isCachedFile(datasetSpec.datasetName,tmpSiteSpec): for tmpFileSpec in datasetSpec.Files: # add to cached file list returnMap[siteName]['cache'].append(tmpFileSpec) # complete replicas if not checkLFC: for tmpEndPoint in allEndPointList: if completeReplicaMap.has_key(tmpEndPoint): storageType = completeReplicaMap[tmpEndPoint] returnMap[siteName][storageType] += datasetSpec.Files # loop over all available LFNs avaLFNs = surlMap.keys() avaLFNs.sort() for tmpLFN in avaLFNs: tmpFileSpec = lfnFileSepcMap[tmpLFN] # loop over all SURLs for tmpSURL in surlMap[tmpLFN]: for tmpSePath in storagePathMap.keys(): # check SURL if tmpSURL.startswith(tmpSePath): # add siteName = storagePathMap[tmpSePath]['siteName'] storageType = storagePathMap[tmpSePath]['storageType'] if not tmpFileSpec in returnMap[siteName][storageType]: returnMap[siteName][storageType].append(tmpFileSpec) break # dump dumpStr = '' for siteName,storageTypeFile in returnMap.iteritems(): dumpStr += '{0}:('.format(siteName) for storageType,fileList in storageTypeFile.iteritems(): dumpStr += '{0}:{1},'.format(storageType,len(fileList)) dumpStr = dumpStr[:-1] dumpStr += ') ' dumpStr= dumpStr[:-1] tmpLog.debug(dumpStr) # return tmpLog.info('done') return self.SC_SUCCEEDED,returnMap except: errtype,errvalue = sys.exc_info()[:2] errMsg = 'failed with {0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return self.SC_FAILED,'{0}.{1} {2}'.format(self.__class__.__name__,methodName,errMsg)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug( '{0} terminating since no more items'.format( self.__class__.__name__)) return # loop over all tasks for jediTaskID, commandMap in taskList: # make logger tmpLog = MsgWrapper( self.logger, ' < jediTaskID={0} >'.format(jediTaskID)) commandStr = commandMap['command'] commentStr = commandMap['comment'] oldStatus = commandMap['oldStatus'] tmpLog.info('start for {0}'.format(commandStr)) tmpStat = Interaction.SC_SUCCEEDED if commandStr in ['kill', 'finish', 'reassign']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) # loop twice to see immediate result for iLoop in range(2): # get active PandaIDs to be killed if commandStr == 'reassign' and commentStr is not None and 'soft reassign' in commentStr: pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI( jediTaskID) elif commandStr == 'reassign' and commentStr is not None and 'nokill reassign' in commentStr: pandaIDs = [] else: pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI( jediTaskID, True) if pandaIDs is None: tmpLog.error( 'failed to get PandaIDs for jediTaskID={0}' .format(jediTaskID)) tmpStat = Interaction.SC_FAILED # kill jobs or update task if tmpStat == Interaction.SC_SUCCEEDED: if pandaIDs == []: # done since no active jobs tmpMsg = 'completed cleaning jobs' tmpLog.sendMsg(tmpMsg, self.msgType) tmpLog.info(tmpMsg) tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID updateTaskStatus = True if commandStr != 'reassign': # reset oldStatus # keep oldStatus for task reassignment since it is reset when actually reassigned tmpTaskSpec.forceUpdate('oldStatus') else: # extract cloud or site if commentStr is not None: tmpItems = commentStr.split(':') if tmpItems[0] == 'cloud': tmpTaskSpec.cloud = tmpItems[1] elif tmpItems[0] == 'nucleus': tmpTaskSpec.nucleus = tmpItems[ 1] else: tmpTaskSpec.site = tmpItems[1] tmpMsg = 'set {0}={1}'.format( tmpItems[0], tmpItems[1]) tmpLog.sendMsg( tmpMsg, self.msgType) tmpLog.info(tmpMsg) # back to oldStatus if necessary if tmpItems[2] == 'y': tmpTaskSpec.status = oldStatus tmpTaskSpec.forceUpdate( 'oldStatus') updateTaskStatus = False if commandStr == 'reassign': tmpTaskSpec.forceUpdate('errorDialog') if commandStr == 'finish': # update datasets tmpLog.info( 'updating datasets to finish') tmpStat = self.taskBufferIF.updateDatasetsToFinishTask_JEDI( jediTaskID, self.pid) if not tmpStat: tmpLog.info( 'wait until datasets are updated to finish' ) # ignore failGoalUnreached when manually finished tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI( jediTaskID) tmpTaskSpec.splitRule = taskSpec.splitRule tmpTaskSpec.unsetFailGoalUnreached() if updateTaskStatus: tmpTaskSpec.status = JediTaskSpec.commandStatusMap( )[commandStr]['done'] tmpMsg = 'set task_status={0}'.format( tmpTaskSpec.status) tmpLog.sendMsg(tmpMsg, self.msgType) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.updateTask_JEDI( tmpTaskSpec, {'jediTaskID': jediTaskID}, setOldModTime=True) tmpLog.info('done with {0}'.format( str(tmpRet))) break else: # kill only in the first loop if iLoop > 0: break # wait or kill jobs if commentStr and 'soft finish' in commentStr: queuedPandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI( jediTaskID) tmpMsg = "trying to kill {0} queued jobs for soft finish".format( len(queuedPandaIDs)) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.killJobs( queuedPandaIDs, commentStr, '52', True) tmpMsg = "wating {0} jobs for soft finish".format( len(pandaIDs)) tmpLog.info(tmpMsg) tmpRet = True tmpLog.info('done with {0}'.format( str(tmpRet))) break else: tmpMsg = "trying to kill {0} jobs".format( len(pandaIDs)) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) if commandStr in ['finish']: # force kill tmpRet = self.taskBufferIF.killJobs( pandaIDs, commentStr, '52', True) elif commandStr in ['reassign']: # force kill tmpRet = self.taskBufferIF.killJobs( pandaIDs, commentStr, '51', True) else: # normal kill tmpRet = self.taskBufferIF.killJobs( pandaIDs, commentStr, '50', True) tmpLog.info('done with {0}'.format( str(tmpRet))) elif commandStr in ['retry', 'incexec']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) # change task params for incexec if commandStr == 'incexec': try: # read task params taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI( jediTaskID) taskParamMap = RefinerUtils.decodeJSON( taskParam) # remove some params for newKey in ['nFiles', 'fixedSandbox']: try: del taskParamMap[newKey] except Exception: pass # convert new params newParamMap = RefinerUtils.decodeJSON( commentStr) # change params for newKey, newVal in iteritems(newParamMap): if newVal is None: # delete if newKey in taskParamMap: del taskParamMap[newKey] else: # change taskParamMap[newKey] = newVal # overwrite sandbox if 'fixedSandbox' in taskParamMap: # noBuild for tmpParam in taskParamMap[ 'jobParameters']: if tmpParam[ 'type'] == 'constant' and re.search( '^-a [^ ]+$', tmpParam['value'] ) is not None: tmpParam['value'] = '-a {0}'.format( taskParamMap['fixedSandbox']) # build if 'buildSpec' in taskParamMap: taskParamMap['buildSpec'][ 'archiveName'] = taskParamMap[ 'fixedSandbox'] # merge if 'mergeSpec' in taskParamMap: taskParamMap['mergeSpec']['jobParameters'] = \ re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters']) # encode new param strTaskParams = RefinerUtils.encodeJSON( taskParamMap) tmpRet = self.taskBufferIF.updateTaskParams_JEDI( jediTaskID, strTaskParams) if tmpRet is not True: tmpLog.error( 'failed to update task params') continue except Exception as e: tmpLog.error( 'failed to change task params with {} {}'. format(str(e), traceback.format_exc())) continue # retry child tasks if 'sole ' in commentStr: retryChildTasks = False else: retryChildTasks = True # discard events if 'discard ' in commentStr: discardEvents = True else: discardEvents = False # release un-staged files if 'staged ' in commentStr: releaseUnstaged = True else: releaseUnstaged = False tmpRet, newTaskStatus = self.taskBufferIF.retryTask_JEDI( jediTaskID, commandStr, retryChildTasks=retryChildTasks, discardEvents=discardEvents, release_unstaged=releaseUnstaged) if tmpRet is True: tmpMsg = 'set task_status={0}'.format( newTaskStatus) tmpLog.sendMsg(tmpMsg, self.msgType) tmpLog.info(tmpMsg) tmpLog.info('done with {0}'.format(tmpRet)) else: tmpLog.error('unknown command') except Exception as e: errStr = '{} failed in runImpl() with {} {} '.format( self.__class__.__name__, str(e), traceback.format_exc()) logger.error(errStr)
def runImpl(self): # cutoff for disk in TB diskThreshold = self.taskBufferIF.getConfigValue(self.msgType, 'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name), 'jedi', 'atlas') if diskThreshold is None: diskThreshold = 100 * 1024 # dataset type to ignore file availability check datasetTypeToSkipCheck = ['log'] # thresholds for data availability check thrInputSize = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas') if thrInputSize is None: thrInputSize = 1 thrInputSize *= 1024*1024*1024 thrInputNum = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_THRESHOLD', 'jedi', 'atlas') if thrInputNum is None: thrInputNum = 100 thrInputSizeFrac = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas') if thrInputSizeFrac is None: thrInputSizeFrac = 10 thrInputSizeFrac = float(thrInputSizeFrac) / 100 thrInputNumFrac = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas') if thrInputNumFrac is None: thrInputNumFrac = 10 thrInputNumFrac = float(thrInputNumFrac) / 100 cutOffRW = 50 negWeightTape = 0.001 # main lastJediTaskID = None siteMapper = self.taskBufferIF.getSiteMapper() while True: try: taskInputList = self.inputList.get(1) # no more datasets if len(taskInputList) == 0: self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__, self.numTasks)) return # loop over all tasks for taskSpec,inputChunk in taskInputList: lastJediTaskID = taskSpec.jediTaskID # make logger tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='jediTaskID={0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start') tmpLog.info('thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}'.format(thrInputSize, thrInputNum, thrInputSizeFrac, thrInputNumFrac)) # RW taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID) # get nuclei nucleusList = siteMapper.nuclei if taskSpec.nucleus in nucleusList: candidateNucleus = taskSpec.nucleus else: tmpLog.info('got {0} candidates'.format(len(nucleusList))) ###################################### # check status newNucleusList = {} for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if not tmpNucleusSpec.state in ['ACTIVE']: tmpLog.info(' skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus, tmpNucleusSpec.state)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info('{0} candidates passed status check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check status of transfer backlog t1Weight = taskSpec.getT1Weight() if t1Weight < 0: tmpLog.info('skip transfer backlog check due to negative T1Weight') else: newNucleusList = {} backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei() for tmpNucleus, tmpNucleusSpec in nucleusList.iteritems(): if tmpNucleus in backlogged_nuclei: tmpLog.info(' skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog'. format(tmpNucleus)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info('{0} candidates passed transfer backlog check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check endpoint fractionFreeSpace = {} newNucleusList = {} tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID, ['output','log']) for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): toSkip = False for tmpDatasetSpec in tmpDatasetSpecList: # ignore distributed datasets if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None: continue # get endpoint with the pattern tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken) if tmpEP == None: tmpLog.info(' skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus, tmpDatasetSpec.storageToken)) toSkip = True break # check state """ if not tmpEP['state'] in ['ACTIVE']: tmpLog.info(' skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus, tmpEP['ddm_endpoint_name'], tmpEP['state'])) toSkip = True break """ # check space tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired'] tmpSpaceToUse = 0 if tmpNucleus in self.fullRW: # 0.25GB per cpuTime/corePower/day tmpSpaceToUse = long(self.fullRW[tmpNucleus]/10/24/3600*0.25) if tmpSpaceSize-tmpSpaceToUse < diskThreshold: tmpLog.info(' skip nucleus={0} since disk shortage (free {1} - reserved {2} < thr {3}) at endpoint {4} criteria=-space'.format(tmpNucleus, tmpSpaceSize, tmpSpaceToUse, diskThreshold, tmpEP['ddm_endpoint_name'])) toSkip = True break # keep fraction of free space if not tmpNucleus in fractionFreeSpace: fractionFreeSpace[tmpNucleus] = {'total':0,'free':0} try: tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \ float(fractionFreeSpace[tmpNucleus]['total']) except: tmpOld = None try: tmpNew = float(tmpSpaceSize-tmpSpaceToUse)/float(tmpEP['space_total']) except: tmpNew = None if tmpNew != None and (tmpOld == None or tmpNew < tmpOld): fractionFreeSpace[tmpNucleus] = {'total':tmpEP['space_total'], 'free':tmpSpaceSize-tmpSpaceToUse} if not toSkip: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info('{0} candidates passed endpoint check {1} TB'.format(len(nucleusList),diskThreshold/1024)) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # ability to execute jobs newNucleusList = {} # get all panda sites tmpSiteList = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): tmpSiteList += tmpNucleusSpec.allPandaSites tmpSiteList = list(set(tmpSiteList)) tmpLog.debug('===== start for job check') jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF) tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True, tmpSiteList,tmpLog) tmpLog.debug('===== done for job check') if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error('no sites can run jobs') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue okNuclei = set() for tmpSite in tmpRet: siteSpec = siteMapper.getSite(tmpSite) okNuclei.add(siteSpec.pandasite) for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if tmpNucleus in okNuclei: newNucleusList[tmpNucleus] = tmpNucleusSpec else: tmpLog.info(' skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus)) nucleusList = newNucleusList tmpLog.info('{0} candidates passed job check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # data locality toSkip = False availableData = {} for datasetSpec in inputChunk.getDatasets(): # only for real datasets if datasetSpec.isPseudo(): continue # ignore DBR if DataServiceUtils.isDBR(datasetSpec.datasetName): continue # skip locality check if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck: continue # use deep scan for primary dataset if datasetSpec.isMaster(): deepScan = True else: deepScan = False # get nuclei where data is available tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF, datasetSpec.datasetName, nucleusList.keys(), deepScan) if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) toSkip = True break # sum for tmpNucleus,tmpVals in tmpRet.iteritems(): if not tmpNucleus in availableData: availableData[tmpNucleus] = tmpVals else: availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems()) if toSkip: continue if availableData != {}: newNucleusList = {} # skip if no data skipMsgList = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if len(nucleusList) == 1: tmpLog.info(' disable data locality check for nucleus={0} since no other candidate'.format(tmpNucleus)) newNucleusList[tmpNucleus] = tmpNucleusSpec elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \ availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac: tmpMsg = ' skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus, availableData[tmpNucleus]['ava_size_any'], availableData[tmpNucleus]['tot_size'], thrInputSizeFrac) skipMsgList.append(tmpMsg) elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \ availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac: tmpMsg = ' skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus, availableData[tmpNucleus]['ava_num_any'], availableData[tmpNucleus]['tot_num'], thrInputNumFrac) skipMsgList.append(tmpMsg) else: newNucleusList[tmpNucleus] = tmpNucleusSpec if len(newNucleusList) > 0: nucleusList = newNucleusList for tmpMsg in skipMsgList: tmpLog.info(tmpMsg) else: tmpLog.info(' disable data locality check since no nucleus has input data') tmpLog.info('{0} candidates passed data check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # weight self.prioRW.acquire() nucleusRW = self.prioRW[taskSpec.currentPriority] self.prioRW.release() totalWeight = 0 nucleusweights = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if not tmpNucleus in nucleusRW: nucleusRW[tmpNucleus] = 0 wStr = '1' # with RW if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW: weight = 1 / float(nucleusRW[tmpNucleus]) wStr += '/( RW={0} )'.format(nucleusRW[tmpNucleus]) else: weight = 1 wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW) # with data if availableData != {}: if availableData[tmpNucleus]['tot_size'] > 0: weight *= float(availableData[tmpNucleus]['ava_size_any']) weight /= float(availableData[tmpNucleus]['tot_size']) wStr += '* ( available_input_size_DISKTAPE={0} )'.format(availableData[tmpNucleus]['ava_size_any']) wStr += '/ ( total_input_size={0} )'.format(availableData[tmpNucleus]['tot_size']) # negative weight for tape if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']: weight *= negWeightTape wStr += '*( weight_TAPE={0} )'.format(negWeightTape) # fraction of free space if tmpNucleus in fractionFreeSpace: try: tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \ float(fractionFreeSpace[tmpNucleus]['total']) weight *= tmpFrac wStr += '*( free_space={0} )/( total_space={1} )'.format(fractionFreeSpace[tmpNucleus]['free'], fractionFreeSpace[tmpNucleus]['total']) except: pass tmpLog.info(' use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr)) totalWeight += weight nucleusweights.append((tmpNucleus,weight)) tmpLog.info('final {0} candidates'.format(len(nucleusList))) ###################################### # final selection tgtWeight = random.uniform(0,totalWeight) candidateNucleus = None for tmpNucleus,weight in nucleusweights: tgtWeight -= weight if tgtWeight <= 0: candidateNucleus = tmpNucleus break if candidateNucleus == None: candidateNucleus = nucleusweights[-1][0] ###################################### # update nucleusSpec = nucleusList[candidateNucleus] # get output/log datasets tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID, ['output','log']) # get destinations retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)} tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) tmpLog.info(' set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet)) self.sendLogMessage(tmpLog) if tmpRet: tmpMsg = 'set task.status=ready' tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) # update RW table self.prioRW.acquire() for prio,rwMap in self.prioRW.iteritems(): if prio > taskSpec.currentPriority: continue if candidateNucleus in rwMap: rwMap[candidateNucleus] += taskRW else: rwMap[candidateNucleus] = taskRW self.prioRW.release() except: errtype,errvalue = sys.exc_info()[:2] errMsg = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue) errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID) errMsg += traceback.format_exc() logger.error(errMsg)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskDsList = self.taskDsList.get(nTasks) # no more datasets if len(taskDsList) == 0: self.logger.debug("%s terminating since no more items" % self.__class__.__name__) return # loop over all tasks for jediTaskID, dsList in taskDsList: allUpdated = True taskBroken = False taskOnHold = False runningTask = False missingMap = {} # make logger tmpLog = MsgWrapper(self.logger, "<jediTaskID={0}>".format(jediTaskID)) # get task tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID, False, True, None, 10) if not tmpStat or taskSpec == None: tmpLog.error("failed to get taskSpec for jediTaskID={0}".format(jediTaskID)) continue try: # get task parameters taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( "task param conversion from json failed with {0}:{1}".format(errtype.__name__, errvalue) ) taskBroken = True # renaming of parameters if taskParamMap.has_key("nEventsPerInputFile"): taskParamMap["nEventsPerFile"] = taskParamMap["nEventsPerInputFile"] # the number of files per job nFilesPerJob = None if taskParamMap.has_key("nFilesPerJob"): nFilesPerJob = taskParamMap["nFilesPerJob"] # the number of files used by scout nFilesForScout = 0 if nFilesPerJob != None: nFilesForScout = 10 * nFilesPerJob else: nFilesForScout = 10 # load XML if taskSpec.useLoadXML(): try: loadXML = taskParamMap["loadXML"] xmlConfig = ParseJobXML.dom_parser(xmlStr=loadXML) except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error("failed to load XML config with {0}:{1}".format(errtype.__name__, errvalue)) taskBroken = True else: xmlConfig = None # check no wait noWaitParent = False if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None, taskSpec.jediTaskID]: tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid) if tmpStat == "running": noWaitParent = True # loop over all datasets nFilesMaster = 0 if not taskBroken: ddmIF = self.ddmIF.getInterface(taskSpec.vo) origNumFiles = None if taskParamMap.has_key("nFiles"): origNumFiles = taskParamMap["nFiles"] for datasetSpec in dsList: tmpLog.info( "start loop for {0}(id={1})".format(datasetSpec.datasetName, datasetSpec.datasetID) ) # get dataset metadata tmpLog.info("get metadata") gotMetadata = False stateUpdateTime = datetime.datetime.utcnow() try: if not datasetSpec.isPseudo(): tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName) else: # dummy metadata for pseudo dataset tmpMetadata = {"state": "closed"} # set mutable when parent is running and the dataset is open if noWaitParent and tmpMetadata["state"] == "open": # dummy metadata when parent is running tmpMetadata = {"state": "mutable"} gotMetadata = True except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( "{0} failed to get metadata to {1}:{2}".format( self.__class__.__name__, errtype.__name__, errvalue ) ) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = "broken" taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec, datasetStatus, tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag("failed to get metadata for {0}".format(datasetSpec.datasetName)) allUpdated = False else: # get file list specified in task parameters fileList, includePatt, excludePatt = RefinerUtils.extractFileList( taskParamMap, datasetSpec.datasetName ) # get the number of events in metadata if taskParamMap.has_key("getNumEventsInMetadata"): getNumEvents = True else: getNumEvents = False # get file list from DDM tmpLog.info("get files") try: useInFilesWithNewAttemptNr = False skipDuplicate = not datasetSpec.useDuplicatedFiles() if not datasetSpec.isPseudo(): if ( fileList != [] and taskParamMap.has_key("useInFilesInContainer") and not datasetSpec.containerName in ["", None] ): # read files from container if file list is specified in task parameters tmpDatasetName = datasetSpec.containerName else: tmpDatasetName = datasetSpec.datasetName tmpRet = ddmIF.getFilesInDataset( tmpDatasetName, getNumEvents=getNumEvents, skipDuplicate=skipDuplicate ) # remove lost files tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName, tmpRet) if tmpLostFiles != {}: tmpLog.info( "found {0} lost files in {1}".format(len(tmpLostFiles), tmpDatasetName) ) for tmpListGUID, tmpLostLFN in tmpLostFiles.iteritems(): tmpLog.info("removed {0}".format(tmpLostLFN)) del tmpRet[tmpListGUID] else: if not taskSpec.useListPFN(): # dummy file list for pseudo dataset tmpRet = { str(uuid.uuid4()): { "lfn": "pseudo_lfn", "scope": None, "filesize": 0, "checksum": None, } } else: # make dummy file list for PFN list if taskParamMap.has_key("nFiles"): nPFN = taskParamMap["nFiles"] else: nPFN = 1 tmpRet = {} for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = { "lfn": "{0:06d}:{1}".format( iPFN, taskParamMap["pfnList"][iPFN].split("/")[-1] ), "scope": None, "filesize": 0, "checksum": None, } except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( "failed to get files due to {0}:{1}".format( self.__class__.__name__, errtype.__name__, errvalue ) ) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = "broken" taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec, datasetStatus, tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag("failed to get files for {0}".format(datasetSpec.datasetName)) allUpdated = False else: # the number of events per file nEventsPerFile = None nEventsPerJob = None nEventsPerRange = None if (datasetSpec.isMaster() and taskParamMap.has_key("nEventsPerFile")) or ( datasetSpec.isPseudo() and taskParamMap.has_key("nEvents") ): if taskParamMap.has_key("nEventsPerFile"): nEventsPerFile = taskParamMap["nEventsPerFile"] elif datasetSpec.isPseudo() and taskParamMap.has_key("nEvents"): # use nEvents as nEventsPerFile for pseudo input nEventsPerFile = taskParamMap["nEvents"] if taskParamMap.has_key("nEventsPerJob"): nEventsPerJob = taskParamMap["nEventsPerJob"] elif taskParamMap.has_key("nEventsPerRange"): nEventsPerRange = taskParamMap["nEventsPerRange"] # max attempts and first event number maxAttempt = None firstEventNumber = None if datasetSpec.isMaster(): # max attempts if taskSpec.disableAutoRetry(): # disable auto retry maxAttempt = 1 elif taskParamMap.has_key("maxAttempt"): maxAttempt = taskParamMap["maxAttempt"] else: # use default value maxAttempt = 3 # first event number firstEventNumber = 1 + taskSpec.getFirstEventOffset() # nMaxEvents nMaxEvents = None if datasetSpec.isMaster() and taskParamMap.has_key("nEvents"): nMaxEvents = taskParamMap["nEvents"] # nMaxFiles nMaxFiles = None if taskParamMap.has_key("nFiles"): if datasetSpec.isMaster(): nMaxFiles = taskParamMap["nFiles"] else: # calculate for secondary nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles) # multipled by the number of jobs per file for event-level splitting if nMaxFiles != None and taskParamMap.has_key("nEventsPerFile"): if taskParamMap.has_key("nEventsPerJob"): if taskParamMap["nEventsPerFile"] > taskParamMap["nEventsPerJob"]: nMaxFiles *= float(taskParamMap["nEventsPerFile"]) / float( taskParamMap["nEventsPerJob"] ) nMaxFiles = int(math.ceil(nMaxFiles)) elif taskParamMap.has_key("nEventsPerRange"): if taskParamMap["nEventsPerFile"] > taskParamMap["nEventsPerRange"]: nMaxFiles *= float(taskParamMap["nEventsPerFile"]) / float( taskParamMap["nEventsPerRange"] ) nMaxFiles = int(math.ceil(nMaxFiles)) # use scout useScout = False if datasetSpec.isMaster() and taskSpec.useScout(): useScout = True # use files with new attempt numbers useFilesWithNewAttemptNr = False if ( not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key("useInFilesWithNewAttemptNr") ): useFilesWithNewAttemptNr = True # feed files to the contents table tmpLog.info("update contents") retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI( datasetSpec, tmpRet, tmpMetadata["state"], stateUpdateTime, nEventsPerFile, nEventsPerJob, maxAttempt, firstEventNumber, nMaxFiles, nMaxEvents, useScout, fileList, useFilesWithNewAttemptNr, nFilesPerJob, nEventsPerRange, nFilesForScout, includePatt, excludePatt, xmlConfig, noWaitParent, taskSpec.parent_tid, ) if retDB == False: taskSpec.setErrDiag( "failed to insert files for {0}. {1}".format( datasetSpec.datasetName, diagMap["errMsg"] ) ) allUpdated = False taskBroken = True break elif retDB == None: # the dataset is locked by another or status is not applicable allUpdated = False elif missingFileList != []: # files are missing tmpErrStr = "{0} files missing in {1}".format( len(missingFileList), datasetSpec.datasetName ) tmpLog.info(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) allUpdated = False taskOnHold = True missingMap[datasetSpec.datasetName] = { "datasetSpec": datasetSpec, "missingFiles": missingFileList, } else: # reduce the number of files to be read if taskParamMap.has_key("nFiles"): if datasetSpec.isMaster(): taskParamMap["nFiles"] -= nFilesUnique # reduce the number of files for scout if useScout: nFilesForScout = diagMap["nFilesForScout"] # number of master input files if datasetSpec.isMaster(): nFilesMaster += nFilesUnique # running task if diagMap["isRunningTask"]: runningTask = True # no activated pending input for noWait if noWaitParent and diagMap["nActivatedPending"] == 0: tmpErrStr = "insufficient inputs are ready" tmpLog.info(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) taskOnHold = True tmpLog.info("end loop") # no mater input if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0: tmpErrStr = "no master input files. input dataset is empty" tmpLog.error(tmpErrStr) taskSpec.setErrDiag(tmpErrStr, None) if taskSpec.allowEmptyInput() or noWaitParent: taskOnHold = True else: taskBroken = True # update task status if taskBroken: # task is broken taskSpec.status = "tobroken" tmpMsg = "set task.status={0}".format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID, taskSpec) # change task status unless the task is running if not runningTask: if taskOnHold: if not noWaitParent: # initialize task generator taskGenerator = TaskGenerator(taskSpec.vo, taskSpec.prodSourceLabel) tmpStat = taskGenerator.initializeMods( self.taskBufferIF, self.ddmIF.getInterface(taskSpec.vo) ) if not tmpStat: tmpErrStr = "failed to initialize TaskGenerator" tmpLog.error(tmpErrStr) taskSpec.status = "tobroken" taskSpec.setErrDiag(tmpErrStr) else: # make parent tasks if necessary tmpLog.info( "make parent tasks with {0} (if necessary)".format( taskGenerator.getClassName(taskSpec.vo, taskSpec.prodSourceLabel) ) ) tmpStat = taskGenerator.doGenerate( taskSpec, taskParamMap, missingFilesMap=missingMap ) if tmpStat == Interaction.SC_FATAL: # failed to make parent tasks taskSpec.status = "tobroken" tmpLog.error("failed to make parent tasks") # go to pending state if not taskSpec.status in ["broken", "tobroken"]: taskSpec.setOnHold() tmpMsg = "set task.status={0}".format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID, taskSpec) elif allUpdated: # all OK allRet, newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI( jediTaskID, getTaskStatus=True ) tmpMsg = "set task.status={0}".format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) tmpLog.info("done") except: errtype, errvalue = sys.exc_info()[:2] logger.error( "{0} failed in runImpl() with {1}:{2}".format(self.__class__.__name__, errtype.__name__, errvalue) )
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.info('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,splitRule,taskStatus,parent_tid in taskList: # make logger tmpLog = MsgWrapper(self.logger,'< jediTaskID={0} >'.format(jediTaskID)) tmpLog.debug('start') tmpStat = Interaction.SC_SUCCEEDED errStr = '' # read task parameters try: taskParam = None taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'conversion to map from json failed with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.debug(taskParam) tmpLog.error(errStr) continue tmpStat = Interaction.SC_FAILED # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: # get VO and sourceLabel vo = taskParamMap['vo'] prodSourceLabel = taskParamMap['prodSourceLabel'] taskType = taskParamMap['taskType'] tmpLog.info('vo={0} sourceLabel={1} taskType={2}'.format(vo,prodSourceLabel,taskType)) # get impl impl = self.implFactory.instantiateImpl(vo,prodSourceLabel,taskType, self.taskBufferIF,self.ddmIF) if impl == None: # task refiner is undefined errStr = 'task refiner is undefined for vo={0} sourceLabel={1}'.format(vo,prodSourceLabel) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to get task refiner with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # extract common parameters if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('extracting common') try: # initalize impl impl.initializeRefiner(tmpLog) impl.oldTaskStatus = taskStatus # extract common parameters impl.extractCommon(jediTaskID, taskParamMap, self.workQueueMapper, splitRule) # set parent tid if not parent_tid in [None,jediTaskID]: impl.taskSpec.parent_tid = parent_tid except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to extract common parameters with {0}:{1} {2}'.format(errtype.__name__,errvalue, traceback.format_exc()) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # check attribute length if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('checking attribute length') if not impl.taskSpec.checkAttrLength(): tmpLog.error(impl.taskSpec.errorDialog) tmpStat = Interaction.SC_FAILED # staging if tmpStat == Interaction.SC_SUCCEEDED: if 'toStaging' in taskParamMap and taskStatus <> 'staged': errStr = 'wait until staging is done' impl.taskSpec.status = 'staging' impl.taskSpec.oldStatus = taskStatus impl.taskSpec.setErrDiag(errStr) # not to update some task attributes impl.taskSpec.resetRefinedAttrs() tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec, {'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus], updateDEFT=False, setFrozenTime=False) continue # check parent noWaitParent = False parentState = None if tmpStat == Interaction.SC_SUCCEEDED: if parent_tid not in [None,jediTaskID]: tmpLog.info('check parent task') try: tmpStat = self.taskBufferIF.checkParentTask_JEDI(parent_tid) parentState = tmpStat if tmpStat == 'completed': # parent is done tmpStat = Interaction.SC_SUCCEEDED elif tmpStat == 'running': if not impl.taskSpec.noWaitParent(): # parent is running errStr = 'pending until parent task {0} is done'.format(parent_tid) impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(errStr) # not to update some task attributes impl.taskSpec.resetRefinedAttrs() tmpLog.info(errStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus],setFrozenTime=False) continue else: # not wait for parent tmpStat = Interaction.SC_SUCCEEDED noWaitParent = True else: # parent is corrupted tmpStat = Interaction.SC_FAILED tmpErrStr = 'parent task {0} failed to complete'.format(parent_tid) impl.taskSpec.setErrDiag(tmpErrStr) except: errtype,errvalue = sys.exc_info()[:2] errStr = 'failed to check parent task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # refine if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('refining with {0}'.format(impl.__class__.__name__)) try: tmpStat = impl.doRefine(jediTaskID,taskParamMap) except: errtype,errvalue = sys.exc_info()[:2] # wait unknown input if noWaitParent or waitInput if ((impl.taskSpec.noWaitParent() or impl.taskSpec.waitInput()) \ and errtype == JediException.UnknownDatasetError) or parentState == 'running' \ or errtype == Interaction.JEDITemporaryError: if impl.taskSpec.noWaitParent() or parentState == 'running': tmpErrStr = 'pending until parent produces input' setFrozenTime=False elif errtype == Interaction.JEDITemporaryError: tmpErrStr = 'pending due to DDM problem. {0}'.format(errvalue) setFrozenTime=True else: tmpErrStr = 'pending until input is staged' setFrozenTime=True impl.taskSpec.status = taskStatus impl.taskSpec.setOnHold() impl.taskSpec.setErrDiag(tmpErrStr) # not to update some task attributes impl.taskSpec.resetRefinedAttrs() tmpLog.info(tmpErrStr) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus], insertUnknown=impl.unknownDatasetList, setFrozenTime=setFrozenTime) continue else: errStr = 'failed to refine task with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(errStr) tmpStat = Interaction.SC_FAILED # register if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to refine the task') if impl == None or impl.taskSpec == None: tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID else: tmpTaskSpec = impl.taskSpec tmpTaskSpec.status = 'tobroken' if errStr != '': tmpTaskSpec.setErrDiag(errStr,True) self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':tmpTaskSpec.jediTaskID},oldStatus=[taskStatus]) else: tmpLog.info('registering') # fill JEDI tables try: # enable protection against task duplication if taskParamMap.has_key('uniqueTaskName') and taskParamMap['uniqueTaskName'] and \ not impl.taskSpec.checkPreProcessed(): uniqueTaskName = True else: uniqueTaskName = False strTaskParams = None if impl.updatedTaskParams != None: strTaskParams = RefinerUtils.encodeJSON(impl.updatedTaskParams) if taskStatus in ['registered', 'staged']: # unset pre-process flag if impl.taskSpec.checkPreProcessed(): impl.taskSpec.setPostPreProcess() # full registration tmpStat,newTaskStatus = self.taskBufferIF.registerTaskInOneShot_JEDI(jediTaskID,impl.taskSpec, impl.inMasterDatasetSpec, impl.inSecDatasetSpecList, impl.outDatasetSpecList, impl.outputTemplateMap, impl.jobParamsTemplate, strTaskParams, impl.unmergeMasterDatasetSpec, impl.unmergeDatasetSpecMap, uniqueTaskName, taskStatus) if not tmpStat: tmpErrStr = 'failed to register the task to JEDI in a single shot' tmpLog.error(tmpErrStr) impl.taskSpec.status = newTaskStatus impl.taskSpec.setErrDiag(tmpErrStr,True) self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus]) tmpMsg = 'set task_status={0}'.format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) else: # disable scouts if previous attempt didn't use it if not impl.taskSpec.useScout(splitRule): impl.taskSpec.setUseScout(False) # disallow to reset some attributes for attName in ['ramCount', 'walltime', 'cpuTime', 'startTime']: impl.taskSpec.resetChangedAttr(attName) # update task with new params self.taskBufferIF.updateTask_JEDI(impl.taskSpec,{'jediTaskID':impl.taskSpec.jediTaskID}, oldStatus=[taskStatus]) # appending for incremetnal execution tmpStat = self.taskBufferIF.appendDatasets_JEDI(jediTaskID,impl.inMasterDatasetSpec, impl.inSecDatasetSpecList) if not tmpStat: tmpLog.error('failed to append datasets for incexec') except: errtype,errvalue = sys.exc_info()[:2] tmpErrStr = 'failed to register the task to JEDI with {0}:{1}'.format(errtype.__name__,errvalue) tmpLog.error(tmpErrStr) else: tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def findLostFiles(self,datasetName,fileMap): methodName = 'findLostFiles' methodName += ' <datasetName={0}>'.format(datasetName) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') try: # get replicas tmpStat,tmpOut = self.listDatasetReplicas(datasetName) if tmpStat != self.SC_SUCCEEDED: tmpLog.error('faild to get dataset replicas with {0}'.format(tmpOut)) raise tmpStat,tmpOut # check if complete replica is available hasCompReplica = False datasetReplicaMap = tmpOut for tmpEndPoint in datasetReplicaMap.keys(): if datasetReplicaMap[tmpEndPoint][-1]['found'] != None and \ datasetReplicaMap[tmpEndPoint][-1]['total'] == datasetReplicaMap[tmpEndPoint][-1]['found']: hasCompReplica = True break # no lost files if hasCompReplica: tmpLog.info('done with no lost files') self.SC_SUCCEEDED,{} # get LFNs and scopes lfnMap = {} scopeMap = {} for tmpGUID in fileMap.keys(): tmpLFN = fileMap[tmpGUID]['lfn'] lfnMap[tmpGUID] = tmpLFN scopeMap[tmpLFN] = fileMap[tmpGUID]['scope'] # get LFC and SE lfcSeMap = {} for tmpEndPoint in datasetReplicaMap.keys(): # get LFC lfc = TiersOfATLAS.getLocalCatalog(tmpEndPoint) # add map if not lfcSeMap.has_key(lfc): lfcSeMap[lfc] = [] # get SE seStr = TiersOfATLAS.getSiteProperty(tmpEndPoint, 'srm') tmpMatch = re.search('://([^:/]+):*\d*/',seStr) if tmpMatch != None: se = tmpMatch.group(1) if not se in lfcSeMap[lfc]: lfcSeMap[lfc].append(se) # get SURLs for lfcHost,seList in lfcSeMap.iteritems(): tmpStat,tmpRetMap = self.getSURLsFromLFC(lfnMap,lfcHost,seList,scopes=scopeMap) if tmpStat != self.SC_SUCCEEDED: tmpLog.error('faild to get SURLs with {0}'.format(tmpRetMap)) raise tmpStat,tmpRetMap # look for missing files newLfnMap = {} for tmpGUID,tmpLFN in lfnMap.iteritems(): if not tmpLFN in tmpRetMap: newLfnMap[tmpGUID] = tmpLFN lfnMap = newLfnMap tmpLog.info('done with lost '+','.join(str(tmpLFN) for tmpLFN in lfnMap.values())) return self.SC_SUCCEEDED,lfnMap except: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0} : {1}'.format(methodName,errMsg)
def getLatestDBRelease(self): methodName = 'getLatestDBRelease' tmpLog = MsgWrapper(logger,methodName) tmpLog.info('trying to get the latest version number of DBR') # get ddo datasets tmpStat,ddoDatasets = self.listDatasets('ddo.*') if tmpStat != self.SC_SUCCEEDED or ddoDatasets == {}: tmpLog.error('failed to get a list of DBRelease datasets from DQ2') return self.SC_FAILED,None # reverse sort to avoid redundant lookup ddoDatasets.sort() ddoDatasets.reverse() # extract version number latestVerMajor = 0 latestVerMinor = 0 latestVerBuild = 0 latestVerRev = 0 latestDBR = '' for tmpName in ddoDatasets: # ignore CDRelease if ".CDRelease." in tmpName: continue # ignore user if tmpName.startswith('ddo.user'): continue # use Atlas.Ideal if not ".Atlas.Ideal." in tmpName: continue match = re.search('\.v(\d+)(_*[^\.]*)$',tmpName) if match == None: tmpLog.warning('cannot extract version number from %s' % tmpName) continue # ignore special DBRs if match.group(2) != '': continue # get major,minor,build,revision numbers tmpVerStr = match.group(1) tmpVerMajor = 0 tmpVerMinor = 0 tmpVerBuild = 0 tmpVerRev = 0 try: tmpVerMajor = int(tmpVerStr[0:2]) except: pass try: tmpVerMinor = int(tmpVerStr[2:4]) except: pass try: tmpVerBuild = int(tmpVerStr[4:6]) except: pass try: tmpVerRev = int(tmpVerStr[6:]) # use only three digit DBR continue except: pass # compare if latestVerMajor > tmpVerMajor: continue elif latestVerMajor == tmpVerMajor: if latestVerMinor > tmpVerMinor: continue elif latestVerMinor == tmpVerMinor: if latestVerBuild > tmpVerBuild: continue elif latestVerBuild == tmpVerBuild: if latestVerRev > tmpVerRev: continue # check if well replicated tmpStat,ddoReplicas = self.listDatasetReplicas(tmpName) if len(ddoReplicas) < 10: continue # higher or equal version latestVerMajor = tmpVerMajor latestVerMinor = tmpVerMinor latestVerBuild = tmpVerBuild latestVerRev = tmpVerRev latestDBR = tmpName # failed if latestDBR == '': tmpLog.error('failed to get the latest version of DBRelease dataset from DQ2') return self.SC_FAILED,None tmpLog.info('use {0}'.format(latestDBR)) return self.SC_SUCCEEDED,latestDBR
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.info('start') # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # rescue picked files tmpLog.info( 'rescue tasks with picked files for vo={0} label={1}' .format(vo, prodSourceLabel)) tmpRet = self.taskBufferIF.rescuePickedFiles_JEDI( vo, prodSourceLabel, jedi_config.watchdog.waitForPicked) if tmpRet == None: # failed tmpLog.error('failed to rescue') else: tmpLog.info('rescued {0} tasks'.format(tmpRet)) # reactivate pending tasks tmpLog.info( 'reactivate pending tasks for vo={0} label={1}'. format(vo, prodSourceLabel)) tmpRet = self.taskBufferIF.reactivatePendingTasks_JEDI( vo, prodSourceLabel, jedi_config.watchdog.waitForPending, jedi_config.watchdog.timeoutForPending) if tmpRet == None: # failed tmpLog.error('failed to reactivate') else: tmpLog.info('reactivated {0} tasks'.format(tmpRet)) # unlock tasks tmpLog.info('unlock tasks for vo={0} label={1}'.format( vo, prodSourceLabel)) tmpRet = self.taskBufferIF.unlockTasks_JEDI( vo, prodSourceLabel, jedi_config.watchdog.waitForLocked) if tmpRet == None: # failed tmpLog.error('failed to unlock') else: tmpLog.info('unlock {0} tasks'.format(tmpRet)) # restart contents update tmpLog.info( 'restart contents update for vo={0} label={1}'. format(vo, prodSourceLabel)) tmpRet = self.taskBufferIF.restartTasksForContentsUpdate_JEDI( vo, prodSourceLabel) if tmpRet == None: # failed tmpLog.error('failed to restart') else: tmpLog.info('restarted {0} tasks'.format(tmpRet)) # kick exhausted tasks tmpLog.info( 'kick exhausted tasks for vo={0} label={1}'.format( vo, prodSourceLabel)) tmpRet = self.taskBufferIF.kickExhaustedTasks_JEDI( vo, prodSourceLabel, jedi_config.watchdog.waitForExhausted) if tmpRet == None: # failed tmpLog.error('failed to kick') else: tmpLog.info('kicked {0} tasks'.format(tmpRet)) # finish tasks when goal is reached tmpLog.info( 'finish achieved tasks for vo={0} label={1}'. format(vo, prodSourceLabel)) tmpRet = self.taskBufferIF.getAchievedTasks_JEDI( vo, prodSourceLabel, jedi_config.watchdog.waitForAchieved) if tmpRet == None: # failed tmpLog.error('failed to finish') else: for jediTaskID in tmpRet: self.taskBufferIF.sendCommandTaskPanda( jediTaskID, 'JEDI. Goal reached', True, 'finish', comQualifier='soft') tmpLog.info('finished {0} tasks'.format(tmpRet)) # vo/prodSourceLabel specific action impl = self.getImpl(vo, prodSourceLabel) if impl != None: tmpLog.info( 'special action for vo={0} label={1} with {2}'. format(vo, prodSourceLabel, impl.__class__.__name__)) tmpStat = impl.doAction() if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error( 'failed to run special acction for vo={0} label={1}' .format(vo, prodSourceLabel)) else: tmpLog.info('done for vo={0} label={1}'.format( vo, prodSourceLabel)) tmpLog.info('done') except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format( self.__class__.__name__, errtype.__name__, errvalue)) # sleep if needed loopCycle = jedi_config.watchdog.loopCycle timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep()
def doGenerate(self, taskSpec, taskParamMap, **varMap): # make logger tmpLog = MsgWrapper(logger, "<jediTaskID={0}>".format(taskSpec.jediTaskID)) tmpLog.info('start taskType={0}'.format(taskSpec.taskType)) tmpLog.info(str(varMap)) # returns retFatal = self.SC_FATAL retTmpError = self.SC_FAILED retOK = self.SC_SUCCEEDED try: # check prodSourceLabel if taskSpec.prodSourceLabel in ['managed', 'test']: # check taskType if taskSpec.taskType == 'recov': # generate parent tasks for lost file recovery if it is not yet generated if 'parentGenerated' in taskParamMap: tmpLog.info( 'skip since already generated parent tasks') else: tmpLog.info( 'generating parent tasks for lost file recovery') # missing files are undefined if 'missingFilesMap' not in varMap: tmpLog.error('missing files are undefined') return retFatal missingFilesMap = varMap['missingFilesMap'] # check datasets for datasetName, datasetValMap in iteritems( missingFilesMap): # dataset needs specify container datasetSpec = datasetValMap['datasetSpec'] if datasetSpec.containerName in ['', None]: errStr = 'cannot make parent tasks due to undefined container for datasetID={0}:{1}'.format( datasetSpec.datasetID, datasetName) taskSpec.setErrDiag(errStr) tmpLog.error(errStr) return retFatal # make parameters for new task newJsonStrList = [] for datasetName, datasetValMap in iteritems( missingFilesMap): datasetSpec = datasetValMap['datasetSpec'] newTaskParamMap = {} newTaskParamMap['oldDatasetName'] = datasetName newTaskParamMap['lostFiles'] = datasetValMap[ 'missingFiles'] newTaskParamMap['vo'] = taskSpec.vo newTaskParamMap['cloud'] = taskSpec.cloud newTaskParamMap[ 'taskPriority'] = taskSpec.taskPriority newTaskParamMap['taskType'] = taskSpec.taskType newTaskParamMap[ 'prodSourceLabel'] = taskSpec.prodSourceLabel logDatasetName = 'panda.jedi{0}.log.{1}'.format( taskSpec.taskType, uuid.uuid4()) newTaskParamMap['log'] = { 'dataset': logDatasetName, 'type': 'template', 'param_type': 'log', 'token': 'ATLASDATADISK', 'value': '{0}.${{SN}}.log.tgz'.format(logDatasetName) } # make new datasetname outDatasetName = datasetName # remove / outDatasetName = re.sub('/$', '', outDatasetName) # remove extension outDatasetName = re.sub( '\.{0}\d+$'.format(taskSpec.taskType), '', outDatasetName) # add extension outDatasetName = outDatasetName + '.{0}{1}'.format( taskSpec.taskType, taskSpec.jediTaskID) newTaskParamMap['output'] = { 'dataset': outDatasetName } if datasetSpec.containerName not in ['', None]: newTaskParamMap['output'][ 'container'] = datasetSpec.containerName # make json jsonStr = json.dumps(newTaskParamMap) newJsonStrList.append(jsonStr) # change original task parameters to not repeat the same procedure and to use newly produced files taskParamMap['parentGenerated'] = True taskParamMap['useInFilesInContainer'] = True taskParamMap['useInFilesWithNewAttemptNr'] = True jsonStr = json.dumps(taskParamMap) # insert and update task parameters sTmp, newJediTaskIDs = self.taskBufferIF.insertUpdateTaskParams_JEDI( taskSpec.jediTaskID, taskSpec.vo, taskSpec.prodSourceLabel, jsonStr, newJsonStrList) if sTmp: tmpLog.info( 'inserted/updated tasks in DB : new jediTaskIDs={0}' .format(str(newJediTaskIDs))) else: tmpLog.error('failed to insert/update tasks in DB') return retFatal # return tmpLog.info('done') return retOK except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('doGenerate failed with {0}:{1}'.format( errtype.__name__, errvalue)) return retFatal
def do_preassign(self): tmp_log = MsgWrapper(logger, 'do_preassign') # refresh self.refresh() # list of resource type resource_type_list = [ rt.resource_name for rt in self.taskBufferIF.load_resource_types() ] # threshold of time duration in second that the queue keeps empty to trigger preassignment empty_duration_threshold = 1800 # return map ret_map = { 'to_reassign': {}, } # loop for prod_source_label in self.prodSourceLabelList: # site-rse map site_rse_map = self.get_site_rse_map(prod_source_label) # parameters from GDP config max_preassigned_tasks = self.taskBufferIF.getConfigValue( 'queue_filler', 'MAX_PREASSIGNED_TASKS_{0}'.format(prod_source_label), 'jedi', self.vo) if max_preassigned_tasks is None: max_preassigned_tasks = 3 min_files_ready = self.taskBufferIF.getConfigValue( 'queue_filler', 'MIN_FILES_READY_{0}'.format(prod_source_label), 'jedi', self.vo) if min_files_ready is None: min_files_ready = 50 min_files_remaining = self.taskBufferIF.getConfigValue( 'queue_filler', 'MIN_FILES_REMAINING_{0}'.format(prod_source_label), 'jedi', self.vo) if min_files_remaining is None: min_files_remaining = 100 # load site empty-since map from cache site_empty_since_map_orig = self._get_from_ses_cache() # available sites available_sites_list = self.get_available_sites_list() # now timestamp now_time = datetime.datetime.utcnow() now_time_ts = int(now_time.timestamp()) # update site empty-since map site_empty_since_map = copy.deepcopy(site_empty_since_map_orig) available_site_name_list = [x[0] for x in available_sites_list] for site in site_empty_since_map_orig: # remove sites that are no longer empty if site not in available_site_name_list: del site_empty_since_map[site] for site in available_site_name_list: # add newly found empty sites if site not in site_empty_since_map_orig: site_empty_since_map[site] = now_time_ts self._update_to_ses_cache(site_empty_since_map) # evaluate sites to preaassign according to cache # get blacklisted_tasks_map from cache blacklisted_tasks_map = self._get_from_bt_cache() blacklisted_tasks_set = set() for bt_list in blacklisted_tasks_map.values(): blacklisted_tasks_set |= set(bt_list) # loop over available sites to preassign for (site, tmpSiteSpec, n_jobs_to_fill) in available_sites_list: # rses of the available site available_rses = set() try: available_rses.update(set(site_rse_map[site])) except KeyError: tmp_log.debug( 'skipped {site} since no good RSE'.format(site=site)) continue # do not consider TAPE rses for rse in set(available_rses): if 'TAPE' in str(rse): available_rses.remove(rse) # skip if no rse for available site if not available_rses: tmp_log.debug( 'skipped {site} since no available RSE'.format( site=site)) continue # skip if no coreCount set if not tmpSiteSpec.coreCount or not tmpSiteSpec.coreCount > 0: tmp_log.debug( 'skipped {site} since coreCount is not set'.format( site=site)) continue # now timestamp now_time = datetime.datetime.utcnow() now_time_ts = int(now_time.timestamp()) # skip if not empty for long enough if site not in site_empty_since_map: tmp_log.error( 'skipped {site} since not in empty-since map (should not happen)' .format(site=site)) continue empty_duration = now_time_ts - site_empty_since_map[site] tmp_num_slots = tmpSiteSpec.getNumStandby(None, None) if empty_duration < empty_duration_threshold and not tmp_num_slots: tmp_log.debug( 'skipped {site} since not empty for enough time ({ed}s < {edt}s)' .format(site=site, ed=empty_duration, edt=empty_duration_threshold)) continue # only simul tasks if site has fairsharePolicy setup processing_type_constraint = '' if tmpSiteSpec.fairsharePolicy not in ('NULL', None): if 'type=simul:0%' in tmpSiteSpec.fairsharePolicy: # skip if zero share of simul tmp_log.debug( 'skipped {site} since with fairshare but zero for simul' .format(site=site)) continue else: processing_type_constraint = "AND t.processingType='simul' " # site attributes site_maxrss = tmpSiteSpec.maxrss if tmpSiteSpec.maxrss not in ( 0, None) else 999999 site_corecount = tmpSiteSpec.coreCount site_capability = str(tmpSiteSpec.capability).lower() # make sql parameters of rses available_rses = list(available_rses) rse_params_list = [] rse_params_map = {} for j, rse in enumerate(available_rses): rse_param = ':rse_{0}'.format(j + 1) rse_params_list.append(rse_param) rse_params_map[rse_param] = rse rse_params_str = ','.join(rse_params_list) # sql sql_query = ( "SELECT t.jediTaskID, t.workQueue_ID " "FROM {jedi_schema}.JEDI_Tasks t " "WHERE t.status IN ('ready','running') AND t.lockedBy IS NULL " "AND t.prodSourceLabel=:prodSourceLabel " "AND t.resource_type=:resource_type " "AND site IS NULL " "AND (COALESCE(t.baseRamCount, 0) + (CASE WHEN t.ramUnit IN ('MBPerCore','MBPerCoreFixed') THEN t.ramCount*:site_corecount ELSE t.ramCount END))*0.95 < :site_maxrss " "AND t.eventService=0 " "AND EXISTS ( " "SELECT * FROM {jedi_schema}.JEDI_Dataset_Locality dl " "WHERE dl.jediTaskID=t.jediTaskID " "AND dl.rse IN ({rse_params_str}) " ") " "{processing_type_constraint} " "AND EXISTS ( " "SELECT d.datasetID FROM {jedi_schema}.JEDI_Datasets d " "WHERE t.jediTaskID=d.jediTaskID AND d.type='input' " "AND d.nFilesToBeUsed-d.nFilesUsed>=:min_files_ready " "AND d.nFiles-d.nFilesUsed>=:min_files_remaining " ") " "ORDER BY t.currentPriority DESC " "FOR UPDATE ").format( jedi_schema=jedi_config.db.schemaJEDI, rse_params_str=rse_params_str, processing_type_constraint=processing_type_constraint) # loop over resource type for resource_type in resource_type_list: # key name for preassigned_tasks_map = site + resource_type key_name = '{0}|{1}'.format(site, resource_type) # skip if not match with site capability if site_capability == 'score' and not resource_type.startswith( 'SCORE'): continue elif site_capability == 'mcore' and not resource_type.startswith( 'MCORE'): continue # params map params_map = { ':prodSourceLabel': prod_source_label, ':resource_type': resource_type, ':site_maxrss': site_maxrss, ':site_corecount': site_corecount, ':min_files_ready': min_files_ready, ':min_files_remaining': min_files_remaining, } params_map.update(rse_params_map) # get preassigned_tasks_map from cache preassigned_tasks_map = self._get_from_pt_cache() preassigned_tasks_cached = preassigned_tasks_map.get( key_name, []) # get task_orig_attr_map from cache task_orig_attr_map = self._get_from_attr_cache() # number of tasks already preassigned n_preassigned_tasks = len(preassigned_tasks_cached) # nuber of tasks to preassign n_tasks_to_preassign = max( max_preassigned_tasks - n_preassigned_tasks, 0) # preassign if n_tasks_to_preassign <= 0: tmp_log.debug( '{key_name:<64} already has enough preassigned tasks ({n_tasks:>3}) ; skipped ' .format(key_name=key_name, n_tasks=n_preassigned_tasks)) elif DRY_RUN: dry_sql_query = ( "SELECT t.jediTaskID, t.workQueue_ID " "FROM {jedi_schema}.JEDI_Tasks t " "WHERE t.status IN ('ready','running') AND t.lockedBy IS NULL " "AND t.prodSourceLabel=:prodSourceLabel " "AND t.resource_type=:resource_type " "AND site IS NULL " "AND (COALESCE(t.baseRamCount, 0) + (CASE WHEN t.ramUnit IN ('MBPerCore','MBPerCoreFixed') THEN t.ramCount*:site_corecount ELSE t.ramCount END))*0.95 < :site_maxrss " "AND t.eventService=0 " "AND EXISTS ( " "SELECT * FROM {jedi_schema}.JEDI_Dataset_Locality dl " "WHERE dl.jediTaskID=t.jediTaskID " "AND dl.rse IN ({rse_params_str}) " ") " "{processing_type_constraint} " "AND EXISTS ( " "SELECT d.datasetID FROM {jedi_schema}.JEDI_Datasets d " "WHERE t.jediTaskID=d.jediTaskID AND d.type='input' " "AND d.nFilesToBeUsed-d.nFilesUsed>=:min_files_ready " "AND d.nFiles-d.nFilesUsed>=:min_files_remaining " ") " "ORDER BY t.currentPriority DESC ").format( jedi_schema=jedi_config.db.schemaJEDI, rse_params_str=rse_params_str, processing_type_constraint= processing_type_constraint) # tmp_log.debug('[dry run] {} {}'.format(dry_sql_query, params_map)) res = self.taskBufferIF.querySQL( dry_sql_query, params_map) n_tasks = 0 if res is None else len(res) if n_tasks > 0: result = [ x[0] for x in res if x[0] not in preassigned_tasks_cached ] updated_tasks = result[:n_tasks_to_preassign] tmp_log.debug( '[dry run] {key_name:<64} {n_tasks:>3} tasks would be preassigned ' .format(key_name=key_name, n_tasks=n_tasks_to_preassign)) # update preassigned_tasks_map into cache preassigned_tasks_map[key_name] = list( set(updated_tasks) | set(preassigned_tasks_cached)) tmp_log.debug('{} ; {}'.format( str(updated_tasks), str(preassigned_tasks_map[key_name]))) self._update_to_pt_cache(preassigned_tasks_map) else: updated_tasks_orig_attr = self.taskBufferIF.queryTasksToPreassign_JEDI( sql_query, params_map, site, blacklist=blacklisted_tasks_set, limit=n_tasks_to_preassign) if updated_tasks_orig_attr is None: # dbproxy method failed tmp_log.error( '{key_name:<64} failed to preassign tasks '. format(key_name=key_name)) else: n_tasks = len(updated_tasks_orig_attr) if n_tasks > 0: updated_tasks = [ x[0] for x in updated_tasks_orig_attr ] tmp_log.info( '{key_name:<64} {n_tasks:>3} tasks preassigned : {updated_tasks}' .format(key_name=key_name, n_tasks=str(n_tasks), updated_tasks=updated_tasks)) # update preassigned_tasks_map into cache preassigned_tasks_map[key_name] = list( set(updated_tasks) | set(preassigned_tasks_cached)) self._update_to_pt_cache(preassigned_tasks_map) # update task_orig_attr_map into cache and return map for taskid, orig_attr in updated_tasks_orig_attr: taskid_str = str(taskid) task_orig_attr_map[taskid_str] = orig_attr ret_map['to_reassign'][taskid] = { 'site': site, 'n_jobs_to_fill': n_jobs_to_fill, } self._update_to_attr_cache(task_orig_attr_map) # Kibana log for taskid in updated_tasks: tmp_log.debug( '#ATM #KV jediTaskID={taskid} action=do_preassign site={site} rtype={rtype} preassigned ' .format(taskid=taskid, site=site, rtype=resource_type)) else: tmp_log.debug( '{key_name:<64} found no proper task to preassign' .format(key_name=key_name)) # total preassigned tasks preassigned_tasks_map = self._get_from_pt_cache() n_pt_tot = sum( [len(pt_list) for pt_list in preassigned_tasks_map.values()]) tmp_log.debug('now {n_pt_tot} tasks preassigned in total'.format( n_pt_tot=n_pt_tot)) # return return ret_map
def doSetup(self, taskSpec, datasetToRegister, pandaJobs): # make logger tmpLog = MsgWrapper(logger, "< jediTaskID={0} >".format(taskSpec.jediTaskID)) tmpLog.info('start label={0} taskType={1}'.format( taskSpec.prodSourceLabel, taskSpec.taskType)) # returns retFatal = self.SC_FATAL retOK = self.SC_SUCCEEDED try: # get DDM I/F ddmIF = self.ddmIF.getInterface(taskSpec.vo, taskSpec.cloud) # skip if DDM I/F is inactive if not ddmIF: tmpLog.info('skip due to inactive DDM I/F') return retOK # collect datasetID to register datasets/containers just in case for tmpPandaJob in pandaJobs: if not tmpPandaJob.produceUnMerge(): for tmpFileSpec in tmpPandaJob.Files: if tmpFileSpec.type in ['output', 'log']: if tmpFileSpec.datasetID not in datasetToRegister: datasetToRegister.append(tmpFileSpec.datasetID) # register datasets if datasetToRegister: tmpLog.info('datasetToRegister={0}'.format( str(datasetToRegister))) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # loop over all datasets avDatasetList = [] cnDatasetMap = {} ddmBackEnd = 'rucio' for datasetID in datasetToRegister: # get output and log datasets tmpLog.info( 'getting datasetSpec with datasetID={0}'.format( datasetID)) tmpStat, datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI( taskSpec.jediTaskID, datasetID) if not tmpStat: tmpLog.error('failed to get output and log datasets') return retFatal if datasetSpec.isPseudo(): tmpLog.info('skip pseudo dataset') continue tmpLog.info('checking {0}'.format(datasetSpec.datasetName)) # check if dataset and container are available in DDM for targetName in [ datasetSpec.datasetName, datasetSpec.containerName ]: if not targetName: continue if targetName in avDatasetList: tmpLog.info( '{0} already registered'.format(targetName)) continue # set lifetime lifetime = None # check dataset/container in DDM tmpList = ddmIF.listDatasets(targetName) if not tmpList: # get location location = None locForRule = None if targetName == datasetSpec.datasetName: # dataset tmpLog.info('dest={0}'.format( datasetSpec.destination)) if datasetSpec.destination: if siteMapper.checkSite( datasetSpec.destination): location = siteMapper.getSite( 'BNL_OSG_SPHENIX' ).ddm_output['default'] else: location = datasetSpec.destination if locForRule is None: locForRule = location # set metadata if targetName == datasetSpec.datasetName: metaData = {} metaData['task_id'] = taskSpec.jediTaskID if taskSpec.campaign: metaData['campaign'] = taskSpec.campaign else: metaData = None # register dataset/container tmpLog.info( 'registering {0} with location={1} backend={2} lifetime={3} meta={4}' .format(targetName, location, ddmBackEnd, lifetime, str(metaData))) tmpStat = ddmIF.registerNewDataset( targetName, backEnd=ddmBackEnd, location=location, lifetime=lifetime, metaData=metaData) if not tmpStat: tmpLog.error('failed to register {0}'.format( targetName)) return retFatal # register location if locForRule: """ if taskSpec.workingGroup: userName = taskSpec.workingGroup else: userName = taskSpec.userName """ userName = None activity = None grouping = None tmpLog.info( 'registering location={} lifetime={} days activity={} grouping={} ' 'owner={}'.format(locForRule, lifetime, activity, grouping, userName)) tmpStat = ddmIF.registerDatasetLocation( targetName, locForRule, owner=userName, lifetime=lifetime, backEnd=ddmBackEnd, activity=activity, grouping=grouping) if not tmpStat: tmpLog.error( 'failed to register location {0} for {1}' .format(locForRule, targetName)) return retFatal avDatasetList.append(targetName) # check if dataset is in the container if datasetSpec.containerName and datasetSpec.containerName != datasetSpec.datasetName: # get list of constituent datasets in the container if datasetSpec.containerName not in cnDatasetMap: cnDatasetMap[ datasetSpec. containerName] = ddmIF.listDatasetsInContainer( datasetSpec.containerName) # add dataset if datasetSpec.datasetName not in cnDatasetMap[ datasetSpec.containerName]: tmpLog.info('adding {0} to {1}'.format( datasetSpec.datasetName, datasetSpec.containerName)) tmpStat = ddmIF.addDatasetsToContainer( datasetSpec.containerName, [datasetSpec.datasetName], backEnd=ddmBackEnd) if not tmpStat: tmpLog.error('failed to add {0} to {1}'.format( datasetSpec.datasetName, datasetSpec.containerName)) return retFatal cnDatasetMap[datasetSpec.containerName].append( datasetSpec.datasetName) else: tmpLog.info('{0} already in {1}'.format( datasetSpec.datasetName, datasetSpec.containerName)) # update dataset datasetSpec.status = 'registered' self.taskBufferIF.updateDataset_JEDI( datasetSpec, { 'jediTaskID': taskSpec.jediTaskID, 'datasetID': datasetID }) # return tmpLog.info('done') return retOK except Exception as e: errStr = 'doSetup failed with {}'.format(str(e)) tmpLog.error(errStr + traceback.format_exc()) taskSpec.setErrDiag(errStr) return retFatal
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,commandMap in taskList: # make logger tmpLog = MsgWrapper(self.logger,' < jediTaskID={0} >'.format(jediTaskID)) commandStr = commandMap['command'] commentStr = commandMap['comment'] oldStatus = commandMap['oldStatus'] tmpLog.info('start for {0}'.format(commandStr)) tmpStat = Interaction.SC_SUCCEEDED if commandStr in ['kill','finish','reassign']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) # loop twice to see immediate result for iLoop in range(2): # get active PandaIDs to be killed if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr: pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID) elif commandStr == 'reassign' and commentStr != None and 'nokill reassign' in commentStr: pandaIDs = [] else: pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True) if pandaIDs == None: tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID)) tmpStat = Interaction.SC_FAILED # kill jobs or update task if tmpStat == Interaction.SC_SUCCEEDED: if pandaIDs == []: # done since no active jobs tmpMsg = 'completed cleaning jobs' tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID updateTaskStatus = True if commandStr != 'reassign': # reset oldStatus # keep oldStatus for task reassignment since it is reset when actually reassigned tmpTaskSpec.forceUpdate('oldStatus') else: # extract cloud or site if commentStr != None: tmpItems = commentStr.split(':') if tmpItems[0] == 'cloud': tmpTaskSpec.cloud = tmpItems[1] elif tmpItems[0] == 'nucleus': tmpTaskSpec.nucleus = tmpItems[1] else: tmpTaskSpec.site = tmpItems[1] tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1]) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) # back to oldStatus if necessary if tmpItems[2] == 'y': tmpTaskSpec.status = oldStatus tmpTaskSpec.forceUpdate('oldStatus') updateTaskStatus = False if commandStr == 'reassign': tmpTaskSpec.forceUpdate('errorDialog') if commandStr == 'finish': # update datasets tmpLog.info('updating datasets to finish') tmpStat = self.taskBufferIF.updateDatasetsToFinishTask_JEDI(jediTaskID, self.pid) if not tmpStat: tmpLog.info('wait until datasets are updated to finish') # ignore failGoalUnreached when manually finished tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID) tmpTaskSpec.splitRule = taskSpec.splitRule tmpTaskSpec.unsetFailGoalUnreached() if updateTaskStatus: tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done'] tmpMsg = 'set task_status={0}'.format(tmpTaskSpec.status) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID}, setOldModTime=True) tmpLog.info('done with {0}'.format(str(tmpRet))) break else: # kill only in the first loop if iLoop > 0: break # wait or kill jobs if 'soft finish' in commentStr: queuedPandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID) tmpMsg = "trying to kill {0} queued jobs for soft finish".format(len(queuedPandaIDs)) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.killJobs(queuedPandaIDs,commentStr,'52',True) tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpRet = True tmpLog.info('done with {0}'.format(str(tmpRet))) break else: tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) if commandStr in ['finish']: # force kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True) elif commandStr in ['reassign']: # force kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'51',True) else: # normal kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True) tmpLog.info('done with {0}'.format(str(tmpRet))) elif commandStr in ['retry','incexec']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) # change task params for incexec if commandStr == 'incexec': try: # read task params taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) # remove some params for newKey in ['nFiles','fixedSandbox']: try: del taskParamMap[newKey] except: pass # convert new params newParamMap = RefinerUtils.decodeJSON(commentStr) # change params for newKey,newVal in newParamMap.iteritems(): if newVal == None: # delete if newKey in taskParamMap: del taskParamMap[newKey] else: # change taskParamMap[newKey] = newVal # overwrite sandbox if 'fixedSandbox' in taskParamMap: # noBuild for tmpParam in taskParamMap['jobParameters']: if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None: tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox'] # build if taskParamMap.has_key('buildSpec'): taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox'] # merge if taskParamMap.has_key('mergeSpec'): taskParamMap['mergeSpec']['jobParameters'] = \ re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters']) # encode new param strTaskParams = RefinerUtils.encodeJSON(taskParamMap) tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams) if tmpRet != True: tmpLog.error('failed to update task params') continue except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue)) continue # retry child tasks if 'sole ' in commentStr: retryChildTasks = False else: retryChildTasks = True # discard events if 'discard ' in commentStr: discardEvents = True else: discardEvents = False tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr, retryChildTasks=retryChildTasks, discardEvents=discardEvents) if tmpRet == True: tmpMsg = 'set task_status={0}'.format(newTaskStatus) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpLog.info('done with {0}'.format(tmpRet)) else: tmpLog.error('unknown command') except: errtype,errvalue = sys.exc_info()[:2] errStr = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue) errStr += traceback.format_exc() logger.error(errStr)
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskDsList = self.taskDsList.get(nTasks) # no more datasets if len(taskDsList) == 0: self.logger.debug('%s terminating since no more items' % self.__class__.__name__) return # loop over all tasks for jediTaskID,dsList in taskDsList: allUpdated = True taskBroken = False taskOnHold = False runningTask = False missingMap = {} datasetsIdxConsistency = [] # get task tmpStat,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False,True,self.pid,10) if not tmpStat or taskSpec == None: self.logger.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID)) continue # make logger try: gshare = '_'.join(taskSpec.gshare.split(' ')) except: gshare = 'Undefined' tmpLog = MsgWrapper(self.logger,'<jediTaskID={0} gshare={1}>'.format(jediTaskID, gshare)) try: # get task parameters taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('task param conversion from json failed with {0}:{1}'.format(errtype.__name__,errvalue)) taskBroken = True # renaming of parameters if taskParamMap.has_key('nEventsPerInputFile'): taskParamMap['nEventsPerFile'] = taskParamMap['nEventsPerInputFile'] # the number of files per job nFilesPerJob = taskSpec.getNumFilesPerJob() # the number of chunks used by scout nChunksForScout = 10 # load XML if taskSpec.useLoadXML(): xmlConfig = taskParamMap['loadXML'] else: xmlConfig = None # skip files used by another task if 'skipFilesUsedBy' in taskParamMap: skipFilesUsedBy = taskParamMap['skipFilesUsedBy'] else: skipFilesUsedBy = None # check no wait noWaitParent = False parentOutDatasets = set() if taskSpec.noWaitParent() and not taskSpec.parent_tid in [None,taskSpec.jediTaskID]: tmpStat = self.taskBufferIF.checkParentTask_JEDI(taskSpec.parent_tid) if tmpStat == 'running': noWaitParent = True # get output datasets from parent task tmpParentStat,tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.parent_tid, ['output','log']) # collect dataset names for tmpParentOutDataset in tmpParentOutDatasets: parentOutDatasets.add(tmpParentOutDataset.datasetName) # loop over all datasets nFilesMaster = 0 checkedMaster = False setFrozenTime = True if not taskBroken: ddmIF = self.ddmIF.getInterface(taskSpec.vo) origNumFiles = None if taskParamMap.has_key('nFiles'): origNumFiles = taskParamMap['nFiles'] for datasetSpec in dsList: tmpLog.debug('start loop for {0}(id={1})'.format(datasetSpec.datasetName,datasetSpec.datasetID)) # index consistency if datasetSpec.indexConsistent(): datasetsIdxConsistency.append(datasetSpec.datasetID) # get dataset metadata tmpLog.debug('get metadata') gotMetadata = False stateUpdateTime = datetime.datetime.utcnow() try: if not datasetSpec.isPseudo(): tmpMetadata = ddmIF.getDatasetMetaData(datasetSpec.datasetName) else: # dummy metadata for pseudo dataset tmpMetadata = {'state':'closed'} # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed if (noWaitParent or taskSpec.runUntilClosed()) and \ (tmpMetadata['state'] == 'open' \ or datasetSpec.datasetName in parentOutDatasets \ or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets): # dummy metadata when parent is running tmpMetadata = {'state':'mutable'} gotMetadata = True except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('{0} failed to get metadata to {1}:{2}'.format(self.__class__.__name__, errtype.__name__,errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog) else: if not taskSpec.ignoreMissingInDS(): # temporary error taskOnHold = True else: # ignore missing datasetStatus = 'failed' # update dataset status self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog) taskSpec.setErrDiag('failed to get metadata for {0}'.format(datasetSpec.datasetName)) if not taskSpec.ignoreMissingInDS(): allUpdated = False else: # get file list specified in task parameters fileList,includePatt,excludePatt = RefinerUtils.extractFileList(taskParamMap,datasetSpec.datasetName) # get the number of events in metadata if taskParamMap.has_key('getNumEventsInMetadata'): getNumEvents = True else: getNumEvents = False # get file list from DDM tmpLog.debug('get files') try: useInFilesWithNewAttemptNr = False skipDuplicate = not datasetSpec.useDuplicatedFiles() if not datasetSpec.isPseudo(): if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \ not datasetSpec.containerName in ['',None]: # read files from container if file list is specified in task parameters tmpDatasetName = datasetSpec.containerName else: tmpDatasetName = datasetSpec.datasetName # use long format for LB longFormat = False if taskSpec.respectLumiblock() or taskSpec.orderByLB(): longFormat = True tmpRet = ddmIF.getFilesInDataset(tmpDatasetName, getNumEvents=getNumEvents, skipDuplicate=skipDuplicate, longFormat=longFormat ) tmpLog.debug('got {0} files in {1}'.format(len(tmpRet),tmpDatasetName)) # remove lost files tmpLostFiles = ddmIF.findLostFiles(tmpDatasetName,tmpRet) if tmpLostFiles != {}: tmpLog.debug('found {0} lost files in {1}'.format(len(tmpLostFiles),tmpDatasetName)) for tmpListGUID,tmpLostLFN in tmpLostFiles.iteritems(): tmpLog.debug('removed {0}'.format(tmpLostLFN)) del tmpRet[tmpListGUID] else: if datasetSpec.isSeqNumber(): # make dummy files for seq_number if datasetSpec.getNumRecords() != None: nPFN = datasetSpec.getNumRecords() elif origNumFiles != None: nPFN = origNumFiles if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \ and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']: nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerJob'] elif taskParamMap.has_key('nEventsPerFile') and taskParamMap.has_key('nEventsPerRange'): nPFN = nPFN * taskParamMap['nEventsPerFile'] / taskParamMap['nEventsPerRange'] elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap: nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerJob'] elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \ and taskSpec.getNumFilesPerJob() is not None: nPFN = taskParamMap['nEvents'] / taskParamMap['nEventsPerFile'] / taskSpec.getNumFilesPerJob() else: # the default number of records for seq_number seqDefNumRecords = 10000 # get nFiles of the master tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI(datasetSpec.jediTaskID, datasetSpec.masterID, ['nFiles']) # use nFiles of the master as the number of records if it is larger than the default if 'nFiles' in tmpMasterAtt and tmpMasterAtt['nFiles'] > seqDefNumRecords: nPFN = tmpMasterAtt['nFiles'] else: nPFN = seqDefNumRecords # check usedBy if skipFilesUsedBy != None: for tmpJediTaskID in str(skipFilesUsedBy).split(','): tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI(tmpJediTaskID, {'datasetName':datasetSpec.datasetName}, ['nFiles']) if 'nFiles' in tmpParentAtt and tmpParentAtt['nFiles']: nPFN += tmpParentAtt['nFiles'] tmpRet = {} # get offset tmpOffset = datasetSpec.getOffset() tmpOffset += 1 for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = {'lfn':iPFN+tmpOffset, 'scope':None, 'filesize':0, 'checksum':None, } elif not taskSpec.useListPFN(): # dummy file list for pseudo dataset tmpRet = {str(uuid.uuid4()):{'lfn':'pseudo_lfn', 'scope':None, 'filesize':0, 'checksum':None, } } else: # make dummy file list for PFN list if taskParamMap.has_key('nFiles'): nPFN = taskParamMap['nFiles'] else: nPFN = 1 tmpRet = {} for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = {'lfn':'{0:06d}:{1}'.format(iPFN,taskParamMap['pfnList'][iPFN].split('/')[-1]), 'scope':None, 'filesize':0, 'checksum':None, } except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to get files due to {0}:{1} {2}'.format(self.__class__.__name__, errtype.__name__,errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus(datasetSpec,datasetStatus,tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag('failed to get files for {0}'.format(datasetSpec.datasetName)) allUpdated = False else: # parameters for master input respectLB = False useRealNumEvents = False if datasetSpec.isMaster(): # respect LB boundaries respectLB = taskSpec.respectLumiblock() # use real number of events useRealNumEvents = taskSpec.useRealNumEvents() # the number of events per file nEventsPerFile = None nEventsPerJob = None nEventsPerRange = None tgtNumEventsPerJob = None if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \ (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()): if taskParamMap.has_key('nEventsPerFile'): nEventsPerFile = taskParamMap['nEventsPerFile'] elif datasetSpec.isMaster() and datasetSpec.isPseudo() and taskParamMap.has_key('nEvents'): # use nEvents as nEventsPerFile for pseudo input nEventsPerFile = taskParamMap['nEvents'] if taskParamMap.has_key('nEventsPerJob'): nEventsPerJob = taskParamMap['nEventsPerJob'] elif taskParamMap.has_key('nEventsPerRange'): nEventsPerRange = taskParamMap['nEventsPerRange'] if 'tgtNumEventsPerJob' in taskParamMap: tgtNumEventsPerJob = taskParamMap['tgtNumEventsPerJob'] # reset nEventsPerJob nEventsPerJob = None # max attempts maxAttempt = None maxFailure = None if datasetSpec.isMaster() or datasetSpec.toKeepTrack(): # max attempts if taskSpec.disableAutoRetry(): # disable auto retry maxAttempt = 1 elif taskParamMap.has_key('maxAttempt'): maxAttempt = taskParamMap['maxAttempt'] else: # use default value maxAttempt = 3 # max failure if 'maxFailure' in taskParamMap: maxFailure = taskParamMap['maxFailure'] # first event number firstEventNumber = None if datasetSpec.isMaster(): # first event number firstEventNumber = 1 + taskSpec.getFirstEventOffset() # nMaxEvents nMaxEvents = None if datasetSpec.isMaster() and taskParamMap.has_key('nEvents'): nMaxEvents = taskParamMap['nEvents'] # nMaxFiles nMaxFiles = None if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): nMaxFiles = taskParamMap['nFiles'] else: # calculate for secondary nMaxFiles = datasetSpec.getNumMultByRatio(origNumFiles) # multipled by the number of jobs per file for event-level splitting if nMaxFiles != None and taskParamMap.has_key('nEventsPerFile'): if taskParamMap.has_key('nEventsPerJob'): if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']: nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerJob']) nMaxFiles = int(math.ceil(nMaxFiles)) elif taskParamMap.has_key('nEventsPerRange'): if taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerRange']: nMaxFiles *= float(taskParamMap['nEventsPerFile'])/float(taskParamMap['nEventsPerRange']) nMaxFiles = int(math.ceil(nMaxFiles)) # use scout useScout = False if datasetSpec.isMaster() and taskSpec.useScout() and (datasetSpec.status != 'toupdate' or not taskSpec.isPostScout()): useScout = True # use files with new attempt numbers useFilesWithNewAttemptNr = False if not datasetSpec.isPseudo() and fileList != [] and taskParamMap.has_key('useInFilesWithNewAttemptNr'): useFilesWithNewAttemptNr = True # ramCount ramCount = 0 # skip short input if datasetSpec.isMaster() and not datasetSpec.isPseudo() \ and nEventsPerFile is not None and nEventsPerJob is not None \ and nEventsPerFile >= nEventsPerJob \ and 'skipShortInput' in taskParamMap and taskParamMap['skipShortInput'] == True: skipShortInput = True else: skipShortInput = False # feed files to the contents table tmpLog.debug('update contents') retDB,missingFileList,nFilesUnique,diagMap = self.taskBufferIF.insertFilesForDataset_JEDI(datasetSpec,tmpRet, tmpMetadata['state'], stateUpdateTime, nEventsPerFile, nEventsPerJob, maxAttempt, firstEventNumber, nMaxFiles, nMaxEvents, useScout, fileList, useFilesWithNewAttemptNr, nFilesPerJob, nEventsPerRange, nChunksForScout, includePatt, excludePatt, xmlConfig, noWaitParent, taskSpec.parent_tid, self.pid, maxFailure, useRealNumEvents, respectLB, tgtNumEventsPerJob, skipFilesUsedBy, ramCount, taskSpec, skipShortInput) if retDB == False: taskSpec.setErrDiag('failed to insert files for {0}. {1}'.format(datasetSpec.datasetName, diagMap['errMsg'])) allUpdated = False taskBroken = True break elif retDB == None: # the dataset is locked by another or status is not applicable allUpdated = False tmpLog.debug('escape since task or dataset is locked') break elif missingFileList != []: # files are missing tmpErrStr = '{0} files missing in {1}'.format(len(missingFileList),datasetSpec.datasetName) tmpLog.debug(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) allUpdated = False taskOnHold = True missingMap[datasetSpec.datasetName] = {'datasetSpec':datasetSpec, 'missingFiles':missingFileList} else: # reduce the number of files to be read if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): taskParamMap['nFiles'] -= nFilesUnique # reduce the number of files for scout if useScout: nChunksForScout = diagMap['nChunksForScout'] # number of master input files if datasetSpec.isMaster(): checkedMaster = True nFilesMaster += nFilesUnique # running task if diagMap['isRunningTask']: runningTask = True # no activated pending input for noWait if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout <= 0) \ and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster(): tmpErrStr = 'insufficient inputs are ready. ' tmpErrStr += diagMap['errMsg'] tmpLog.debug(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) taskOnHold = True setFrozenTime = False break tmpLog.debug('end loop') # no mater input if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster: tmpErrStr = 'no master input files. input dataset is empty' tmpLog.error(tmpErrStr) taskSpec.setErrDiag(tmpErrStr,None) if taskSpec.allowEmptyInput() or noWaitParent: taskOnHold = True else: taskBroken = True # index consistency if not taskOnHold and not taskBroken and len(datasetsIdxConsistency) > 0: self.taskBufferIF.removeFilesIndexInconsistent_JEDI(jediTaskID,datasetsIdxConsistency) # update task status if taskBroken: # task is broken taskSpec.status = 'tobroken' tmpMsg = 'set task_status={0}'.format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid) # change task status unless the task is running if not runningTask: if taskOnHold: # go to pending state if not taskSpec.status in ['broken','tobroken']: taskSpec.setOnHold() tmpMsg = 'set task_status={0}'.format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,taskSpec,pid=self.pid,setFrozenTime=setFrozenTime) elif allUpdated: # all OK allRet,newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI(jediTaskID,getTaskStatus=True,pid=self.pid, useWorldCloud=taskSpec.useWorldCloud()) tmpMsg = 'set task_status={0}'.format(newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) # just unlock retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid) tmpLog.debug('unlock not-running task with {0}'.format(retUnlock)) else: # just unlock retUnlock = self.taskBufferIF.unlockSingleTask_JEDI(jediTaskID,self.pid) tmpLog.debug('unlock task with {0}'.format(retUnlock)) tmpLog.debug('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def do_for_data_locality(self): tmp_log = MsgWrapper(logger) # refresh self.refresh() # list of resource type # resource_type_list = [ rt.resource_name for rt in self.taskBufferIF.load_resource_types() ] # loop for prod_source_label in self.prodSourceLabelList: # site-rse map and blacklisted rses site_rse_map, blacklisted_rse_set = self.get_site_rse_map_and_blacklisted_rse_set( prod_source_label) tmp_log.debug('Found {0} blacklisted RSEs : {1}'.format( len(blacklisted_rse_set), ','.join(list(blacklisted_rse_set)))) # parameter from GDP config upplimit_ioIntensity = self.taskBufferIF.getConfigValue( 'task_withholder', 'LIMIT_IOINTENSITY_{0}'.format(prod_source_label), 'jedi', self.vo) lowlimit_currentPriority = self.taskBufferIF.getConfigValue( 'task_withholder', 'LIMIT_PRIORITY_{0}'.format(prod_source_label), 'jedi', self.vo) if upplimit_ioIntensity is None: upplimit_ioIntensity = 999999 if lowlimit_currentPriority is None: lowlimit_currentPriority = -999999 upplimit_ioIntensity = max(upplimit_ioIntensity, 100) # get work queue for gshare work_queue_list = self.workQueueMapper.getAlignedQueueList( self.vo, prod_source_label) # loop over work queue for work_queue in work_queue_list: gshare = work_queue.queue_name # get cutoff cutoff = self.taskBufferIF.getConfigValue( 'jobbroker', 'NQUEUELIMITSITE_{}'.format(gshare), 'jedi', self.vo) if not cutoff: cutoff = 20 # busy sites busy_sites_list = self.get_busy_sites(gshare, cutoff) # rses of busy sites busy_rses = set() for site in busy_sites_list: try: busy_rses.update(set(site_rse_map[site])) except KeyError: continue # make sql parameters of rses to_exclude_rses = list(busy_rses | blacklisted_rse_set) rse_params_list = [] rse_params_map = {} for j, rse in enumerate(to_exclude_rses): rse_param = ':rse_{0}'.format(j + 1) rse_params_list.append(rse_param) rse_params_map[rse_param] = rse rse_params_str = ','.join(rse_params_list) # sql sql_query = ( "SELECT t.jediTaskID " "FROM {jedi_schema}.JEDI_Tasks t " "WHERE t.status IN ('ready','running','scouting') AND t.lockedBy IS NULL " "AND t.gshare=:gshare " "AND t.ioIntensity>=:ioIntensity AND t.currentPriority<:currentPriority " "AND EXISTS ( " "SELECT * FROM {jedi_schema}.JEDI_Datasets d " "WHERE d.jediTaskID=t.jediTaskID " "AND d.type='input' " ") " "AND NOT EXISTS ( " "SELECT * FROM {jedi_schema}.JEDI_Dataset_Locality dl " "WHERE dl.jediTaskID=t.jediTaskID " "AND dl.rse NOT IN ({rse_params_str}) " ") " "FOR UPDATE ").format( jedi_schema=jedi_config.db.schemaJEDI, rse_params_str=rse_params_str) # params map params_map = { ':gshare': gshare, ':ioIntensity': upplimit_ioIntensity, ':currentPriority': lowlimit_currentPriority, } params_map.update(rse_params_map) # pending reason reason = 'no local input data, ioIntensity>={ioIntensity}, currentPriority<{currentPriority},'\ 'nQueue>max({cutOff},nRunning*2) at all sites where the task can run'.format( ioIntensity=upplimit_ioIntensity,currentPriority=lowlimit_currentPriority, cutOff=cutoff) # set pending dry_run = False if dry_run: dry_sql_query = ( "SELECT t.jediTaskID " "FROM {jedi_schema}.JEDI_Tasks t " "WHERE t.status IN ('ready','running','scouting') AND t.lockedBy IS NULL " "AND t.gshare=:gshare " "AND t.ioIntensity>=:ioIntensity AND t.currentPriority<:currentPriority " "AND EXISTS ( " "SELECT * FROM {jedi_schema}.JEDI_Datasets d " "WHERE d.jediTaskID=t.jediTaskID " "AND d.type='input' " ") " "AND NOT EXISTS ( " "SELECT * FROM {jedi_schema}.JEDI_Dataset_Locality dl " "WHERE dl.jediTaskID=t.jediTaskID " "AND dl.rse NOT IN ({rse_params_str}) " ") ").format(jedi_schema=jedi_config.db.schemaJEDI, rse_params_str=rse_params_str) res = self.taskBufferIF.querySQL(dry_sql_query, params_map) n_tasks = 0 if res is None else len(res) if n_tasks > 0: result = [x[0] for x in res] tmp_log.debug( '[dry run] gshare: {gshare:<16} {n_tasks:>5} tasks would be pending : {result} ; reason="{reason}" ' .format(gshare=gshare, n_tasks=n_tasks, result=result, reason=reason)) else: n_tasks = self.taskBufferIF.queryTasksToBePending_JEDI( sql_query, params_map, reason) if n_tasks is not None and n_tasks > 0: tmp_log.info( 'gshare: {gshare:<16} {n_tasks:>5} tasks got pending ; reason="{reason}" ' .format(gshare=gshare, n_tasks=str(n_tasks), reason=reason))
def doSetup(self,taskSpec,datasetToRegister,pandaJobs): # make logger tmpLog = MsgWrapper(logger,"< jediTaskID={0} >".format(taskSpec.jediTaskID)) tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType)) # returns retFatal = self.SC_FATAL retTmpError = self.SC_FAILED retOK = self.SC_SUCCEEDED try: # get DDM I/F ddmIF = self.ddmIF.getInterface(taskSpec.vo) # register datasets if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']: # prod vs anal userSetup = False if taskSpec.prodSourceLabel in ['user']: userSetup = True # collect datasetID to register datasets/containers just in case for tmpPandaJob in pandaJobs: if not tmpPandaJob.produceUnMerge(): for tmpFileSpec in tmpPandaJob.Files: if tmpFileSpec.type in ['output','log']: if tmpFileSpec.datasetID not in datasetToRegister: datasetToRegister.append(tmpFileSpec.datasetID) tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister))) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # loop over all datasets avDatasetList = [] cnDatasetMap = {} for datasetID in datasetToRegister: # get output and log datasets tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID)) tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID, datasetID) if not tmpStat: tmpLog.error('failed to get output and log datasets') return retFatal if datasetSpec.isPseudo(): tmpLog.info('skip pseudo dataset') continue # DDM backend ddmBackEnd = taskSpec.getDdmBackEnd() tmpLog.info('checking {0}'.format(datasetSpec.datasetName)) # check if dataset and container are available in DDM for targetName in [datasetSpec.datasetName,datasetSpec.containerName]: if targetName is None: continue if targetName not in avDatasetList: # set lifetime if targetName.startswith('panda'): if datasetSpec.type == 'trn_log' and taskSpec.prodSourceLabel == 'managed': lifetime = 365 else: lifetime = 14 else: lifetime = None # check dataset/container in DDM tmpList = ddmIF.listDatasets(targetName) if tmpList == []: # get location location = None locForRule = None if targetName == datasetSpec.datasetName: # dataset if datasetSpec.site in ['',None]: if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None: locForRule = datasetSpec.destination elif DataServiceUtils.getDestinationSE(datasetSpec.storageToken) is not None: location = DataServiceUtils.getDestinationSE(datasetSpec.storageToken) elif taskSpec.cloud is not None: # use T1 SE tmpT1Name = siteMapper.getCloud(taskSpec.cloud)['source'] location = siteMapper.getDdmEndpoint(tmpT1Name, datasetSpec.storageToken, taskSpec.prodSourceLabel, JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType)) else: tmpLog.info('site={0} token={1}'.format(datasetSpec.site, datasetSpec.storageToken)) location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken, taskSpec.prodSourceLabel, JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType)) if locForRule is None: locForRule = location # set metadata if taskSpec.prodSourceLabel in ['managed','test'] and targetName == datasetSpec.datasetName: metaData = {} metaData['task_id'] = taskSpec.jediTaskID if taskSpec.campaign not in [None,'']: metaData['campaign'] = taskSpec.campaign if datasetSpec.getTransient() is not None: metaData['transient'] = datasetSpec.getTransient() else: metaData = None # register dataset/container tmpLog.info('registering {0} with location={1} backend={2} lifetime={3} meta={4}'.format(targetName, location, ddmBackEnd, lifetime, str(metaData))) tmpStat = ddmIF.registerNewDataset(targetName,backEnd=ddmBackEnd,location=location, lifetime=lifetime,metaData=metaData) if not tmpStat: tmpLog.error('failed to register {0}'.format(targetName)) return retFatal # procedures for user if userSetup or DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None: # register location tmpToRegister = False if userSetup and targetName == datasetSpec.datasetName and datasetSpec.site not in ['',None]: if taskSpec.workingGroup: userName = taskSpec.workingGroup else: userName = taskSpec.userName grouping = None tmpToRegister = True elif DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None: userName = None grouping = 'NONE' tmpToRegister = True if tmpToRegister: activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel) tmpLog.info('registering location={} lifetime={} days activity={} grouping={} ' 'owner={}'.format(locForRule, lifetime, activity, grouping, userName)) tmpStat = ddmIF.registerDatasetLocation(targetName,locForRule,owner=userName, lifetime=lifetime,backEnd=ddmBackEnd, activity=activity,grouping=grouping) if not tmpStat: tmpLog.error('failed to register location {0} for {1}'.format(locForRule, targetName)) return retFatal # double copy if userSetup and datasetSpec.type == 'output': if datasetSpec.destination != datasetSpec.site: tmpLog.info('skip making double copy as destination={0} is not site={1}'.format(datasetSpec.destination, datasetSpec.site)) else: second_copy = True try: if taskSpec.site: panda_site = siteMapper.getSite(taskSpec.site) if panda_site.catchall and 'skip_2nd_copy' in panda_site.catchall: tmpLog.info('skip making double copy as specified in {0} catchall'.format(panda_site)) second_copy = False except Exception: second_copy = True if second_copy: locForDouble = '(type=SCRATCHDISK)\\notforextracopy=True' tmpMsg = 'registering double copy ' tmpMsg += 'location="{0}" lifetime={1}days activity={2} for dataset={3}'.format(locForDouble,lifetime, activity,targetName) tmpLog.info(tmpMsg) tmpStat = ddmIF.registerDatasetLocation(targetName,locForDouble,copies=2,owner=userName, lifetime=lifetime,activity=activity, grouping='NONE',weight='freespace', ignore_availability=False) if not tmpStat: tmpLog.error('failed to register double copylocation {0} for {1}'.format(locForDouble, targetName)) return retFatal avDatasetList.append(targetName) else: tmpLog.info('{0} already registered'.format(targetName)) # check if dataset is in the container if datasetSpec.containerName is not None and datasetSpec.containerName != datasetSpec.datasetName: # get list of constituent datasets in the container if datasetSpec.containerName not in cnDatasetMap: cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName) # add dataset if datasetSpec.datasetName not in cnDatasetMap[datasetSpec.containerName]: tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName], backEnd=ddmBackEnd) if not tmpStat: tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName, datasetSpec.containerName)) return retFatal cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName) else: tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) # update dataset datasetSpec.status = 'registered' self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID, 'datasetID':datasetID}) # register ES datasets if taskSpec.registerEsFiles(): targetName = EventServiceUtils.getEsDatasetName(taskSpec.jediTaskID) location = None metaData = {} metaData['task_id'] = taskSpec.jediTaskID metaData['hidden'] = True tmpLog.info('registering ES dataset {0} with location={1} meta={2}'.format(targetName, location, str(metaData))) tmpStat = ddmIF.registerNewDataset(targetName,location=location,metaData=metaData, resurrect=True) if not tmpStat: tmpLog.error('failed to register ES dataset {0}'.format(targetName)) return retFatal # register rule location = 'type=DATADISK' activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel) grouping = 'NONE' tmpLog.info('registering location={0} activity={1} grouping={2}'.format(location, activity, grouping)) tmpStat = ddmIF.registerDatasetLocation(targetName,location,activity=activity, grouping=grouping) if not tmpStat: tmpLog.error('failed to register location {0} with {2} for {1}'.format(location, targetName, activity)) return retFatal # open datasets if taskSpec.prodSourceLabel in ['managed','test']: # get the list of output/log datasets outDatasetList = [] for tmpPandaJob in pandaJobs: for tmpFileSpec in tmpPandaJob.Files: if tmpFileSpec.type in ['output','log']: if tmpFileSpec.destinationDBlock not in outDatasetList: outDatasetList.append(tmpFileSpec.destinationDBlock) # open datasets for outDataset in outDatasetList: tmpLog.info('open {0}'.format(outDataset)) ddmIF.openDataset(outDataset) # unset lifetime ddmIF.setDatasetMetadata(outDataset,'lifetime',None) # return tmpLog.info('done') return retOK except Exception: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskDsList = self.taskDsList.get(nTasks) # no more datasets if len(taskDsList) == 0: self.logger.debug('%s terminating since no more items' % self.__class__.__name__) return # loop over all tasks for jediTaskID, dsList in taskDsList: allUpdated = True taskBroken = False taskOnHold = False runningTask = False missingMap = {} # make logger tmpLog = MsgWrapper( self.logger, '< jediTaskID={0} >'.format(jediTaskID)) # get task tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI( jediTaskID, False, True, self.pid, 10) if not tmpStat or taskSpec == None: tmpLog.error( 'failed to get taskSpec for jediTaskID={0}'.format( jediTaskID)) continue try: # get task parameters taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI( jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( 'task param conversion from json failed with {0}:{1}' .format(errtype.__name__, errvalue)) taskBroken = True # renaming of parameters if taskParamMap.has_key('nEventsPerInputFile'): taskParamMap['nEventsPerFile'] = taskParamMap[ 'nEventsPerInputFile'] # the number of files per job nFilesPerJob = None if taskParamMap.has_key('nFilesPerJob'): nFilesPerJob = taskParamMap['nFilesPerJob'] # the number of chunks used by scout nChunksForScout = 10 # load XML if taskSpec.useLoadXML(): xmlConfig = taskParamMap['loadXML'] else: xmlConfig = None # skip files used by another task if 'skipFilesUsedBy' in taskParamMap: skipFilesUsedBy = taskParamMap['skipFilesUsedBy'] else: skipFilesUsedBy = None # check no wait noWaitParent = False parentOutDatasets = set() if taskSpec.noWaitParent() and not taskSpec.parent_tid in [ None, taskSpec.jediTaskID ]: tmpStat = self.taskBufferIF.checkParentTask_JEDI( taskSpec.parent_tid) if tmpStat == 'running': noWaitParent = True # get output datasets from parent task tmpParentStat, tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( taskSpec.parent_tid, ['output', 'log']) # collect dataset names for tmpParentOutDataset in tmpParentOutDatasets: parentOutDatasets.add( tmpParentOutDataset.datasetName) # loop over all datasets nFilesMaster = 0 checkedMaster = False setFrozenTime = True if not taskBroken: ddmIF = self.ddmIF.getInterface(taskSpec.vo) origNumFiles = None if taskParamMap.has_key('nFiles'): origNumFiles = taskParamMap['nFiles'] for datasetSpec in dsList: tmpLog.debug('start loop for {0}(id={1})'.format( datasetSpec.datasetName, datasetSpec.datasetID)) # get dataset metadata tmpLog.debug('get metadata') gotMetadata = False stateUpdateTime = datetime.datetime.utcnow() try: if not datasetSpec.isPseudo(): tmpMetadata = ddmIF.getDatasetMetaData( datasetSpec.datasetName) else: # dummy metadata for pseudo dataset tmpMetadata = {'state': 'closed'} # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed if (noWaitParent or taskSpec.runUntilClosed()) and \ (tmpMetadata['state'] == 'open' \ or datasetSpec.datasetName in parentOutDatasets \ or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets): # dummy metadata when parent is running tmpMetadata = {'state': 'mutable'} gotMetadata = True except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( '{0} failed to get metadata to {1}:{2}'. format(self.__class__.__name__, errtype.__name__, errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus( datasetSpec, datasetStatus, tmpLog) else: if not taskSpec.ignoreMissingInDS(): # temporary error taskOnHold = True else: # ignore missing datasetStatus = 'failed' # update dataset status self.updateDatasetStatus( datasetSpec, datasetStatus, tmpLog) taskSpec.setErrDiag( 'failed to get metadata for {0}'.format( datasetSpec.datasetName)) if not taskSpec.ignoreMissingInDS(): allUpdated = False else: # get file list specified in task parameters fileList, includePatt, excludePatt = RefinerUtils.extractFileList( taskParamMap, datasetSpec.datasetName) # get the number of events in metadata if taskParamMap.has_key( 'getNumEventsInMetadata'): getNumEvents = True else: getNumEvents = False # get file list from DDM tmpLog.debug('get files') try: useInFilesWithNewAttemptNr = False skipDuplicate = not datasetSpec.useDuplicatedFiles( ) if not datasetSpec.isPseudo(): if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \ not datasetSpec.containerName in ['',None]: # read files from container if file list is specified in task parameters tmpDatasetName = datasetSpec.containerName else: tmpDatasetName = datasetSpec.datasetName # use long format for LB longFormat = False if taskSpec.respectLumiblock(): longFormat = True tmpRet = ddmIF.getFilesInDataset( tmpDatasetName, getNumEvents=getNumEvents, skipDuplicate=skipDuplicate, longFormat=longFormat) tmpLog.debug( 'got {0} files in {1}'.format( len(tmpRet), tmpDatasetName)) # remove lost files tmpLostFiles = ddmIF.findLostFiles( tmpDatasetName, tmpRet) if tmpLostFiles != {}: tmpLog.debug( 'found {0} lost files in {1}'. format(len(tmpLostFiles), tmpDatasetName)) for tmpListGUID, tmpLostLFN in tmpLostFiles.iteritems( ): tmpLog.debug( 'removed {0}'.format( tmpLostLFN)) del tmpRet[tmpListGUID] else: if datasetSpec.isSeqNumber(): # make dummy files for seq_number if datasetSpec.getNumRecords( ) != None: nPFN = datasetSpec.getNumRecords( ) elif origNumFiles != None: nPFN = origNumFiles if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \ and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']: nPFN = nPFN * taskParamMap[ 'nEventsPerFile'] / taskParamMap[ 'nEventsPerJob'] elif taskParamMap.has_key( 'nEventsPerFile' ) and taskParamMap.has_key( 'nEventsPerRange'): nPFN = nPFN * taskParamMap[ 'nEventsPerFile'] / taskParamMap[ 'nEventsPerRange'] elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap: nPFN = taskParamMap[ 'nEvents'] / taskParamMap[ 'nEventsPerJob'] elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \ and 'nFilesPerJob' in taskParamMap: nPFN = taskParamMap[ 'nEvents'] / taskParamMap[ 'nEventsPerFile'] / taskParamMap[ 'nFilesPerJob'] else: # the default number of records for seq_number seqDefNumRecords = 10000 # get nFiles of the master tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI( datasetSpec.jediTaskID, datasetSpec.masterID, ['nFiles']) # use nFiles of the master as the number of records if it is larger than the default if 'nFiles' in tmpMasterAtt and tmpMasterAtt[ 'nFiles'] > seqDefNumRecords: nPFN = tmpMasterAtt[ 'nFiles'] else: nPFN = seqDefNumRecords # check usedBy if skipFilesUsedBy != None: for tmpJediTaskID in str( skipFilesUsedBy ).split(','): tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI( tmpJediTaskID, { 'datasetName': datasetSpec. datasetName }, ['nFiles']) if 'nFiles' in tmpParentAtt and tmpParentAtt[ 'nFiles']: nPFN += tmpParentAtt[ 'nFiles'] tmpRet = {} # get offset tmpOffset = datasetSpec.getOffset() tmpOffset += 1 for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = { 'lfn': iPFN + tmpOffset, 'scope': None, 'filesize': 0, 'checksum': None, } elif not taskSpec.useListPFN(): # dummy file list for pseudo dataset tmpRet = { str(uuid.uuid4()): { 'lfn': 'pseudo_lfn', 'scope': None, 'filesize': 0, 'checksum': None, } } else: # make dummy file list for PFN list if taskParamMap.has_key('nFiles'): nPFN = taskParamMap['nFiles'] else: nPFN = 1 tmpRet = {} for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = { 'lfn': '{0:06d}:{1}'.format( iPFN, taskParamMap['pfnList'] [iPFN].split('/')[-1]), 'scope': None, 'filesize': 0, 'checksum': None, } except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( 'failed to get files due to {0}:{1} {2}' .format(self.__class__.__name__, errtype.__name__, errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus( datasetSpec, datasetStatus, tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag( 'failed to get files for {0}'.format( datasetSpec.datasetName)) allUpdated = False else: # parameters for master input respectLB = False useRealNumEvents = False if datasetSpec.isMaster(): # respect LB boundaries respectLB = taskSpec.respectLumiblock() # use real number of events useRealNumEvents = taskSpec.useRealNumEvents( ) # the number of events per file nEventsPerFile = None nEventsPerJob = None nEventsPerRange = None tgtNumEventsPerJob = None if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \ (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()): if taskParamMap.has_key( 'nEventsPerFile'): nEventsPerFile = taskParamMap[ 'nEventsPerFile'] elif datasetSpec.isMaster( ) and datasetSpec.isPseudo( ) and taskParamMap.has_key('nEvents'): # use nEvents as nEventsPerFile for pseudo input nEventsPerFile = taskParamMap[ 'nEvents'] if taskParamMap.has_key( 'nEventsPerJob'): nEventsPerJob = taskParamMap[ 'nEventsPerJob'] elif taskParamMap.has_key( 'nEventsPerRange'): nEventsPerRange = taskParamMap[ 'nEventsPerRange'] if 'tgtNumEventsPerJob' in taskParamMap: tgtNumEventsPerJob = taskParamMap[ 'tgtNumEventsPerJob'] # reset nEventsPerJob nEventsPerJob = None # max attempts maxAttempt = None maxFailure = None if datasetSpec.isMaster( ) or datasetSpec.toKeepTrack(): # max attempts if taskSpec.disableAutoRetry(): # disable auto retry maxAttempt = 1 elif taskParamMap.has_key( 'maxAttempt'): maxAttempt = taskParamMap[ 'maxAttempt'] else: # use default value maxAttempt = 3 # max failure if 'maxFailure' in taskParamMap: maxFailure = taskParamMap[ 'maxFailure'] # first event number firstEventNumber = None if datasetSpec.isMaster(): # first event number firstEventNumber = 1 + taskSpec.getFirstEventOffset( ) # nMaxEvents nMaxEvents = None if datasetSpec.isMaster( ) and taskParamMap.has_key('nEvents'): nMaxEvents = taskParamMap['nEvents'] # nMaxFiles nMaxFiles = None if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): nMaxFiles = taskParamMap['nFiles'] else: # calculate for secondary nMaxFiles = datasetSpec.getNumMultByRatio( origNumFiles) # multipled by the number of jobs per file for event-level splitting if nMaxFiles != None and taskParamMap.has_key( 'nEventsPerFile'): if taskParamMap.has_key( 'nEventsPerJob'): if taskParamMap[ 'nEventsPerFile'] > taskParamMap[ 'nEventsPerJob']: nMaxFiles *= float( taskParamMap[ 'nEventsPerFile'] ) / float(taskParamMap[ 'nEventsPerJob']) nMaxFiles = int( math.ceil( nMaxFiles)) elif taskParamMap.has_key( 'nEventsPerRange'): if taskParamMap[ 'nEventsPerFile'] > taskParamMap[ 'nEventsPerRange']: nMaxFiles *= float( taskParamMap[ 'nEventsPerFile'] ) / float(taskParamMap[ 'nEventsPerRange']) nMaxFiles = int( math.ceil( nMaxFiles)) # use scout useScout = False if datasetSpec.isMaster( ) and taskSpec.useScout() and ( datasetSpec.status != 'toupdate' or not taskSpec.isPostScout()): useScout = True # use files with new attempt numbers useFilesWithNewAttemptNr = False if not datasetSpec.isPseudo( ) and fileList != [] and taskParamMap.has_key( 'useInFilesWithNewAttemptNr'): useFilesWithNewAttemptNr = True #ramCount ramCount = 0 # feed files to the contents table tmpLog.debug('update contents') retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI( datasetSpec, tmpRet, tmpMetadata['state'], stateUpdateTime, nEventsPerFile, nEventsPerJob, maxAttempt, firstEventNumber, nMaxFiles, nMaxEvents, useScout, fileList, useFilesWithNewAttemptNr, nFilesPerJob, nEventsPerRange, nChunksForScout, includePatt, excludePatt, xmlConfig, noWaitParent, taskSpec.parent_tid, self.pid, maxFailure, useRealNumEvents, respectLB, tgtNumEventsPerJob, skipFilesUsedBy, ramCount) if retDB == False: taskSpec.setErrDiag( 'failed to insert files for {0}. {1}' .format(datasetSpec.datasetName, diagMap['errMsg'])) allUpdated = False taskBroken = True break elif retDB == None: # the dataset is locked by another or status is not applicable allUpdated = False tmpLog.debug( 'escape since task or dataset is locked' ) break elif missingFileList != []: # files are missing tmpErrStr = '{0} files missing in {1}'.format( len(missingFileList), datasetSpec.datasetName) tmpLog.debug(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) allUpdated = False taskOnHold = True missingMap[datasetSpec.datasetName] = { 'datasetSpec': datasetSpec, 'missingFiles': missingFileList } else: # reduce the number of files to be read if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): taskParamMap[ 'nFiles'] -= nFilesUnique # reduce the number of files for scout if useScout: nChunksForScout = diagMap[ 'nChunksForScout'] # number of master input files if datasetSpec.isMaster(): checkedMaster = True nFilesMaster += nFilesUnique # running task if diagMap['isRunningTask']: runningTask = True # no activated pending input for noWait if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout == 0) \ and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster(): tmpErrStr = 'insufficient inputs are ready. ' tmpErrStr += diagMap['errMsg'] tmpLog.debug(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) taskOnHold = True setFrozenTime = False break tmpLog.debug('end loop') # no mater input if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster: tmpErrStr = 'no master input files. input dataset is empty' tmpLog.error(tmpErrStr) taskSpec.setErrDiag(tmpErrStr, None) if taskSpec.allowEmptyInput() or noWaitParent: taskOnHold = True else: taskBroken = True # update task status if taskBroken: # task is broken taskSpec.status = 'tobroken' tmpMsg = 'set task.status={0}'.format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI( jediTaskID, taskSpec, pid=self.pid) # change task status unless the task is running if not runningTask: if taskOnHold: # go to pending state if not taskSpec.status in ['broken', 'tobroken']: taskSpec.setOnHold() tmpMsg = 'set task.status={0}'.format( taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI( jediTaskID, taskSpec, pid=self.pid, setFrozenTime=setFrozenTime) elif allUpdated: # all OK allRet, newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI( jediTaskID, getTaskStatus=True, pid=self.pid, useWorldCloud=taskSpec.useWorldCloud()) tmpMsg = 'set task.status={0}'.format( newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) # just unlock retUnlock = self.taskBufferIF.unlockSingleTask_JEDI( jediTaskID, self.pid) tmpLog.debug('unlock not-running task with {0}'.format( retUnlock)) else: # just unlock retUnlock = self.taskBufferIF.unlockSingleTask_JEDI( jediTaskID, self.pid) tmpLog.debug('unlock task with {0}'.format(retUnlock)) tmpLog.debug('done') except: errtype, errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format( self.__class__.__name__, errtype.__name__, errvalue))
def runImpl(self): # cutoff for disk in TB diskThreshold = 5 * 1024 # dataset type to ignore file availability check datasetTypeToSkipCheck = ['log'] thrInputSize = 1024*1024*1024 thrInputNum = 100 thrInputSizeFrac = 0.1 thrInputNumFrac = 0.1 cutOffRW = 50 negWeightTape = 0.001 # main lastJediTaskID = None siteMapper = self.taskBufferIF.getSiteMapper() while True: try: taskInputList = self.inputList.get(1) # no more datasets if len(taskInputList) == 0: self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__, self.numTasks)) return # loop over all tasks for taskSpec,inputChunk in taskInputList: lastJediTaskID = taskSpec.jediTaskID # make logger tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='{0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start') # get nuclei nucleusList = siteMapper.nuclei if taskSpec.nucleus in nucleusList: candidateNucleus = taskSpec.nucleus else: tmpLog.debug('got {0} candidates'.format(len(nucleusList))) ###################################### # check status newNucleusList = {} for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if not tmpNucleusSpec.state in ['ACTIVE']: tmpLog.debug(' skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus, tmpNucleusSpec.state)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.debug('{0} candidates passed status check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check endpoint newNucleusList = {} tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID, ['output','log']) for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): toSkip = False for tmpDatasetSpec in tmpDatasetSpecList: # ignore distributed datasets if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None: continue # get endpoint with the pattern tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken) if tmpEP == None: tmpLog.debug(' skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus, tmpDatasetSpec.storageToken)) toSkip = True break # check state """ if not tmpEP['state'] in ['ACTIVE']: tmpLog.debug(' skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus, tmpEP['ddm_endpoint_name'], tmpEP['state'])) toSkip = True break """ # check space tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired'] if tmpSpaceSize < diskThreshold: tmpLog.debug(' skip nucleus={0} since disk shortage ({1}<{2}) at endpoint {3} criteria=-space'.format(tmpNucleus, tmpSpaceSize, diskThreshold, tmpEP['state'])) toSkip = True break if not toSkip: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.debug('{0} candidates passed endpoint check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # data locality toSkip = False availableData = {} for datasetSpec in inputChunk.getDatasets(): # only for real datasets if datasetSpec.isPseudo(): continue # ignore DBR if DataServiceUtils.isDBR(datasetSpec.datasetName): continue # skip locality check if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck: continue # get nuclei where data is available tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF, datasetSpec.datasetName, nucleusList.keys()) if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) toSkip = True break # sum for tmpNucleus,tmpVals in tmpRet.iteritems(): if not tmpNucleus in availableData: availableData[tmpNucleus] = tmpVals else: availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems()) if toSkip: continue if availableData != {}: newNucleusList = {} # skip if no data for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if availableData[tmpNucleus]['tot_size'] > thrInputSize and \ availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac: tmpLog.debug(' skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus, availableData[tmpNucleus]['ava_size_any'], availableData[tmpNucleus]['tot_size'], thrInputSizeFrac)) elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \ availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac: tmpLog.debug(' skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus, availableData[tmpNucleus]['ava_num_any'], availableData[tmpNucleus]['tot_num'], thrInputNumFrac)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.debug('{0} candidates passed data check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # ability to execute jobs newNucleusList = {} # get all panda sites tmpSiteList = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): tmpSiteList += tmpNucleusSpec.allPandaSites tmpSiteList = list(set(tmpSiteList)) tmpLog.debug('===== start for job check') jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF) tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True, tmpSiteList,tmpLog) tmpLog.debug('===== done for job check') if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.debug('failed to get sites where jobs can run. Use any nuclei where input is available') # use any nuclei where input is available if no sites can run jobs tmpRet = tmpSiteList okNuclei = set() for tmpSite in tmpRet: siteSpec = siteMapper.getSite(tmpSite) okNuclei.add(siteSpec.pandasite) for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if tmpNucleus in okNuclei: newNucleusList[tmpNucleus] = tmpNucleusSpec else: tmpLog.debug(' skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus)) nucleusList = newNucleusList tmpLog.debug('{0} candidates passed job check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # RW taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID) ###################################### # weight self.prioRW.acquire() nucleusRW = self.prioRW[taskSpec.currentPriority] self.prioRW.release() totalWeight = 0 nucleusweights = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if not tmpNucleus in nucleusRW: nucleusRW[tmpNucleus] = 0 wStr = '1' # with RW if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW: weight = 1 / float(nucleusRW[tmpNucleus]) wStr += '/({0}=RW)'.format(nucleusRW[tmpNucleus]) else: weight = 1 wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW) # with data if availableData != {}: weight *= float(availableData[tmpNucleus]['ava_size_any']) weight /= float(availableData[tmpNucleus]['tot_size']) wStr += '*({0}=available input size on DISK/TAPE)'.format(availableData[tmpNucleus]['ava_size_any']) wStr += '/({0}=total input size)'.format(availableData[tmpNucleus]['tot_size']) # negative weight for tape if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']: weight *= negWeightTape wStr += '*({0}=weight for TAPE)'.format(negWeightTape) tmpLog.debug(' use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr)) totalWeight += weight nucleusweights.append((tmpNucleus,weight)) tmpLog.debug('final {0} candidates'.format(len(nucleusList))) ###################################### # final selection tgtWeight = random.uniform(0,totalWeight) candidateNucleus = None for tmpNucleus,weight in nucleusweights: tgtWeight -= weight if tgtWeight <= 0: candidateNucleus = tmpNucleus break if candidateNucleus == None: candidateNucleus = nucleusweights[-1][0] ###################################### # update nucleusSpec = nucleusList[candidateNucleus] # get output/log datasets tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID, ['output','log']) # get destinations retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)} tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) tmpLog.info(' set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet)) # update RW table self.prioRW.acquire() for prio,rwMap in self.prioRW.iteritems(): if prio > taskSpec.currentPriority: continue if candidateNucleus in rwMap: rwMap[candidateNucleus] += taskRW else: rwMap[candidateNucleus] = taskRW self.prioRW.release() except: errtype,errvalue = sys.exc_info()[:2] errMsg = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue) errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID) errMsg += traceback.format_exc() logger.error(errMsg)
def doGenerate(self,taskSpec,taskParamMap,**varMap): # make logger tmpLog = MsgWrapper(logger,"<jediTaskID={0}>".format(taskSpec.jediTaskID)) tmpLog.info('start taskType={0}'.format(taskSpec.taskType)) tmpLog.info(str(varMap)) # returns retFatal = self.SC_FATAL retTmpError = self.SC_FAILED retOK = self.SC_SUCCEEDED try: # check prodSourceLabel if taskSpec.prodSourceLabel in ['managed','test']: # check taskType if taskSpec.taskType == 'recov': # generate parent tasks for lost file recovery if it is not yet generated if taskParamMap.has_key('parentGenerated'): tmpLog.info('skip since already generated parent tasks') else: tmpLog.info('generating parent tasks for lost file recovery') # missing files are undefined if not varMap.has_key('missingFilesMap'): tmpLog.error('missing files are undefined') return retFatal missingFilesMap = varMap['missingFilesMap'] # check datasets for datasetName,datasetValMap in missingFilesMap.iteritems(): # dataset needs specify container datasetSpec = datasetValMap['datasetSpec'] if datasetSpec.containerName in ['',None]: errStr = 'cannot make parent tasks due to undefined container for datasetID={0}:{1}'.format(datasetSpec.datasetID, datasetName) taskSpec.setErrDiag(errStr) tmpLog.error(errStr) return retFatal # make parameters for new task newJsonStrList = [] for datasetName,datasetValMap in missingFilesMap.iteritems(): datasetSpec = datasetValMap['datasetSpec'] newTaskParamMap = {} newTaskParamMap['oldDatasetName'] = datasetName newTaskParamMap['lostFiles'] = datasetValMap['missingFiles'] newTaskParamMap['vo'] = taskSpec.vo newTaskParamMap['cloud'] = taskSpec.cloud newTaskParamMap['taskPriority'] = taskSpec.taskPriority newTaskParamMap['taskType'] = taskSpec.taskType newTaskParamMap['prodSourceLabel'] = taskSpec.prodSourceLabel logDatasetName = 'panda.jedi{0}.log.{1}'.format(taskSpec.taskType,uuid.uuid4()) newTaskParamMap['log'] = {'dataset': logDatasetName, 'type':'template', 'param_type':'log', 'token':'ATLASDATADISK', 'value':'{0}.${{SN}}.log.tgz'.format(logDatasetName)} # make new datasetname outDatasetName = datasetName # remove / outDatasetName = re.sub('/$','',outDatasetName) # remove extension outDatasetName = re.sub('\.{0}\d+$'.format(taskSpec.taskType),'',outDatasetName) # add extension outDatasetName = outDatasetName + '.{0}{1}'.format(taskSpec.taskType,taskSpec.jediTaskID) newTaskParamMap['output'] = {'dataset': outDatasetName} if not datasetSpec.containerName in ['',None]: newTaskParamMap['output']['container'] = datasetSpec.containerName # make json jsonStr = json.dumps(newTaskParamMap) newJsonStrList.append(jsonStr) # change original task parameters to not repeat the same procedure and to use newly produced files taskParamMap['parentGenerated'] = True taskParamMap['useInFilesInContainer'] = True taskParamMap['useInFilesWithNewAttemptNr'] = True jsonStr = json.dumps(taskParamMap) # insert and update task parameters sTmp,newJediTaskIDs = self.taskBufferIF.insertUpdateTaskParams_JEDI(taskSpec.jediTaskID, taskSpec.vo, taskSpec.prodSourceLabel, jsonStr,newJsonStrList) if sTmp: tmpLog.info('inserted/updated tasks in DB : new jediTaskIDs={0}'.format(str(newJediTaskIDs))) else: tmpLog.error('failed to insert/update tasks in DB') return retFatal # return tmpLog.info('done') return retOK except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doGenerate failed with {0}:{1}'.format(errtype.__name__,errvalue)) return retFatal
def runImpl(self): while True: try: # get a part of list nTasks = 100 taskList = self.taskList.get(nTasks) totalTasks, idxTasks = self.taskList.stat() # no more datasets if len(taskList) == 0: self.logger.debug( '{0} terminating since no more items'.format( self.__class__.__name__)) return # make logger tmpLog = MsgWrapper(self.logger) tmpLog.info( 'start TaskCheckerThread {0}/{1} for jediTaskID={2}'. format(idxTasks, totalTasks, taskList)) tmpStat = Interaction.SC_SUCCEEDED # get TaskSpecs taskSpecList = [] for jediTaskID in taskList: tmpRet, taskSpec = self.taskBufferIF.getTaskWithID_JEDI( jediTaskID, False) if tmpRet and taskSpec is not None: taskSpecList.append(taskSpec) else: tmpLog.error( 'failed to get taskSpec for jediTaskID={0}'.format( jediTaskID)) if taskSpecList != []: # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: impl = self.implFactory.getImpl( self.vo, self.prodSourceLabel) if impl is None: # task brokerage is undefined tmpLog.error( 'task broker is undefined for vo={0} sourceLabel={1}' .format(self.vo, self.prodSourceLabel)) tmpStat = Interaction.SC_FAILED except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('getImpl failed with {0}:{1}'.format( errtype.__name__, errvalue)) tmpStat = Interaction.SC_FAILED # check if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('check with {0}'.format( impl.__class__.__name__)) try: tmpStat, taskCloudMap = impl.doCheck(taskSpecList) except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('doCheck failed with {0}:{1}'.format( errtype.__name__, errvalue)) tmpStat = Interaction.SC_FAILED # update if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to check assignment') else: tmpRet = self.taskBufferIF.setCloudToTasks_JEDI( taskCloudMap) tmpLog.info('done with {0} for {1}'.format( tmpRet, str(taskCloudMap))) except Exception: errtype, errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format( self.__class__.__name__, errtype.__name__, errvalue))
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,commandMap in taskList: # make logger tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID)) commandStr = commandMap['command'] commentStr = commandMap['comment'] oldStatus = commandMap['oldStatus'] tmpLog.info('start for {0}'.format(commandStr)) tmpStat = Interaction.SC_SUCCEEDED if commandStr in ['kill','finish','reassign']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.sendMsg(tmpMsg,self.msgType) # loop twice to see immediate result for iLoop in range(2): # get active PandaIDs to be killed if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr: pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID) else: pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True) if pandaIDs == None: tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID)) tmpStat = Interaction.SC_FAILED # kill jobs or update task if tmpStat == Interaction.SC_SUCCEEDED: if pandaIDs == []: # done since no active jobs tmpMsg = 'completed cleaning jobs' tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID updateTaskStatus = True if commandStr != 'reassign': # reset oldStatus # keep oldStatus for task reassignment since it is reset when actually reassigned tmpTaskSpec.forceUpdate('oldStatus') else: # extract cloud or site if commentStr != None: tmpItems = commentStr.split(':') if tmpItems[0] == 'cloud': tmpTaskSpec.cloud = tmpItems[1] else: tmpTaskSpec.site = tmpItems[1] tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1]) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) # back to oldStatus if necessary if tmpItems[2] == 'y': tmpTaskSpec.status = oldStatus tmpTaskSpec.forceUpdate('oldStatus') updateTaskStatus = False if commandStr == 'reassign': tmpTaskSpec.forceUpdate('errorDialog') if updateTaskStatus: tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done'] tmpMsg = 'set task.status={0}'.format(tmpTaskSpec.status) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID}) tmpLog.info('done with {0}'.format(str(tmpRet))) break else: # kill only in the first loop if iLoop > 0: break # wait or kill jobs if 'soft finish' in commentStr: tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpRet = True tmpLog.info('done with {0}'.format(str(tmpRet))) break else: tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) if commandStr in ['reassign','finish']: # force kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True) else: # normal kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True) tmpLog.info('done with {0}'.format(str(tmpRet))) elif commandStr in ['retry','incexec']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.sendMsg(tmpMsg,self.msgType) # change task params for incexec if commandStr == 'incexec': try: # read task params taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) # remove some params for newKey in ['nFiles','fixedSandbox']: try: del taskParamMap[newKey] except: pass # convert new params newParamMap = RefinerUtils.decodeJSON(commentStr) # change params for newKey,newVal in newParamMap.iteritems(): if newVal == None: # delete if newKey in taskParamMap: del taskParamMap[newKey] else: # change taskParamMap[newKey] = newVal # overwrite sandbox if 'fixedSandbox' in taskParamMap: # noBuild for tmpParam in taskParamMap['jobParameters']: if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None: tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox'] # build if taskParamMap.has_key('buildSpec'): taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox'] # merge if taskParamMap.has_key('mergeSpec'): taskParamMap['mergeSpec']['jobParameters'] = \ re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters']) # encode new param strTaskParams = RefinerUtils.encodeJSON(taskParamMap) tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams) if tmpRet != True: tmpLog.error('failed to update task params') continue except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue)) continue # retry failed files tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr) if tmpRet == True: tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpLog.info('done with {0}'.format(tmpRet)) else: tmpLog.error('unknown command') except: errtype,errvalue = sys.exc_info()[:2] errStr = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue) errStr += traceback.format_exc() logger.error(errStr)
def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap): # make logger tmpLog = MsgWrapper(logger, '<jediTaskID={0}>'.format(taskSpec.jediTaskID)) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL, inputChunk retTmpError = self.SC_FAILED, inputChunk # set cloud try: if not taskParamMap: taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI( taskSpec.jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) if not taskSpec.cloud and 'cloud' in taskParamMap: taskSpec.cloud = taskParamMap['cloud'] except Exception: pass # get sites in the cloud site_preassigned = True if taskSpec.site not in ['', None]: tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) if self.siteMapper.checkSite(taskSpec.site): scanSiteList = [taskSpec.site] else: scanSiteList = [] for tmpSite in self.siteMapper.getCloud( taskSpec.cloud)['sites']: if re.search(taskSpec.site, tmpSite): scanSiteList.append(tmpSite) if not scanSiteList: tmpLog.error('unknown site={}'.format(taskSpec.site)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError elif inputChunk.getPreassignedSite() is not None: scanSiteList = [inputChunk.getPreassignedSite()] tmpLog.debug('site={0} is pre-assigned in masterDS'.format( inputChunk.getPreassignedSite())) else: site_preassigned = False scanSiteList = self.siteMapper.getCloud(taskSpec.cloud)['sites'] # remove NA if 'NA' in scanSiteList: scanSiteList.remove('NA') tmpLog.debug('cloud=%s has %s candidates' % (taskSpec.cloud, len(scanSiteList))) tmpLog.debug('initial {0} candidates'.format(len(scanSiteList))) ###################################### # selection for status and PandaSite newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status if tmpSiteSpec.status != 'online' and not site_preassigned: tmpLog.debug(' skip %s due to status=%s' % (tmpSiteName, tmpSiteSpec.status)) continue # check PandaSite if 'PandaSite' in taskParamMap and taskParamMap['PandaSite']: if tmpSiteSpec.pandasite != taskParamMap['PandaSite']: tmpLog.debug(' skip %s due to wrong PandaSite=%s <> %s' % (tmpSiteName, tmpSiteSpec.pandasite, taskParamMap['PandaSite'])) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for scratch disk minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize( ) + inputChunk.getMaxAtomSize() minDiskCountS = minDiskCountS // 1024 // 1024 # size for direct IO sites if taskSpec.useLocalIO(): minDiskCountR = minDiskCountS else: minDiskCountR = taskSpec.getOutDiskSize( ) + taskSpec.getWorkDiskSize() minDiskCountR = minDiskCountR // 1024 // 1024 newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir: if JediCoreUtils.use_direct_io_for_job(taskSpec, tmpSiteSpec, inputChunk): minDiskCount = minDiskCountR else: minDiskCount = minDiskCountS if minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug( ' skip {0} due to small scratch disk={1} < {2}'. format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # free space must be >= 200GB diskThreshold = 200 tmpSpaceSize = tmpSiteSpec.space if tmpSiteSpec.space and tmpSpaceSize < diskThreshold: tmpLog.debug( ' skip {0} due to disk shortage in SE = {1} < {2}GB'. format(tmpSiteName, tmpSiteSpec.space, diskThreshold)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if minWalltime not in [0, None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug( ' skip {0} due to short site walltime={1}(site upper limit) < {2}' .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug( ' skip {0} due to short job walltime={1}(site lower limit) > {2}' .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format( len(scanSiteList), minWalltime, taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for memory origMinRamCount = inputChunk.getMaxRamCount() if not site_preassigned and origMinRamCount: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # job memory requirement if taskSpec.ramPerCore(): minRamCount = origMinRamCount * ( tmpSiteSpec.coreCount if tmpSiteSpec.coreCount else 1) minRamCount += (taskSpec.baseRamCount if taskSpec.baseRamCount else 0) else: minRamCount = origMinRamCount # site max memory requirement site_maxmemory = tmpSiteSpec.maxrss if tmpSiteSpec.maxrss else 0 # check at the site if site_maxmemory and minRamCount and minRamCount > site_maxmemory: tmpMsg = ' skip site={0} due to site RAM shortage {1}(site upper limit) less than {2} '.format( tmpSiteName, site_maxmemory, minRamCount) tmpLog.debug(tmpMsg) continue # site min memory requirement site_minmemory = tmpSiteSpec.minrss if tmpSiteSpec.minrss else 0 if site_minmemory and minRamCount and minRamCount < site_minmemory: tmpMsg = ' skip site={0} due to job RAM shortage {1}(site lower limit) greater than {2} '.format( tmpSiteName, site_minmemory, minRamCount) tmpLog.info(tmpMsg) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if tmpSiteName in nWNmap: nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][ 'updateJob'] if nPilot == 0 and taskSpec.prodSourceLabel not in ['test']: tmpLog.debug(' skip %s due to no pilot' % tmpSiteName) #continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # sites already used by task tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI( taskSpec.jediTaskID) if not tmpSt: tmpLog.error('failed to get sites which already used by task') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # get list of available files availableFileMap = {} for datasetSpec in inputChunk.getDatasets(): try: # get list of site to be scanned tmpLog.debug( 'getting the list of available files for {0}'.format( datasetSpec.datasetName)) fileScanSiteList = [] for tmpPseudoSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpPseudoSiteName) tmpSiteName = tmpSiteSpec.get_unified_name() if tmpSiteName in fileScanSiteList: continue fileScanSiteList.append(tmpSiteName) # mapping between sites and input storage endpoints siteStorageEP = AtlasBrokerUtils.getSiteInputStorageEndpointMap( fileScanSiteList, self.siteMapper, taskSpec.prodSourceLabel, None) # disable file lookup for merge jobs if inputChunk.isMerging: checkCompleteness = False else: checkCompleteness = True if not datasetSpec.isMaster(): useCompleteOnly = True else: useCompleteOnly = False # get available files per site/endpoint tmpAvFileMap = self.ddmIF.getAvailableFiles( datasetSpec, siteStorageEP, self.siteMapper, check_completeness=checkCompleteness, file_scan_in_container=False, complete_only=useCompleteOnly) if tmpAvFileMap is None: raise Interaction.JEDITemporaryError( 'ddmIF.getAvailableFiles failed') availableFileMap[datasetSpec.datasetName] = tmpAvFileMap except Exception as e: tmpLog.error('failed to get available files with {}'.format(e)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # calculate weight tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsByGlobalShare( taskSpec.vo) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # final procedure tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} candidateSpecList = [] preSiteCandidateSpec = None for tmpSiteName in scanSiteList: # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'running', None, None) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'defined', None, None) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'activated', None, None) weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1) # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # set weight siteCandidateSpec.weight = weight # files for tmpDatasetName, availableFiles in six.iteritems( availableFileMap): if tmpSiteName in availableFiles: siteCandidateSpec.add_local_disk_files( availableFiles[tmpSiteName]['localdisk']) # append if tmpSiteName in sitesUsedByTask: candidateSpecList.append(siteCandidateSpec) else: if weight not in weightMap: weightMap[weight] = [] weightMap[weight].append(siteCandidateSpec) # limit the number of sites maxNumSites = 5 weightList = list(weightMap.keys()) weightList.sort() weightList.reverse() for weightVal in weightList: if len(candidateSpecList) >= maxNumSites: break sitesWithWeight = weightMap[weightVal] random.shuffle(sitesWithWeight) candidateSpecList += sitesWithWeight[:(maxNumSites - len(candidateSpecList))] # collect site names scanSiteList = [] for siteCandidateSpec in candidateSpecList: scanSiteList.append(siteCandidateSpec.siteName) # append candidates newScanSiteList = [] for siteCandidateSpec in candidateSpecList: # append inputChunk.addSiteCandidate(siteCandidateSpec) newScanSiteList.append(siteCandidateSpec.siteName) tmpLog.debug(' use {} with weight={} nFiles={}'.format( siteCandidateSpec.siteName, siteCandidateSpec.weight, len(siteCandidateSpec.localDiskFiles))) scanSiteList = newScanSiteList if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # return tmpLog.debug('done') return self.SC_SUCCEEDED, inputChunk
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,commandMap in taskList: # make logger tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID)) commandStr = commandMap['command'] commentStr = commandMap['comment'] oldStatus = commandMap['oldStatus'] tmpLog.info('start for {0}'.format(commandStr)) tmpStat = Interaction.SC_SUCCEEDED if commandStr in ['kill','finish','reassign']: # get active PandaIDs to be killed pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True) if pandaIDs == None: tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID)) tmpStat = Interaction.SC_FAILED # kill jobs or update task if tmpStat == Interaction.SC_SUCCEEDED: if pandaIDs == []: # done since no active jobs tmpLog.info('completed the command') tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID updateTaskStatus = True if commandStr != 'reassign': # keep oldStatus for task reassignment since it is reset when actually reassigned tmpTaskSpec.forceUpdate('oldStatus') else: # extract cloud or site tmpItems = commentStr.split(':') if tmpItems[0] == 'cloud': tmpTaskSpec.cloud = tmpItems[1] else: tmpTaskSpec.site = tmpItems[1] # back to oldStatus if necessary if tmpItems[2] == 'y': tmpTaskSpec.status = oldStatus tmpTaskSpec.forceUpdate('oldStatus') updateTaskStatus = False if updateTaskStatus: tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done'] tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID}) else: tmpLog.info('sending kill command') tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True) tmpLog.info('done with {0}'.format(str(tmpRet))) elif commandStr in ['retry','incexec']: # change task params for incexec if commandStr == 'incexec': try: # read task params taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) # remove some params for newKey in ['nFiles','fixedSandbox']: try: del taskParamMap[newKey] except: pass # convert new params newParamMap = RefinerUtils.decodeJSON(commentStr) # change params for newKey,newVal in newParamMap.iteritems(): if newVal == None: # delete if newKey in taskParamMap: del taskParamMap[newKey] else: # change taskParamMap[newKey] = newVal # overwrite sandbox if 'fixedSandbox' in taskParamMap: # noBuild for tmpParam in taskParamMap['jobParameters']: if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None: tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox'] # build if taskParamMap.has_key('buildSpec'): taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox'] # merge if taskParamMap.has_key('mergeSpec'): taskParamMap['mergeSpec']['jobParameters'] = \ re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters']) # encode new param strTaskParams = RefinerUtils.encodeJSON(taskParamMap) tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams) if tmpRet != True: tmpLog.error('failed to update task params') continue except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue)) continue # retry failed files tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr) if tmpRet == True: tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpLog.info('done with {0}'.format(tmpRet)) else: tmpLog.error('unknown command') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap): # make logger tmpLog = MsgWrapper(logger, '<jediTaskID={0}>'.format(taskSpec.jediTaskID), monToken='<jediTaskID={0} {1}>'.format( taskSpec.jediTaskID, datetime.datetime.utcnow().isoformat('/'))) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL, inputChunk retTmpError = self.SC_FAILED, inputChunk # get primary site candidates sitePreAssigned = False excludeList = [] includeList = None scanSiteList = [] # get list of site access siteAccessList = self.taskBufferIF.listSiteAccess( None, taskSpec.userName) siteAccessMap = {} for tmpSiteName, tmpAccess in siteAccessList: siteAccessMap[tmpSiteName] = tmpAccess # site limitation if taskSpec.useLimitedSites(): if 'excludedSite' in taskParamMap: excludeList = taskParamMap['excludedSite'] # str to list for task retry try: if type(excludeList) != types.ListType: excludeList = excludeList.split(',') except: pass if 'includedSite' in taskParamMap: includeList = taskParamMap['includedSite'] # str to list for task retry if includeList == '': includeList = None try: if type(includeList) != types.ListType: includeList = includeList.split(',') except: pass # loop over all sites for siteName, tmpSiteSpec in self.siteMapper.siteSpecList.iteritems(): if tmpSiteSpec.type == 'analysis': scanSiteList.append(siteName) # preassigned if not taskSpec.site in ['', None]: # site is pre-assigned tmpLog.info('site={0} is pre-assigned'.format(taskSpec.site)) sitePreAssigned = True if not taskSpec.site in scanSiteList: scanSiteList.append(taskSpec.site) tmpLog.info('initial {0} candidates'.format(len(scanSiteList))) # allowed remote access protocol allowedRemoteProtocol = 'fax' # MP if taskSpec.coreCount != None and taskSpec.coreCount > 1: # use MCORE only useMP = 'only' elif taskSpec.coreCount == 0: # use MCORE and normal useMP = 'any' else: # not use MCORE useMP = 'unuse' ###################################### # selection for status newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # skip unified queues if tmpSiteSpec.is_unified: tmpLog.info( ' skip site=%s due to is_unified=%s criteria=-unified' % (tmpSiteName, tmpSiteSpec.is_unified)) continue # check site status skipFlag = False if tmpSiteSpec.status in ['offline']: skipFlag = True elif tmpSiteSpec.status in ['brokeroff', 'test']: if not sitePreAssigned: skipFlag = True elif tmpSiteName != taskSpec.site: skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.info( ' skip site=%s due to status=%s criteria=-status' % (tmpSiteName, tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.info('{0} candidates passed site status check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for MP newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \ (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]): newScanSiteList.append(tmpSiteName) else: tmpLog.info(' skip site=%s due to core mismatch cores_site=%s <> cores_task=%s criteria=-cpucore' % \ (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount)) scanSiteList = newScanSiteList tmpLog.info('{0} candidates passed for useMP={1}'.format( len(scanSiteList), useMP)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for release if taskSpec.transHome != None: unified_site_list = self.get_unified_sites(scanSiteList) if taskSpec.transHome.startswith('ROOT'): # hack until x86_64-slc6-gcc47-opt is published in installedsw if taskSpec.architecture == 'x86_64-slc6-gcc47-opt': tmpCmtConfig = 'x86_64-slc6-gcc46-opt' else: tmpCmtConfig = taskSpec.architecture siteListWithSW = self.taskBufferIF.checkSitesWithRelease( unified_site_list, cmtConfig=tmpCmtConfig, onlyCmtConfig=True) elif 'AthAnalysis' in taskSpec.transHome or re.search('Ath[a-zA-Z]+Base',taskSpec.transHome) != None \ or 'AnalysisBase' in taskSpec.transHome: # AthAnalysis siteListWithSW = self.taskBufferIF.checkSitesWithRelease( unified_site_list, cmtConfig=taskSpec.architecture, onlyCmtConfig=True) else: # remove AnalysisTransforms- transHome = re.sub('^[^-]+-*', '', taskSpec.transHome) transHome = re.sub('_', '-', transHome) if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None and taskSpec.transHome != 'AnalysisTransforms' and \ re.search('\d{4}-\d{2}-\d{2}T\d{4}$',taskSpec.transHome) == None and \ re.search('_\d+\.\d+\.\d+$',taskSpec.transHome) is None: # cache is checked siteListWithSW = self.taskBufferIF.checkSitesWithRelease( unified_site_list, caches=transHome, cmtConfig=taskSpec.architecture) elif (transHome == '' and taskSpec.transUses != None) or \ (re.search('_\d+\.\d+\.\d+$',taskSpec.transHome) is not None and \ (taskSpec.transUses is None or re.search('-\d+\.\d+$',taskSpec.transUses) is None)): # remove Atlas- transUses = taskSpec.transUses.split('-')[-1] # release is checked siteListWithSW = self.taskBufferIF.checkSitesWithRelease( unified_site_list, releases=transUses, cmtConfig=taskSpec.architecture) siteListWithSW += self.taskBufferIF.checkSitesWithRelease( unified_site_list, caches=transHome, cmtConfig=taskSpec.architecture) else: # nightlies siteListWithSW = self.taskBufferIF.checkSitesWithRelease( unified_site_list, releases='CVMFS') newScanSiteList = [] for tmpSiteName in unified_site_list: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # release check is disabled or release is available if tmpSiteSpec.releases == ['ANY']: newScanSiteList.append(tmpSiteName) elif tmpSiteName in siteListWithSW: newScanSiteList.append(tmpSiteName) else: # release is unavailable tmpLog.info(' skip site=%s due to missing rel/cache %s:%s:%s criteria=-cache' % \ (tmpSiteName,taskSpec.transUses,taskSpec.transHome,taskSpec.architecture)) scanSiteList = self.get_pseudo_sites(newScanSiteList, scanSiteList) tmpLog.info('{0} candidates passed for SW {1}:{2}:{3}'.format( len(scanSiteList), taskSpec.transUses, taskSpec.transHome, taskSpec.architecture)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for memory minRamCount = inputChunk.getMaxRamCount() minRamCount = JediCoreUtils.compensateRamCount(minRamCount) if not minRamCount in [0, None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # site max memory requirement if not tmpSiteSpec.maxrss in [0, None]: site_maxmemory = tmpSiteSpec.maxrss else: site_maxmemory = tmpSiteSpec.maxmemory if not site_maxmemory in [ 0, None ] and minRamCount != 0 and minRamCount > site_maxmemory: tmpLog.info( ' skip site={0} due to site RAM shortage. site_maxmemory={1} < job_minramcount={2} criteria=-lowmemory' .format(tmpSiteName, site_maxmemory, minRamCount)) continue # site min memory requirement if not tmpSiteSpec.minrss in [0, None]: site_minmemory = tmpSiteSpec.minrss else: site_minmemory = tmpSiteSpec.minmemory if not site_minmemory in [ 0, None ] and minRamCount != 0 and minRamCount < site_minmemory: tmpLog.info( ' skip site={0} due to job RAM shortage. site_minmemory={1} > job_minramcount={2} criteria=-highmemory' .format(tmpSiteName, site_minmemory, minRamCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList ramUnit = taskSpec.ramUnit if ramUnit is None: ramUnit = 'MB' tmpLog.info('{0} candidates passed memory check = {1} {2}'.format( len(scanSiteList), minRamCount, ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for scratch disk tmpMaxAtomSize = inputChunk.getMaxAtomSize() tmpEffAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True) tmpOutDiskSize = taskSpec.getOutDiskSize() tmpWorkDiskSize = taskSpec.getWorkDiskSize() minDiskCountS = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize + tmpMaxAtomSize minDiskCountS = minDiskCountS / 1024 / 1024 # size for direct IO sites if taskSpec.useLocalIO(): minDiskCountR = minDiskCountS else: minDiskCountR = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize minDiskCountR = minDiskCountR / 1024 / 1024 tmpLog.info( 'maxAtomSize={0} effectiveAtomSize={1} outDiskCount={2} workDiskSize={3}' .format(tmpMaxAtomSize, tmpEffAtomSize, tmpOutDiskSize, tmpWorkDiskSize)) tmpLog.info('minDiskCountScratch={0} minDiskCountRemote={1}'.format( minDiskCountS, minDiskCountR)) newScanSiteList = [] for tmpSiteName in self.get_unified_sites(scanSiteList): tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0: if tmpSiteSpec.isDirectIO(): minDiskCount = minDiskCountR else: minDiskCount = minDiskCountS if minDiskCount > tmpSiteSpec.maxwdir: tmpLog.info( ' skip site={0} due to small scratch disk={1} < {2} criteria=-disk' .format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = self.get_pseudo_sites(newScanSiteList, scanSiteList) tmpLog.info('{0} candidates passed scratch disk check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in self.get_unified_sites(scanSiteList): # check endpoint tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) tmpEndPoint = tmpSiteSpec.ddm_endpoints_output.getEndPoint( tmpSiteSpec.ddm_output) if tmpEndPoint is not None: # free space must be >= 200GB diskThreshold = 200 tmpSpaceSize = 0 if tmpEndPoint['space_expired'] is not None: tmpSpaceSize += tmpEndPoint['space_expired'] if tmpEndPoint['space_free'] is not None: tmpSpaceSize += tmpEndPoint['space_free'] if tmpSpaceSize < diskThreshold: tmpLog.info( ' skip site={0} due to disk shortage in SE {1} < {2}GB criteria=-disk' .format(tmpSiteName, tmpSpaceSize, diskThreshold)) continue # check if blacklisted if tmpEndPoint['blacklisted'] == 'Y': tmpLog.info( ' skip site={0} since {1} is blacklisted in DDM criteria=-blacklist' .format(tmpSiteName, tmpSiteSpec.ddm_output)) continue newScanSiteList.append(tmpSiteName) scanSiteList = self.get_pseudo_sites(newScanSiteList, scanSiteList) tmpLog.info('{0} candidates passed SE space check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if not minWalltime in [0, None] and minWalltime > 0: minWalltime *= tmpEffAtomSize newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.info( ' skip site={0} due to short site walltime={1}(site upper limit) < {2} criteria=-shortwalltime' .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.info( ' skip site={0} due to short job walltime={1}(site lower limit) > {2} criteria=-longwalltime' .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.info('{0} candidates passed walltime check ={1}{2}'.format( len(scanSiteList), minWalltime, taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in self.get_unified_sites(scanSiteList): # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][ 'updateJob'] if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']: tmpLog.info( ' skip site=%s due to no pilot criteria=-nopilot' % tmpSiteName) if not self.testMode: continue newScanSiteList.append(tmpSiteName) scanSiteList = self.get_pseudo_sites(newScanSiteList, scanSiteList) tmpLog.info('{0} candidates passed pilot activity check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # check inclusion and exclusion newScanSiteList = [] sitesForANY = [] for tmpSiteName in self.get_unified_sites(scanSiteList): autoSite = False # check exclusion if AtlasBrokerUtils.isMatched(tmpSiteName, excludeList): tmpLog.info( ' skip site={0} excluded criteria=-excluded'.format( tmpSiteName)) continue # check inclusion if includeList != None and not AtlasBrokerUtils.isMatched( tmpSiteName, includeList): if 'AUTO' in includeList: autoSite = True else: tmpLog.info( ' skip site={0} not included criteria=-notincluded'. format(tmpSiteName)) continue tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # limited access if tmpSiteSpec.accesscontrol == 'grouplist': if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \ siteAccessMap[tmpSiteSpec.sitename] != 'approved': tmpLog.info( ' skip site={0} limited access criteria=-limitedaccess' .format(tmpSiteName)) continue # check cloud if not taskSpec.cloud in [None, '', 'any', tmpSiteSpec.cloud]: tmpLog.info( ' skip site={0} cloud mismatch criteria=-cloudmismatch'. format(tmpSiteName)) continue if autoSite: sitesForANY.append(tmpSiteName) else: newScanSiteList.append(tmpSiteName) # use AUTO sites if no sites are included if newScanSiteList == []: newScanSiteList = sitesForANY else: for tmpSiteName in sitesForANY: tmpLog.info( ' skip site={0} not included criteria=-notincluded'. format(tmpSiteName)) scanSiteList = self.get_pseudo_sites(newScanSiteList, scanSiteList) tmpLog.info('{0} candidates passed inclusion/exclusion/cloud'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for data availability hasDDS = False dataWeight = {} remoteSourceList = {} if inputChunk.getDatasets() != []: oldScanSiteList = copy.copy(scanSiteList) oldScanUnifiedSiteList = self.get_unified_sites(oldScanSiteList) for datasetSpec in inputChunk.getDatasets(): datasetName = datasetSpec.datasetName if not self.dataSiteMap.has_key(datasetName): # get the list of sites where data is available tmpLog.debug( 'getting the list of sites where {0} is available'. format(datasetName)) tmpSt, tmpRet = AtlasBrokerUtils.getAnalSitesWithData( self.get_unified_sites(scanSiteList), self.siteMapper, self.ddmIF, datasetName) if tmpSt in [ Interaction.JEDITemporaryError, Interaction.JEDITimeoutError ]: tmpLog.error( 'temporary failed to get the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError if tmpSt == Interaction.JEDIFatalError: tmpLog.error( 'fatal error when getting the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal # append self.dataSiteMap[datasetName] = tmpRet if datasetName.startswith('ddo'): tmpLog.debug(' {0} sites'.format(len(tmpRet))) else: tmpLog.debug(' {0} sites : {1}'.format( len(tmpRet), str(tmpRet))) # check if distributed if tmpRet != {}: isDistributed = True for tmpMap in tmpRet.values(): for tmpVal in tmpMap.values(): if tmpVal['state'] == 'complete': isDistributed = False break if not isDistributed: break if isDistributed: # check if really distributed isDistributed = self.ddmIF.isDistributedDataset( datasetName) if isDistributed: hasDDS = True datasetSpec.setDistributed() tmpLog.debug(' {0} is distributed'.format( datasetName)) # check if the data is available at somewhere if self.dataSiteMap[datasetName] == {}: for tmpSiteName in scanSiteList: tmpLog.info( ' skip site={0} data is unavailable criteria=-input' .format(tmpSiteName)) tmpLog.error( '{0} is unavailable at any site'.format(datasetName)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal # get the list of sites where data is available scanSiteList = None scanSiteListOnDisk = None normFactor = 0 for datasetName, tmpDataSite in self.dataSiteMap.iteritems(): normFactor += 1 # get sites where replica is available tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk( tmpDataSite, includeTape=True) tmpDiskSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk( tmpDataSite, includeTape=False) # get sites which can remotely access source sites if inputChunk.isMerging: # disable remote access for merging tmpSatelliteSites = {} elif (not sitePreAssigned) or ( sitePreAssigned and not taskSpec.site in tmpSiteList): tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites( tmpDiskSiteList, self.taskBufferIF, self.siteMapper, nSites=50, protocol=allowedRemoteProtocol) else: tmpSatelliteSites = {} # make weight map for local for tmpSiteName in tmpSiteList: if not dataWeight.has_key(tmpSiteName): dataWeight[tmpSiteName] = 0 # give more weight to disk if tmpSiteName in tmpDiskSiteList: dataWeight[tmpSiteName] += 1 else: dataWeight[tmpSiteName] += 0.001 # make weight map for remote for tmpSiteName, tmpWeightSrcMap in tmpSatelliteSites.iteritems( ): # skip since local data is available if tmpSiteName in tmpSiteList: continue tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # negative weight for remote access wRemote = 50.0 if not tmpSiteSpec.wansinklimit in [0, None]: wRemote /= float(tmpSiteSpec.wansinklimit) # sum weight if not dataWeight.has_key(tmpSiteName): dataWeight[tmpSiteName] = float( tmpWeightSrcMap['weight']) / wRemote else: dataWeight[tmpSiteName] += float( tmpWeightSrcMap['weight']) / wRemote # make remote source list if not remoteSourceList.has_key(tmpSiteName): remoteSourceList[tmpSiteName] = {} remoteSourceList[tmpSiteName][ datasetName] = tmpWeightSrcMap['source'] # first list if scanSiteList == None: scanSiteList = [] for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys(): if not tmpSiteName in oldScanUnifiedSiteList: continue if not tmpSiteName in scanSiteList: scanSiteList.append(tmpSiteName) scanSiteListOnDisk = set() for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys( ): if not tmpSiteName in oldScanUnifiedSiteList: continue scanSiteListOnDisk.add(tmpSiteName) continue # pickup sites which have all data newScanList = [] for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys(): if tmpSiteName in scanSiteList and not tmpSiteName in newScanList: newScanList.append(tmpSiteName) scanSiteList = newScanList tmpLog.debug('{0} is available at {1} sites'.format( datasetName, len(scanSiteList))) # pickup sites which have all data on DISK newScanListOnDisk = set() for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys(): if tmpSiteName in scanSiteListOnDisk: newScanListOnDisk.add(tmpSiteName) scanSiteListOnDisk = newScanListOnDisk tmpLog.debug('{0} is available at {1} sites on DISK'.format( datasetName, len(scanSiteListOnDisk))) # check for preassigned if sitePreAssigned and not taskSpec.site in scanSiteList: scanSiteList = [] tmpLog.info( 'data is unavailable locally or remotely at preassigned site {0}' .format(taskSpec.site)) elif len(scanSiteListOnDisk) > 0: # use only disk sites scanSiteList = list(scanSiteListOnDisk) scanSiteList = self.get_pseudo_sites(scanSiteList, oldScanSiteList) # dump for tmpSiteName in oldScanSiteList: if tmpSiteName not in scanSiteList: tmpLog.info( ' skip site={0} data is unavailable criteria=-input'. format(tmpSiteName)) tmpLog.info('{0} candidates have input data'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal ###################################### # sites already used by task tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI( taskSpec.jediTaskID) if not tmpSt: tmpLog.error('failed to get sites which already used by task') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError sitesUsedByTask = self.get_unified_sites(sitesUsedByTask) ###################################### # calculate weight """ fqans = taskSpec.makeFQANs() tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans, taskSpec.workingGroup,True) currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight) currentPriority -= 500 tmpLog.debug('currentPriority={0}'.format(currentPriority)) """ tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsByGlobalShare( taskSpec.vo) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # check for preassigned if sitePreAssigned and (taskSpec.site not in scanSiteList and taskSpec.site not in self.get_unified_sites(scanSiteList)): tmpLog.info("preassigned site {0} did not pass all tests".format( taskSpec.site)) tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal ###################################### # final procedure tmpLog.info('final {0} candidates'.format(len(scanSiteList))) weightMap = {} candidateSpecList = [] timeWindowForFC = 6 preSiteCandidateSpec = None failureCounts = self.taskBufferIF.getFailureCountsForTask_JEDI( taskSpec.jediTaskID, timeWindowForFC) problematicSites = set() for tmpPseudoSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpPseudoSiteName) tmpSiteName = tmpSiteSpec.get_unified_name() # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'running', None, None) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'defined', None, None) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) + \ AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None) nStarting = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'starting', None, None) nFailed = 0 nClosed = 0 nFinished = 0 if tmpSiteName in failureCounts: if 'failed' in failureCounts[tmpSiteName]: nFailed = failureCounts[tmpSiteName]['failed'] if 'closed' in failureCounts[tmpSiteName]: nClosed = failureCounts[tmpSiteName]['closed'] if 'finished' in failureCounts[tmpSiteName]: nFinished = failureCounts[tmpSiteName]['finished'] # problematic sites if nFailed + nClosed > 2 * nFinished: problematicSites.add(tmpSiteName) # calculate weight weight = float(nRunning + 1) / float(nActivated + nAssigned + nStarting + 1) nThrottled = 0 if remoteSourceList.has_key(tmpSiteName): nThrottled = AtlasBrokerUtils.getNumJobs( jobStatPrioMap, tmpSiteName, 'throttled', None, None) weight /= float(nThrottled + 1) # noramize weights by taking data availability into account tmpDataWeight = 1 if dataWeight.has_key(tmpSiteName): weight = weight * dataWeight[tmpSiteName] tmpDataWeight = dataWeight[tmpSiteName] # make candidate siteCandidateSpec = SiteCandidate(tmpPseudoSiteName) # preassigned if sitePreAssigned and tmpSiteName == taskSpec.site: preSiteCandidateSpec = siteCandidateSpec # set weight siteCandidateSpec.weight = weight tmpStr = ' site={0} nRun={1} nDef={2} nAct={3} nStart={4} '.format( tmpPseudoSiteName, nRunning, nAssigned, nActivated, nStarting) tmpStr += 'nFailed={0} nClosed={1} nFinished={2} nTr={3} dataW={4} W={5}'.format( nFailed, nClosed, nFinished, nThrottled, tmpDataWeight, weight) tmpLog.info(tmpStr) # append if tmpSiteName in sitesUsedByTask: candidateSpecList.append(siteCandidateSpec) else: if not weightMap.has_key(weight): weightMap[weight] = [] weightMap[weight].append(siteCandidateSpec) # sort candidates by weights weightList = weightMap.keys() weightList.sort() weightList.reverse() for weightVal in weightList: sitesWithWeight = weightMap[weightVal] random.shuffle(sitesWithWeight) candidateSpecList += sitesWithWeight # limit the number of sites. use all sites for distributed datasets if not hasDDS: maxNumSites = 10 # remove problematic sites candidateSpecList = AtlasBrokerUtils.skipProblematicSites( candidateSpecList, problematicSites, sitesUsedByTask, preSiteCandidateSpec, maxNumSites, timeWindowForFC, tmpLog) # append preassigned if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList: candidateSpecList.append(preSiteCandidateSpec) # collect site names scanSiteList = [] for siteCandidateSpec in candidateSpecList: scanSiteList.append(siteCandidateSpec.siteName) # get list of available files availableFileMap = {} for datasetSpec in inputChunk.getDatasets(): try: # get list of site to be scanned fileScanSiteList = [] for tmpPseudoSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpPseudoSiteName) tmpSiteName = tmpSiteSpec.get_unified_name() if tmpSiteName in fileScanSiteList: continue fileScanSiteList.append(tmpSiteName) if remoteSourceList.has_key( tmpSiteName ) and remoteSourceList[tmpSiteName].has_key( datasetSpec.datasetName): for tmpRemoteSite in remoteSourceList[tmpSiteName][ datasetSpec.datasetName]: if not tmpRemoteSite in fileScanSiteList: fileScanSiteList.append(tmpRemoteSite) # mapping between sites and input storage endpoints siteStorageEP = AtlasBrokerUtils.getSiteInputStorageEndpointMap( fileScanSiteList, self.siteMapper) # disable file lookup for merge jobs if inputChunk.isMerging: checkCompleteness = False else: checkCompleteness = True # get available files per site/endpoint tmpAvFileMap = self.ddmIF.getAvailableFiles( datasetSpec, siteStorageEP, self.siteMapper, check_completeness=checkCompleteness) if tmpAvFileMap == None: raise Interaction.JEDITemporaryError, 'ddmIF.getAvailableFiles failed' availableFileMap[datasetSpec.datasetName] = tmpAvFileMap except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed to get available files with %s %s' % (errtype.__name__, errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # append candidates newScanSiteList = [] for siteCandidateSpec in candidateSpecList: tmpPseudoSiteName = siteCandidateSpec.siteName tmpSiteSpec = self.siteMapper.getSite(tmpPseudoSiteName) tmpSiteName = tmpSiteSpec.get_unified_name() # preassigned if sitePreAssigned and tmpSiteName != taskSpec.site: tmpLog.info( ' skip site={0} non pre-assigned site criteria=-nonpreassigned' .format(tmpPseudoSiteName)) continue # set available files if inputChunk.getDatasets() == []: isAvailable = True else: isAvailable = False for tmpDatasetName, availableFiles in availableFileMap.iteritems(): tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName) # check remote files if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[ tmpSiteName].has_key(tmpDatasetName): for tmpRemoteSite in remoteSourceList[tmpSiteName][ tmpDatasetName]: if availableFiles.has_key(tmpRemoteSite) and \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpRemoteSite]['localdisk']): # use only remote disk files siteCandidateSpec.remoteFiles += availableFiles[ tmpRemoteSite]['localdisk'] # set remote site and access protocol siteCandidateSpec.remoteProtocol = allowedRemoteProtocol siteCandidateSpec.remoteSource = tmpRemoteSite isAvailable = True break # local files if availableFiles.has_key(tmpSiteName): if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']) or \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localtape']) or \ (tmpDatasetSpec.isDistributed() and len(availableFiles[tmpSiteName]['all']) > 0): siteCandidateSpec.localDiskFiles += availableFiles[ tmpSiteName]['localdisk'] # add cached files to local list since cached files go to pending when reassigned siteCandidateSpec.localDiskFiles += availableFiles[ tmpSiteName]['cache'] siteCandidateSpec.localTapeFiles += availableFiles[ tmpSiteName]['localtape'] siteCandidateSpec.cacheFiles += availableFiles[ tmpSiteName]['cache'] siteCandidateSpec.remoteFiles += availableFiles[ tmpSiteName]['remote'] siteCandidateSpec.addAvailableFiles( availableFiles[tmpSiteName]['all']) isAvailable = True else: tmpMsg = '{0} is incomplete at {1} : nFiles={2} nLocal={3} nCached={4} nTape={5}' tmpLog.debug( tmpMsg.format( tmpDatasetName, tmpPseudoSiteName, len(tmpDatasetSpec.Files), len(availableFiles[tmpSiteName]['localdisk']), len(availableFiles[tmpSiteName]['cache']), len(availableFiles[tmpSiteName]['localtape']), )) if not isAvailable: break # append if not isAvailable: tmpLog.info( ' skip site={0} file unavailable criteria=-fileunavailable' .format(siteCandidateSpec.siteName)) continue inputChunk.addSiteCandidate(siteCandidateSpec) newScanSiteList.append(siteCandidateSpec.siteName) tmpLog.info( ' use site={0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5} criteria=+use' .format( siteCandidateSpec.siteName, siteCandidateSpec.weight, len(siteCandidateSpec.localDiskFiles), len(siteCandidateSpec.localTapeFiles), len(siteCandidateSpec.cacheFiles), len(siteCandidateSpec.remoteFiles), )) scanSiteList = newScanSiteList if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # send info to logger self.sendLogMessage(tmpLog) # return tmpLog.debug('done') return self.SC_SUCCEEDED, inputChunk
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.info('start') # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # rescue picked files tmpLog.info('rescue tasks with picked files for vo={0} label={1}'.format(vo,prodSourceLabel)) tmpRet = self.taskBufferIF.rescuePickedFiles_JEDI(vo,prodSourceLabel, jedi_config.watchdog.waitForPicked) if tmpRet == None: # failed tmpLog.error('failed to rescue') else: tmpLog.info('rescued {0} tasks'.format(tmpRet)) # reactivate pending tasks tmpLog.info('reactivate pending tasks for vo={0} label={1}'.format(vo,prodSourceLabel)) timeoutForPending = None if hasattr(jedi_config.watchdog,'timeoutForPendingVoLabel'): timeoutForPending = JediCoreUtils.getConfigParam(jedi_config.watchdog.timeoutForPendingVoLabel,vo,prodSourceLabel) if timeoutForPending == None: timeoutForPending = jedi_config.watchdog.timeoutForPending timeoutForPending = int(timeoutForPending) tmpRet = self.taskBufferIF.reactivatePendingTasks_JEDI(vo,prodSourceLabel, jedi_config.watchdog.waitForPending, timeoutForPending) if tmpRet == None: # failed tmpLog.error('failed to reactivate') else: tmpLog.info('reactivated {0} tasks'.format(tmpRet)) # unlock tasks tmpLog.info('unlock tasks for vo={0} label={1}'.format(vo,prodSourceLabel)) tmpRet = self.taskBufferIF.unlockTasks_JEDI(vo,prodSourceLabel, jedi_config.watchdog.waitForLocked) if tmpRet == None: # failed tmpLog.error('failed to unlock') else: tmpLog.info('unlock {0} tasks'.format(tmpRet)) # restart contents update tmpLog.info('restart contents update for vo={0} label={1}'.format(vo,prodSourceLabel)) tmpRet = self.taskBufferIF.restartTasksForContentsUpdate_JEDI(vo,prodSourceLabel) if tmpRet == None: # failed tmpLog.error('failed to restart') else: tmpLog.info('restarted {0} tasks'.format(tmpRet)) # kick exhausted tasks tmpLog.info('kick exhausted tasks for vo={0} label={1}'.format(vo,prodSourceLabel)) tmpRet = self.taskBufferIF.kickExhaustedTasks_JEDI(vo,prodSourceLabel, jedi_config.watchdog.waitForExhausted) if tmpRet == None: # failed tmpLog.error('failed to kick') else: tmpLog.info('kicked {0} tasks'.format(tmpRet)) # finish tasks when goal is reached tmpLog.info('finish achieved tasks for vo={0} label={1}'.format(vo,prodSourceLabel)) tmpRet = self.taskBufferIF.getAchievedTasks_JEDI(vo,prodSourceLabel, jedi_config.watchdog.waitForAchieved) if tmpRet == None: # failed tmpLog.error('failed to finish') else: for jediTaskID in tmpRet: self.taskBufferIF.sendCommandTaskPanda(jediTaskID,'JEDI. Goal reached',True,'finish',comQualifier='soft') tmpLog.info('finished {0} tasks'.format(tmpRet)) # vo/prodSourceLabel specific action impl = self.getImpl(vo,prodSourceLabel) if impl != None: tmpLog.info('special action for vo={0} label={1} with {2}'.format(vo,prodSourceLabel,impl.__class__.__name__)) tmpStat = impl.doAction() if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to run special acction for vo={0} label={1}'.format(vo,prodSourceLabel)) else: tmpLog.info('done for vo={0} label={1}'.format(vo,prodSourceLabel)) tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,errtype.__name__,errvalue)) # sleep if needed loopCycle = jedi_config.watchdog.loopCycle timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep()
def doSetup(self,taskSpec,datasetToRegister,pandaJobs): # make logger tmpLog = MsgWrapper(logger,"<jediTaskID={0}>".format(taskSpec.jediTaskID)) tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType)) # returns retFatal = self.SC_FATAL retTmpError = self.SC_FAILED retOK = self.SC_SUCCEEDED try: # get DDM I/F ddmIF = self.ddmIF.getInterface(taskSpec.vo) # register datasets if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']: # prod vs anal userSetup = False if taskSpec.prodSourceLabel in ['user']: userSetup = True # collect datasetID to register datasets/containers just in case for tmpPandaJob in pandaJobs: if not tmpPandaJob.produceUnMerge(): for tmpFileSpec in tmpPandaJob.Files: if tmpFileSpec.type in ['output','log']: if not tmpFileSpec.datasetID in datasetToRegister: datasetToRegister.append(tmpFileSpec.datasetID) tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister))) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # loop over all datasets avDatasetList = [] cnDatasetMap = {} for datasetID in datasetToRegister: # get output and log datasets tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID)) tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID, datasetID) if not tmpStat: tmpLog.error('failed to get output and log datasets') return retFatal # DDM backend ddmBackEnd = taskSpec.getDdmBackEnd() tmpLog.info('checking {0}'.format(datasetSpec.datasetName)) # check if dataset and container are available in DDM for targetName in [datasetSpec.datasetName,datasetSpec.containerName]: if targetName == None: continue if not targetName in avDatasetList: # set lifetime if targetName.startswith('panda'): lifetime = 14 else: lifetime = None # check dataset/container in DDM tmpList = ddmIF.listDatasets(targetName) if tmpList == []: # get location location = None locForRule = None if targetName == datasetSpec.datasetName: # dataset if datasetSpec.site in ['',None]: if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None: locForRule = datasetSpec.destination elif DataServiceUtils.getDestinationSE(datasetSpec.storageToken) != None: location = DataServiceUtils.getDestinationSE(datasetSpec.storageToken) elif taskSpec.cloud != None: # use T1 SE tmpT1Name = siteMapper.getCloud(taskSpec.cloud)['source'] location = siteMapper.getDdmEndpoint(tmpT1Name,datasetSpec.storageToken) else: location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken) if locForRule == None: locForRule = location # set metadata if taskSpec.prodSourceLabel in ['managed','test'] and targetName == datasetSpec.datasetName: metaData = {} metaData['task_id'] = taskSpec.jediTaskID if not taskSpec.campaign in [None,'']: metaData['campaign'] = taskSpec.campaign if datasetSpec.getTransient() != None: metaData['transient'] = datasetSpec.getTransient() else: metaData = None # register dataset/container tmpLog.info('registering {0} with location={1} backend={2} lifetime={3} meta={4}'.format(targetName, location, ddmBackEnd, lifetime, str(metaData))) tmpStat = ddmIF.registerNewDataset(targetName,backEnd=ddmBackEnd,location=location, lifetime=lifetime,metaData=metaData) if not tmpStat: tmpLog.error('failed to register {0}'.format(targetName)) return retFatal # procedures for user if userSetup or DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None: # register location tmpToRegister = False if userSetup and targetName == datasetSpec.datasetName and not datasetSpec.site in ['',None]: userName = taskSpec.userName grouping = None tmpToRegister = True elif DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None: userName = None grouping = 'NONE' tmpToRegister = True if tmpToRegister: activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel) tmpLog.info('registring location={0} lifetime={1}days activity={2} grouping={3}'.format(locForRule,lifetime, activity,grouping)) tmpStat = ddmIF.registerDatasetLocation(targetName,locForRule,owner=userName, lifetime=lifetime,backEnd=ddmBackEnd, activity=activity,grouping=grouping) if not tmpStat: tmpLog.error('failed to register location {0} with {2} for {1}'.format(locForRule, targetName, ddmBackEnd)) return retFatal avDatasetList.append(targetName) else: tmpLog.info('{0} already registered'.format(targetName)) # check if dataset is in the container if datasetSpec.containerName != None and datasetSpec.containerName != datasetSpec.datasetName: # get list of constituent datasets in the container if not cnDatasetMap.has_key(datasetSpec.containerName): cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName) # add dataset if not datasetSpec.datasetName in cnDatasetMap[datasetSpec.containerName]: tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName], backEnd=ddmBackEnd) if not tmpStat: tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName, datasetSpec.containerName)) return retFatal cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName) else: tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) # update dataset datasetSpec.status = 'registered' self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID, 'datasetID':datasetID}) # open datasets if taskSpec.prodSourceLabel in ['managed','test']: # get the list of output/log datasets outDatasetList = [] for tmpPandaJob in pandaJobs: for tmpFileSpec in tmpPandaJob.Files: if tmpFileSpec.type in ['output','log']: if not tmpFileSpec.destinationDBlock in outDatasetList: outDatasetList.append(tmpFileSpec.destinationDBlock) # open datasets for outDataset in outDatasetList: tmpLog.info('open {0}'.format(outDataset)) ddmIF.openDataset(outDataset) # unset lifetime ddmIF.setDatasetMetadata(outDataset,'lifetime',None) # return tmpLog.info('done') return retOK except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.info('start') # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # rescue picked files tmpLog.info('rescue tasks with picked files for vo={0} label={1}'.format(vo,prodSourceLabel)) tmpRet = self.taskBufferIF.rescuePickedFiles_JEDI(vo,prodSourceLabel, jedi_config.watchdog.waitForPicked) if tmpRet == None: # failed tmpLog.error('failed to rescue') else: tmpLog.info('rescued {0} tasks'.format(tmpRet)) # reactivate pending tasks tmpLog.info('reactivate pending tasks for vo={0} label={1}'.format(vo,prodSourceLabel)) tmpRet = self.taskBufferIF.reactivatePendingTasks_JEDI(vo,prodSourceLabel, jedi_config.watchdog.waitForPending, jedi_config.watchdog.timeoutForPending) if tmpRet == None: # failed tmpLog.error('failed to reactivate') else: tmpLog.info('reactivated {0} tasks'.format(tmpRet)) # vo/prodSourceLabel specific action impl = self.getImpl(vo,prodSourceLabel) if impl != None: tmpLog.info('special action for vo={0} label={1} with {2}'.format(vo,prodSourceLabel,impl.__class__.__name__)) tmpStat = impl.doAction() if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to run special acction for vo={0} label={1}'.format(vo,prodSourceLabel)) else: tmpLog.info('done for vo={0} label={1}'.format(vo,prodSourceLabel)) tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,errtype.__name__,errvalue)) # sleep if needed loopCycle = jedi_config.watchdog.loopCycle timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep()
def doSplit(self, taskSpec, inputChunk, siteMapper, allow_chunk_size_limit=False): # return for failure retFatal = self.SC_FATAL, [] retTmpError = self.SC_FAILED, [] # make logger tmpLog = MsgWrapper( logger, '< jediTaskID={0} datasetID={1} >'.format( taskSpec.jediTaskID, inputChunk.masterIndexName)) tmpLog.debug( 'start chunk_size_limit={}'.format(allow_chunk_size_limit)) if not inputChunk.isMerging: # set maxNumFiles using taskSpec if specified maxNumFiles = taskSpec.getMaxNumFilesPerJob() # set fsize gradients using taskSpec sizeGradients = taskSpec.getOutDiskSize() # set fsize intercepts using taskSpec sizeIntercepts = taskSpec.getWorkDiskSize() # walltime if not taskSpec.useHS06(): walltimeGradient = taskSpec.walltime else: walltimeGradient = taskSpec.getCpuTime() # number of events per job if defined nEventsPerJob = taskSpec.getNumEventsPerJob() # number of files per job if defined if not taskSpec.dynamicNumEvents(): nFilesPerJob = taskSpec.getNumFilesPerJob() else: nFilesPerJob = None if nFilesPerJob is None and nEventsPerJob is None and inputChunk.useScout() \ and not taskSpec.useLoadXML() and not taskSpec.respectSplitRule(): nFilesPerJob = 1 # grouping with boundaryID useBoundary = taskSpec.useGroupWithBoundaryID() # fsize intercepts per input size sizeGradientsPerInSize = None # max primay output size maxOutSize = None # max size per job maxSizePerJob = taskSpec.getMaxSizePerJob() if maxSizePerJob is not None: maxSizePerJob += InputChunk.defaultOutputSize # dynamic number of events dynNumEvents = taskSpec.dynamicNumEvents() # max number of event ranges maxNumEventRanges = None # multiplicity of jobs if taskSpec.useJobCloning(): multiplicity = 1 else: multiplicity = taskSpec.getNumEventServiceConsumer() # split with fields if taskSpec.getFieldNumToLFN( ) is not None and taskSpec.useFileAsSourceLFN(): splitByFields = taskSpec.getFieldNumToLFN() else: splitByFields = None else: # set parameters for merging maxNumFiles = taskSpec.getMaxNumFilesPerMergeJob() sizeGradients = 0 walltimeGradient = 0 nFilesPerJob = taskSpec.getNumFilesPerMergeJob() nEventsPerJob = taskSpec.getNumEventsPerMergeJob() maxSizePerJob = None useBoundary = {'inSplit': 3} dynNumEvents = False maxNumEventRanges = None multiplicity = None # gradients per input size is 1 + margin sizeGradientsPerInSize = self.sizeGradientsPerInSizeForMerge # intercepts for libDS sizeIntercepts = taskSpec.getWorkDiskSize() # mergein of 500MB interceptsMergin = self.interceptsMerginForMerge if sizeIntercepts < interceptsMergin: sizeIntercepts = interceptsMergin maxOutSize = taskSpec.getMaxSizePerMergeJob() if maxOutSize is None: # max output size is 5GB for merging by default maxOutSize = 5 * 1024 * 1024 * 1024 # split with fields if taskSpec.getFieldNumToLFN( ) is not None and taskSpec.useFileAsSourceLFN(): splitByFields = list( range(4 + 1, 4 + 1 + len(taskSpec.getFieldNumToLFN()))) else: splitByFields = None # LB respectLB = taskSpec.respectLumiblock() # dump tmpLog.debug( 'maxNumFiles={0} sizeGradients={1} sizeIntercepts={2} useBoundary={3}' .format(maxNumFiles, sizeGradients, sizeIntercepts, useBoundary)) tmpLog.debug( 'walltimeGradient={0} nFilesPerJob={1} nEventsPerJob={2}'.format( walltimeGradient, nFilesPerJob, nEventsPerJob)) tmpLog.debug('useScout={} isMerging={}'.format(inputChunk.useScout(), inputChunk.isMerging)) tmpLog.debug( 'sizeGradientsPerInSize={0} maxOutSize={1} respectLB={2} dynNumEvents={3}' .format(sizeGradientsPerInSize, maxOutSize, respectLB, dynNumEvents)) tmpLog.debug('multiplicity={0} splitByFields={1} nFiles={2}'.format( multiplicity, str(splitByFields), inputChunk.getNumFilesInMaster())) # split returnList = [] subChunks = [] iSubChunks = 0 if inputChunk.useScout() and not inputChunk.isMerging: default_nSubChunks = 2 elif taskSpec.is_hpo_workflow(): default_nSubChunks = 2 else: default_nSubChunks = 25 subChunk = None nSubChunks = default_nSubChunks strict_chunkSize = False while True: # change site if iSubChunks % nSubChunks == 0 or subChunk == []: # append to return map if subChunks != []: # get site names for parallel execution if taskSpec.getNumSitesPerJob( ) > 1 and not inputChunk.isMerging and inputChunk.useJumbo != 'fake': siteName = inputChunk.getParallelSites( taskSpec.getNumSitesPerJob(), nSubChunks, [siteName]) returnList.append({ 'siteName': siteName, 'subChunks': subChunks, 'siteCandidate': siteCandidate, }) try: gshare = taskSpec.gshare.replace(' ', '_') except Exception: gshare = None tmpLog.info('split to nJobs=%s at site=%s gshare=%s' % (len(subChunks), siteName, gshare)) # checkpoint inputChunk.checkpoint_file_usage() # reset subChunks = [] # skip PQs with chunk size limit ngList = [] if not allow_chunk_size_limit: for siteName in inputChunk.get_candidate_names(): siteSpec = siteMapper.getSite(siteName) if siteSpec.get_job_chunk_size() is not None: ngList.append(siteName) # new candidate siteCandidate, getCandidateMsg = inputChunk.getOneSiteCandidate( nSubChunks, ngSites=ngList, get_msg=True) if siteCandidate is None: tmpLog.debug('no candidate: {0}'.format(getCandidateMsg)) break siteName = siteCandidate.siteName siteSpec = siteMapper.getSite(siteName) # set chunk size nSubChunks = siteSpec.get_job_chunk_size() if nSubChunks is None: nSubChunks = default_nSubChunks strict_chunkSize = False else: strict_chunkSize = True # directIO if not JediCoreUtils.use_direct_io_for_job( taskSpec, siteSpec, inputChunk): useDirectIO = False else: useDirectIO = True # get maxSize if it is set in taskSpec maxSize = maxSizePerJob if maxSize is None: # use maxwdir as the default maxSize if not useDirectIO: maxSize = siteCandidate.get_overridden_attribute( 'maxwdir') if maxSize is None: maxSize = siteSpec.maxwdir if maxSize: maxSize *= 1024 * 1024 elif nEventsPerJob is not None or nFilesPerJob is not None: maxSize = None else: maxSize = siteCandidate.get_overridden_attribute( 'maxwdir') if maxSize is None: maxSize = siteSpec.maxwdir maxSize = max(50000, maxSize) * 1024 * 1024 else: # add offset maxSize += sizeIntercepts # max disk size maxDiskSize = siteCandidate.get_overridden_attribute('maxwdir') if maxDiskSize is None: maxDiskSize = siteSpec.maxwdir if maxDiskSize: maxDiskSize *= 1024 * 1024 # max walltime maxWalltime = None if not inputChunk.isMerging: maxWalltime = taskSpec.getMaxWalltime() if maxWalltime is None: maxWalltime = siteSpec.maxtime # core count if siteSpec.coreCount: coreCount = siteSpec.coreCount else: coreCount = 1 # core power corePower = siteSpec.corepower # max num of event ranges for dynNumEvents if dynNumEvents: maxNumEventRanges = int(siteSpec.get_n_sim_events() // taskSpec.get_min_granularity()) if maxNumEventRanges == 0: maxNumEventRanges = 1 tmpLog.debug( 'chosen {0} : {1} : nQueue={2} nRunCap={3}'.format( siteName, getCandidateMsg, siteCandidate.nQueuedJobs, siteCandidate.nRunningJobsCap)) tmpLog.debug('new weight {0}'.format(siteCandidate.weight)) tmpLog.debug( 'maxSize={0} maxWalltime={1} coreCount={2} corePower={3} maxNumEventRanges={4} maxDisk={5}' .format(maxSize, maxWalltime, coreCount, corePower, maxNumEventRanges, maxDiskSize)) tmpLog.debug('useDirectIO={0} label={1}'.format( useDirectIO, taskSpec.prodSourceLabel)) # get sub chunk subChunk = inputChunk.getSubChunk( siteName, maxSize=maxSize, maxNumFiles=maxNumFiles, sizeGradients=sizeGradients, sizeIntercepts=sizeIntercepts, nFilesPerJob=nFilesPerJob, walltimeGradient=walltimeGradient, maxWalltime=maxWalltime, nEventsPerJob=nEventsPerJob, useBoundary=useBoundary, sizeGradientsPerInSize=sizeGradientsPerInSize, maxOutSize=maxOutSize, coreCount=coreCount, respectLB=respectLB, corePower=corePower, dynNumEvents=dynNumEvents, maxNumEventRanges=maxNumEventRanges, multiplicity=multiplicity, splitByFields=splitByFields, tmpLog=tmpLog, useDirectIO=useDirectIO, maxDiskSize=maxDiskSize, enableLog=True) if subChunk is None: break if subChunk != []: # append subChunks.append(subChunk) iSubChunks += 1 # append to return map if remain isSkipped = False if subChunks != []: # skip if chunk size is not enough if allow_chunk_size_limit and strict_chunkSize and len( subChunks) < nSubChunks: tmpLog.debug( 'skip splitting since chunk size {} is less than chunk size limit {} at {}' .format(len(subChunks), nSubChunks, siteName)) inputChunk.rollback_file_usage() isSkipped = True else: # get site names for parallel execution if taskSpec.getNumSitesPerJob( ) > 1 and not inputChunk.isMerging: siteName = inputChunk.getParallelSites( taskSpec.getNumSitesPerJob(), nSubChunks, [siteName]) returnList.append({ 'siteName': siteName, 'subChunks': subChunks, 'siteCandidate': siteCandidate, }) try: gshare = taskSpec.gshare.replace(' ', '_') except Exception: gshare = None tmpLog.info('split to nJobs=%s at site=%s gshare=%s' % (len(subChunks), siteName, gshare)) # return tmpLog.debug('done') return self.SC_SUCCEEDED, returnList, isSkipped
def runImpl(self): while True: try: # get a part of list nTasks = 100 taskList = self.taskList.get(nTasks) totalTasks, idxTasks = self.taskList.stat() # no more datasets if len(taskList) == 0: self.logger.debug( '{0} terminating since no more items'.format( self.__class__.__name__)) return # make logger tmpLog = MsgWrapper(self.logger) tmpLog.info( 'start TaskBrokerThread {0}/{1} for jediTaskID={2}'.format( idxTasks, totalTasks, taskList)) tmpStat = Interaction.SC_SUCCEEDED # get TaskSpecs tmpListToAssign = [] for tmpTaskItem in taskList: tmpListItem = self.taskBufferIF.getTasksToBeProcessed_JEDI( None, None, None, None, None, simTasks=[tmpTaskItem], readMinFiles=True) if tmpListItem is None: # failed tmpLog.error( 'failed to get the input chunks for jediTaskID={0}' .format(tmpTaskItem)) tmpStat = Interaction.SC_FAILED break tmpListToAssign += tmpListItem # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: impl = self.implFactory.getImpl( self.vo, self.prodSourceLabel) if impl is None: # task refiner is undefined tmpLog.error( 'task broker is undefined for vo={0} sourceLabel={1}' .format(self.vo, self.prodSourceLabel)) tmpStat = Interaction.SC_FAILED except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('getImpl failed with {0}:{1}'.format( errtype.__name__, errvalue)) tmpStat = Interaction.SC_FAILED # brokerage if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('brokerage with {0} for {1} tasks '.format( impl.__class__.__name__, len(tmpListToAssign))) try: tmpStat = impl.doBrokerage(tmpListToAssign, self.vo, self.prodSourceLabel, self.workQueue, self.resource_name) except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('doBrokerage failed with {0}:{1}'.format( errtype.__name__, errvalue)) tmpStat = Interaction.SC_FAILED # register if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed') else: tmpLog.info('done') except Exception: errtype, errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format( self.__class__.__name__, errtype.__name__, errvalue))
def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue, resource_name): # params nBunch = 4 threshold = 2.0 nJobsInBunchMax = 600 nJobsInBunchMin = 500 minTotalWalltime = 50*1000*1000 nWaitingLimit = 4 nWaitingBunchLimit = 2 nParallel = 2 nParallelCap = 5 # make logger tmpLog = MsgWrapper(logger) workQueueID = workQueue.getID() workQueueName = workQueue.queue_name workQueueName = '_'.join(workQueue.queue_name.split(' ')) msgHeader = '{0}:{1} cloud={2} queue={3} resource_type={4}:'.format(vo, prodSourceLabel, cloudName, workQueueName, resource_name) tmpLog.debug('{0} start workQueueID={1}'.format(msgHeader, workQueueID)) # get central configuration values config_map = self.__getConfiguration(vo, workQueue.queue_name, resource_name) configQueueLimit = config_map[NQUEUELIMIT]['value'] configQueueCap = config_map[NQUEUECAP]['value'] configRunningCap = config_map[NRUNNINGCAP]['value'] tmpLog.debug(msgHeader + ' got configuration configQueueLimit={0}, configQueueCap={1}, configRunningCap={2}' .format(configQueueLimit, configQueueCap, configRunningCap)) # check if unthrottled if not workQueue.throttled: msgBody = "PASS unthrottled since GS_throttled is False" tmpLog.info(msgHeader+" "+msgBody) return self.retUnThrottled # get the jobs statistics for our wq/gs and expand the stats map jobstats_map = self.__prepareJobStats(workQueue, resource_name, config_map) nRunning_rt = jobstats_map['nRunning_rt'] nRunning_gs = jobstats_map['nRunning_gs'] nRunning_runningcap = jobstats_map['nRunning_runningcap'] nNotRun_rt = jobstats_map['nNotRun_rt'] nNotRun_gs = jobstats_map['nNotRun_gs'] nNotRun_queuelimit = jobstats_map['nNotRun_queuelimit'] nNotRun_queuecap = jobstats_map['nNotRun_queuecap'] nDefine_rt = jobstats_map['nDefine_rt'] nDefine_gs = jobstats_map['nDefine_gs'] nDefine_queuelimit = jobstats_map['nDefine_queuelimit'] nDefine_queuecap = jobstats_map['nDefine_queuecap'] nWaiting_rt = jobstats_map['nWaiting_rt'] nWaiting_gs = jobstats_map['nWaiting_gs'] # check if higher prio tasks are waiting if workQueue.queue_name in non_rt_wqs: # find highest priority of currently defined jobs tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed', cloudName, workQueue) # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo, workQueue, 'managed', cloudName) else: # find highest priority of currently defined jobs tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI('managed', cloudName, workQueue, resource_name) # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI(vo, workQueue, 'managed', cloudName, resource_name) highestPrioInPandaDB = highestPrioJobStat['highestPrio'] nNotRunHighestPrio = highestPrioJobStat['nNotRun'] if highestPrioWaiting is None: msgBody = 'failed to get the highest priority of waiting tasks' tmpLog.error("{0} {1}".format(msgHeader, msgBody)) return self.retTmpError # high priority tasks are waiting highPrioQueued = False if highestPrioWaiting > highestPrioInPandaDB \ or (highestPrioWaiting == highestPrioInPandaDB and nNotRunHighestPrio < nJobsInBunchMin): highPrioQueued = True tmpLog.debug("{0} highestPrio waiting:{1} inPanda:{2} numNotRun:{3} -> highPrioQueued={4}".format(msgHeader, highestPrioWaiting, highestPrioInPandaDB, nNotRunHighestPrio, highPrioQueued)) # set maximum number of jobs to be submitted if workQueue.queue_name in non_rt_wqs: tmpRemainingSlot = int(nRunning_gs * threshold - nNotRun_gs) else: tmpRemainingSlot = int(nRunning_rt * threshold - nNotRun_rt) # use the lower limit to avoid creating too many _sub/_dis datasets nJobsInBunch = min(max(nJobsInBunchMin, tmpRemainingSlot), nJobsInBunchMax) if configQueueLimit is not None: nQueueLimit = configQueueLimit else: nQueueLimit = nJobsInBunch * nBunch # use nPrestage for reprocessing if workQueue.queue_name in ['Heavy Ion', 'Reprocessing default']: # reset nJobsInBunch if nQueueLimit > (nNotRun_queuelimit + nDefine_queuelimit): tmpRemainingSlot = nQueueLimit - (nNotRun_queuelimit + nDefine_queuelimit) if tmpRemainingSlot > nJobsInBunch: nJobsInBunch = min(tmpRemainingSlot, nJobsInBunchMax) # get cap # set number of jobs to be submitted if configQueueCap is None: self.setMaxNumJobs(nJobsInBunch / nParallel) else: self.setMaxNumJobs(configQueueCap / nParallelCap) # get total walltime totWalltime = self.taskBufferIF.getTotalWallTime_JEDI(vo, prodSourceLabel, workQueue, resource_name, cloudName) # log the current situation and limits tmpLog.info("{0} nQueueLimit={1} nRunCap={2} nQueueCap={3}".format(msgHeader, nQueueLimit, configRunningCap, configQueueCap)) tmpLog.info("{0} at global share level: nQueued={1} nDefine={2} nRunning={3}".format(msgHeader, nNotRun_gs + nDefine_gs, nDefine_gs, nRunning_gs)) tmpLog.info("{0} at resource type level: nQueued_rt={1} nDefine_rt={2} nRunning_rt={3} totWalltime={4}".format(msgHeader, nNotRun_rt + nDefine_rt, nDefine_rt, nRunning_rt, totWalltime)) # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling limitPriority = False if workQueue.queue_name not in non_rt_wqs \ and nRunning_rt == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit \ and (totWalltime is None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name in non_rt_wqs \ and nRunning_gs == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit: limitPriority = True if not highPrioQueued: # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format(nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody),self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name not in non_rt_wqs and nRunning_rt != 0 \ and float(nNotRun_rt + nDefine_rt) / float(nRunning_rt) > threshold and \ (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit and (totWalltime is None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # enough jobs in Panda msgBody = "SKIP nQueued_rt({0})/nRunning_rt({1})>{2} & nQueued_queuelimit({3})>{4} totWalltime({5})>{6}".format(nNotRun_rt + nDefine_rt, nRunning_rt, threshold, nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name in non_rt_wqs and nRunning_gs != 0 \ and float(nNotRun_gs + nDefine_gs) / float(nRunning_gs) > threshold and \ (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit: limitPriority = True if not highPrioQueued: # enough jobs in Panda msgBody = "SKIP nQueued_gs({0})/nRunning_gs({1})>{2} & nQueued_queuelimit({3})>{4}".format(nNotRun_gs + nDefine_gs, nRunning_gs, threshold, nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif nDefine_queuelimit > nQueueLimit: limitPriority = True if not highPrioQueued: # brokerage is stuck msgBody = "SKIP too many nDefined_queuelimit({0})>{1}".format(nDefine_queuelimit, nQueueLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif nWaiting_rt > max(nRunning_rt * nWaitingLimit, nJobsInBunch * nWaitingBunchLimit): limitPriority = True if not highPrioQueued: # too many waiting msgBody = "SKIP too many nWaiting_rt({0})>max(nRunning_rt({1})x{2},{3}x{4})".format(nWaiting_rt, nRunning_rt, nWaitingLimit, nJobsInBunch, nWaitingBunchLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif configRunningCap and nRunning_runningcap > configRunningCap: # cap on running msgBody = "SKIP nRunning_runningcap({0})>nRunningCap({1})".format(nRunning_runningcap, configRunningCap) tmpLog.warning('{0} {1}'.format(msgHeader, msgBody)) tmpLog.sendMsg('{0} {1}'.format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif configQueueCap and nNotRun_queuecap + nDefine_queuecap > configQueueCap: limitPriority = True if not highPrioQueued: # cap on queued msgBody = "SKIP nQueued_queuecap({0})>nQueueCap({1})".format(nNotRun_queuecap + nDefine_queuecap, configQueueCap) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr # get jobs from prodDB limitPriorityValue = None if limitPriority: limitPriorityValue = highestPrioWaiting self.setMinPriority(limitPriorityValue) else: # not enough jobs are queued if (nNotRun_queuelimit + nDefine_queuelimit < nQueueLimit * 0.9) \ or (workQueue.queue_name in non_rt_wqs and nNotRun_gs + nDefine_gs < nRunning_gs) \ or (workQueue.queue_name not in non_rt_wqs and nNotRun_rt + nDefine_rt < nRunning_rt): tmpLog.debug(msgHeader+" not enough jobs queued") self.notEnoughJobsQueued() self.setMaxNumJobs(max(self.maxNumJobs, nQueueLimit/20)) msgBody = "PASS - priority limit={0} maxNumJobs={1}".format(limitPriorityValue, self.maxNumJobs) tmpLog.info(msgHeader+" "+msgBody) return self.retUnThrottled