def registerDatasetSubscription(self,datasetName,location,activity=None,ignoreUnknown=False): methodName = 'registerDatasetSubscription' methodName = '{0} datasetName={1} location={2}'.format(methodName,datasetName,location) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') isOK = True try: # get DQ2 API dq2 = DQ2() # call dq2.registerDatasetSubscription(datasetName,location,activity=activity) except DQSubscriptionExistsException: pass except DQUnknownDatasetException: if ignoreUnknown: pass else: isOK = False except: isOK = False if not isOK: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0} : {1}'.format(methodName,errMsg) tmpLog.info('done') return self.SC_SUCCEEDED,True
def doAction(self): try: # get logger origTmpLog = MsgWrapper(logger) origTmpLog.debug('start') # lock got_lock = self._get_lock() if not got_lock: origTmpLog.debug('locked by another process. Skipped') return self.SC_SUCCEEDED origTmpLog.debug('got lock') # undo preassigned tasks self.undo_preassign() # preassign tasks to sites ret_map = self.do_preassign() # unlock # self._release_lock() # origTmpLog.debug('released lock') # to-reassign map to_reassign_map = ret_map['to_reassign'] if to_reassign_map: # wait some minutes so that preassigned tasks can be brokered, before reassigning jobs origTmpLog.debug('wait {0}s before reassigning jobs'.format( reassign_jobs_wait_time)) time.sleep(reassign_jobs_wait_time) # reassign jobs of preassigned tasks self.reassign_jobs(to_reassign_map) except Exception: errtype, errvalue = sys.exc_info()[:2] err_str = traceback.format_exc() origTmpLog.error('failed with {0} {1} ; {2}'.format( errtype, errvalue, err_str)) # return origTmpLog.debug('done') return self.SC_SUCCEEDED
def doActionForReassgin(self,gTmpLog): # get DDM I/F ddmIF = self.ddmIF.getInterface(self.vo) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # get tasks to get reassigned taskList = self.taskBufferIF.getTasksToReassign_JEDI(self.vo,self.prodSourceLabel) gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList))) for taskSpec in taskList: tmpLog = MsgWrapper(logger,'<jediTaskID={0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start to reassign') # DDM backend ddmBackEnd = taskSpec.getDdmBackEnd() # update cloudtasks tmpStat = self.taskBufferIF.setCloudTaskByUser('jedi',taskSpec.jediTaskID,taskSpec.cloud,'assigned',True) if tmpStat != 'SUCCEEDED': tmpLog.error('failed to update CloudTasks') continue # get datasets tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,['output','log']) if tmpStat != True: tmpLog.error('failed to get datasets') continue # check cloud if not siteMapper.checkCloud(taskSpec.cloud): tmpLog.error("cloud={0} doesn't exist".format(taskSpec.cloud)) continue # get T1 t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest'] t1Site = siteMapper.getSite(t1SiteName) # loop over all datasets isOK = True for datasetSpec in datasetSpecList: tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName)) # get location location = siteMapper.getDdmEndpoint(t1Site.sitename,datasetSpec.storageToken) # set origin metadata tmpLog.debug('setting metadata origin={0}'.format(location)) tmpStat = ddmIF.setDatasetMetadata(datasetSpec.datasetName,'origin',location) if tmpStat != True: tmpLog.error("failed to set origin") isOK = False break # make subscription tmpLog.debug('registering subscription to {0} with backend={1}'.format(location, ddmBackEnd)) tmpStat = ddmIF.registerDatasetSubscription(datasetSpec.datasetName,location, activity='Production',ignoreUnknown=True, backEnd=ddmBackEnd) if tmpStat != True: tmpLog.error("failed to make subscription") isOK = False break # succeeded if isOK: # activate task taskSpec.status = taskSpec.oldStatus taskSpec.oldStatus = None self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID}) tmpLog.debug('finished to reassign')
def doCleanDataLocality(self): tmpLog = MsgWrapper(logger, ' #ATM #KV doCleanDataLocality') tmpLog.debug('start') try: # lock got_lock = self.taskBufferIF.lockProcess_JEDI( vo=self.vo, prodSourceLabel='default', cloud=None, workqueue_id=None, resource_name=None, component='AtlasDataLocalityUpdaterWatchDog.doCleanDataLocality', pid=self.pid, timeLimit=1440) if not got_lock: tmpLog.debug('locked by another process. Skipped') return tmpLog.debug('got lock') # lifetime of records record_lifetime_hours = 24 # run now_timestamp = datetime.datetime.utcnow() before_timestamp = now_timestamp - datetime.timedelta(hours=record_lifetime_hours) n_rows = self.taskBufferIF.deleteOutdatedDatasetLocality_JEDI(before_timestamp) tmpLog.info('cleaned up {0} records'.format(n_rows)) # done tmpLog.debug('done') except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0} {1} {2}'.format(errtype, errvalue, traceback.format_exc()))
def freezeDataset(self,datasetName,ignoreUnknown=False): methodName = 'freezeDataset' methodName = '{0} datasetName={1}'.format(methodName,datasetName) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') isOK = True try: # get DQ2 API dq2=DQ2() # freeze dq2.freezeDataset(datasetName) except DQFrozenDatasetException: pass except DQUnknownDatasetException: if ignoreUnknown: pass else: isOK = False except: isOK = False if isOK: tmpLog.info('done') return self.SC_SUCCEEDED,True else: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0} : {1}'.format(methodName,errMsg)
def doAction(self): try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # action for priority boost self.doActionForPriorityBoost(tmpLog) # action for reassign self.doActionForReassgin(tmpLog) # action for throttled self.doActionForThrottled(tmpLog) # action for high prio pending for minPriority, timeoutVal in [ (950, 10), (900, 30), ]: self.doActionForHighPrioPending(tmpLog, minPriority, timeoutVal) # action to set scout job data w/o scouts self.doActionToSetScoutJobData(tmpLog) except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0}:{1} {2}'.format( errtype.__name__, errvalue, traceback.format_exc())) # return tmpLog.debug('done') return self.SC_SUCCEEDED
def doAction(self): try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # action for priority boost self.doActionForPriorityBoost(tmpLog) # action for reassign self.doActionForReassgin(tmpLog) # action for throttled self.doActionForThrottled(tmpLog) # action for high prio pending for minPriority,timeoutVal in [(950,10), (900,30), ]: self.doActionForHighPrioPending(tmpLog,minPriority,timeoutVal) # action to set scout job data w/o scouts self.doActionToSetScoutJobData(tmpLog) # action to throttle jobs in paused tasks self.doActionToThrottleJobInPausedTasks(tmpLog) # action for jumbo jumbo = JumboWatchDog(self.taskBufferIF, self.ddmIF, tmpLog, 'atlas', 'managed') jumbo.run() except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0}:{1} {2}'.format(errtype.__name__,errvalue, traceback.format_exc())) # return tmpLog.debug('done') return self.SC_SUCCEEDED
def doAction(self): try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # action for priority boost self.doActionForPriorityBoost(tmpLog) # action for reassign self.doActionForReassgin(tmpLog) # action for throttled self.doActionForThrottled(tmpLog) # action for high prio pending for minPriority,timeoutVal in [(950,10), (900,30), ]: self.doActionForHighPrioPending(tmpLog,minPriority,timeoutVal) # action to set scout job data w/o scouts self.doActionToSetScoutJobData(tmpLog) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0}:{1} {2}'.format(errtype.__name__,errvalue, traceback.format_exc())) # return tmpLog.debug('done') return self.SC_SUCCEEDED
def getDatasetMetaData(self,datasetName): # make logger methodName = 'getDatasetMetaData' methodName = '{0} datasetName={1}'.format(methodName,datasetName) tmpLog = MsgWrapper(logger,methodName) try: # get DQ2 API dq2=DQ2() # get file list tmpRet = dq2.getMetaDataAttribute(datasetName,dq2.listMetaDataAttributes()) # change dataset state to string if tmpRet['state'] in [DatasetState.CLOSED,DatasetState.FROZEN]: tmpRet['state'] = 'closed' elif tmpRet['state'] == DatasetState.OPEN: tmpRet['state'] = 'open' else: tmpRet['state'] = 'unknown' tmpLog.debug(str(tmpRet)) return self.SC_SUCCEEDED,tmpRet except: errtype,errvalue = sys.exc_info()[:2] errMsg = 'failed with {0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) errCode = self.checkError(errtype) return errCode,'{0}.{1} {2}'.format(self.__class__.__name__,methodName,errMsg)
def runImpl(self): while True: try: # get a part of list nTasks = 100 taskList = self.taskList.get(nTasks) totalTasks,idxTasks = self.taskList.stat() # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # make logger tmpLog = MsgWrapper(self.logger) tmpLog.info('start TaskBrokerThread {0}/{1} for jediTaskID={2}'.format(idxTasks,totalTasks,taskList)) tmpStat = Interaction.SC_SUCCEEDED # get TaskSpecs tmpListToAssign = [] for tmpTaskItem in taskList: tmpListItem = self.taskBufferIF.getTasksToBeProcessed_JEDI(None,None,None,None,None, simTasks=[tmpTaskItem], readMinFiles=True) if tmpListItem == None: # failed tmpLog.error('failed to get the input chunks for jediTaskID={0}'.format(tmpTaskItem)) tmpStat = Interaction.SC_FAILED break tmpListToAssign += tmpListItem # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: impl = self.implFactory.getImpl(self.vo,self.prodSourceLabel) if impl == None: # task refiner is undefined tmpLog.error('task broker is undefined for vo={0} sourceLabel={1}'.format(self.vo,self.prodSourceLabel)) tmpStat = Interaction.SC_FAILED except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('getImpl failed with {0}:{1}'.format(errtype.__name__,errvalue)) tmpStat = Interaction.SC_FAILED # brokerage if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('brokerage with {0} for {1} tasks '.format(impl.__class__.__name__,len(tmpListToAssign))) try: tmpStat = impl.doBrokerage(tmpListToAssign,self.vo, self.prodSourceLabel,self.workQueue) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doBrokerage failed with {0}:{1}'.format(errtype.__name__,errvalue)) tmpStat = Interaction.SC_FAILED # register if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed') else: tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def doCheck(self,taskSpecList): # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doCheck') # return for failure retFatal = self.SC_FATAL,{} retTmpError = self.SC_FAILED,{} # get list of jediTaskIDs taskIdList = [] taskSpecMap = {} for taskSpec in taskSpecList: taskIdList.append(taskSpec.jediTaskID) taskSpecMap[taskSpec.jediTaskID] = taskSpec # check with panda tmpLog.debug('check with panda') tmpPandaStatus,cloudsInPanda = PandaClient.seeCloudTask(taskIdList) if tmpPandaStatus != 0: tmpLog.error('failed to see clouds') return retTmpError # make return map retMap = {} for tmpTaskID,tmpCoreName in cloudsInPanda.iteritems(): tmpLog.debug('jediTaskID={0} -> {1}'.format(tmpTaskID,tmpCoreName)) if not tmpCoreName in ['NULL','',None]: taskSpec = taskSpecMap[tmpTaskID] if taskSpec.useWorldCloud(): # get destinations for WORLD cloud ddmIF = self.ddmIF.getInterface(taskSpec.vo) # get site siteSpec = self.siteMapper.getSite(tmpCoreName) # get nucleus nucleus = siteSpec.pandasite # get output/log datasets tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(tmpTaskID,['output','log']) # get destinations retMap[tmpTaskID] = {'datasets':[],'nucleus':nucleus} for datasetSpec in tmpDatasetSpecs: # skip distributed datasets if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None: continue # get token token = ddmIF.convertTokenToEndpoint(siteSpec.ddm,datasetSpec.storageToken) # use default endpoint if token == None: token = siteSpec.ddm # add origianl token if not datasetSpec.storageToken in ['',None]: token += '/{0}'.format(datasetSpec.storageToken) retMap[tmpTaskID]['datasets'].append({'datasetID':datasetSpec.datasetID, 'token':'dst:{0}'.format(token), 'destination':tmpCoreName}) else: retMap[tmpTaskID] = tmpCoreName tmpLog.debug('ret {0}'.format(str(retMap))) # return tmpLog.debug('done') return self.SC_SUCCEEDED,retMap
def doCheck(self,taskSpecList): # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doCheck') # return for failure retFatal = self.SC_FATAL,{} retTmpError = self.SC_FAILED,{} # get list of jediTaskIDs taskIdList = [] taskSpecMap = {} for taskSpec in taskSpecList: taskIdList.append(taskSpec.jediTaskID) taskSpecMap[taskSpec.jediTaskID] = taskSpec # check with panda tmpLog.debug('check with panda') tmpPandaStatus,cloudsInPanda = PandaClient.seeCloudTask(taskIdList) if tmpPandaStatus != 0: tmpLog.error('failed to see clouds') return retTmpError # make return map retMap = {} for tmpTaskID,tmpCoreName in cloudsInPanda.iteritems(): tmpLog.debug('jediTaskID={0} -> {1}'.format(tmpTaskID,tmpCoreName)) if not tmpCoreName in ['NULL','',None]: taskSpec = taskSpecMap[tmpTaskID] if taskSpec.useWorldCloud(): # get destinations for WORLD cloud ddmIF = self.ddmIF.getInterface(taskSpec.vo) # get site siteSpec = self.siteMapper.getSite(tmpCoreName) # get nucleus nucleus = siteSpec.pandasite # get output/log datasets tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(tmpTaskID,['output','log']) # get destinations retMap[tmpTaskID] = {'datasets':[],'nucleus':nucleus} for datasetSpec in tmpDatasetSpecs: # skip distributed datasets if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None: continue # get token token = ddmIF.convertTokenToEndpoint(siteSpec.ddm,datasetSpec.storageToken) # use default endpoint if token == None: token = siteSpec.ddm # add origianl token if not datasetSpec.storageToken in ['',None]: token += '/{0}'.format(datasetSpec.storageToken) retMap[tmpTaskID]['datasets'].append({'datasetID':datasetSpec.datasetID, 'token':'dst:{0}'.format(token), 'destination':tmpCoreName}) else: retMap[tmpTaskID] = tmpCoreName tmpLog.debug('ret {0}'.format(str(retMap))) # return tmpLog.debug('done') return self.SC_SUCCEEDED,retMap
def runImpl(self): while True: try: # get a part of list nTasks = 100 taskList = self.taskList.get(nTasks) totalTasks,idxTasks = self.taskList.stat() # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # make logger tmpLog = MsgWrapper(self.logger) tmpLog.info('start TaskCheckerThread {0}/{1} for jediTaskID={2}'.format(idxTasks,totalTasks,taskList)) tmpStat = Interaction.SC_SUCCEEDED # get TaskSpecs taskSpecList = [] for jediTaskID in taskList: tmpRet,taskSpec = self.taskBufferIF.getTaskWithID_JEDI(jediTaskID,False) if tmpRet and taskSpec != None: taskSpecList.append(taskSpec) else: tmpLog.error('failed to get taskSpec for jediTaskID={0}'.format(jediTaskID)) if taskSpecList != []: # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: impl = self.implFactory.getImpl(self.vo,self.prodSourceLabel) if impl == None: # task brokerage is undefined tmpLog.error('task broker is undefined for vo={0} sourceLabel={1}'.format(self.vo,self.prodSourceLabel)) tmpStat = Interaction.SC_FAILED except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('getImpl failed with {0}:{1}'.format(errtype.__name__,errvalue)) tmpStat = Interaction.SC_FAILED # check if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('brokerage with {0}'.format(impl.__class__.__name__)) try: tmpStat,taskCloudMap = impl.doCheck(taskSpecList) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doCheck failed with {0}:{1}'.format(errtype.__name__,errvalue)) tmpStat = Interaction.SC_FAILED # update if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to check assignment') else: tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(taskCloudMap) tmpLog.info('done with {0} for {1}'.format(tmpRet,str(taskCloudMap))) except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def doCheck(self, taskSpecList): # make logger tmpLog = MsgWrapper(logger) tmpLog.debug("start doCheck") # return for failure retFatal = self.SC_FATAL, {} retTmpError = self.SC_FAILED, {} # get list of jediTaskIDs taskIdList = [] taskSpecMap = {} for taskSpec in taskSpecList: taskIdList.append(taskSpec.jediTaskID) taskSpecMap[taskSpec.jediTaskID] = taskSpec # check with panda tmpLog.debug("check with panda") tmpPandaStatus, cloudsInPanda = PandaClient.seeCloudTask(taskIdList) if tmpPandaStatus != 0: tmpLog.error("failed to see clouds") return retTmpError # make return map retMap = {} for tmpTaskID, tmpCoreName in cloudsInPanda.iteritems(): tmpLog.debug("jediTaskID={0} -> {1}".format(tmpTaskID, tmpCoreName)) if not tmpCoreName in ["NULL", "", None]: taskSpec = taskSpecMap[tmpTaskID] if taskSpec.useWorldCloud(): # get destinations for WORLD cloud ddmIF = self.ddmIF.getInterface(taskSpec.vo) # get site siteSpec = self.siteMapper.getSite(tmpCoreName) # get output/log datasets tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( tmpTaskID, ["output", "log"] ) # get destinations retMap[tmpTaskID] = [] for datasetSpec in tmpDatasetSpecs: token = ddmIF.convertTokenToEndpoint(siteSpec.ddm, datasetSpec.storageToken) # use default endpoint if token == None: token = siteSpec.ddm retMap[tmpTaskID].append( { "datasetID": datasetSpec.datasetID, "token": "dst:{0}".format(token), "destination": tmpCoreName, } ) else: retMap[tmpTaskID] = tmpCoreName tmpLog.debug("ret {0}".format(str(retMap))) # return tmpLog.debug("done") return self.SC_SUCCEEDED, retMap
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.info('start') # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # vo/prodSourceLabel specific action impl = self.getImpl(vo, prodSourceLabel, subType=self.subStr) if impl is not None: plugin_name = impl.__class__.__name__ tmpLog.info( 'pre-action for vo={} label={} cls={}'.format( vo, prodSourceLabel, plugin_name)) impl.pre_action(tmpLog, vo, prodSourceLabel, self.pid) tmpLog.info( 'do action for vo={} label={} cls={}'.format( vo, prodSourceLabel, plugin_name)) tmpStat = impl.doAction() if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error( 'failed to run special action for vo={} label={} cls={}' .format(vo, prodSourceLabel, plugin_name)) else: tmpLog.info( 'done for vo={} label={} cls={}'.format( vo, prodSourceLabel, plugin_name)) tmpLog.info('done') except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format( self.__class__.__name__, errtype.__name__, errvalue)) # sleep if needed loopCycle = jedi_config.watchdog.loopCycle if self.period is None else self.period timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep(max_val=loopCycle)
def start(self): # start base classes JediKnight.start(self) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # get the list of tasks to exec command tmpList = self.taskBufferIF.getTasksToExecCommand_JEDI( vo, prodSourceLabel) if tmpList == None: # failed tmpLog.error( 'failed to get the task list for vo={0} label={1}' .format(vo, prodSourceLabel)) else: tmpLog.debug('got {0} tasks'.format(len(tmpList))) # put to a locked list taskList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # make workers nWorker = jedi_config.taskrefine.nWorkers for iWorker in range(nWorker): thr = TaskCommandoThread( taskList, threadPool, self.taskBufferIF, self.ddmIF, self.pid) thr.start() # join threadPool.join() tmpLog.debug('done') except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format( self.__class__.__name__, errtype.__name__, errvalue)) # sleep if needed loopCycle = jedi_config.tcommando.loopCycle timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep()
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for taskSpec in taskList: # make logger tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID)) tmpLog.info('start') tmpStat = Interaction.SC_SUCCEEDED # get impl impl = self.implFactory.instantiateImpl(taskSpec.vo,taskSpec.prodSourceLabel,None, self.taskBufferIF,self.ddmIF) if impl == None: # post processor is undefined tmpLog.error('post-processor is undefined for vo={0} sourceLabel={1}'.format(taskSpec.vo,taskSpec.prodSourceLabel)) tmpStat = Interaction.SC_FATAL # execute if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('post-process with {0}'.format(impl.__class__.__name__)) try: impl.doPostProcess(taskSpec,tmpLog) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doPostProcess failed with {0}:{1}'.format(errtype.__name__,errvalue)) tmpStat = Interaction.SC_FATAL # done if tmpStat == Interaction.SC_FATAL: # task is broken tmpErrStr = 'post-process failed' tmpLog.error(tmpErrStr) taskSpec.status = 'broken' taskSpec.setErrDiag(tmpErrStr) taskSpec.lockedBy = None self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID}) elif tmpStat == Interaction.SC_FAILED: tmpErrStr = 'post processing failed' taskSpec.setOnHold() taskSpec.setErrDiag(tmpErrStr,True) taskSpec.lockedBy = None self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID}) tmpLog.info('set task_status={0} since {1}'.format(taskSpec.status,taskSpec.errorDialog)) continue # final procedure try: impl.doFinalProcedure(taskSpec,tmpLog) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doFinalProcedure failed with {0}:{1}'.format(errtype.__name__,errvalue)) # done tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format(self.__class__.__name__,errtype.__name__,errvalue))
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # get the list of tasks to refine tmpList = self.taskBufferIF.getTasksToRefine_JEDI(vo,prodSourceLabel) if tmpList == None: # failed tmpLog.error('failed to get the list of tasks to refine') else: tmpLog.debug('got {0} tasks'.format(len(tmpList))) # put to a locked list taskList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # get work queue mapper workQueueMapper = self.taskBufferIF.getWorkQueueMap() # make workers nWorker = jedi_config.taskrefine.nWorkers for iWorker in range(nWorker): thr = TaskRefinerThread(taskList,threadPool, self.taskBufferIF, self.ddmIF, self,workQueueMapper) thr.start() # join threadPool.join() except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,errtype.__name__,errvalue)) # sleep if needed loopCycle = jedi_config.taskrefine.loopCycle timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep()
def doForWaitingJobs(self): tmpLog = MsgWrapper(logger, 'doForWaitingJobs label=user') # check every 60 min checkInterval = 60 # get lib.tgz for waiting jobs libList = self.taskBufferIF.getLibForWaitingRunJob_JEDI(self.vo, self.prodSourceLabel, checkInterval) tmpLog.debug('got {0} lib.tgz files'.format(len(libList))) # activate or kill orphan jobs which were submitted to use lib.tgz when the lib.tgz was being produced for prodUserName,datasetName,tmpFileSpec in libList: tmpLog = MsgWrapper(logger,'< #ATM #KV doForWaitingJobs jediTaskID={0} label=user >'.format(tmpFileSpec.jediTaskID)) tmpLog.debug('start') # check status of lib.tgz if tmpFileSpec.status == 'failed': # get buildJob pandaJobSpecs = self.taskBufferIF.peekJobs([tmpFileSpec.PandaID], fromDefined=False, fromActive=False, fromWaiting=False) pandaJobSpec = pandaJobSpecs[0] if pandaJobSpec is not None: # kill self.taskBufferIF.updateJobs([pandaJobSpec],False) tmpLog.debug(' action=killed_downstream_jobs for user="******" with libDS={1}'.format(prodUserName,datasetName)) else: # PandaJobSpec not found tmpLog.error(' cannot find PandaJobSpec for user="******" with PandaID={1}'.format(prodUserName, tmpFileSpec.PandaID)) elif tmpFileSpec.status == 'finished': # set metadata self.taskBufferIF.setGUIDs([{'guid':tmpFileSpec.GUID, 'lfn':tmpFileSpec.lfn, 'checksum':tmpFileSpec.checksum, 'fsize':tmpFileSpec.fsize, 'scope':tmpFileSpec.scope, }]) # get lib dataset dataset = self.taskBufferIF.queryDatasetWithMap({'name':datasetName}) if dataset is not None: # activate jobs aThr = Activator(self.taskBufferIF,dataset) aThr.start() aThr.join() tmpLog.debug(' action=activated_downstream_jobs for user="******" with libDS={1}'.format(prodUserName,datasetName)) else: # datasetSpec not found tmpLog.error(' cannot find datasetSpec for user="******" with libDS={1}'.format(prodUserName,datasetName)) else: # lib.tgz is not ready tmpLog.debug(' keep waiting for user="******" libDS={1}'.format(prodUserName,datasetName))
def doForPreStaging(self): try: tmpLog = MsgWrapper(logger,'doForPreStaging') # lock flagLocked = self.taskBufferIF.lockProcess_JEDI(self.vo,self.prodSourceLabel, self.cronActions['forPrestage'], 0,self.pid,timeLimit=5) if not flagLocked: return tmpLog.debug('start') # get throttled users thrUserTasks = self.taskBufferIF.getThrottledUsersTasks_JEDI(self.vo,self.prodSourceLabel) # get dispatch datasets dispUserTasks = self.taskBufferIF.getDispatchDatasetsPerUser(self.vo,self.prodSourceLabel,True,True) # max size of prestaging requests in MB maxPrestaging = self.taskBufferIF.getConfigValue('anal_watchdog', 'PRESTAGE_LIMIT', 'jedi', 'atlas') if maxPrestaging == None: maxPrestaging = 1 maxPrestaging *= 1024*1024 # throttle interval thrInterval = 120 # loop over all users for userName,userDict in dispUserTasks.iteritems(): tmpLog.debug('{0} {1} GB'.format(userName, userDict['size']/1024)) # too large if userDict['size'] > maxPrestaging: tmpLog.debug('{0} has too large prestaging {1}>{2} GB'.format(userName, userDict['size']/1024, maxPrestaging/1024)) # throttle tasks for taskID in userDict['tasks']: if not userName in thrUserTasks or not taskID in thrUserTasks[userName]: tmpLog.debug('thottle jediTaskID={0}'.format(taskID)) errDiag = 'throttled for {0} min due to too large prestaging from TAPE'.format(thrInterval) self.taskBufferIF.throttleTask_JEDI(taskID,thrInterval,errDiag) # remove the user from the list if userName in thrUserTasks: del thrUserTasks[userName] # release users for userName,taskIDs in thrUserTasks.items(): tmpLog.debug('{0} release throttled tasks'.format(userName)) # unthrottle tasks for taskID in taskIDs: tmpLog.debug('unthottle jediTaskID={0}'.format(taskID)) self.taskBufferIF.releaseThrottledTask_JEDI(taskID) tmpLog.debug('done') except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0} {1} {2}'.format(errtype,errvalue,traceback.format_exc()))
def doAction(self): try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # action for priority boost self.doActionForPriorityBoost(tmpLog) # action for reassign self.doActionForReassgin(tmpLog) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0} {1}'.format(errtype,errvalue)) # return tmpLog.debug('done') return self.SC_SUCCEEDED
def doAction(self): try: # get logger origTmpLog = MsgWrapper(logger) origTmpLog.debug('start') # make tasks pending under certain conditions self.do_for_data_locality() except Exception: errtype, errvalue = sys.exc_info()[:2] err_str = traceback.format_exc() origTmpLog.error('failed with {0} {1} ; {2}'.format( errtype, errvalue, err_str)) # return origTmpLog.debug('done') return self.SC_SUCCEEDED
def doAction(self): try: # get logger origTmpLog = MsgWrapper(logger) origTmpLog.debug('start') # clean up data locality self.doCleanDataLocality() # update data locality self.doUpdateDataLocality() except Exception: errtype, errvalue = sys.exc_info()[:2] origTmpLog.error('failed with {0} {1}'.format(errtype, errvalue)) # return origTmpLog.debug('done') return self.SC_SUCCEEDED
def checkDatasetConsistency(self,location,datasetName): # make logger methodName = 'checkDatasetConsistency' methodName = '{0} datasetName={1} location={2}'.format(methodName,datasetName,location) tmpLog = MsgWrapper(logger,methodName) try: # get DQ2 API dq2=DQ2() # check tmpRet = dq2.checkDatasetConsistency(location,datasetName) tmpLog.debug(str(tmpRet)) except: errtype,errvalue = sys.exc_info()[:2] errMsg = 'failed with {0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) errCode = self.checkError(errtype) return errCode,'{0}.{1} {2}'.format(self.__class__.__name__,methodName,errMsg)
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.info('start') # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # rescue picked files tmpLog.info('rescue tasks with picked files for vo={0} label={1}'.format(vo,prodSourceLabel)) tmpRet = self.taskBufferIF.rescuePickedFiles_JEDI(vo,prodSourceLabel, jedi_config.watchdog.waitForPicked) if tmpRet == None: # failed tmpLog.error('failed to rescue') else: tmpLog.info('rescued {0} tasks'.format(tmpRet)) # reactivate pending tasks tmpLog.info('reactivate pending tasks for vo={0} label={1}'.format(vo,prodSourceLabel)) tmpRet = self.taskBufferIF.reactivatePendingTasks_JEDI(vo,prodSourceLabel, jedi_config.watchdog.waitForPending, jedi_config.watchdog.timeoutForPending) if tmpRet == None: # failed tmpLog.error('failed to reactivate') else: tmpLog.info('reactivated {0} tasks'.format(tmpRet)) # vo/prodSourceLabel specific action impl = self.getImpl(vo,prodSourceLabel) if impl != None: tmpLog.info('special action for vo={0} label={1} with {2}'.format(vo,prodSourceLabel,impl.__class__.__name__)) tmpStat = impl.doAction() if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to run special acction for vo={0} label={1}'.format(vo,prodSourceLabel)) else: tmpLog.info('done for vo={0} label={1}'.format(vo,prodSourceLabel)) tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,errtype.__name__,errvalue)) # sleep if needed loopCycle = jedi_config.watchdog.loopCycle timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep()
def finger(self,userName): methodName = 'finger' methodName = '{0} userName={1}'.format(methodName,userName) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') try: # cleanup DN userName = parse_dn(userName) # exec tmpRet = infoClient().finger(userName) except: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0}:{1}'.format(methodName,errMsg) tmpLog.info('done') return self.SC_SUCCEEDED,tmpRet
def setDatasetOwner(self,datasetName,userName): methodName = 'setDatasetOwner' methodName = '{0} datasetName={1} userName={2}'.format(methodName,datasetName,userName) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') try: # cleanup DN userName = parse_dn(userName) # get DQ2 API dq2=DQ2() # set dq2.setMetaDataAttribute(datasetName,'owner',userName) except: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0} : {1}'.format(methodName,errMsg) tmpLog.info('done') return self.SC_SUCCEEDED,True
def setDatasetMetadata(self,datasetName,metadataName,metadaValue): methodName = 'setDatasetMetadata' methodName = '{0} datasetName={1} metadataName={2} metadaValue={3}'.format(methodName,datasetName, metadataName,metadaValue) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') try: # get DQ2 API dq2 = DQ2() # set dq2.setMetaDataAttribute(datasetName,metadataName,metadaValue) except DQUnknownDatasetException: pass except: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0} : {1}'.format(methodName,errMsg) tmpLog.info('done') return self.SC_SUCCEEDED,True
def registerDatasetLocation(self,datasetName,location,lifetime=None,owner=None): methodName = 'registerDatasetLocation' methodName = '{0} datasetName={1} location={2}'.format(methodName,datasetName,location) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') try: # cleanup DN owner = parse_dn(owner) # get DQ2 API dq2 = DQ2() # set dq2.registerDatasetLocation(datasetName,location,lifetime=lifetime) dq2.setReplicaMetaDataAttribute(datasetName,location,'owner',owner) except: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0} : {1}'.format(methodName,errMsg) tmpLog.info('done') return self.SC_SUCCEEDED,True
def doAction(self): try: # get logger origTmpLog = MsgWrapper(logger) origTmpLog.debug('start') # handle waiting jobs self.doForWaitingJobs() # throttle tasks if so many prestaging requests self.doForPreStaging() # priority massage self.doForPriorityMassage() # redo stalled analysis jobs self.doForRedoStalledJobs() # throttle WAN data access #self.doForThrottleWAN() except Exception: errtype,errvalue = sys.exc_info()[:2] origTmpLog.error('failed with {0} {1}'.format(errtype,errvalue)) # return origTmpLog.debug('done') return self.SC_SUCCEEDED
def doAction(self): try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # action for priority boost self.doActionForPriorityBoost(tmpLog) # action for reassign self.doActionForReassgin(tmpLog) # action for throttled self.doActionForThrottled(tmpLog) # action for high prio pending for minPriority,timeoutVal in [(950,10), (900,30), ]: self.doActionForHighPrioPending(tmpLog,minPriority,timeoutVal) except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0} {1}'.format(errtype,errvalue)) # return tmpLog.debug('done') return self.SC_SUCCEEDED
def doUpdateDataLocality(self): tmpLog = MsgWrapper(logger, ' #ATM #KV doUpdateDataLocality') tmpLog.debug('start') try: # lock got_lock = self.taskBufferIF.lockProcess_JEDI( vo=self.vo, prodSourceLabel='default', cloud=None, workqueue_id=None, resource_name=None, component='AtlasDataLocalityUpdaterWatchDog.doUpdateDataLocality', pid=self.pid, timeLimit=240) if not got_lock: tmpLog.debug('locked by another process. Skipped') return tmpLog.debug('got lock') # get list of datasets datasets_list = self.get_datasets_list() tmpLog.debug('got {0} datasets to update'.format(len(datasets_list))) # make thread pool thread_pool = ThreadPool() # make workers n_workers = 4 for _ in range(n_workers): thr = DataLocalityUpdaterThread(taskDsList=datasets_list, threadPool=thread_pool, taskbufferIF=self.taskBufferIF, ddmIF=self.ddmIF, pid=self.pid, loggerObj=tmpLog) thr.start() tmpLog.debug('started {0} updater workers'.format(n_workers)) # join thread_pool.join() # done tmpLog.debug('done') except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0} {1} {2}'.format(errtype, errvalue, traceback.format_exc()))
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self,self.taskBufferIF,self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.info('start') # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # prepare tasks to be finished tmpLog.info('preparing tasks to be finished for vo={0} label={1}'.format(vo,prodSourceLabel)) tmpRet = self.taskBufferIF.prepareTasksToBeFinished_JEDI(vo,prodSourceLabel, jedi_config.postprocessor.nTasks, pid=self.pid) if tmpRet == None: # failed tmpLog.error('failed to prepare tasks') # get tasks to be finished tmpLog.info('getting tasks to be finished') tmpList = self.taskBufferIF.getTasksToBeFinished_JEDI(vo,prodSourceLabel,self.pid, jedi_config.postprocessor.nTasks) if tmpList == None: # failed tmpLog.error('failed to get tasks to be finished') else: tmpLog.info('got {0} tasks'.format(len(tmpList))) # put to a locked list taskList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # make workers nWorker = jedi_config.postprocessor.nWorkers for iWorker in range(nWorker): thr = PostProcessorThread(taskList,threadPool, self.taskBufferIF, self.ddmIF, self) thr.start() # join threadPool.join() tmpLog.info('done') except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format(self.__class__.__name__,errtype.__name__,errvalue)) # sleep if needed loopCycle = 60 timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod)
def doAction(self): try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # action for priority boost self.doActionForPriorityBoost(tmpLog) # action for reassign self.doActionForReassign(tmpLog) # action for throttled self.doActionForThrottled(tmpLog) # action for high prio pending for minPriority, timeoutVal in [(950, 10), (900, 30), ]: self.doActionForHighPrioPending(tmpLog, minPriority, timeoutVal) # action to set scout job data w/o scouts self.doActionToSetScoutJobData(tmpLog) # action to throttle jobs in paused tasks self.doActionToThrottleJobInPausedTasks(tmpLog) # action for jumbo jumbo = JumboWatchDog(self.taskBufferIF, self.ddmIF, tmpLog, 'atlas', 'managed') jumbo.run() except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0}:{1} {2}'.format(errtype.__name__, errvalue, traceback.format_exc())) # return tmpLog.debug('done') return self.SC_SUCCEEDED
def deleteDataset(self,datasetName,emptyOnly,ignoreUnknown=False): methodName = 'deleteDataset' methodName = '{0} datasetName={1}'.format(methodName,datasetName) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') isOK = True retStr = '' nFiles = -1 try: # get DQ2 API dq2=DQ2() # get the number of files if emptyOnly: nFiles = dq2.getNumberOfFiles(datasetName) # erase if not emptyOnly or nFiles == 0: dq2.eraseDataset(datasetName) retStr = 'deleted {0}'.format(datasetName) else: retStr = 'keep {0} where {1} files are available'.format(datasetName,nFiles) except DQUnknownDatasetException: if ignoreUnknown: pass else: isOK = False except: isOK = False if isOK: tmpLog.info('done') return self.SC_SUCCEEDED,retStr else: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error(errMsg) return errCode,'{0} : {1}'.format(methodName,errMsg)
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # get the list of tasks to refine tmpList = self.taskBufferIF.getTasksToRefine_JEDI( vo, prodSourceLabel) if tmpList is None: # failed tmpLog.error( 'failed to get the list of tasks to refine') else: tmpLog.debug('got {0} tasks'.format(len(tmpList))) # put to a locked list taskList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # get work queue mapper workQueueMapper = self.taskBufferIF.getWorkQueueMap( ) # make workers nWorker = jedi_config.taskrefine.nWorkers for iWorker in range(nWorker): thr = TaskRefinerThread( taskList, threadPool, self.taskBufferIF, self.ddmIF, self, workQueueMapper) thr.start() # join threadPool.join() except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format( self.__class__.__name__, errtype.__name__, errvalue)) tmpLog.error('Traceback: {0}'.format(traceback.format_exc())) # sleep if needed loopCycle = jedi_config.taskrefine.loopCycle timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep(max_val=loopCycle)
def doCheck(self,taskSpecList): # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doCheck') # return for failure retFatal = self.SC_FATAL,{} retTmpError = self.SC_FAILED,{} # get list of reqIDs wchih are mapped to taskID in Panda reqIdTaskIdMap = {} for taskSpec in taskSpecList: if taskSpec.reqID != None: if reqIdTaskIdMap.has_key(taskSpec.reqID): tmpLog.error('reqID={0} is dubplicated in jediTaskID={1},{2}'.format(taskSpec.reqID, taskSpec.jediTaskID, reqIdTaskIdMap[taskSpec.reqID])) else: reqIdTaskIdMap[taskSpec.reqID] = taskSpec.jediTaskID tmpLog.debug('jediTaskID={0} has reqID={1}'.format(taskSpec.jediTaskID,taskSpec.reqID)) else: tmpLog.error('jediTaskID={0} has undefined reqID'.format(taskSpec.jediTaskID)) # check with panda tmpLog.debug('check with panda') tmpPandaStatus,cloudsInPanda = PandaClient.seeCloudTask(reqIdTaskIdMap.keys()) if tmpPandaStatus != 0: tmpLog.error('failed to see clouds') return retTmpError # make return map retMap = {} for tmpReqID,tmpCloud in cloudsInPanda.iteritems(): if not tmpCloud in ['NULL','',None]: tmpLog.debug('reqID={0} jediTaskID={1} -> {2}'.format(tmpReqID,reqIdTaskIdMap[tmpReqID],tmpCloud)) """ # check file availability tmpSt = self.findMissingFiles(reqIdTaskIdMap[tmpReqID],tmpCloud) if tmpSt != self.SC_SUCCEEDED: tmpLog.error('failed to check file availability for jediTaskID={0}'.format(reqIdTaskIdMap[tmpReqID])) continue """ retMap[reqIdTaskIdMap[tmpReqID]] = tmpCloud tmpLog.debug('ret {0}'.format(str(retMap))) # return tmpLog.debug('done') return self.SC_SUCCEEDED,retMap
def expandContainer(self,containerName): methodName = 'expandContainer' methodName = '{0} contName={1}'.format(methodName,containerName) tmpLog = MsgWrapper(logger,methodName) tmpLog.info('start') try: dsList = [] # get real names tmpS,tmpRealNameList = self.listDatasets(containerName) if tmpS != self.SC_SUCCEEDED: tmpLog.error('failed to get real names') return tmpS,tmpRealNameList # loop over all names for tmpRealName in tmpRealNameList: # container if tmpRealName.endswith('/'): # get contents tmpS,tmpO = self.listDatasetsInContainer(tmpRealName) if tmpS != self.SC_SUCCEEDED: tmpLog.error('failed to get datasets in {0}'.format(tmpRealName)) return tmpS,tmpO else: tmpO = [tmpRealName] # collect dataset names for tmpStr in tmpO: if not tmpStr in dsList: dsList.append(tmpStr) dsList.sort() # return tmpLog.info('got {0}'.format(str(dsList))) return self.SC_SUCCEEDED,dsList except: errtype,errvalue = sys.exc_info()[:2] errCode = self.checkError(errtype) errMsg = '{0} {1}'.format(errtype.__name__,errvalue) tmpLog.error('failed with {0}'.format(errMsg)) return errCode,'{0} : {1}'.format(methodName,errMsg)
def runImpl(self): # cutoff for disk in TB diskThreshold = 5 * 1024 # dataset type to ignore file availability check datasetTypeToSkipCheck = ['log'] thrInputSize = 1024*1024*1024 thrInputNum = 100 thrInputSizeFrac = 0.1 thrInputNumFrac = 0.1 cutOffRW = 50 negWeightTape = 0.001 # main lastJediTaskID = None siteMapper = self.taskBufferIF.getSiteMapper() while True: try: taskInputList = self.inputList.get(1) # no more datasets if len(taskInputList) == 0: self.logger.debug('{0} terminating after processing {1} tasks since no more inputs '.format(self.__class__.__name__, self.numTasks)) return # loop over all tasks for taskSpec,inputChunk in taskInputList: lastJediTaskID = taskSpec.jediTaskID # make logger tmpLog = MsgWrapper(self.logger,'<jediTaskID={0}>'.format(taskSpec.jediTaskID),monToken='{0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start') # get nuclei nucleusList = siteMapper.nuclei if taskSpec.nucleus in nucleusList: candidateNucleus = taskSpec.nucleus else: tmpLog.debug('got {0} candidates'.format(len(nucleusList))) ###################################### # check status newNucleusList = {} for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if not tmpNucleusSpec.state in ['ACTIVE']: tmpLog.debug(' skip nucleus={0} due to status={1} criteria=-status'.format(tmpNucleus, tmpNucleusSpec.state)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.debug('{0} candidates passed status check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check endpoint newNucleusList = {} tmpStat,tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID, ['output','log']) for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): toSkip = False for tmpDatasetSpec in tmpDatasetSpecList: # ignore distributed datasets if DataServiceUtils.getDistributedDestination(tmpDatasetSpec.storageToken) != None: continue # get endpoint with the pattern tmpEP = tmpNucleusSpec.getAssoicatedEndpoint(tmpDatasetSpec.storageToken) if tmpEP == None: tmpLog.debug(' skip nucleus={0} since no endpoint with {1} criteria=-match'.format(tmpNucleus, tmpDatasetSpec.storageToken)) toSkip = True break # check state """ if not tmpEP['state'] in ['ACTIVE']: tmpLog.debug(' skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus, tmpEP['ddm_endpoint_name'], tmpEP['state'])) toSkip = True break """ # check space tmpSpaceSize = tmpEP['space_free'] + tmpEP['space_expired'] if tmpSpaceSize < diskThreshold: tmpLog.debug(' skip nucleus={0} since disk shortage ({1}<{2}) at endpoint {3} criteria=-space'.format(tmpNucleus, tmpSpaceSize, diskThreshold, tmpEP['state'])) toSkip = True break if not toSkip: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.debug('{0} candidates passed endpoint check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # data locality toSkip = False availableData = {} for datasetSpec in inputChunk.getDatasets(): # only for real datasets if datasetSpec.isPseudo(): continue # ignore DBR if DataServiceUtils.isDBR(datasetSpec.datasetName): continue # skip locality check if DataServiceUtils.getDatasetType(datasetSpec.datasetName) in datasetTypeToSkipCheck: continue # get nuclei where data is available tmpSt,tmpRet = AtlasBrokerUtils.getNucleiWithData(siteMapper,self.ddmIF, datasetSpec.datasetName, nucleusList.keys()) if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error('failed to get nuclei where data is available, since {0}'.format(tmpRet)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) toSkip = True break # sum for tmpNucleus,tmpVals in tmpRet.iteritems(): if not tmpNucleus in availableData: availableData[tmpNucleus] = tmpVals else: availableData[tmpNucleus] = dict((k,v+tmpVals[k]) for (k,v) in availableData[tmpNucleus].iteritems()) if toSkip: continue if availableData != {}: newNucleusList = {} # skip if no data for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if availableData[tmpNucleus]['tot_size'] > thrInputSize and \ availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac: tmpLog.debug(' skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format(tmpNucleus, availableData[tmpNucleus]['ava_size_any'], availableData[tmpNucleus]['tot_size'], thrInputSizeFrac)) elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \ availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac: tmpLog.debug(' skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format(tmpNucleus, availableData[tmpNucleus]['ava_num_any'], availableData[tmpNucleus]['tot_num'], thrInputNumFrac)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.debug('{0} candidates passed data check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # ability to execute jobs newNucleusList = {} # get all panda sites tmpSiteList = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): tmpSiteList += tmpNucleusSpec.allPandaSites tmpSiteList = list(set(tmpSiteList)) tmpLog.debug('===== start for job check') jobBroker = AtlasProdJobBroker(self.ddmIF,self.taskBufferIF) tmpSt,tmpRet = jobBroker.doBrokerage(taskSpec,taskSpec.cloud,inputChunk,None,True, tmpSiteList,tmpLog) tmpLog.debug('===== done for job check') if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.debug('failed to get sites where jobs can run. Use any nuclei where input is available') # use any nuclei where input is available if no sites can run jobs tmpRet = tmpSiteList okNuclei = set() for tmpSite in tmpRet: siteSpec = siteMapper.getSite(tmpSite) okNuclei.add(siteSpec.pandasite) for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if tmpNucleus in okNuclei: newNucleusList[tmpNucleus] = tmpNucleusSpec else: tmpLog.debug(' skip nucleus={0} due to missing ability to run jobs criteria=-job'.format(tmpNucleus)) nucleusList = newNucleusList tmpLog.debug('{0} candidates passed job check'.format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # RW taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI(taskSpec.jediTaskID) ###################################### # weight self.prioRW.acquire() nucleusRW = self.prioRW[taskSpec.currentPriority] self.prioRW.release() totalWeight = 0 nucleusweights = [] for tmpNucleus,tmpNucleusSpec in nucleusList.iteritems(): if not tmpNucleus in nucleusRW: nucleusRW[tmpNucleus] = 0 wStr = '1' # with RW if tmpNucleus in nucleusRW and nucleusRW[tmpNucleus] >= cutOffRW: weight = 1 / float(nucleusRW[tmpNucleus]) wStr += '/({0}=RW)'.format(nucleusRW[tmpNucleus]) else: weight = 1 wStr += '/(1 : RW={0}<{1})'.format(nucleusRW[tmpNucleus],cutOffRW) # with data if availableData != {}: weight *= float(availableData[tmpNucleus]['ava_size_any']) weight /= float(availableData[tmpNucleus]['tot_size']) wStr += '*({0}=available input size on DISK/TAPE)'.format(availableData[tmpNucleus]['ava_size_any']) wStr += '/({0}=total input size)'.format(availableData[tmpNucleus]['tot_size']) # negative weight for tape if availableData[tmpNucleus]['ava_size_any'] > availableData[tmpNucleus]['ava_size_disk']: weight *= negWeightTape wStr += '*({0}=weight for TAPE)'.format(negWeightTape) tmpLog.debug(' use nucleus={0} weight={1} {2} criteria=+use'.format(tmpNucleus,weight,wStr)) totalWeight += weight nucleusweights.append((tmpNucleus,weight)) tmpLog.debug('final {0} candidates'.format(len(nucleusList))) ###################################### # final selection tgtWeight = random.uniform(0,totalWeight) candidateNucleus = None for tmpNucleus,weight in nucleusweights: tgtWeight -= weight if tgtWeight <= 0: candidateNucleus = tmpNucleus break if candidateNucleus == None: candidateNucleus = nucleusweights[-1][0] ###################################### # update nucleusSpec = nucleusList[candidateNucleus] # get output/log datasets tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID, ['output','log']) # get destinations retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,tmpDatasetSpecs)} tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) tmpLog.info(' set nucleus={0} with {1} criteria=+set'.format(candidateNucleus,tmpRet)) # update RW table self.prioRW.acquire() for prio,rwMap in self.prioRW.iteritems(): if prio > taskSpec.currentPriority: continue if candidateNucleus in rwMap: rwMap[candidateNucleus] += taskRW else: rwMap[candidateNucleus] = taskRW self.prioRW.release() except: errtype,errvalue = sys.exc_info()[:2] errMsg = '{0}.runImpl() failed with {1} {2} '.format(self.__class__.__name__,errtype.__name__,errvalue) errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID) errMsg += traceback.format_exc() logger.error(errMsg)
def findMissingFiles(self,jediTaskID,cloudName): tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(jediTaskID)) tmpLog.debug('start findMissingFiles') # return for failure retError = self.SC_FAILED # get datasets tmpSt,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(jediTaskID,['input'],True) if not tmpSt: tmpLog.error('failed to get the list of datasets') return retError # loop over all datasets for datasetSpec in datasetSpecList: # check only master dataset if not datasetSpec.isMaster(): continue tmpLog.debug('checking {0}'.format(datasetSpec.datasetName)) # get ddmIF ddmIF = self.ddmIF.getInterface(datasetSpec.vo) if ddmIF == None: tmpLog.error('failed to get DDM I/F for vo={0}'.format(datasetSpec.vo)) return retError # get the list of sites where data is available tmpSt,tmpRet = AtlasBrokerUtils.getSitesWithData(self.siteMapper,ddmIF, datasetSpec.datasetName) if tmpSt != self.SC_SUCCEEDED: tmpLog.error('failed to get the list of sites where {0} is available, since {1}'.format(datasetSpec.datasetName, tmpRet)) return retError dataSiteMap = tmpRet # data is unavailable in cloud if not dataSiteMap.has_key(cloudName): tmpLog.error('{0} is unavailable in cloud={1} map={2}'.format(datasetSpec.datasetName,cloudName,str(dataSiteMap))) return retError # mapping between sites and storage endpoints checkedSites = [self.siteMapper.getCloud(cloudName)['source']]+dataSiteMap[cloudName]['t2'] siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap(checkedSites,self.siteMapper) # get available files per site/endpoint tmpAvFileMap = ddmIF.getAvailableFiles(datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[1], checkLFC=True) if tmpAvFileMap == None: tmpLog.error('failed to get available file list for {0}'.format(datasetSpec.datasetName)) return retError # check availability missingFiles = [] for fileSpec in datasetSpec.Files: fileFound = False for tmpSiteName,availableFilesMap in tmpAvFileMap.iteritems(): for tmpStorageType,availableFiles in availableFilesMap.iteritems(): for availableFile in availableFiles: if fileSpec.lfn == availableFile.lfn: fileFound = True break if fileFound: break if fileFound: break # missing if not fileFound: missingFiles.append(fileSpec.fileID) tmpLog.debug('{0} missing'.format(fileSpec.lfn)) # update contents if missingFiles != []: tmpSt = self.taskBufferIF.setMissingFiles_JEDI(jediTaskID,datasetSpec.datasetID,missingFiles) if not tmpSt: tmpLog.error('failed to set missing files in {0}'.format(datasetSpec.datasetName)) return retError tmpLog.debug('done findMissingFiles') return self.SC_SUCCEEDED
def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap): # make logger tmpLog = MsgWrapper(logger, '<jediTaskID={0}>'.format(taskSpec.jediTaskID), monToken='<jediTaskID={0} {1}>'.format( taskSpec.jediTaskID, datetime.datetime.utcnow().isoformat('/'))) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL, inputChunk retTmpError = self.SC_FAILED, inputChunk # get primary site candidates sitePreAssigned = False excludeList = [] includeList = None scanSiteList = [] # get list of site access siteAccessList = self.taskBufferIF.listSiteAccess( None, taskSpec.userName) siteAccessMap = {} for tmpSiteName, tmpAccess in siteAccessList: siteAccessMap[tmpSiteName] = tmpAccess # site limitation if taskSpec.useLimitedSites(): if 'excludedSite' in taskParamMap: excludeList = taskParamMap['excludedSite'] # str to list for task retry try: if type(excludeList) != types.ListType: excludeList = excludeList.split(',') except: pass if 'includedSite' in taskParamMap: includeList = taskParamMap['includedSite'] # str to list for task retry if includeList == '': includeList = None try: if type(includeList) != types.ListType: includeList = includeList.split(',') except: pass # loop over all sites for siteName, tmpSiteSpec in self.siteMapper.siteSpecList.iteritems(): if tmpSiteSpec.type == 'analysis': scanSiteList.append(siteName) # preassigned if not taskSpec.site in ['', None]: # site is pre-assigned tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) sitePreAssigned = True if not taskSpec.site in scanSiteList: scanSiteList.append(taskSpec.site) tmpLog.debug('initial {0} candidates'.format(len(scanSiteList))) # allowed remote access protocol allowedRemoteProtocol = 'fax' # MP if taskSpec.coreCount != None and taskSpec.coreCount > 1: # use MCORE only useMP = 'only' elif taskSpec.coreCount == 0: # use MCORE and normal useMP = 'any' else: # not use MCORE useMP = 'unuse' ###################################### # selection for status newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status in ['offline']: skipFlag = True elif tmpSiteSpec.status in ['brokeroff', 'test']: if not sitePreAssigned: skipFlag = True elif tmpSiteName != taskSpec.site: skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug( ' skip site=%s due to status=%s criteria=-status' % (tmpSiteName, tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for MP if not sitePreAssigned: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if useMP == 'any' or (useMP == 'only' and tmpSiteSpec.coreCount > 1) or \ (useMP =='unuse' and tmpSiteSpec.coreCount in [0,1,None]): newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip site=%s due to core mismatch cores_site=%s <> cores_task=%s criteria=-cpucore' % \ (tmpSiteName,tmpSiteSpec.coreCount,taskSpec.coreCount)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for useMP={1}'.format( len(scanSiteList), useMP)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for release if taskSpec.transHome != None: if taskSpec.transHome.startswith('ROOT'): # hack until x86_64-slc6-gcc47-opt is published in installedsw if taskSpec.architecture == 'x86_64-slc6-gcc47-opt': tmpCmtConfig = 'x86_64-slc6-gcc46-opt' else: tmpCmtConfig = taskSpec.architecture siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, cmtConfig=tmpCmtConfig, onlyCmtConfig=True) elif 'AthAnalysis' in taskSpec.transHome or re.search( 'Ath[a-zA-Z]+Base', taskSpec.transHome) != None: # AthAnalysis siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, cmtConfig=taskSpec.architecture, onlyCmtConfig=True) else: # remove AnalysisTransforms- transHome = re.sub('^[^-]+-*', '', taskSpec.transHome) transHome = re.sub('_', '-', transHome) if re.search('rel_\d+(\n|$)',taskSpec.transHome) == None and taskSpec.transHome != 'AnalysisTransforms' and \ re.search('\d{4}-\d{2}-\d{2}T\d{4}$',taskSpec.transHome) == None : # cache is checked siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, caches=transHome, cmtConfig=taskSpec.architecture) elif transHome == '' and taskSpec.transUses != None: # remove Atlas- transUses = taskSpec.transUses.split('-')[-1] # release is checked siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, releases=transUses, cmtConfig=taskSpec.architecture) else: # nightlies siteListWithSW = self.taskBufferIF.checkSitesWithRelease( scanSiteList, releases='CVMFS') newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # release check is disabled or release is available if tmpSiteSpec.releases == ['ANY']: newScanSiteList.append(tmpSiteName) elif tmpSiteName in siteListWithSW: newScanSiteList.append(tmpSiteName) else: # release is unavailable tmpLog.debug(' skip site=%s due to missing rel/cache %s:%s:%s criteria=-cache' % \ (tmpSiteName,taskSpec.transUses,taskSpec.transHome,taskSpec.architecture)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed for SW {1}:{2}:{3}'.format( len(scanSiteList), taskSpec.transUses, taskSpec.transHome, taskSpec.architecture)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for memory minRamCount = inputChunk.getMaxRamCount() minRamCount = JediCoreUtils.compensateRamCount(minRamCount) if not minRamCount in [0, None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # site max memory requirement if not tmpSiteSpec.maxrss in [0, None]: site_maxmemory = tmpSiteSpec.maxrss else: site_maxmemory = tmpSiteSpec.maxmemory if not site_maxmemory in [ 0, None ] and minRamCount != 0 and minRamCount > site_maxmemory: tmpLog.debug( ' skip site={0} due to site RAM shortage. site_maxmemory={1} < job_minramcount={2} criteria=-lowmemory' .format(tmpSiteName, site_maxmemory, minRamCount)) continue # site min memory requirement if not tmpSiteSpec.minrss in [0, None]: site_minmemory = tmpSiteSpec.minrss else: site_minmemory = tmpSiteSpec.minmemory if not site_minmemory in [ 0, None ] and minRamCount != 0 and minRamCount < site_minmemory: tmpLog.debug( ' skip site={0} due to job RAM shortage. site_minmemory={1} > job_minramcount={2} criteria=-highmemory' .format(tmpSiteName, site_minmemory, minRamCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format( len(scanSiteList), minRamCount, taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for scratch disk tmpMaxAtomSize = inputChunk.getMaxAtomSize() tmpEffAtomSize = inputChunk.getMaxAtomSize(effectiveSize=True) tmpOutDiskSize = taskSpec.getOutDiskSize() tmpWorkDiskSize = taskSpec.getWorkDiskSize() minDiskCountS = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize + tmpMaxAtomSize minDiskCountS = minDiskCountS / 1024 / 1024 # size for direct IO sites if taskSpec.useLocalIO(): minDiskCountR = minDiskCountS else: minDiskCountR = tmpOutDiskSize * tmpEffAtomSize + tmpWorkDiskSize minDiskCountR = minDiskCountR / 1024 / 1024 tmpLog.debug( 'maxAtomSize={0} effectiveAtomSize={1} outDiskCount={2} workDiskSize={3}' .format(tmpMaxAtomSize, tmpEffAtomSize, tmpOutDiskSize, tmpWorkDiskSize)) tmpLog.debug('minDiskCountScratch={0} minDiskCountRemote={1}'.format( minDiskCountS, minDiskCountR)) newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0: if tmpSiteSpec.isDirectIO(): minDiskCount = minDiskCountR else: minDiskCount = minDiskCountS if minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug( ' skip site={0} due to small scratch disk={1} < {2} criteria=-disk' .format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # check endpoint tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) tmpEndPoint = tmpSiteSpec.ddm_endpoints.getEndPoint( tmpSiteSpec.ddm) if tmpEndPoint is not None: # free space must be >= 200GB diskThreshold = 200 tmpSpaceSize = 0 if tmpEndPoint['space_expired'] is not None: tmpSpaceSize += tmpEndPoint['space_expired'] if tmpEndPoint['space_free'] is not None: tmpSpaceSize += tmpEndPoint['space_free'] if tmpSpaceSize < diskThreshold: tmpLog.debug( ' skip site={0} due to disk shortage in SE {1} < {2}GB criteria=-disk' .format(tmpSiteName, tmpSpaceSize, diskThreshold)) continue # check if blacklisted if tmpEndPoint['blacklisted'] == 'Y': tmpLog.debug( ' skip site={0} since {1} is blacklisted in DDM criteria=-blacklist' .format(tmpSiteName, tmpSiteSpec.ddm)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if not minWalltime in [0, None] and minWalltime > 0: minWalltime *= tmpEffAtomSize newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug( ' skip site={0} due to short site walltime={1}(site upper limit) < {2} criteria=-shortwalltime' .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug( ' skip site={0} due to short job walltime={1}(site lower limit) > {2} criteria=-longwalltime' .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format( len(scanSiteList), minWalltime, taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][ 'updateJob'] if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']: tmpLog.debug( ' skip site=%s due to no pilot criteria=-nopilot' % tmpSiteName) if not self.testMode: continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # check inclusion and exclusion newScanSiteList = [] sitesForANY = [] for tmpSiteName in scanSiteList: autoSite = False # check exclusion if AtlasBrokerUtils.isMatched(tmpSiteName, excludeList): tmpLog.debug( ' skip site={0} excluded criteria=-excluded'.format( tmpSiteName)) continue # check inclusion if includeList != None and not AtlasBrokerUtils.isMatched( tmpSiteName, includeList): if 'AUTO' in includeList: autoSite = True else: tmpLog.debug( ' skip site={0} not included criteria=-notincluded'. format(tmpSiteName)) continue tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # limited access if tmpSiteSpec.accesscontrol == 'grouplist': if not siteAccessMap.has_key(tmpSiteSpec.sitename) or \ siteAccessMap[tmpSiteSpec.sitename] != 'approved': tmpLog.debug( ' skip site={0} limited access criteria=-limitedaccess' .format(tmpSiteName)) continue # check cloud if not taskSpec.cloud in [None, '', 'any', tmpSiteSpec.cloud]: tmpLog.debug( ' skip site={0} cloud mismatch criteria=-cloudmismatch'. format(tmpSiteName)) continue if autoSite: sitesForANY.append(tmpSiteName) else: newScanSiteList.append(tmpSiteName) # use AUTO sites if no sites are included if newScanSiteList == []: newScanSiteList = sitesForANY else: for tmpSiteName in sitesForANY: tmpLog.debug( ' skip site={0} not included criteria=-notincluded'. format(tmpSiteName)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed inclusion/exclusion/cloud'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # selection for data availability hasDDS = False dataWeight = {} remoteSourceList = {} if inputChunk.getDatasets() != []: oldScanSiteList = copy.copy(scanSiteList) for datasetSpec in inputChunk.getDatasets(): datasetName = datasetSpec.datasetName if not self.dataSiteMap.has_key(datasetName): # get the list of sites where data is available tmpLog.debug( 'getting the list of sites where {0} is available'. format(datasetName)) tmpSt, tmpRet = AtlasBrokerUtils.getAnalSitesWithData( scanSiteList, self.siteMapper, self.ddmIF, datasetName) if tmpSt in [ Interaction.JEDITemporaryError, Interaction.JEDITimeoutError ]: tmpLog.error( 'temporary failed to get the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError if tmpSt == Interaction.JEDIFatalError: tmpLog.error( 'fatal error when getting the list of sites where data is available, since %s' % tmpRet) taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal # append self.dataSiteMap[datasetName] = tmpRet if datasetName.startswith('ddo'): tmpLog.debug(' {0} sites'.format(len(tmpRet))) else: tmpLog.debug(' {0} sites : {1}'.format( len(tmpRet), str(tmpRet))) # check if distributed if tmpRet != {}: isDistributed = True for tmpMap in tmpRet.values(): for tmpVal in tmpMap.values(): if tmpVal['state'] == 'complete': isDistributed = False break if not isDistributed: break if isDistributed: # check if really distributed isDistributed = self.ddmIF.isDistributedDataset( datasetName) if isDistributed: hasDDS = True datasetSpec.setDistributed() tmpLog.debug(' {0} is distributed'.format( datasetName)) # check if the data is available at somewhere if self.dataSiteMap[datasetName] == {}: tmpLog.error( '{0} is unavailable at any site'.format(datasetName)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal # get the list of sites where data is available scanSiteList = None scanSiteListOnDisk = None normFactor = 0 for datasetName, tmpDataSite in self.dataSiteMap.iteritems(): normFactor += 1 # get sites where replica is available tmpSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk( tmpDataSite, includeTape=True) tmpDiskSiteList = AtlasBrokerUtils.getAnalSitesWithDataDisk( tmpDataSite, includeTape=False) # get sites which can remotely access source sites if inputChunk.isMerging: # disable remote access for merging tmpSatelliteSites = {} elif (not sitePreAssigned) or ( sitePreAssigned and not taskSpec.site in tmpSiteList): tmpSatelliteSites = AtlasBrokerUtils.getSatelliteSites( tmpDiskSiteList, self.taskBufferIF, self.siteMapper, nSites=50, protocol=allowedRemoteProtocol) else: tmpSatelliteSites = {} # make weight map for local for tmpSiteName in tmpSiteList: if not dataWeight.has_key(tmpSiteName): dataWeight[tmpSiteName] = 0 # give more weight to disk if tmpSiteName in tmpDiskSiteList: dataWeight[tmpSiteName] += 1 else: dataWeight[tmpSiteName] += 0.001 # make weight map for remote for tmpSiteName, tmpWeightSrcMap in tmpSatelliteSites.iteritems( ): # skip since local data is available if tmpSiteName in tmpSiteList: continue tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # negative weight for remote access wRemote = 50.0 if not tmpSiteSpec.wansinklimit in [0, None]: wRemote /= float(tmpSiteSpec.wansinklimit) # sum weight if not dataWeight.has_key(tmpSiteName): dataWeight[tmpSiteName] = float( tmpWeightSrcMap['weight']) / wRemote else: dataWeight[tmpSiteName] += float( tmpWeightSrcMap['weight']) / wRemote # make remote source list if not remoteSourceList.has_key(tmpSiteName): remoteSourceList[tmpSiteName] = {} remoteSourceList[tmpSiteName][ datasetName] = tmpWeightSrcMap['source'] # first list if scanSiteList == None: scanSiteList = [] for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys(): if not tmpSiteName in oldScanSiteList: continue if not tmpSiteName in scanSiteList: scanSiteList.append(tmpSiteName) scanSiteListOnDisk = set() for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys( ): if not tmpSiteName in oldScanSiteList: continue scanSiteListOnDisk.add(tmpSiteName) continue # pickup sites which have all data newScanList = [] for tmpSiteName in tmpSiteList + tmpSatelliteSites.keys(): if tmpSiteName in scanSiteList and not tmpSiteName in newScanList: newScanList.append(tmpSiteName) scanSiteList = newScanList tmpLog.debug('{0} is available at {1} sites'.format( datasetName, len(scanSiteList))) # pickup sites which have all data on DISK newScanListOnDisk = set() for tmpSiteName in tmpDiskSiteList + tmpSatelliteSites.keys(): if tmpSiteName in scanSiteListOnDisk: newScanListOnDisk.add(tmpSiteName) scanSiteListOnDisk = newScanListOnDisk tmpLog.debug('{0} is available at {1} sites on DISK'.format( datasetName, len(scanSiteListOnDisk))) # check for preassigned if sitePreAssigned and not taskSpec.site in scanSiteList: scanSiteList = [] tmpLog.debug( 'data is unavailable locally or remotely at preassigned site {0}' .format(taskSpec.site)) elif len(scanSiteListOnDisk) > 0: # use only disk sites scanSiteList = list(scanSiteListOnDisk) tmpLog.debug('{0} candidates have input data'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal ###################################### # sites already used by task tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI( taskSpec.jediTaskID) if not tmpSt: tmpLog.error('failed to get sites which already used by task') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError ###################################### # calculate weight fqans = taskSpec.makeFQANs() """ tmpDm1,tmpDm2,tmpPriorityOffset,tmpSerNum,tmpWeight = self.taskBufferIF.getPrioParameters([],taskSpec.userName,fqans, taskSpec.workingGroup,True) currentPriority = PrioUtil.calculatePriority(tmpPriorityOffset,tmpSerNum,tmpWeight) currentPriority -= 500 tmpLog.debug('currentPriority={0}'.format(currentPriority)) """ tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI( taskSpec.vo, taskSpec.prodSourceLabel) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # check for preassigned if sitePreAssigned and not taskSpec.site in scanSiteList: tmpLog.debug("preassigned site {0} did not pass all tests".format( taskSpec.site)) tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retFatal ###################################### # final procedure tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} candidateSpecList = [] timeWindowForFC = 6 preSiteCandidateSpec = None failureCounts = self.taskBufferIF.getFailureCountsForTask_JEDI( taskSpec.jediTaskID, timeWindowForFC) problematicSites = set() for tmpSiteName in scanSiteList: # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'running', None, None) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'defined', None, None) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'activated',None,None) + \ AtlasBrokerUtils.getNumJobs(jobStatPrioMap,tmpSiteName,'throttled',None,None) nStarting = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'starting', None, None) nFailed = 0 nClosed = 0 nFinished = 0 if tmpSiteName in failureCounts: if 'failed' in failureCounts[tmpSiteName]: nFailed = failureCounts[tmpSiteName]['failed'] if 'closed' in failureCounts[tmpSiteName]: nClosed = failureCounts[tmpSiteName]['closed'] if 'finished' in failureCounts[tmpSiteName]: nFinished = failureCounts[tmpSiteName]['finished'] # problematic sites if nFailed + nClosed > 2 * nFinished: problematicSites.add(tmpSiteName) # calculate weight weight = float(nRunning + 1) / float(nActivated + nAssigned + nStarting + 1) nThrottled = 0 if remoteSourceList.has_key(tmpSiteName): nThrottled = AtlasBrokerUtils.getNumJobs( jobStatPrioMap, tmpSiteName, 'throttled', None, None) weight /= float(nThrottled + 1) # noramize weights by taking data availability into account tmpDataWeight = 1 if dataWeight.has_key(tmpSiteName): weight = weight * dataWeight[tmpSiteName] tmpDataWeight = dataWeight[tmpSiteName] # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # preassigned if sitePreAssigned and tmpSiteName == taskSpec.site: preSiteCandidateSpec = siteCandidateSpec # set weight siteCandidateSpec.weight = weight tmpStr = ' site={0} nRun={1} nDef={2} nAct={3} nStart={4} '.format( tmpSiteName, nRunning, nAssigned, nActivated, nStarting) tmpStr += 'nFailed={0} nClosed={1} nFinished={2} nTr={3} dataW={4} W={5}'.format( nFailed, nClosed, nFinished, nThrottled, tmpDataWeight, weight) tmpLog.debug(tmpStr) # append if tmpSiteName in sitesUsedByTask: candidateSpecList.append(siteCandidateSpec) else: if not weightMap.has_key(weight): weightMap[weight] = [] weightMap[weight].append(siteCandidateSpec) # sort candidates by weights weightList = weightMap.keys() weightList.sort() weightList.reverse() for weightVal in weightList: sitesWithWeight = weightMap[weightVal] random.shuffle(sitesWithWeight) candidateSpecList += sitesWithWeight # limit the number of sites. use all sites for distributed datasets if not hasDDS: maxNumSites = 10 # remove problematic sites candidateSpecList = AtlasBrokerUtils.skipProblematicSites( candidateSpecList, problematicSites, sitesUsedByTask, preSiteCandidateSpec, maxNumSites, timeWindowForFC, tmpLog) # append preassigned if sitePreAssigned and preSiteCandidateSpec != None and not preSiteCandidateSpec in candidateSpecList: candidateSpecList.append(preSiteCandidateSpec) # collect site names scanSiteList = [] for siteCandidateSpec in candidateSpecList: scanSiteList.append(siteCandidateSpec.siteName) # get list of available files availableFileMap = {} for datasetSpec in inputChunk.getDatasets(): try: # get list of site to be scanned fileScanSiteList = [] for tmpSiteName in scanSiteList: fileScanSiteList.append(tmpSiteName) if remoteSourceList.has_key( tmpSiteName ) and remoteSourceList[tmpSiteName].has_key( datasetSpec.datasetName): for tmpRemoteSite in remoteSourceList[tmpSiteName][ datasetSpec.datasetName]: if not tmpRemoteSite in fileScanSiteList: fileScanSiteList.append(tmpRemoteSite) # mapping between sites and storage endpoints siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap( fileScanSiteList, self.siteMapper) # disable file lookup for merge jobs if inputChunk.isMerging: checkCompleteness = False else: checkCompleteness = True # get available files per site/endpoint tmpAvFileMap = self.ddmIF.getAvailableFiles( datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[2], checkCompleteness=checkCompleteness) if tmpAvFileMap == None: raise Interaction.JEDITemporaryError, 'ddmIF.getAvailableFiles failed' availableFileMap[datasetSpec.datasetName] = tmpAvFileMap except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed to get available files with %s %s' % (errtype.__name__, errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # append candidates newScanSiteList = [] for siteCandidateSpec in candidateSpecList: tmpSiteName = siteCandidateSpec.siteName # preassigned if sitePreAssigned and tmpSiteName != taskSpec.site: tmpLog.debug( ' skip site={0} non pre-assigned site criteria=-nonpreassigned' .format(tmpSiteName)) continue # set available files if inputChunk.getDatasets() == []: isAvailable = True else: isAvailable = False for tmpDatasetName, availableFiles in availableFileMap.iteritems(): tmpDatasetSpec = inputChunk.getDatasetWithName(tmpDatasetName) # check remote files if remoteSourceList.has_key(tmpSiteName) and remoteSourceList[ tmpSiteName].has_key(tmpDatasetName): for tmpRemoteSite in remoteSourceList[tmpSiteName][ tmpDatasetName]: if availableFiles.has_key(tmpRemoteSite) and \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpRemoteSite]['localdisk']): # use only remote disk files siteCandidateSpec.remoteFiles += availableFiles[ tmpRemoteSite]['localdisk'] # set remote site and access protocol siteCandidateSpec.remoteProtocol = allowedRemoteProtocol siteCandidateSpec.remoteSource = tmpRemoteSite isAvailable = True break # local files if availableFiles.has_key(tmpSiteName): if len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localdisk']) or \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['cache']) or \ len(tmpDatasetSpec.Files) <= len(availableFiles[tmpSiteName]['localtape']) or \ (tmpDatasetSpec.isDistributed() and len(availableFiles[tmpSiteName]['all']) > 0): siteCandidateSpec.localDiskFiles += availableFiles[ tmpSiteName]['localdisk'] # add cached files to local list since cached files go to pending when reassigned siteCandidateSpec.localDiskFiles += availableFiles[ tmpSiteName]['cache'] siteCandidateSpec.localTapeFiles += availableFiles[ tmpSiteName]['localtape'] siteCandidateSpec.cacheFiles += availableFiles[ tmpSiteName]['cache'] siteCandidateSpec.remoteFiles += availableFiles[ tmpSiteName]['remote'] siteCandidateSpec.addAvailableFiles( availableFiles[tmpSiteName]['all']) isAvailable = True else: tmpMsg = '{0} is incompete at {1} : nFiles={2} nLocal={3} nCached={4} nTape={5}' tmpLog.debug( tmpMsg.format( tmpDatasetName, tmpSiteName, len(tmpDatasetSpec.Files), len(availableFiles[tmpSiteName]['localdisk']), len(availableFiles[tmpSiteName]['cache']), len(availableFiles[tmpSiteName]['localtape']), )) if not isAvailable: break # append if not isAvailable: tmpLog.debug( ' skip site={0} file unavailable criteria=-fileunavailable' .format(siteCandidateSpec.siteName)) continue inputChunk.addSiteCandidate(siteCandidateSpec) newScanSiteList.append(siteCandidateSpec.siteName) tmpLog.debug( ' use site={0} with weight={1} nLocalDisk={2} nLocalTaps={3} nCache={4} nRemote={5} criteria=+use' .format( siteCandidateSpec.siteName, siteCandidateSpec.weight, len(siteCandidateSpec.localDiskFiles), len(siteCandidateSpec.localTapeFiles), len(siteCandidateSpec.cacheFiles), len(siteCandidateSpec.remoteFiles), )) scanSiteList = newScanSiteList if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) # send info to logger self.sendLogMessage(tmpLog) return retTmpError # send info to logger self.sendLogMessage(tmpLog) # return tmpLog.debug('done') return self.SC_SUCCEEDED, inputChunk
def doActionForReassgin(self,gTmpLog): # get DDM I/F ddmIF = self.ddmIF.getInterface(self.vo) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # get tasks to get reassigned taskList = self.taskBufferIF.getTasksToReassign_JEDI(self.vo,self.prodSourceLabel) gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList))) for taskSpec in taskList: tmpLog = MsgWrapper(logger,'<jediTaskID={0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start to reassign') # DDM backend ddmBackEnd = taskSpec.getDdmBackEnd() # get datasets tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,['output','log']) if tmpStat != True: tmpLog.error('failed to get datasets') continue # update DB if not taskSpec.useWorldCloud(): # update cloudtasks tmpStat = self.taskBufferIF.setCloudTaskByUser('jedi',taskSpec.jediTaskID,taskSpec.cloud,'assigned',True) if tmpStat != 'SUCCEEDED': tmpLog.error('failed to update CloudTasks') continue # check cloud if not siteMapper.checkCloud(taskSpec.cloud): tmpLog.error("cloud={0} doesn't exist".format(taskSpec.cloud)) continue else: # re-run task brokerage if taskSpec.nucleus in [None,'']: taskSpec.status = 'assigning' taskSpec.oldStatus = None taskSpec.setToRegisterDatasets() self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID}, setOldModTime=True) tmpLog.debug('set task_status={0} to trigger task brokerage again'.format(taskSpec.status)) continue # get nucleus nucleusSpec = siteMapper.getNucleus(taskSpec.nucleus) if nucleusSpec == None: tmpLog.error("nucleus={0} doesn't exist".format(taskSpec.nucleus)) continue # set nucleus retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,datasetSpecList)} tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) # get T1/nucleus if not taskSpec.useWorldCloud(): t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest'] else: t1SiteName = nucleusSpec.getOnePandaSite() t1Site = siteMapper.getSite(t1SiteName) # loop over all datasets isOK = True for datasetSpec in datasetSpecList: tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName)) if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None: tmpLog.debug('skip {0} is distributed'.format(datasetSpec.datasetName)) continue # get location location = siteMapper.getDdmEndpoint(t1Site.sitename,datasetSpec.storageToken) # make subscription try: tmpLog.debug('registering subscription to {0} with backend={1}'.format(location, ddmBackEnd)) tmpStat = ddmIF.registerDatasetSubscription(datasetSpec.datasetName,location, 'Production Output',asynchronous=True) if tmpStat != True: tmpLog.error("failed to make subscription") isOK = False break except: errtype,errvalue = sys.exc_info()[:2] tmpLog.warning('failed to make subscription with {0}:{1}'.format(errtype.__name__,errvalue)) isOK = False break # succeeded if isOK: # activate task if taskSpec.oldStatus in ['assigning','exhausted',None]: taskSpec.status = 'ready' else: taskSpec.status = taskSpec.oldStatus taskSpec.oldStatus = None self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID}, setOldModTime=True) tmpLog.debug('finished to reassign')
def runImpl(self): while True: try: # get a part of list nTasks = 100 taskList = self.taskList.get(nTasks) totalTasks, idxTasks = self.taskList.stat() # no more datasets if len(taskList) == 0: self.logger.debug( '{0} terminating since no more items'.format( self.__class__.__name__)) return # make logger tmpLog = MsgWrapper(self.logger) tmpLog.info( 'start TaskBrokerThread {0}/{1} for jediTaskID={2}'.format( idxTasks, totalTasks, taskList)) tmpStat = Interaction.SC_SUCCEEDED # get TaskSpecs tmpListToAssign = [] for tmpTaskItem in taskList: tmpListItem = self.taskBufferIF.getTasksToBeProcessed_JEDI( None, None, None, None, None, simTasks=[tmpTaskItem], readMinFiles=True) if tmpListItem is None: # failed tmpLog.error( 'failed to get the input chunks for jediTaskID={0}' .format(tmpTaskItem)) tmpStat = Interaction.SC_FAILED break tmpListToAssign += tmpListItem # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: impl = self.implFactory.getImpl( self.vo, self.prodSourceLabel) if impl is None: # task refiner is undefined tmpLog.error( 'task broker is undefined for vo={0} sourceLabel={1}' .format(self.vo, self.prodSourceLabel)) tmpStat = Interaction.SC_FAILED except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('getImpl failed with {0}:{1}'.format( errtype.__name__, errvalue)) tmpStat = Interaction.SC_FAILED # brokerage if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('brokerage with {0} for {1} tasks '.format( impl.__class__.__name__, len(tmpListToAssign))) try: tmpStat = impl.doBrokerage(tmpListToAssign, self.vo, self.prodSourceLabel, self.workQueue, self.resource_name) except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('doBrokerage failed with {0}:{1}'.format( errtype.__name__, errvalue)) tmpStat = Interaction.SC_FAILED # register if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed') else: tmpLog.info('done') except Exception: errtype, errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format( self.__class__.__name__, errtype.__name__, errvalue))
def doBrokerage(self, inputList, vo, prodSourceLabel, workQueue, resource_name): # list with a lock inputListWorld = ListWithLock([]) # variables for submission maxBunchTask = 100 # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doBrokerage') # return for failure retFatal = self.SC_FATAL retTmpError = self.SC_FAILED tmpLog.debug( 'vo={0} label={1} queue={2} resource_name={3} nTasks={4}'.format( vo, prodSourceLabel, workQueue.queue_name, resource_name, len(inputList))) # loop over all tasks allRwMap = {} prioMap = {} tt2Map = {} expRWs = {} jobSpecList = [] for tmpJediTaskID, tmpInputList in inputList: for taskSpec, cloudName, inputChunk in tmpInputList: # collect tasks for WORLD if taskSpec.useWorldCloud(): inputListWorld.append((taskSpec, inputChunk)) continue # make JobSpec to be submitted for TaskAssigner jobSpec = JobSpec() jobSpec.taskID = taskSpec.jediTaskID jobSpec.jediTaskID = taskSpec.jediTaskID # set managed to trigger TA jobSpec.prodSourceLabel = 'managed' jobSpec.processingType = taskSpec.processingType jobSpec.workingGroup = taskSpec.workingGroup jobSpec.metadata = taskSpec.processingType jobSpec.assignedPriority = taskSpec.taskPriority jobSpec.currentPriority = taskSpec.currentPriority jobSpec.maxDiskCount = ( taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()) // 1024 // 1024 if taskSpec.useWorldCloud(): # use destinationSE to trigger task brokerage in WORLD cloud jobSpec.destinationSE = taskSpec.cloud prodDBlock = None setProdDBlock = False for datasetSpec in inputChunk.getDatasets(): prodDBlock = datasetSpec.datasetName if datasetSpec.isMaster(): jobSpec.prodDBlock = datasetSpec.datasetName setProdDBlock = True for fileSpec in datasetSpec.Files: tmpInFileSpec = fileSpec.convertToJobFileSpec( datasetSpec) jobSpec.addFile(tmpInFileSpec) # use secondary dataset name as prodDBlock if setProdDBlock is False and prodDBlock is not None: jobSpec.prodDBlock = prodDBlock # append jobSpecList.append(jobSpec) prioMap[jobSpec.taskID] = jobSpec.currentPriority tt2Map[jobSpec.taskID] = jobSpec.processingType # get RW for a priority if jobSpec.currentPriority not in allRwMap: tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI( vo, prodSourceLabel, workQueue, jobSpec.currentPriority) if tmpRW is None: tmpLog.error( 'failed to calculate RW with prio={0}'.format( jobSpec.currentPriority)) return retTmpError allRwMap[jobSpec.currentPriority] = tmpRW # get expected RW expRW = self.taskBufferIF.calculateTaskRW_JEDI( jobSpec.jediTaskID) if expRW is None: tmpLog.error( 'failed to calculate RW for jediTaskID={0}'.format( jobSpec.jediTaskID)) return retTmpError expRWs[jobSpec.taskID] = expRW # for old clouds if jobSpecList != []: # get fullRWs fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI( vo, prodSourceLabel, None, None) if fullRWs is None: tmpLog.error('failed to calculate full RW') return retTmpError # set metadata for jobSpec in jobSpecList: rwValues = allRwMap[jobSpec.currentPriority] jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % ( jobSpec.metadata, str(rwValues), str(expRWs), str(prioMap), str(fullRWs), str(tt2Map)) tmpLog.debug('run task assigner for {0} tasks'.format( len(jobSpecList))) nBunchTask = 0 while nBunchTask < len(jobSpecList): # get a bunch jobsBunch = jobSpecList[nBunchTask:nBunchTask + maxBunchTask] strIDs = 'jediTaskID=' for tmpJobSpec in jobsBunch: strIDs += '{0},'.format(tmpJobSpec.taskID) strIDs = strIDs[:-1] tmpLog.debug(strIDs) # increment index nBunchTask += maxBunchTask # run task brokerge stS, outSs = PandaClient.runTaskAssignment(jobsBunch) tmpLog.debug('{0}:{1}'.format(stS, str(outSs))) # for WORLD if len(inputListWorld) > 0: # thread pool threadPool = ThreadPool() # get full RW for WORLD fullRWs = self.taskBufferIF.calculateWorldRWwithPrio_JEDI( vo, prodSourceLabel, None, None) if fullRWs is None: tmpLog.error('failed to calculate full WORLD RW') return retTmpError # get RW per priority for taskSpec, inputChunk in inputListWorld: if taskSpec.currentPriority not in allRwMap: tmpRW = self.taskBufferIF.calculateWorldRWwithPrio_JEDI( vo, prodSourceLabel, workQueue, taskSpec.currentPriority) if tmpRW is None: tmpLog.error( 'failed to calculate RW with prio={0}'.format( taskSpec.currentPriority)) return retTmpError allRwMap[taskSpec.currentPriority] = tmpRW # live counter for RWs liveCounter = MapWithLock(allRwMap) # make workers ddmIF = self.ddmIF.getInterface(vo) for iWorker in range(4): thr = AtlasProdTaskBrokerThread(inputListWorld, threadPool, self.taskBufferIF, ddmIF, fullRWs, liveCounter, workQueue) thr.start() threadPool.join(60 * 10) # return tmpLog.debug('doBrokerage done') return self.SC_SUCCEEDED
def doBrokerage(self, inputList, vo, prodSourceLabel, workQueue): # variables for submission maxBunchTask = 100 # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doBrokerage') # return for failure retFatal = self.SC_FATAL retTmpError = self.SC_FAILED tmpLog.debug('vo={0} label={1} queue={2}'.format( vo, prodSourceLabel, workQueue.queue_name)) # loop over all tasks allRwMap = {} prioMap = {} tt2Map = {} expRWs = {} jobSpecList = [] for tmpJediTaskID, tmpInputList in inputList: for taskSpec, cloudName, inputChunk in tmpInputList: # make JobSpec to be submitted for TaskAssigner jobSpec = JobSpec() jobSpec.taskID = taskSpec.jediTaskID jobSpec.jediTaskID = taskSpec.jediTaskID # set managed to trigger TA jobSpec.prodSourceLabel = 'managed' jobSpec.processingType = taskSpec.processingType jobSpec.workingGroup = taskSpec.workingGroup jobSpec.metadata = taskSpec.processingType jobSpec.assignedPriority = taskSpec.taskPriority jobSpec.currentPriority = taskSpec.currentPriority jobSpec.maxDiskCount = ( taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()) / 1024 / 1024 if taskSpec.useWorldCloud(): # use destinationSE to trigger task brokerage in WORLD cloud jobSpec.destinationSE = taskSpec.cloud prodDBlock = None setProdDBlock = False for datasetSpec in inputChunk.getDatasets(): prodDBlock = datasetSpec.datasetName if datasetSpec.isMaster(): jobSpec.prodDBlock = datasetSpec.datasetName setProdDBlock = True for fileSpec in datasetSpec.Files: tmpInFileSpec = fileSpec.convertToJobFileSpec( datasetSpec) jobSpec.addFile(tmpInFileSpec) # use secondary dataset name as prodDBlock if setProdDBlock == False and prodDBlock != None: jobSpec.prodDBlock = prodDBlock # append jobSpecList.append(jobSpec) prioMap[jobSpec.taskID] = jobSpec.currentPriority tt2Map[jobSpec.taskID] = jobSpec.processingType # get RW for a priority if not allRwMap.has_key(jobSpec.currentPriority): tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI( vo, prodSourceLabel, workQueue, jobSpec.currentPriority) if tmpRW == None: tmpLog.error( 'failed to calculate RW with prio={0}'.format( jobSpec.currentPriority)) return retTmpError allRwMap[jobSpec.currentPriority] = tmpRW # get expected RW expRW = self.taskBufferIF.calculateTaskRW_JEDI( jobSpec.jediTaskID) if expRW == None: tmpLog.error( 'failed to calculate RW for jediTaskID={0}'.format( jobSpec.jediTaskID)) return retTmpError expRWs[jobSpec.taskID] = expRW # get fullRWs fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI( vo, prodSourceLabel, None, None) if fullRWs == None: tmpLog.error('failed to calculate full RW') return retTmpError # set metadata for jobSpec in jobSpecList: rwValues = allRwMap[jobSpec.currentPriority] jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % ( jobSpec.metadata, str(rwValues), str(expRWs), str(prioMap), str(fullRWs), str(tt2Map)) tmpLog.debug('run task assigner for {0} tasks'.format( len(jobSpecList))) nBunchTask = 0 while nBunchTask < len(jobSpecList): # get a bunch jobsBunch = jobSpecList[nBunchTask:nBunchTask + maxBunchTask] strIDs = 'jediTaskID=' for tmpJobSpec in jobsBunch: strIDs += '{0},'.format(tmpJobSpec.taskID) strIDs = strIDs[:-1] tmpLog.debug(strIDs) # increment index nBunchTask += maxBunchTask # run task brokerge stS, outSs = PandaClient.runTaskAssignment(jobsBunch) tmpLog.debug('{0}:{1}'.format(stS, str(outSs))) # return tmpLog.debug('done') return self.SC_SUCCEEDED
def findMissingFiles(self, jediTaskID, cloudName): tmpLog = MsgWrapper(logger, '<jediTaskID={0}>'.format(jediTaskID)) tmpLog.debug('start findMissingFiles') # return for failure retError = self.SC_FAILED # get datasets tmpSt, datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( jediTaskID, ['input'], True) if not tmpSt: tmpLog.error('failed to get the list of datasets') return retError # loop over all datasets for datasetSpec in datasetSpecList: # check only master dataset if not datasetSpec.isMaster(): continue tmpLog.debug('checking {0}'.format(datasetSpec.datasetName)) # get ddmIF ddmIF = self.ddmIF.getInterface(datasetSpec.vo) if ddmIF == None: tmpLog.error('failed to get DDM I/F for vo={0}'.format( datasetSpec.vo)) return retError # get the list of sites where data is available tmpSt, tmpRet = AtlasBrokerUtils.getSitesWithData( self.siteMapper, ddmIF, datasetSpec.datasetName) if tmpSt != self.SC_SUCCEEDED: tmpLog.error( 'failed to get the list of sites where {0} is available, since {1}' .format(datasetSpec.datasetName, tmpRet)) return retError dataSiteMap = tmpRet # data is unavailable in cloud if not dataSiteMap.has_key(cloudName): tmpLog.error('{0} is unavailable in cloud={1} map={2}'.format( datasetSpec.datasetName, cloudName, str(dataSiteMap))) return retError # mapping between sites and storage endpoints checkedSites = [self.siteMapper.getCloud(cloudName)['source'] ] + dataSiteMap[cloudName]['t2'] siteStorageEP = AtlasBrokerUtils.getSiteStorageEndpointMap( checkedSites, self.siteMapper) # get available files per site/endpoint tmpAvFileMap = ddmIF.getAvailableFiles(datasetSpec, siteStorageEP, self.siteMapper, ngGroup=[1], checkLFC=True) if tmpAvFileMap == None: tmpLog.error( 'failed to get available file list for {0}'.format( datasetSpec.datasetName)) return retError # check availability missingFiles = [] for fileSpec in datasetSpec.Files: fileFound = False for tmpSiteName, availableFilesMap in tmpAvFileMap.iteritems(): for tmpStorageType, availableFiles in availableFilesMap.iteritems( ): for availableFile in availableFiles: if fileSpec.lfn == availableFile.lfn: fileFound = True break if fileFound: break if fileFound: break # missing if not fileFound: missingFiles.append(fileSpec.fileID) tmpLog.debug('{0} missing'.format(fileSpec.lfn)) # update contents if missingFiles != []: tmpSt = self.taskBufferIF.setMissingFiles_JEDI( jediTaskID, datasetSpec.datasetID, missingFiles) if not tmpSt: tmpLog.error('failed to set missing files in {0}'.format( datasetSpec.datasetName)) return retError tmpLog.debug('done findMissingFiles') return self.SC_SUCCEEDED
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskList = self.taskList.get(nTasks) # no more datasets if len(taskList) == 0: self.logger.debug('{0} terminating since no more items'.format(self.__class__.__name__)) return # loop over all tasks for jediTaskID,commandMap in taskList: # make logger tmpLog = MsgWrapper(self.logger,' <jediTaskID={0}>'.format(jediTaskID)) commandStr = commandMap['command'] commentStr = commandMap['comment'] oldStatus = commandMap['oldStatus'] tmpLog.info('start for {0}'.format(commandStr)) tmpStat = Interaction.SC_SUCCEEDED if commandStr in ['kill','finish','reassign']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.sendMsg(tmpMsg,self.msgType) # loop twice to see immediate result for iLoop in range(2): # get active PandaIDs to be killed if commandStr == 'reassign' and commentStr != None and 'soft reassign' in commentStr: pandaIDs = self.taskBufferIF.getQueuedPandaIDsWithTask_JEDI(jediTaskID) else: pandaIDs = self.taskBufferIF.getPandaIDsWithTask_JEDI(jediTaskID,True) if pandaIDs == None: tmpLog.error('failed to get PandaIDs for jediTaskID={0}'.format(jediTaskID)) tmpStat = Interaction.SC_FAILED # kill jobs or update task if tmpStat == Interaction.SC_SUCCEEDED: if pandaIDs == []: # done since no active jobs tmpMsg = 'completed cleaning jobs' tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpTaskSpec = JediTaskSpec() tmpTaskSpec.jediTaskID = jediTaskID updateTaskStatus = True if commandStr != 'reassign': # reset oldStatus # keep oldStatus for task reassignment since it is reset when actually reassigned tmpTaskSpec.forceUpdate('oldStatus') else: # extract cloud or site if commentStr != None: tmpItems = commentStr.split(':') if tmpItems[0] == 'cloud': tmpTaskSpec.cloud = tmpItems[1] else: tmpTaskSpec.site = tmpItems[1] tmpMsg = 'set {0}={1}'.format(tmpItems[0],tmpItems[1]) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) # back to oldStatus if necessary if tmpItems[2] == 'y': tmpTaskSpec.status = oldStatus tmpTaskSpec.forceUpdate('oldStatus') updateTaskStatus = False if commandStr == 'reassign': tmpTaskSpec.forceUpdate('errorDialog') if updateTaskStatus: tmpTaskSpec.status = JediTaskSpec.commandStatusMap()[commandStr]['done'] tmpMsg = 'set task.status={0}'.format(tmpTaskSpec.status) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpRet = self.taskBufferIF.updateTask_JEDI(tmpTaskSpec,{'jediTaskID':jediTaskID}) tmpLog.info('done with {0}'.format(str(tmpRet))) break else: # kill only in the first loop if iLoop > 0: break # wait or kill jobs if 'soft finish' in commentStr: tmpMsg = "wating {0} jobs for soft finish".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpRet = True tmpLog.info('done with {0}'.format(str(tmpRet))) break else: tmpMsg = "trying to kill {0} jobs".format(len(pandaIDs)) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg,self.msgType) if commandStr in ['reassign','finish']: # force kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'52',True) else: # normal kill tmpRet = self.taskBufferIF.killJobs(pandaIDs,commentStr,'50',True) tmpLog.info('done with {0}'.format(str(tmpRet))) elif commandStr in ['retry','incexec']: tmpMsg = 'executing {0}'.format(commandStr) tmpLog.sendMsg(tmpMsg,self.msgType) # change task params for incexec if commandStr == 'incexec': try: # read task params taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI(jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) # remove some params for newKey in ['nFiles','fixedSandbox']: try: del taskParamMap[newKey] except: pass # convert new params newParamMap = RefinerUtils.decodeJSON(commentStr) # change params for newKey,newVal in newParamMap.iteritems(): if newVal == None: # delete if newKey in taskParamMap: del taskParamMap[newKey] else: # change taskParamMap[newKey] = newVal # overwrite sandbox if 'fixedSandbox' in taskParamMap: # noBuild for tmpParam in taskParamMap['jobParameters']: if tmpParam['type'] == 'constant' and re.search('^-a [^ ]+$',tmpParam['value']) != None: tmpParam['value'] = '-a {0}'.taskParamMap['fixedSandbox'] # build if taskParamMap.has_key('buildSpec'): taskParamMap['buildSpec']['archiveName'] = taskParamMap['fixedSandbox'] # merge if taskParamMap.has_key('mergeSpec'): taskParamMap['mergeSpec']['jobParameters'] = \ re.sub('-a [^ ]+','-a {0}'.format(taskParamMap['fixedSandbox']),taskParamMap['mergeSpec']['jobParameters']) # encode new param strTaskParams = RefinerUtils.encodeJSON(taskParamMap) tmpRet = self.taskBufferIF.updateTaskParams_JEDI(jediTaskID,strTaskParams) if tmpRet != True: tmpLog.error('failed to update task params') continue except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed to change task params with {0}:{1}'.format(errtype.__name__,errvalue)) continue # retry failed files tmpRet,newTaskStatus = self.taskBufferIF.retryTask_JEDI(jediTaskID,commandStr) if tmpRet == True: tmpMsg = 'set task.status={0}'.format(newTaskStatus) tmpLog.sendMsg(tmpMsg,self.msgType) tmpLog.info(tmpMsg) tmpLog.info('done with {0}'.format(tmpRet)) else: tmpLog.error('unknown command') except: errtype,errvalue = sys.exc_info()[:2] errStr = '{0} failed in runImpl() with {1}:{2} '.format(self.__class__.__name__,errtype.__name__,errvalue) errStr += traceback.format_exc() logger.error(errStr)
def toBeThrottled(self, vo, prodSourceLabel, cloudName, workQueue, resource_name): # params nBunch = 4 threshold = 2.0 nJobsInBunchMax = 600 nJobsInBunchMin = 500 minTotalWalltime = 50 * 1000 * 1000 nWaitingLimit = 4 nWaitingBunchLimit = 2 nParallel = 2 nParallelCap = 5 # make logger tmpLog = MsgWrapper(logger) workQueueID = workQueue.getID() workQueueName = workQueue.queue_name workQueueName = '_'.join(workQueue.queue_name.split(' ')) msgHeader = '{0}:{1} cloud={2} queue={3} resource_type={4}:'.format( vo, prodSourceLabel, cloudName, workQueueName, resource_name) tmpLog.debug('{0} start workQueueID={1}'.format( msgHeader, workQueueID)) # get central configuration values config_map = self.__getConfiguration(vo, workQueue.queue_name, resource_name) configQueueLimit = config_map[NQUEUELIMIT]['value'] configQueueCap = config_map[NQUEUECAP]['value'] configRunningCap = config_map[NRUNNINGCAP]['value'] tmpLog.debug( msgHeader + ' got configuration configQueueLimit={0}, configQueueCap={1}, configRunningCap={2}' .format(configQueueLimit, configQueueCap, configRunningCap)) # check if unthrottled if not workQueue.throttled: msgBody = "PASS unthrottled since GS_throttled is False" tmpLog.info(msgHeader + " " + msgBody) return self.retUnThrottled # get the jobs statistics for our wq/gs and expand the stats map jobstats_map = self.__prepareJobStats(workQueue, resource_name, config_map) nRunning_rt = jobstats_map['nRunning_rt'] nRunning_gs = jobstats_map['nRunning_gs'] nRunning_runningcap = jobstats_map['nRunning_runningcap'] nNotRun_rt = jobstats_map['nNotRun_rt'] nNotRun_gs = jobstats_map['nNotRun_gs'] nNotRun_queuelimit = jobstats_map['nNotRun_queuelimit'] nNotRun_queuecap = jobstats_map['nNotRun_queuecap'] nDefine_rt = jobstats_map['nDefine_rt'] nDefine_gs = jobstats_map['nDefine_gs'] nDefine_queuelimit = jobstats_map['nDefine_queuelimit'] nDefine_queuecap = jobstats_map['nDefine_queuecap'] nWaiting_rt = jobstats_map['nWaiting_rt'] nWaiting_gs = jobstats_map['nWaiting_gs'] # check if higher prio tasks are waiting if workQueue.queue_name in non_rt_wqs: # find highest priority of currently defined jobs tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI( 'managed', cloudName, workQueue) # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI( vo, workQueue, 'managed', cloudName) else: # find highest priority of currently defined jobs tmpStat, highestPrioJobStat = self.taskBufferIF.getHighestPrioJobStat_JEDI( 'managed', cloudName, workQueue, resource_name) # the highest priority of waiting tasks highestPrioWaiting = self.taskBufferIF.checkWaitingTaskPrio_JEDI( vo, workQueue, 'managed', cloudName, resource_name) highestPrioInPandaDB = highestPrioJobStat['highestPrio'] nNotRunHighestPrio = highestPrioJobStat['nNotRun'] if highestPrioWaiting is None: msgBody = 'failed to get the highest priority of waiting tasks' tmpLog.error("{0} {1}".format(msgHeader, msgBody)) return self.retTmpError # high priority tasks are waiting highPrioQueued = False if highestPrioWaiting > highestPrioInPandaDB \ or (highestPrioWaiting == highestPrioInPandaDB and nNotRunHighestPrio < nJobsInBunchMin): highPrioQueued = True tmpLog.debug( "{0} highestPrio waiting:{1} inPanda:{2} numNotRun:{3} -> highPrioQueued={4}" .format(msgHeader, highestPrioWaiting, highestPrioInPandaDB, nNotRunHighestPrio, highPrioQueued)) # set maximum number of jobs to be submitted if workQueue.queue_name in non_rt_wqs: tmpRemainingSlot = int(nRunning_gs * threshold - nNotRun_gs) else: tmpRemainingSlot = int(nRunning_rt * threshold - nNotRun_rt) # use the lower limit to avoid creating too many _sub/_dis datasets nJobsInBunch = min(max(nJobsInBunchMin, tmpRemainingSlot), nJobsInBunchMax) if configQueueLimit is not None: nQueueLimit = configQueueLimit else: nQueueLimit = nJobsInBunch * nBunch # use nPrestage for reprocessing if workQueue.queue_name in ['Heavy Ion', 'Reprocessing default']: # reset nJobsInBunch if nQueueLimit > (nNotRun_queuelimit + nDefine_queuelimit): tmpRemainingSlot = nQueueLimit - (nNotRun_queuelimit + nDefine_queuelimit) if tmpRemainingSlot > nJobsInBunch: nJobsInBunch = min(tmpRemainingSlot, nJobsInBunchMax) # get cap # set number of jobs to be submitted if configQueueCap is None: self.setMaxNumJobs(nJobsInBunch / nParallel) else: self.setMaxNumJobs(configQueueCap / nParallelCap) # get total walltime totWalltime = self.taskBufferIF.getTotalWallTime_JEDI( vo, prodSourceLabel, workQueue, resource_name, cloudName) # log the current situation and limits tmpLog.info("{0} nQueueLimit={1} nRunCap={2} nQueueCap={3}".format( msgHeader, nQueueLimit, configRunningCap, configQueueCap)) tmpLog.info( "{0} at global share level: nQueued={1} nDefine={2} nRunning={3}". format(msgHeader, nNotRun_gs + nDefine_gs, nDefine_gs, nRunning_gs)) tmpLog.info( "{0} at resource type level: nQueued_rt={1} nDefine_rt={2} nRunning_rt={3} totWalltime={4}" .format(msgHeader, nNotRun_rt + nDefine_rt, nDefine_rt, nRunning_rt, totWalltime)) # check number of jobs when high priority jobs are not waiting. test jobs are sent without throttling limitPriority = False if workQueue.queue_name not in non_rt_wqs \ and nRunning_rt == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit \ and (totWalltime is None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format( nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name in non_rt_wqs \ and nRunning_gs == 0 and (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit: limitPriority = True if not highPrioQueued: # pilot is not running or DDM has a problem msgBody = "SKIP no running and enough nQueued_queuelimit({0})>{1} totWalltime({2})>{3} ".format( nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name not in non_rt_wqs and nRunning_rt != 0 \ and float(nNotRun_rt + nDefine_rt) / float(nRunning_rt) > threshold and \ (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit and (totWalltime is None or totWalltime > minTotalWalltime): limitPriority = True if not highPrioQueued: # enough jobs in Panda msgBody = "SKIP nQueued_rt({0})/nRunning_rt({1})>{2} & nQueued_queuelimit({3})>{4} totWalltime({5})>{6}".format( nNotRun_rt + nDefine_rt, nRunning_rt, threshold, nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit, totWalltime, minTotalWalltime) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif workQueue.queue_name in non_rt_wqs and nRunning_gs != 0 \ and float(nNotRun_gs + nDefine_gs) / float(nRunning_gs) > threshold and \ (nNotRun_queuelimit + nDefine_queuelimit) > nQueueLimit: limitPriority = True if not highPrioQueued: # enough jobs in Panda msgBody = "SKIP nQueued_gs({0})/nRunning_gs({1})>{2} & nQueued_queuelimit({3})>{4}".format( nNotRun_gs + nDefine_gs, nRunning_gs, threshold, nNotRun_queuelimit + nDefine_queuelimit, nQueueLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif nDefine_queuelimit > nQueueLimit: limitPriority = True if not highPrioQueued: # brokerage is stuck msgBody = "SKIP too many nDefined_queuelimit({0})>{1}".format( nDefine_queuelimit, nQueueLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif nWaiting_rt > max(nRunning_rt * nWaitingLimit, nJobsInBunch * nWaitingBunchLimit): limitPriority = True if not highPrioQueued: # too many waiting msgBody = "SKIP too many nWaiting_rt({0})>max(nRunning_rt({1})x{2},{3}x{4})".format( nWaiting_rt, nRunning_rt, nWaitingLimit, nJobsInBunch, nWaitingBunchLimit) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif configRunningCap and nRunning_runningcap > configRunningCap: # cap on running msgBody = "SKIP nRunning_runningcap({0})>nRunningCap({1})".format( nRunning_runningcap, configRunningCap) tmpLog.warning('{0} {1}'.format(msgHeader, msgBody)) tmpLog.sendMsg('{0} {1}'.format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr elif configQueueCap and nNotRun_queuecap + nDefine_queuecap > configQueueCap: limitPriority = True if not highPrioQueued: # cap on queued msgBody = "SKIP nQueued_queuecap({0})>nQueueCap({1})".format( nNotRun_queuecap + nDefine_queuecap, configQueueCap) tmpLog.warning("{0} {1}".format(msgHeader, msgBody)) tmpLog.sendMsg("{0} {1}".format(msgHeader, msgBody), self.msgType, msgLevel='warning', escapeChar=True) return self.retMergeUnThr # get jobs from prodDB limitPriorityValue = None if limitPriority: limitPriorityValue = highestPrioWaiting self.setMinPriority(limitPriorityValue) else: # not enough jobs are queued if (nNotRun_queuelimit + nDefine_queuelimit < nQueueLimit * 0.9) \ or (workQueue.queue_name in non_rt_wqs and nNotRun_gs + nDefine_gs < nRunning_gs) \ or (workQueue.queue_name not in non_rt_wqs and nNotRun_rt + nDefine_rt < nRunning_rt): tmpLog.debug(msgHeader + " not enough jobs queued") if not workQueue.queue_name in non_rt_wqs: self.notEnoughJobsQueued() self.setMaxNumJobs(max(self.maxNumJobs, nQueueLimit / 20)) msgBody = "PASS - priority limit={0} maxNumJobs={1}".format( limitPriorityValue, self.maxNumJobs) tmpLog.info(msgHeader + " " + msgBody) return self.retUnThrottled
def doBrokerage(self,inputList,vo,prodSourceLabel,workQueue): # list with a lock inputListWorld = ListWithLock([]) # variables for submission maxBunchTask = 100 # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doBrokerage') # return for failure retFatal = self.SC_FATAL retTmpError = self.SC_FAILED tmpLog.debug('vo={0} label={1} queue={2} nTasks={3}'.format(vo,prodSourceLabel, workQueue.queue_name, len(inputList))) # loop over all tasks allRwMap = {} prioMap = {} tt2Map = {} expRWs = {} jobSpecList = [] for tmpJediTaskID,tmpInputList in inputList: for taskSpec,cloudName,inputChunk in tmpInputList: # collect tasks for WORLD if taskSpec.useWorldCloud(): inputListWorld.append((taskSpec,inputChunk)) continue # make JobSpec to be submitted for TaskAssigner jobSpec = JobSpec() jobSpec.taskID = taskSpec.jediTaskID jobSpec.jediTaskID = taskSpec.jediTaskID # set managed to trigger TA jobSpec.prodSourceLabel = 'managed' jobSpec.processingType = taskSpec.processingType jobSpec.workingGroup = taskSpec.workingGroup jobSpec.metadata = taskSpec.processingType jobSpec.assignedPriority = taskSpec.taskPriority jobSpec.currentPriority = taskSpec.currentPriority jobSpec.maxDiskCount = (taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()) / 1024 / 1024 if taskSpec.useWorldCloud(): # use destinationSE to trigger task brokerage in WORLD cloud jobSpec.destinationSE = taskSpec.cloud prodDBlock = None setProdDBlock = False for datasetSpec in inputChunk.getDatasets(): prodDBlock = datasetSpec.datasetName if datasetSpec.isMaster(): jobSpec.prodDBlock = datasetSpec.datasetName setProdDBlock = True for fileSpec in datasetSpec.Files: tmpInFileSpec = fileSpec.convertToJobFileSpec(datasetSpec) jobSpec.addFile(tmpInFileSpec) # use secondary dataset name as prodDBlock if setProdDBlock == False and prodDBlock != None: jobSpec.prodDBlock = prodDBlock # append jobSpecList.append(jobSpec) prioMap[jobSpec.taskID] = jobSpec.currentPriority tt2Map[jobSpec.taskID] = jobSpec.processingType # get RW for a priority if not allRwMap.has_key(jobSpec.currentPriority): tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI(vo,prodSourceLabel,workQueue, jobSpec.currentPriority) if tmpRW == None: tmpLog.error('failed to calculate RW with prio={0}'.format(jobSpec.currentPriority)) return retTmpError allRwMap[jobSpec.currentPriority] = tmpRW # get expected RW expRW = self.taskBufferIF.calculateTaskRW_JEDI(jobSpec.jediTaskID) if expRW == None: tmpLog.error('failed to calculate RW for jediTaskID={0}'.format(jobSpec.jediTaskID)) return retTmpError expRWs[jobSpec.taskID] = expRW # for old clouds if jobSpecList != []: # get fullRWs fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI(vo,prodSourceLabel,None,None) if fullRWs == None: tmpLog.error('failed to calculate full RW') return retTmpError # set metadata for jobSpec in jobSpecList: rwValues = allRwMap[jobSpec.currentPriority] jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % (jobSpec.metadata, str(rwValues),str(expRWs), str(prioMap),str(fullRWs), str(tt2Map)) tmpLog.debug('run task assigner for {0} tasks'.format(len(jobSpecList))) nBunchTask = 0 while nBunchTask < len(jobSpecList): # get a bunch jobsBunch = jobSpecList[nBunchTask:nBunchTask+maxBunchTask] strIDs = 'jediTaskID=' for tmpJobSpec in jobsBunch: strIDs += '{0},'.format(tmpJobSpec.taskID) strIDs = strIDs[:-1] tmpLog.debug(strIDs) # increment index nBunchTask += maxBunchTask # run task brokerge stS,outSs = PandaClient.runTaskAssignment(jobsBunch) tmpLog.debug('{0}:{1}'.format(stS,str(outSs))) # for WORLD if len(inputListWorld) > 0: # thread pool threadPool = ThreadPool() # get full RW for WORLD fullRWs = self.taskBufferIF.calculateWorldRWwithPrio_JEDI(vo,prodSourceLabel,None,None) if fullRWs == None: tmpLog.error('failed to calculate full WORLD RW') return retTmpError # get RW per priority for taskSpec,inputChunk in inputListWorld: if not taskSpec.currentPriority in allRwMap: tmpRW = self.taskBufferIF.calculateWorldRWwithPrio_JEDI(vo,prodSourceLabel,workQueue, taskSpec.currentPriority) if tmpRW == None: tmpLog.error('failed to calculate RW with prio={0}'.format(taskSpec.currentPriority)) return retTmpError allRwMap[taskSpec.currentPriority] = tmpRW # live counter for RWs liveCounter = MapWithLock(allRwMap) # make workers ddmIF = self.ddmIF.getInterface(vo) for iWorker in range(4): thr = AtlasProdTaskBrokerThread(inputListWorld,threadPool, self.taskBufferIF,ddmIF, fullRWs,liveCounter) thr.start() threadPool.join(60*10) # return tmpLog.debug('doBrokerage done') return self.SC_SUCCEEDED
def runImpl(self): # cutoff for disk in TB diskThreshold = self.taskBufferIF.getConfigValue( self.msgType, 'DISK_THRESHOLD_{0}'.format(self.workQueue.queue_name), 'jedi', 'atlas') if diskThreshold is None: diskThreshold = 100 * 1024 # dataset type to ignore file availability check datasetTypeToSkipCheck = ['log'] # thresholds for data availability check thrInputSize = self.taskBufferIF.getConfigValue( self.msgType, 'INPUT_SIZE_THRESHOLD', 'jedi', 'atlas') if thrInputSize is None: thrInputSize = 1 thrInputSize *= 1024 * 1024 * 1024 thrInputNum = self.taskBufferIF.getConfigValue(self.msgType, 'INPUT_NUM_THRESHOLD', 'jedi', 'atlas') if thrInputNum is None: thrInputNum = 100 thrInputSizeFrac = self.taskBufferIF.getConfigValue( self.msgType, 'INPUT_SIZE_FRACTION', 'jedi', 'atlas') if thrInputSizeFrac is None: thrInputSizeFrac = 10 thrInputSizeFrac = float(thrInputSizeFrac) / 100 thrInputNumFrac = self.taskBufferIF.getConfigValue( self.msgType, 'INPUT_NUM_FRACTION', 'jedi', 'atlas') if thrInputNumFrac is None: thrInputNumFrac = 10 thrInputNumFrac = float(thrInputNumFrac) / 100 cutOffRW = 50 negWeightTape = 0.001 minIoIntensityWithLD = self.taskBufferIF.getConfigValue( self.msgType, 'MIN_IO_INTENSITY_WITH_LOCAL_DATA', 'jedi', 'atlas') if minIoIntensityWithLD is None: minIoIntensityWithLD = 200 minInputSizeWithLD = self.taskBufferIF.getConfigValue( self.msgType, 'MIN_INPUT_SIZE_WITH_LOCAL_DATA', 'jedi', 'atlas') if minInputSizeWithLD is None: minInputSizeWithLD = 10000 maxTaskPrioWithLD = self.taskBufferIF.getConfigValue( self.msgType, 'MAX_TASK_PRIO_WITH_LOCAL_DATA', 'jedi', 'atlas') if maxTaskPrioWithLD is None: maxTaskPrioWithLD = 800 # main lastJediTaskID = None siteMapper = self.taskBufferIF.getSiteMapper() while True: try: taskInputList = self.inputList.get(1) # no more datasets if len(taskInputList) == 0: self.logger.debug( '{0} terminating after processing {1} tasks since no more inputs ' .format(self.__class__.__name__, self.numTasks)) return # loop over all tasks for taskSpec, inputChunk in taskInputList: lastJediTaskID = taskSpec.jediTaskID # make logger tmpLog = MsgWrapper( self.logger, '<jediTaskID={0}>'.format(taskSpec.jediTaskID), monToken='jediTaskID={0}'.format(taskSpec.jediTaskID)) tmpLog.debug('start') tmpLog.info( 'thrInputSize:{0} thrInputNum:{1} thrInputSizeFrac:{2} thrInputNumFrac;{3}' .format(thrInputSize, thrInputNum, thrInputSizeFrac, thrInputNumFrac)) # read task parameters try: taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI( taskSpec.jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except Exception: tmpLog.error('failed to read task params') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue # RW taskRW = self.taskBufferIF.calculateTaskWorldRW_JEDI( taskSpec.jediTaskID) # get nuclei nucleusList = siteMapper.nuclei if taskSpec.nucleus in siteMapper.nuclei: candidateNucleus = taskSpec.nucleus elif taskSpec.nucleus in siteMapper.satellites: nucleusList = siteMapper.satellites candidateNucleus = taskSpec.nucleus else: tmpLog.info('got {0} candidates'.format( len(nucleusList))) ###################################### # check status newNucleusList = {} for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleusSpec.state not in ['ACTIVE']: tmpLog.info( ' skip nucleus={0} due to status={1} criteria=-status' .format(tmpNucleus, tmpNucleusSpec.state)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info( '{0} candidates passed status check'.format( len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check status of transfer backlog t1Weight = taskSpec.getT1Weight() if t1Weight < 0: tmpLog.info( 'skip transfer backlog check due to negative T1Weight' ) else: newNucleusList = {} backlogged_nuclei = self.taskBufferIF.getBackloggedNuclei( ) for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleus in backlogged_nuclei: tmpLog.info( ' skip nucleus={0} due to long transfer backlog criteria=-transfer_backlog' .format(tmpNucleus)) else: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info( '{0} candidates passed transfer backlog check'. format(len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # check endpoint fractionFreeSpace = {} newNucleusList = {} tmpStat, tmpDatasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( taskSpec.jediTaskID, ['output', 'log']) for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): toSkip = False for tmpDatasetSpec in tmpDatasetSpecList: # ignore distributed datasets if DataServiceUtils.getDistributedDestination( tmpDatasetSpec.storageToken ) is not None: continue # get endpoint with the pattern tmpEP = tmpNucleusSpec.getAssociatedEndpoint( tmpDatasetSpec.storageToken) if tmpEP is None: tmpLog.info( ' skip nucleus={0} since no endpoint with {1} criteria=-match' .format(tmpNucleus, tmpDatasetSpec.storageToken)) toSkip = True break # check state """ if tmpEP['state'] not in ['ACTIVE']: tmpLog.info(' skip nucleus={0} since endpoint {1} is in {2} criteria=-epstatus'.format(tmpNucleus, tmpEP['ddm_endpoint_name'], tmpEP['state'])) toSkip = True break """ # check space tmpSpaceSize = tmpEP['space_free'] + tmpEP[ 'space_expired'] tmpSpaceToUse = 0 if tmpNucleus in self.fullRW: # 0.25GB per cpuTime/corePower/day tmpSpaceToUse = long( self.fullRW[tmpNucleus] / 10 / 24 / 3600 * 0.25) if tmpSpaceSize - tmpSpaceToUse < diskThreshold: tmpLog.info( ' skip nucleus={0} since disk shortage (free {1} GB - reserved {2} GB < thr {3} GB) at endpoint {4} criteria=-space' .format(tmpNucleus, tmpSpaceSize, tmpSpaceToUse, diskThreshold, tmpEP['ddm_endpoint_name'])) toSkip = True break # keep fraction of free space if tmpNucleus not in fractionFreeSpace: fractionFreeSpace[tmpNucleus] = { 'total': 0, 'free': 0 } try: tmpOld = float(fractionFreeSpace[tmpNucleus]['free']) / \ float(fractionFreeSpace[tmpNucleus]['total']) except Exception: tmpOld = None try: tmpNew = float(tmpSpaceSize - tmpSpaceToUse) / float( tmpEP['space_total']) except Exception: tmpNew = None if tmpNew is not None and (tmpOld is None or tmpNew < tmpOld): fractionFreeSpace[tmpNucleus] = { 'total': tmpEP['space_total'], 'free': tmpSpaceSize - tmpSpaceToUse } if not toSkip: newNucleusList[tmpNucleus] = tmpNucleusSpec nucleusList = newNucleusList tmpLog.info( '{0} candidates passed endpoint check {1} TB'. format(len(nucleusList), diskThreshold / 1024)) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # ability to execute jobs newNucleusList = {} # get all panda sites tmpSiteList = [] for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): tmpSiteList += tmpNucleusSpec.allPandaSites tmpSiteList = list(set(tmpSiteList)) tmpLog.debug('===== start for job check') jobBroker = AtlasProdJobBroker(self.ddmIF, self.taskBufferIF) tmpSt, tmpRet = jobBroker.doBrokerage( taskSpec, taskSpec.cloud, inputChunk, None, True, tmpSiteList, tmpLog) tmpLog.debug('===== done for job check') if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error('no sites can run jobs') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue okNuclei = set() for tmpSite in tmpRet: siteSpec = siteMapper.getSite(tmpSite) okNuclei.add(siteSpec.pandasite) for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleus in okNuclei: newNucleusList[tmpNucleus] = tmpNucleusSpec else: tmpLog.info( ' skip nucleus={0} due to missing ability to run jobs criteria=-job' .format(tmpNucleus)) nucleusList = newNucleusList tmpLog.info('{0} candidates passed job check'.format( len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # data locality toSkip = False availableData = {} for datasetSpec in inputChunk.getDatasets(): # only for real datasets if datasetSpec.isPseudo(): continue # ignore DBR if DataServiceUtils.isDBR(datasetSpec.datasetName): continue # skip locality check if DataServiceUtils.getDatasetType( datasetSpec.datasetName ) in datasetTypeToSkipCheck: continue # primary only if taskParamMap.get( 'taskBrokerOnMaster' ) is True and not datasetSpec.isMaster(): continue # use deep scan for primary dataset unless data carousel if datasetSpec.isMaster( ) and not taskSpec.inputPreStaging(): deepScan = True else: deepScan = False # get nuclei where data is available tmpSt, tmpRet = AtlasBrokerUtils.getNucleiWithData( siteMapper, self.ddmIF, datasetSpec.datasetName, list(nucleusList.keys()), deepScan) if tmpSt != Interaction.SC_SUCCEEDED: tmpLog.error( 'failed to get nuclei where data is available, since {0}' .format(tmpRet)) taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) toSkip = True break # sum for tmpNucleus, tmpVals in iteritems(tmpRet): if tmpNucleus not in availableData: availableData[tmpNucleus] = tmpVals else: availableData[tmpNucleus] = dict( (k, v + tmpVals[k]) for (k, v) in iteritems( availableData[tmpNucleus])) if toSkip: continue if availableData != {}: newNucleusList = {} # skip if no data skipMsgList = [] for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if taskSpec.inputPreStaging( ) and availableData[tmpNucleus][ 'ava_num_any'] > 0: # use incomplete replicas for data carousel since the completeness is guaranteed newNucleusList[tmpNucleus] = tmpNucleusSpec elif availableData[tmpNucleus]['tot_size'] > thrInputSize and \ availableData[tmpNucleus]['ava_size_any'] < availableData[tmpNucleus]['tot_size'] * thrInputSizeFrac: tmpMsg = ' skip nucleus={0} due to insufficient input size {1}B < {2}*{3} criteria=-insize'.format( tmpNucleus, availableData[tmpNucleus] ['ava_size_any'], availableData[tmpNucleus]['tot_size'], thrInputSizeFrac) skipMsgList.append(tmpMsg) elif availableData[tmpNucleus]['tot_num'] > thrInputNum and \ availableData[tmpNucleus]['ava_num_any'] < availableData[tmpNucleus]['tot_num'] * thrInputNumFrac: tmpMsg = ' skip nucleus={0} due to short number of input files {1} < {2}*{3} criteria=-innum'.format( tmpNucleus, availableData[tmpNucleus] ['ava_num_any'], availableData[tmpNucleus]['tot_num'], thrInputNumFrac) skipMsgList.append(tmpMsg) else: newNucleusList[tmpNucleus] = tmpNucleusSpec totInputSize = list(availableData.values( ))[0]['tot_size'] / 1024 / 1024 / 1024 data_locality_check_str = ( '(ioIntensity ({0}) is None or less than {1} kBPerS ' 'and input size ({2} GB) is less than {3}) ' 'or task.currentPriority ({4}) is higher than or equal to {5}' ).format(taskSpec.ioIntensity, minIoIntensityWithLD, int(totInputSize), minInputSizeWithLD, taskSpec.currentPriority, maxTaskPrioWithLD) if len(newNucleusList) > 0: nucleusList = newNucleusList for tmpMsg in skipMsgList: tmpLog.info(tmpMsg) elif ((taskSpec.ioIntensity is None or taskSpec.ioIntensity <= minIoIntensityWithLD) and totInputSize <= minInputSizeWithLD) \ or taskSpec.currentPriority >= maxTaskPrioWithLD: availableData = {} tmpLog.info( ' disable data locality check since no nucleus has input data, {}' .format(data_locality_check_str)) else: # no candidate + unavoidable data locality check nucleusList = newNucleusList for tmpMsg in skipMsgList: tmpLog.info(tmpMsg) tmpLog.info( ' the following conditions required to disable data locality check: {}' .format(data_locality_check_str)) tmpLog.info( '{0} candidates passed data check'.format( len(nucleusList))) if nucleusList == {}: tmpLog.error('no candidates') taskSpec.setErrDiag( tmpLog.uploadLog(taskSpec.jediTaskID)) self.sendLogMessage(tmpLog) continue ###################################### # weight self.prioRW.acquire() nucleusRW = self.prioRW[taskSpec.currentPriority] self.prioRW.release() totalWeight = 0 nucleusweights = [] for tmpNucleus, tmpNucleusSpec in iteritems( nucleusList): if tmpNucleus not in nucleusRW: nucleusRW[tmpNucleus] = 0 wStr = '1' # with RW if tmpNucleus in nucleusRW and nucleusRW[ tmpNucleus] >= cutOffRW: weight = 1 / float(nucleusRW[tmpNucleus]) wStr += '/( RW={0} )'.format( nucleusRW[tmpNucleus]) else: weight = 1 wStr += '/(1 : RW={0}<{1})'.format( nucleusRW[tmpNucleus], cutOffRW) # with data if availableData != {}: if availableData[tmpNucleus]['tot_size'] > 0: weight *= float(availableData[tmpNucleus] ['ava_size_any']) weight /= float( availableData[tmpNucleus]['tot_size']) wStr += '* ( available_input_size_DISKTAPE={0} )'.format( availableData[tmpNucleus] ['ava_size_any']) wStr += '/ ( total_input_size={0} )'.format( availableData[tmpNucleus]['tot_size']) # negative weight for tape if availableData[tmpNucleus][ 'ava_size_any'] > availableData[ tmpNucleus]['ava_size_disk']: weight *= negWeightTape wStr += '*( weight_TAPE={0} )'.format( negWeightTape) # fraction of free space if tmpNucleus in fractionFreeSpace: try: tmpFrac = float(fractionFreeSpace[tmpNucleus]['free']) / \ float(fractionFreeSpace[tmpNucleus]['total']) weight *= tmpFrac wStr += '*( free_space={0} )/( total_space={1} )'.format( fractionFreeSpace[tmpNucleus]['free'], fractionFreeSpace[tmpNucleus]['total']) except Exception: pass tmpLog.info( ' use nucleus={0} weight={1} {2} criteria=+use' .format(tmpNucleus, weight, wStr)) totalWeight += weight nucleusweights.append((tmpNucleus, weight)) tmpLog.info('final {0} candidates'.format( len(nucleusList))) ###################################### # final selection tgtWeight = random.uniform(0, totalWeight) candidateNucleus = None for tmpNucleus, weight in nucleusweights: tgtWeight -= weight if tgtWeight <= 0: candidateNucleus = tmpNucleus break if candidateNucleus is None: candidateNucleus = nucleusweights[-1][0] ###################################### # update nucleusSpec = nucleusList[candidateNucleus] # get output/log datasets tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( taskSpec.jediTaskID, ['output', 'log']) # get destinations retMap = { taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus( nucleusSpec, tmpDatasetSpecs) } tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) tmpLog.info( ' set nucleus={0} with {1} criteria=+set'.format( candidateNucleus, tmpRet)) self.sendLogMessage(tmpLog) if tmpRet: tmpMsg = 'set task_status=ready' tmpLog.sendMsg(tmpMsg, self.msgType) # update RW table self.prioRW.acquire() for prio, rwMap in iteritems(self.prioRW): if prio > taskSpec.currentPriority: continue if candidateNucleus in rwMap: rwMap[candidateNucleus] += taskRW else: rwMap[candidateNucleus] = taskRW self.prioRW.release() except Exception: errtype, errvalue = sys.exc_info()[:2] errMsg = '{0}.runImpl() failed with {1} {2} '.format( self.__class__.__name__, errtype.__name__, errvalue) errMsg += 'lastJediTaskID={0} '.format(lastJediTaskID) errMsg += traceback.format_exc() logger.error(errMsg)
def doSetup(self,taskSpec,datasetToRegister,pandaJobs): # make logger tmpLog = MsgWrapper(logger,"< jediTaskID={0} >".format(taskSpec.jediTaskID)) tmpLog.info('start label={0} taskType={1}'.format(taskSpec.prodSourceLabel,taskSpec.taskType)) # returns retFatal = self.SC_FATAL retTmpError = self.SC_FAILED retOK = self.SC_SUCCEEDED try: # get DDM I/F ddmIF = self.ddmIF.getInterface(taskSpec.vo) # register datasets if datasetToRegister != [] or taskSpec.prodSourceLabel in ['user']: # prod vs anal userSetup = False if taskSpec.prodSourceLabel in ['user']: userSetup = True # collect datasetID to register datasets/containers just in case for tmpPandaJob in pandaJobs: if not tmpPandaJob.produceUnMerge(): for tmpFileSpec in tmpPandaJob.Files: if tmpFileSpec.type in ['output','log']: if tmpFileSpec.datasetID not in datasetToRegister: datasetToRegister.append(tmpFileSpec.datasetID) tmpLog.info('datasetToRegister={0}'.format(str(datasetToRegister))) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # loop over all datasets avDatasetList = [] cnDatasetMap = {} for datasetID in datasetToRegister: # get output and log datasets tmpLog.info('getting datasetSpec with datasetID={0}'.format(datasetID)) tmpStat,datasetSpec = self.taskBufferIF.getDatasetWithID_JEDI(taskSpec.jediTaskID, datasetID) if not tmpStat: tmpLog.error('failed to get output and log datasets') return retFatal if datasetSpec.isPseudo(): tmpLog.info('skip pseudo dataset') continue # DDM backend ddmBackEnd = taskSpec.getDdmBackEnd() tmpLog.info('checking {0}'.format(datasetSpec.datasetName)) # check if dataset and container are available in DDM for targetName in [datasetSpec.datasetName,datasetSpec.containerName]: if targetName is None: continue if targetName not in avDatasetList: # set lifetime if targetName.startswith('panda'): if datasetSpec.type == 'trn_log' and taskSpec.prodSourceLabel == 'managed': lifetime = 365 else: lifetime = 14 else: lifetime = None # check dataset/container in DDM tmpList = ddmIF.listDatasets(targetName) if tmpList == []: # get location location = None locForRule = None if targetName == datasetSpec.datasetName: # dataset if datasetSpec.site in ['',None]: if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None: locForRule = datasetSpec.destination elif DataServiceUtils.getDestinationSE(datasetSpec.storageToken) is not None: location = DataServiceUtils.getDestinationSE(datasetSpec.storageToken) elif taskSpec.cloud is not None: # use T1 SE tmpT1Name = siteMapper.getCloud(taskSpec.cloud)['source'] location = siteMapper.getDdmEndpoint(tmpT1Name, datasetSpec.storageToken, taskSpec.prodSourceLabel, JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType)) else: tmpLog.info('site={0} token={1}'.format(datasetSpec.site, datasetSpec.storageToken)) location = siteMapper.getDdmEndpoint(datasetSpec.site,datasetSpec.storageToken, taskSpec.prodSourceLabel, JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType)) if locForRule is None: locForRule = location # set metadata if taskSpec.prodSourceLabel in ['managed','test'] and targetName == datasetSpec.datasetName: metaData = {} metaData['task_id'] = taskSpec.jediTaskID if taskSpec.campaign not in [None,'']: metaData['campaign'] = taskSpec.campaign if datasetSpec.getTransient() is not None: metaData['transient'] = datasetSpec.getTransient() else: metaData = None # register dataset/container tmpLog.info('registering {0} with location={1} backend={2} lifetime={3} meta={4}'.format(targetName, location, ddmBackEnd, lifetime, str(metaData))) tmpStat = ddmIF.registerNewDataset(targetName,backEnd=ddmBackEnd,location=location, lifetime=lifetime,metaData=metaData) if not tmpStat: tmpLog.error('failed to register {0}'.format(targetName)) return retFatal # procedures for user if userSetup or DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None: # register location tmpToRegister = False if userSetup and targetName == datasetSpec.datasetName and datasetSpec.site not in ['',None]: if taskSpec.workingGroup: userName = taskSpec.workingGroup else: userName = taskSpec.userName grouping = None tmpToRegister = True elif DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None: userName = None grouping = 'NONE' tmpToRegister = True if tmpToRegister: activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel) tmpLog.info('registering location={} lifetime={} days activity={} grouping={} ' 'owner={}'.format(locForRule, lifetime, activity, grouping, userName)) tmpStat = ddmIF.registerDatasetLocation(targetName,locForRule,owner=userName, lifetime=lifetime,backEnd=ddmBackEnd, activity=activity,grouping=grouping) if not tmpStat: tmpLog.error('failed to register location {0} for {1}'.format(locForRule, targetName)) return retFatal # double copy if userSetup and datasetSpec.type == 'output': if datasetSpec.destination != datasetSpec.site: tmpLog.info('skip making double copy as destination={0} is not site={1}'.format(datasetSpec.destination, datasetSpec.site)) else: second_copy = True try: if taskSpec.site: panda_site = siteMapper.getSite(taskSpec.site) if panda_site.catchall and 'skip_2nd_copy' in panda_site.catchall: tmpLog.info('skip making double copy as specified in {0} catchall'.format(panda_site)) second_copy = False except Exception: second_copy = True if second_copy: locForDouble = '(type=SCRATCHDISK)\\notforextracopy=True' tmpMsg = 'registering double copy ' tmpMsg += 'location="{0}" lifetime={1}days activity={2} for dataset={3}'.format(locForDouble,lifetime, activity,targetName) tmpLog.info(tmpMsg) tmpStat = ddmIF.registerDatasetLocation(targetName,locForDouble,copies=2,owner=userName, lifetime=lifetime,activity=activity, grouping='NONE',weight='freespace', ignore_availability=False) if not tmpStat: tmpLog.error('failed to register double copylocation {0} for {1}'.format(locForDouble, targetName)) return retFatal avDatasetList.append(targetName) else: tmpLog.info('{0} already registered'.format(targetName)) # check if dataset is in the container if datasetSpec.containerName is not None and datasetSpec.containerName != datasetSpec.datasetName: # get list of constituent datasets in the container if datasetSpec.containerName not in cnDatasetMap: cnDatasetMap[datasetSpec.containerName] = ddmIF.listDatasetsInContainer(datasetSpec.containerName) # add dataset if datasetSpec.datasetName not in cnDatasetMap[datasetSpec.containerName]: tmpLog.info('adding {0} to {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) tmpStat = ddmIF.addDatasetsToContainer(datasetSpec.containerName,[datasetSpec.datasetName], backEnd=ddmBackEnd) if not tmpStat: tmpLog.error('failed to add {0} to {1}'.format(datasetSpec.datasetName, datasetSpec.containerName)) return retFatal cnDatasetMap[datasetSpec.containerName].append(datasetSpec.datasetName) else: tmpLog.info('{0} already in {1}'.format(datasetSpec.datasetName,datasetSpec.containerName)) # update dataset datasetSpec.status = 'registered' self.taskBufferIF.updateDataset_JEDI(datasetSpec,{'jediTaskID':taskSpec.jediTaskID, 'datasetID':datasetID}) # register ES datasets if taskSpec.registerEsFiles(): targetName = EventServiceUtils.getEsDatasetName(taskSpec.jediTaskID) location = None metaData = {} metaData['task_id'] = taskSpec.jediTaskID metaData['hidden'] = True tmpLog.info('registering ES dataset {0} with location={1} meta={2}'.format(targetName, location, str(metaData))) tmpStat = ddmIF.registerNewDataset(targetName,location=location,metaData=metaData, resurrect=True) if not tmpStat: tmpLog.error('failed to register ES dataset {0}'.format(targetName)) return retFatal # register rule location = 'type=DATADISK' activity = DataServiceUtils.getActivityForOut(taskSpec.prodSourceLabel) grouping = 'NONE' tmpLog.info('registering location={0} activity={1} grouping={2}'.format(location, activity, grouping)) tmpStat = ddmIF.registerDatasetLocation(targetName,location,activity=activity, grouping=grouping) if not tmpStat: tmpLog.error('failed to register location {0} with {2} for {1}'.format(location, targetName, activity)) return retFatal # open datasets if taskSpec.prodSourceLabel in ['managed','test']: # get the list of output/log datasets outDatasetList = [] for tmpPandaJob in pandaJobs: for tmpFileSpec in tmpPandaJob.Files: if tmpFileSpec.type in ['output','log']: if tmpFileSpec.destinationDBlock not in outDatasetList: outDatasetList.append(tmpFileSpec.destinationDBlock) # open datasets for outDataset in outDatasetList: tmpLog.info('open {0}'.format(outDataset)) ddmIF.openDataset(outDataset) # unset lifetime ddmIF.setDatasetMetadata(outDataset,'lifetime',None) # return tmpLog.info('done') return retOK except Exception: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('doSetup failed with {0}:{1}'.format(errtype.__name__,errvalue)) taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retFatal
def runImpl(self): while True: try: # get a part of list nTasks = 100 taskList = self.taskList.get(nTasks) totalTasks, idxTasks = self.taskList.stat() # no more datasets if len(taskList) == 0: self.logger.debug( '{0} terminating since no more items'.format( self.__class__.__name__)) return # make logger tmpLog = MsgWrapper(self.logger) tmpLog.info( 'start TaskCheckerThread {0}/{1} for jediTaskID={2}'. format(idxTasks, totalTasks, taskList)) tmpStat = Interaction.SC_SUCCEEDED # get TaskSpecs taskSpecList = [] for jediTaskID in taskList: tmpRet, taskSpec = self.taskBufferIF.getTaskWithID_JEDI( jediTaskID, False) if tmpRet and taskSpec is not None: taskSpecList.append(taskSpec) else: tmpLog.error( 'failed to get taskSpec for jediTaskID={0}'.format( jediTaskID)) if taskSpecList != []: # get impl if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('getting Impl') try: impl = self.implFactory.getImpl( self.vo, self.prodSourceLabel) if impl is None: # task brokerage is undefined tmpLog.error( 'task broker is undefined for vo={0} sourceLabel={1}' .format(self.vo, self.prodSourceLabel)) tmpStat = Interaction.SC_FAILED except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('getImpl failed with {0}:{1}'.format( errtype.__name__, errvalue)) tmpStat = Interaction.SC_FAILED # check if tmpStat == Interaction.SC_SUCCEEDED: tmpLog.info('check with {0}'.format( impl.__class__.__name__)) try: tmpStat, taskCloudMap = impl.doCheck(taskSpecList) except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('doCheck failed with {0}:{1}'.format( errtype.__name__, errvalue)) tmpStat = Interaction.SC_FAILED # update if tmpStat != Interaction.SC_SUCCEEDED: tmpLog.error('failed to check assignment') else: tmpRet = self.taskBufferIF.setCloudToTasks_JEDI( taskCloudMap) tmpLog.info('done with {0} for {1}'.format( tmpRet, str(taskCloudMap))) except Exception: errtype, errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format( self.__class__.__name__, errtype.__name__, errvalue))
def runImpl(self): while True: try: # get a part of list nTasks = 10 taskDsList = self.taskDsList.get(nTasks) # no more datasets if len(taskDsList) == 0: self.logger.debug('%s terminating since no more items' % self.__class__.__name__) return # loop over all tasks for jediTaskID, dsList in taskDsList: allUpdated = True taskBroken = False taskOnHold = False runningTask = False missingMap = {} # make logger tmpLog = MsgWrapper( self.logger, '< jediTaskID={0} >'.format(jediTaskID)) # get task tmpStat, taskSpec = self.taskBufferIF.getTaskWithID_JEDI( jediTaskID, False, True, self.pid, 10) if not tmpStat or taskSpec == None: tmpLog.error( 'failed to get taskSpec for jediTaskID={0}'.format( jediTaskID)) continue try: # get task parameters taskParam = self.taskBufferIF.getTaskParamsWithID_JEDI( jediTaskID) taskParamMap = RefinerUtils.decodeJSON(taskParam) except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( 'task param conversion from json failed with {0}:{1}' .format(errtype.__name__, errvalue)) taskBroken = True # renaming of parameters if taskParamMap.has_key('nEventsPerInputFile'): taskParamMap['nEventsPerFile'] = taskParamMap[ 'nEventsPerInputFile'] # the number of files per job nFilesPerJob = None if taskParamMap.has_key('nFilesPerJob'): nFilesPerJob = taskParamMap['nFilesPerJob'] # the number of chunks used by scout nChunksForScout = 10 # load XML if taskSpec.useLoadXML(): xmlConfig = taskParamMap['loadXML'] else: xmlConfig = None # skip files used by another task if 'skipFilesUsedBy' in taskParamMap: skipFilesUsedBy = taskParamMap['skipFilesUsedBy'] else: skipFilesUsedBy = None # check no wait noWaitParent = False parentOutDatasets = set() if taskSpec.noWaitParent() and not taskSpec.parent_tid in [ None, taskSpec.jediTaskID ]: tmpStat = self.taskBufferIF.checkParentTask_JEDI( taskSpec.parent_tid) if tmpStat == 'running': noWaitParent = True # get output datasets from parent task tmpParentStat, tmpParentOutDatasets = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( taskSpec.parent_tid, ['output', 'log']) # collect dataset names for tmpParentOutDataset in tmpParentOutDatasets: parentOutDatasets.add( tmpParentOutDataset.datasetName) # loop over all datasets nFilesMaster = 0 checkedMaster = False setFrozenTime = True if not taskBroken: ddmIF = self.ddmIF.getInterface(taskSpec.vo) origNumFiles = None if taskParamMap.has_key('nFiles'): origNumFiles = taskParamMap['nFiles'] for datasetSpec in dsList: tmpLog.debug('start loop for {0}(id={1})'.format( datasetSpec.datasetName, datasetSpec.datasetID)) # get dataset metadata tmpLog.debug('get metadata') gotMetadata = False stateUpdateTime = datetime.datetime.utcnow() try: if not datasetSpec.isPseudo(): tmpMetadata = ddmIF.getDatasetMetaData( datasetSpec.datasetName) else: # dummy metadata for pseudo dataset tmpMetadata = {'state': 'closed'} # set mutable when and the dataset is open and parent is running or task is configured to run until the dataset is closed if (noWaitParent or taskSpec.runUntilClosed()) and \ (tmpMetadata['state'] == 'open' \ or datasetSpec.datasetName in parentOutDatasets \ or datasetSpec.datasetName.split(':')[-1] in parentOutDatasets): # dummy metadata when parent is running tmpMetadata = {'state': 'mutable'} gotMetadata = True except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( '{0} failed to get metadata to {1}:{2}'. format(self.__class__.__name__, errtype.__name__, errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus( datasetSpec, datasetStatus, tmpLog) else: if not taskSpec.ignoreMissingInDS(): # temporary error taskOnHold = True else: # ignore missing datasetStatus = 'failed' # update dataset status self.updateDatasetStatus( datasetSpec, datasetStatus, tmpLog) taskSpec.setErrDiag( 'failed to get metadata for {0}'.format( datasetSpec.datasetName)) if not taskSpec.ignoreMissingInDS(): allUpdated = False else: # get file list specified in task parameters fileList, includePatt, excludePatt = RefinerUtils.extractFileList( taskParamMap, datasetSpec.datasetName) # get the number of events in metadata if taskParamMap.has_key( 'getNumEventsInMetadata'): getNumEvents = True else: getNumEvents = False # get file list from DDM tmpLog.debug('get files') try: useInFilesWithNewAttemptNr = False skipDuplicate = not datasetSpec.useDuplicatedFiles( ) if not datasetSpec.isPseudo(): if fileList != [] and taskParamMap.has_key('useInFilesInContainer') and \ not datasetSpec.containerName in ['',None]: # read files from container if file list is specified in task parameters tmpDatasetName = datasetSpec.containerName else: tmpDatasetName = datasetSpec.datasetName # use long format for LB longFormat = False if taskSpec.respectLumiblock(): longFormat = True tmpRet = ddmIF.getFilesInDataset( tmpDatasetName, getNumEvents=getNumEvents, skipDuplicate=skipDuplicate, longFormat=longFormat) tmpLog.debug( 'got {0} files in {1}'.format( len(tmpRet), tmpDatasetName)) # remove lost files tmpLostFiles = ddmIF.findLostFiles( tmpDatasetName, tmpRet) if tmpLostFiles != {}: tmpLog.debug( 'found {0} lost files in {1}'. format(len(tmpLostFiles), tmpDatasetName)) for tmpListGUID, tmpLostLFN in tmpLostFiles.iteritems( ): tmpLog.debug( 'removed {0}'.format( tmpLostLFN)) del tmpRet[tmpListGUID] else: if datasetSpec.isSeqNumber(): # make dummy files for seq_number if datasetSpec.getNumRecords( ) != None: nPFN = datasetSpec.getNumRecords( ) elif origNumFiles != None: nPFN = origNumFiles if taskParamMap.has_key('nEventsPerJob') and taskParamMap.has_key('nEventsPerFile') \ and taskParamMap['nEventsPerFile'] > taskParamMap['nEventsPerJob']: nPFN = nPFN * taskParamMap[ 'nEventsPerFile'] / taskParamMap[ 'nEventsPerJob'] elif taskParamMap.has_key( 'nEventsPerFile' ) and taskParamMap.has_key( 'nEventsPerRange'): nPFN = nPFN * taskParamMap[ 'nEventsPerFile'] / taskParamMap[ 'nEventsPerRange'] elif 'nEvents' in taskParamMap and 'nEventsPerJob' in taskParamMap: nPFN = taskParamMap[ 'nEvents'] / taskParamMap[ 'nEventsPerJob'] elif 'nEvents' in taskParamMap and 'nEventsPerFile' in taskParamMap \ and 'nFilesPerJob' in taskParamMap: nPFN = taskParamMap[ 'nEvents'] / taskParamMap[ 'nEventsPerFile'] / taskParamMap[ 'nFilesPerJob'] else: # the default number of records for seq_number seqDefNumRecords = 10000 # get nFiles of the master tmpMasterAtt = self.taskBufferIF.getDatasetAttributes_JEDI( datasetSpec.jediTaskID, datasetSpec.masterID, ['nFiles']) # use nFiles of the master as the number of records if it is larger than the default if 'nFiles' in tmpMasterAtt and tmpMasterAtt[ 'nFiles'] > seqDefNumRecords: nPFN = tmpMasterAtt[ 'nFiles'] else: nPFN = seqDefNumRecords # check usedBy if skipFilesUsedBy != None: for tmpJediTaskID in str( skipFilesUsedBy ).split(','): tmpParentAtt = self.taskBufferIF.getDatasetAttributesWithMap_JEDI( tmpJediTaskID, { 'datasetName': datasetSpec. datasetName }, ['nFiles']) if 'nFiles' in tmpParentAtt and tmpParentAtt[ 'nFiles']: nPFN += tmpParentAtt[ 'nFiles'] tmpRet = {} # get offset tmpOffset = datasetSpec.getOffset() tmpOffset += 1 for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = { 'lfn': iPFN + tmpOffset, 'scope': None, 'filesize': 0, 'checksum': None, } elif not taskSpec.useListPFN(): # dummy file list for pseudo dataset tmpRet = { str(uuid.uuid4()): { 'lfn': 'pseudo_lfn', 'scope': None, 'filesize': 0, 'checksum': None, } } else: # make dummy file list for PFN list if taskParamMap.has_key('nFiles'): nPFN = taskParamMap['nFiles'] else: nPFN = 1 tmpRet = {} for iPFN in range(nPFN): tmpRet[str(uuid.uuid4())] = { 'lfn': '{0:06d}:{1}'.format( iPFN, taskParamMap['pfnList'] [iPFN].split('/')[-1]), 'scope': None, 'filesize': 0, 'checksum': None, } except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error( 'failed to get files due to {0}:{1} {2}' .format(self.__class__.__name__, errtype.__name__, errvalue)) if errtype == Interaction.JEDIFatalError: # fatal error datasetStatus = 'broken' taskBroken = True # update dataset status self.updateDatasetStatus( datasetSpec, datasetStatus, tmpLog) else: # temporary error taskOnHold = True taskSpec.setErrDiag( 'failed to get files for {0}'.format( datasetSpec.datasetName)) allUpdated = False else: # parameters for master input respectLB = False useRealNumEvents = False if datasetSpec.isMaster(): # respect LB boundaries respectLB = taskSpec.respectLumiblock() # use real number of events useRealNumEvents = taskSpec.useRealNumEvents( ) # the number of events per file nEventsPerFile = None nEventsPerJob = None nEventsPerRange = None tgtNumEventsPerJob = None if (datasetSpec.isMaster() and (taskParamMap.has_key('nEventsPerFile') or useRealNumEvents)) or \ (datasetSpec.isPseudo() and taskParamMap.has_key('nEvents') and not datasetSpec.isSeqNumber()): if taskParamMap.has_key( 'nEventsPerFile'): nEventsPerFile = taskParamMap[ 'nEventsPerFile'] elif datasetSpec.isMaster( ) and datasetSpec.isPseudo( ) and taskParamMap.has_key('nEvents'): # use nEvents as nEventsPerFile for pseudo input nEventsPerFile = taskParamMap[ 'nEvents'] if taskParamMap.has_key( 'nEventsPerJob'): nEventsPerJob = taskParamMap[ 'nEventsPerJob'] elif taskParamMap.has_key( 'nEventsPerRange'): nEventsPerRange = taskParamMap[ 'nEventsPerRange'] if 'tgtNumEventsPerJob' in taskParamMap: tgtNumEventsPerJob = taskParamMap[ 'tgtNumEventsPerJob'] # reset nEventsPerJob nEventsPerJob = None # max attempts maxAttempt = None maxFailure = None if datasetSpec.isMaster( ) or datasetSpec.toKeepTrack(): # max attempts if taskSpec.disableAutoRetry(): # disable auto retry maxAttempt = 1 elif taskParamMap.has_key( 'maxAttempt'): maxAttempt = taskParamMap[ 'maxAttempt'] else: # use default value maxAttempt = 3 # max failure if 'maxFailure' in taskParamMap: maxFailure = taskParamMap[ 'maxFailure'] # first event number firstEventNumber = None if datasetSpec.isMaster(): # first event number firstEventNumber = 1 + taskSpec.getFirstEventOffset( ) # nMaxEvents nMaxEvents = None if datasetSpec.isMaster( ) and taskParamMap.has_key('nEvents'): nMaxEvents = taskParamMap['nEvents'] # nMaxFiles nMaxFiles = None if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): nMaxFiles = taskParamMap['nFiles'] else: # calculate for secondary nMaxFiles = datasetSpec.getNumMultByRatio( origNumFiles) # multipled by the number of jobs per file for event-level splitting if nMaxFiles != None and taskParamMap.has_key( 'nEventsPerFile'): if taskParamMap.has_key( 'nEventsPerJob'): if taskParamMap[ 'nEventsPerFile'] > taskParamMap[ 'nEventsPerJob']: nMaxFiles *= float( taskParamMap[ 'nEventsPerFile'] ) / float(taskParamMap[ 'nEventsPerJob']) nMaxFiles = int( math.ceil( nMaxFiles)) elif taskParamMap.has_key( 'nEventsPerRange'): if taskParamMap[ 'nEventsPerFile'] > taskParamMap[ 'nEventsPerRange']: nMaxFiles *= float( taskParamMap[ 'nEventsPerFile'] ) / float(taskParamMap[ 'nEventsPerRange']) nMaxFiles = int( math.ceil( nMaxFiles)) # use scout useScout = False if datasetSpec.isMaster( ) and taskSpec.useScout() and ( datasetSpec.status != 'toupdate' or not taskSpec.isPostScout()): useScout = True # use files with new attempt numbers useFilesWithNewAttemptNr = False if not datasetSpec.isPseudo( ) and fileList != [] and taskParamMap.has_key( 'useInFilesWithNewAttemptNr'): useFilesWithNewAttemptNr = True #ramCount ramCount = 0 # feed files to the contents table tmpLog.debug('update contents') retDB, missingFileList, nFilesUnique, diagMap = self.taskBufferIF.insertFilesForDataset_JEDI( datasetSpec, tmpRet, tmpMetadata['state'], stateUpdateTime, nEventsPerFile, nEventsPerJob, maxAttempt, firstEventNumber, nMaxFiles, nMaxEvents, useScout, fileList, useFilesWithNewAttemptNr, nFilesPerJob, nEventsPerRange, nChunksForScout, includePatt, excludePatt, xmlConfig, noWaitParent, taskSpec.parent_tid, self.pid, maxFailure, useRealNumEvents, respectLB, tgtNumEventsPerJob, skipFilesUsedBy, ramCount) if retDB == False: taskSpec.setErrDiag( 'failed to insert files for {0}. {1}' .format(datasetSpec.datasetName, diagMap['errMsg'])) allUpdated = False taskBroken = True break elif retDB == None: # the dataset is locked by another or status is not applicable allUpdated = False tmpLog.debug( 'escape since task or dataset is locked' ) break elif missingFileList != []: # files are missing tmpErrStr = '{0} files missing in {1}'.format( len(missingFileList), datasetSpec.datasetName) tmpLog.debug(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) allUpdated = False taskOnHold = True missingMap[datasetSpec.datasetName] = { 'datasetSpec': datasetSpec, 'missingFiles': missingFileList } else: # reduce the number of files to be read if taskParamMap.has_key('nFiles'): if datasetSpec.isMaster(): taskParamMap[ 'nFiles'] -= nFilesUnique # reduce the number of files for scout if useScout: nChunksForScout = diagMap[ 'nChunksForScout'] # number of master input files if datasetSpec.isMaster(): checkedMaster = True nFilesMaster += nFilesUnique # running task if diagMap['isRunningTask']: runningTask = True # no activated pending input for noWait if noWaitParent and diagMap['nActivatedPending'] == 0 and not (useScout and nChunksForScout == 0) \ and tmpMetadata['state'] != 'closed' and datasetSpec.isMaster(): tmpErrStr = 'insufficient inputs are ready. ' tmpErrStr += diagMap['errMsg'] tmpLog.debug(tmpErrStr) taskSpec.setErrDiag(tmpErrStr) taskOnHold = True setFrozenTime = False break tmpLog.debug('end loop') # no mater input if not taskOnHold and not taskBroken and allUpdated and nFilesMaster == 0 and checkedMaster: tmpErrStr = 'no master input files. input dataset is empty' tmpLog.error(tmpErrStr) taskSpec.setErrDiag(tmpErrStr, None) if taskSpec.allowEmptyInput() or noWaitParent: taskOnHold = True else: taskBroken = True # update task status if taskBroken: # task is broken taskSpec.status = 'tobroken' tmpMsg = 'set task.status={0}'.format(taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI( jediTaskID, taskSpec, pid=self.pid) # change task status unless the task is running if not runningTask: if taskOnHold: # go to pending state if not taskSpec.status in ['broken', 'tobroken']: taskSpec.setOnHold() tmpMsg = 'set task.status={0}'.format( taskSpec.status) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) allRet = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI( jediTaskID, taskSpec, pid=self.pid, setFrozenTime=setFrozenTime) elif allUpdated: # all OK allRet, newTaskStatus = self.taskBufferIF.updateTaskStatusByContFeeder_JEDI( jediTaskID, getTaskStatus=True, pid=self.pid, useWorldCloud=taskSpec.useWorldCloud()) tmpMsg = 'set task.status={0}'.format( newTaskStatus) tmpLog.info(tmpMsg) tmpLog.sendMsg(tmpMsg, self.msgType) # just unlock retUnlock = self.taskBufferIF.unlockSingleTask_JEDI( jediTaskID, self.pid) tmpLog.debug('unlock not-running task with {0}'.format( retUnlock)) else: # just unlock retUnlock = self.taskBufferIF.unlockSingleTask_JEDI( jediTaskID, self.pid) tmpLog.debug('unlock task with {0}'.format(retUnlock)) tmpLog.debug('done') except: errtype, errvalue = sys.exc_info()[:2] logger.error('{0} failed in runImpl() with {1}:{2}'.format( self.__class__.__name__, errtype.__name__, errvalue))
def start(self): # start base classes JediKnight.start(self) FactoryBase.initializeMods(self, self.taskBufferIF, self.ddmIF) # go into main loop while True: startTime = datetime.datetime.utcnow() try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start TaskBroker') # get work queue mapper workQueueMapper = self.taskBufferIF.getWorkQueueMap() resource_types = self.taskBufferIF.load_resource_types() # loop over all vos for vo in self.vos: # loop over all sourceLabels for prodSourceLabel in self.prodSourceLabels: # loop over all work queues for workQueue in workQueueMapper.getAlignedQueueList( vo, prodSourceLabel): for resource_type in resource_types: wq_name = '_'.join( workQueue.queue_name.split(' ')) msgLabel = 'vo={0} label={1} queue={2} resource_type={3}: '.\ format(vo, prodSourceLabel, wq_name, resource_type.resource_name) tmpLog.debug(msgLabel + 'start') # get the list of tasks to check tmpList = self.taskBufferIF.getTasksToCheckAssignment_JEDI( vo, prodSourceLabel, workQueue, resource_type.resource_name) if tmpList is None: # failed tmpLog.error( msgLabel + 'failed to get the list of tasks to check' ) else: tmpLog.debug(msgLabel + 'got tasks_to_check={0}'. format(len(tmpList))) # put to a locked list taskList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # make workers nWorker = jedi_config.taskbroker.nWorkers for iWorker in range(nWorker): thr = TaskCheckerThread( taskList, threadPool, self.taskBufferIF, self.ddmIF, self, vo, prodSourceLabel) thr.start() # join threadPool.join() # get the list of tasks to assign tmpList = self.taskBufferIF.getTasksToAssign_JEDI( vo, prodSourceLabel, workQueue, resource_type.resource_name) if tmpList is None: # failed tmpLog.error( msgLabel + 'failed to get the list of tasks to assign' ) else: tmpLog.debug(msgLabel + 'got tasks_to_assign={0}'. format(len(tmpList))) # put to a locked list taskList = ListWithLock(tmpList) # make thread pool threadPool = ThreadPool() # make workers nWorker = jedi_config.taskbroker.nWorkers for iWorker in range(nWorker): thr = TaskBrokerThread( taskList, threadPool, self.taskBufferIF, self.ddmIF, self, vo, prodSourceLabel, workQueue, resource_type.resource_name) thr.start() # join threadPool.join() tmpLog.debug(msgLabel + 'done') except Exception: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('failed in {0}.start() with {1} {2}'.format( self.__class__.__name__, errtype.__name__, errvalue)) tmpLog.debug('done') # sleep if needed loopCycle = jedi_config.taskbroker.loopCycle timeDelta = datetime.datetime.utcnow() - startTime sleepPeriod = loopCycle - timeDelta.seconds if sleepPeriod > 0: time.sleep(sleepPeriod) # randomize cycle self.randomSleep(max_val=loopCycle)
def doAction(self): try: # get logger tmpLog = MsgWrapper(logger) tmpLog.debug('start') origTmpLog = tmpLog # check every 60 min checkInterval = 60 # get lib.tgz for waiting jobs libList = self.taskBufferIF.getLibForWaitingRunJob_JEDI(self.vo,self.prodSourceLabel,checkInterval) tmpLog.debug('got {0} lib.tgz files'.format(len(libList))) # activate or kill orphan jobs which were submitted to use lib.tgz when the lib.tgz was being produced for prodUserName,datasetName,tmpFileSpec in libList: tmpLog = MsgWrapper(logger,'<jediTaskID={0}>'.format(tmpFileSpec.jediTaskID)) tmpLog.debug('start') # check status of lib.tgz if tmpFileSpec.status == 'failed': # get buildJob pandaJobSpecs = self.taskBufferIF.peekJobs([tmpFileSpec.PandaID], fromDefined=False, fromActive=False, fromWaiting=False) pandaJobSpec = pandaJobSpecs[0] if pandaJobSpec != None: # kill self.taskBufferIF.updateJobs([pandaJobSpec],False) tmpLog.debug(' killed downstream jobs for user="******" with libDS={1}'.format(prodUserName,datasetName)) else: # PandaJobSpec not found tmpLog.error(' cannot find PandaJobSpec for user="******" with PandaID={1}'.format(prodUserName, tmpFileSpec.PandaID)) elif tmpFileSpec.status == 'finished': # set metadata self.taskBufferIF.setGUIDs([{'guid':tmpFileSpec.GUID, 'lfn':tmpFileSpec.lfn, 'checksum':tmpFileSpec.checksum, 'fsize':tmpFileSpec.fsize, 'scope':tmpFileSpec.scope, }]) # get lib dataset dataset = self.taskBufferIF.queryDatasetWithMap({'name':datasetName}) if dataset != None: # activate jobs aThr = Activator(self.taskBufferIF,dataset) aThr.start() aThr.join() tmpLog.debug(' activated downstream jobs for user="******" with libDS={1}'.format(prodUserName,datasetName)) else: # datasetSpec not found tmpLog.error(' cannot find datasetSpec for user="******" with libDS={1}'.format(prodUserName,datasetName)) else: # lib.tgz is not ready tmpLog.debug(' keep waiting for user="******" libDS={1}'.format(prodUserName,datasetName)) except: tmpLog = origTmpLog errtype,errvalue = sys.exc_info()[:2] tmpLog.error('failed with {0} {1}'.format(errtype,errvalue)) # return tmpLog = origTmpLog tmpLog.debug('done') return self.SC_SUCCEEDED
def doActionForReassign(self,gTmpLog): # get DDM I/F ddmIF = self.ddmIF.getInterface(self.vo) # get site mapper siteMapper = self.taskBufferIF.getSiteMapper() # get tasks to get reassigned taskList = self.taskBufferIF.getTasksToReassign_JEDI(self.vo,self.prodSourceLabel) gTmpLog.debug('got {0} tasks to reassign'.format(len(taskList))) for taskSpec in taskList: tmpLog = MsgWrapper(logger, '< jediTaskID={0} >'.format(taskSpec.jediTaskID)) tmpLog.debug('start to reassign') # DDM backend ddmBackEnd = taskSpec.getDdmBackEnd() # get datasets tmpStat,datasetSpecList = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(taskSpec.jediTaskID,['output','log']) if tmpStat is not True: tmpLog.error('failed to get datasets') continue # update DB if not taskSpec.useWorldCloud(): # update cloudtasks tmpStat = self.taskBufferIF.setCloudTaskByUser('jedi',taskSpec.jediTaskID,taskSpec.cloud,'assigned',True) if tmpStat != 'SUCCEEDED': tmpLog.error('failed to update CloudTasks') continue # check cloud if not siteMapper.checkCloud(taskSpec.cloud): tmpLog.error("cloud={0} doesn't exist".format(taskSpec.cloud)) continue else: # re-run task brokerage if taskSpec.nucleus in [None,'']: taskSpec.status = 'assigning' taskSpec.oldStatus = None taskSpec.setToRegisterDatasets() self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID': taskSpec.jediTaskID}, setOldModTime=True) tmpLog.debug('#ATM #KV label=managed action=trigger_new_brokerage by setting task_status={0}'. format(taskSpec.status)) continue # get nucleus nucleusSpec = siteMapper.getNucleus(taskSpec.nucleus) if nucleusSpec is None: tmpLog.error("nucleus={0} doesn't exist".format(taskSpec.nucleus)) continue # set nucleus retMap = {taskSpec.jediTaskID: AtlasBrokerUtils.getDictToSetNucleus(nucleusSpec,datasetSpecList)} tmpRet = self.taskBufferIF.setCloudToTasks_JEDI(retMap) # get T1/nucleus if not taskSpec.useWorldCloud(): t1SiteName = siteMapper.getCloud(taskSpec.cloud)['dest'] else: t1SiteName = nucleusSpec.getOnePandaSite() t1Site = siteMapper.getSite(t1SiteName) # loop over all datasets isOK = True for datasetSpec in datasetSpecList: tmpLog.debug('dataset={0}'.format(datasetSpec.datasetName)) if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) is not None: tmpLog.debug('skip {0} is distributed'.format(datasetSpec.datasetName)) continue # get location location = siteMapper.getDdmEndpoint(t1Site.sitename, datasetSpec.storageToken, taskSpec.prodSourceLabel, JobUtils.translate_tasktype_to_jobtype(taskSpec.taskType)) # make subscription try: tmpLog.debug('registering subscription to {0} with backend={1}'.format(location, ddmBackEnd)) tmpStat = ddmIF.registerDatasetSubscription(datasetSpec.datasetName,location, 'Production Output',asynchronous=True) if tmpStat is not True: tmpLog.error("failed to make subscription") isOK = False break except Exception: errtype,errvalue = sys.exc_info()[:2] tmpLog.warning('failed to make subscription with {0}:{1}'.format(errtype.__name__,errvalue)) isOK = False break # succeeded if isOK: # activate task if taskSpec.oldStatus in ['assigning','exhausted',None]: taskSpec.status = 'ready' else: taskSpec.status = taskSpec.oldStatus taskSpec.oldStatus = None self.taskBufferIF.updateTask_JEDI(taskSpec,{'jediTaskID':taskSpec.jediTaskID}, setOldModTime=True) tmpLog.debug('finished to reassign')
def doBrokerage(self, taskSpec, cloudName, inputChunk, taskParamMap): # make logger tmpLog = MsgWrapper(logger, '<jediTaskID={0}>'.format(taskSpec.jediTaskID)) tmpLog.debug('start') # return for failure retFatal = self.SC_FATAL, inputChunk retTmpError = self.SC_FAILED, inputChunk # get sites in the cloud if not taskSpec.site in ['', None]: scanSiteList = [taskSpec.site] tmpLog.debug('site={0} is pre-assigned'.format(taskSpec.site)) elif inputChunk.getPreassignedSite() != None: scanSiteList = [inputChunk.getPreassignedSite()] tmpLog.debug('site={0} is pre-assigned in masterDS'.format( inputChunk.getPreassignedSite())) else: scanSiteList = self.siteMapper.getCloud(cloudName)['sites'] tmpLog.debug('cloud=%s has %s candidates' % (cloudName, len(scanSiteList))) tmpLog.debug('initial {0} candidates'.format(len(scanSiteList))) ###################################### # selection for status newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check site status skipFlag = False if tmpSiteSpec.status != 'online': skipFlag = True if not skipFlag: newScanSiteList.append(tmpSiteName) else: tmpLog.debug(' skip %s due to status=%s' % (tmpSiteName, tmpSiteSpec.status)) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed site status check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for memory minRamCount = max(taskSpec.ramCount, inputChunk.ramCount) if not minRamCount in [0, None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxmemory != 0 and minRamCount != 0 and minRamCount > tmpSiteSpec.maxmemory: tmpLog.debug( ' skip {0} due to site RAM shortage={1}(site upper limit) < {2}' .format(tmpSiteName, tmpSiteSpec.maxmemory, minRamCount)) continue if tmpSiteSpec.minmemory != 0 and minRamCount != 0 and minRamCount < tmpSiteSpec.minmemory: tmpLog.debug( ' skip {0} due to job RAM shortage={1}(site lower limit) > {2}' .format(tmpSiteName, tmpSiteSpec.minmemory, minRamCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed memory check ={1}{2}'.format( len(scanSiteList), minRamCount, taskSpec.ramUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for scratch disk minDiskCountS = taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize( ) + inputChunk.getMaxAtomSize() minDiskCountS = minDiskCountS / 1024 / 1024 # size for direct IO sites if taskSpec.useLocalIO(): minDiskCountR = minDiskCountS else: minDiskCountR = taskSpec.getOutDiskSize( ) + taskSpec.getWorkDiskSize() minDiskCountR = minDiskCountR / 1024 / 1024 newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxwdir != 0: if tmpSiteSpec.isDirectIO(): minDiskCount = minDiskCountR else: minDiskCount = minDiskCountS if minDiskCount > tmpSiteSpec.maxwdir: tmpLog.debug( ' skip {0} due to small scratch disk={1} < {2}'. format(tmpSiteName, tmpSiteSpec.maxwdir, minDiskCount)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed scratch disk check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for available space in SE newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # free space must be >= 200GB diskThreshold = 200 tmpSpaceSize = tmpSiteSpec.space if tmpSiteSpec.space != 0 and tmpSpaceSize < diskThreshold: tmpLog.debug( ' skip {0} due to disk shortage in SE = {1} < {2}GB'. format(tmpSiteName, tmpSiteSpec.space, diskThreshold)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed SE space check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for walltime minWalltime = taskSpec.walltime if not minWalltime in [0, None]: newScanSiteList = [] for tmpSiteName in scanSiteList: tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) # check at the site if tmpSiteSpec.maxtime != 0 and minWalltime > tmpSiteSpec.maxtime: tmpLog.debug( ' skip {0} due to short site walltime={1}(site upper limit) < {2}' .format(tmpSiteName, tmpSiteSpec.maxtime, minWalltime)) continue if tmpSiteSpec.mintime != 0 and minWalltime < tmpSiteSpec.mintime: tmpLog.debug( ' skip {0} due to short job walltime={1}(site lower limit) > {2}' .format(tmpSiteName, tmpSiteSpec.mintime, minWalltime)) continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed walltime check ={1}{2}'.format( len(scanSiteList), minWalltime, taskSpec.walltimeUnit)) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # selection for nPilot nWNmap = self.taskBufferIF.getCurrentSiteData() newScanSiteList = [] for tmpSiteName in scanSiteList: # check at the site nPilot = 0 if nWNmap.has_key(tmpSiteName): nPilot = nWNmap[tmpSiteName]['getJob'] + nWNmap[tmpSiteName][ 'updateJob'] if nPilot == 0 and not taskSpec.prodSourceLabel in ['test']: tmpLog.debug(' skip %s due to no pilot' % tmpSiteName) #continue newScanSiteList.append(tmpSiteName) scanSiteList = newScanSiteList tmpLog.debug('{0} candidates passed pilot activity check'.format( len(scanSiteList))) if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # sites already used by task tmpSt, sitesUsedByTask = self.taskBufferIF.getSitesUsedByTask_JEDI( taskSpec.jediTaskID) if not tmpSt: tmpLog.error('failed to get sites which already used by task') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # calculate weight tmpSt, jobStatPrioMap = self.taskBufferIF.getJobStatisticsWithWorkQueue_JEDI( taskSpec.vo, taskSpec.prodSourceLabel, taskSpec.currentPriority) if not tmpSt: tmpLog.error('failed to get job statistics with priority') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError ###################################### # final procedure tmpLog.debug('final {0} candidates'.format(len(scanSiteList))) weightMap = {} candidateSpecList = [] preSiteCandidateSpec = None for tmpSiteName in scanSiteList: # get number of jobs in each job status. Using workQueueID=None to include non-JEDI jobs nRunning = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'running', None, None) nAssigned = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'defined', None, None) nActivated = AtlasBrokerUtils.getNumJobs(jobStatPrioMap, tmpSiteName, 'activated', None, None) weight = float(nRunning + 1) / float(nActivated + nAssigned + 1) / float(nAssigned + 1) # make candidate siteCandidateSpec = SiteCandidate(tmpSiteName) # set weight siteCandidateSpec.weight = weight # append if tmpSiteName in sitesUsedByTask: candidateSpecList.append(siteCandidateSpec) else: if not weightMap.has_key(weight): weightMap[weight] = [] weightMap[weight].append(siteCandidateSpec) # limit the number of sites maxNumSites = 5 weightList = weightMap.keys() weightList.sort() weightList.reverse() for weightVal in weightList: if len(candidateSpecList) >= maxNumSites: break sitesWithWeight = weightMap[weightVal] random.shuffle(sitesWithWeight) candidateSpecList += sitesWithWeight[:(maxNumSites - len(candidateSpecList))] # collect site names scanSiteList = [] for siteCandidateSpec in candidateSpecList: scanSiteList.append(siteCandidateSpec.siteName) # append candidates newScanSiteList = [] for siteCandidateSpec in candidateSpecList: tmpSiteName = siteCandidateSpec.siteName # append inputChunk.addSiteCandidate(siteCandidateSpec) newScanSiteList.append(siteCandidateSpec.siteName) tmpLog.debug(' use {0} with weight={1}'.format( siteCandidateSpec.siteName, siteCandidateSpec.weight)) scanSiteList = newScanSiteList if scanSiteList == []: tmpLog.error('no candidates') taskSpec.setErrDiag(tmpLog.uploadLog(taskSpec.jediTaskID)) return retTmpError # return tmpLog.debug('done') return self.SC_SUCCEEDED, inputChunk