def test(): '''test submission''' from pandaharvester.harvestercore.job_spec import JobSpec from pandaharvester.harvestercore.plugin_factory import PluginFactory import json queuename = 'ARC-TEST' queueconfmapper = QueueConfigMapper() queueconf = queueconfmapper.get_queue(queuename) pluginfactory = PluginFactory() pandajob = '{"jobsetID": 11881, "logGUID": "88ee8a52-5c70-490c-a585-5eb6f48e4152", "cmtConfig": "x86_64-slc6-gcc49-opt", "prodDBlocks": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "dispatchDBlockTokenForOut": "NULL,NULL", "destinationDBlockToken": "dst:CERN-PROD_DATADISK,dst:NDGF-T1_DATADISK", "destinationSE": "CERN-PROD_PRESERVATION", "realDatasets": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00", "prodUserID": "gingrich", "GUID": "A407D965-B139-A543-8851-A8E134A678D7", "realDatasetsIn": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "nSent": 2, "cloud": "WORLD", "StatusCode": 0, "homepackage": "AtlasOffline/21.0.15", "inFiles": "EVNT.11329621._001079.pool.root.1", "processingType": "simul", "currentPriority": 900, "fsize": "129263662", "fileDestinationSE": "CERN-PROD_PRESERVATION,BOINC_MCORE", "scopeOut": "mc16_13TeV", "minRamCount": 1573, "jobDefinitionID": 0, "maxWalltime": 40638, "scopeLog": "mc16_13TeV", "transformation": "Sim_tf.py", "maxDiskCount": 485, "coreCount": 1, "prodDBlockToken": "NULL", "transferType": "NULL", "destinationDblock": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00_sub0418634273,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00_sub0418634276", "dispatchDBlockToken": "NULL", "jobPars": "--inputEVNTFile=EVNT.11329621._001079.pool.root.1 --maxEvents=50 --postInclude \\"default:RecJobTransforms/UseFrontier.py\\" --preExec \\"EVNTtoHITS:simFlags.SimBarcodeOffset.set_Value_and_Lock(200000)\\" \\"EVNTtoHITS:simFlags.TRTRangeCut=30.0;simFlags.TightMuonStepping=True\\" --preInclude \\"EVNTtoHITS:SimulationJobOptions/preInclude.BeamPipeKill.py,SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py\\" --skipEvents=4550 --firstEvent=5334551 --outputHITSFile=HITS.11364822._128373.pool.root.1 --physicsList=FTFP_BERT_ATL_VALIDATION --randomSeed=106692 --DBRelease=\\"all:current\\" --conditionsTag \\"default:OFLCOND-MC16-SDR-14\\" --geometryVersion=\\"default:ATLAS-R2-2016-01-00-01_VALIDATION\\" --runNumber=364168 --AMITag=s3126 --DataRunNumber=284500 --simulator=FullG4 --truthStrategy=MC15aPlus", "attemptNr": 2, "swRelease": "Atlas-21.0.15", "nucleus": "CERN-PROD", "maxCpuCount": 40638, "outFiles": "HITS.11364822._128373.pool.root.11,log.11364822._128373.job.log.tgz.11", "ddmEndPointOut": "CERN-PROD_DATADISK,NDGF-T1_DATADISK", "scopeIn": "mc16_13TeV", "PandaID": 3487584273, "sourceSite": "NULL", "dispatchDblock": "panda.11364822.07.05.GEN.0c9b1d3b-feec-411a-89e4-1cbf7347d70c_dis003487584270", "prodSourceLabel": "managed", "checksum": "ad:cd0bf10b", "jobName": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.e5340_e5984_s3126.3433643361", "ddmEndPointIn": "NDGF-T1_DATADISK", "taskID": 11364822, "logFile": "log.11364822._128373.job.log.tgz.1"}' pandajob = json.loads(pandajob) jspec = JobSpec() jspec.convert_job_json(pandajob) jspec.computingSite = queuename jspeclist = [jspec] maker = pluginfactory.get_plugin(queueconf.workerMaker) wspec = maker.make_worker(jspeclist, queueconf) wspec.hasJob = 1 wspec.set_jobspec_list(jspeclist) sub = ARCSubmitter() print sub.submit_workers([wspec]) print wspec.batchID
def __init__(self, **kwarg): FIFOBase.__init__(self, **kwarg) self.fifoName = '{0}_fifo'.format(self.titleName) pluginConf = {} pluginConf.update( {'titleName': self.titleName} ) pluginConf.update( {'module': harvester_config.fifo.fifoModule, 'name': harvester_config.fifo.fifoClass,} ) pluginFactory = PluginFactory() self.fifo = pluginFactory.get_plugin(pluginConf)
def _initialize_fifo(self, force_enable=False): self.fifoName = '{0}_fifo'.format(self.titleName) self.config = getattr(harvester_config, self.titleName) if force_enable: self.enabled = True elif hasattr(self.config, 'fifoEnable') and self.config.fifoEnable: self.enabled = True else: self.enabled = False return pluginConf = vars(self.config).copy() pluginConf.update( {'titleName': self.titleName} ) if hasattr(self.config, 'fifoModule') and hasattr(self.config, 'fifoClass'): pluginConf.update( {'module': self.config.fifoModule, 'name': self.config.fifoClass,} ) else: if not hasattr(harvester_config, 'fifo'): return pluginConf.update( {'module': harvester_config.fifo.fifoModule, 'name': harvester_config.fifo.fifoClass,} ) pluginFactory = PluginFactory() self.fifo = pluginFactory.get_plugin(pluginConf)
class WorkerMaker(object): # constructor def __init__(self): self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() # get plugin def get_plugin(self, queue_config): return self.pluginFactory.get_plugin(queue_config.workerMaker) # make workers def make_workers(self, jobchunk_list, queue_config, n_ready, resource_type, maker=None): tmpLog = core_utils.make_logger(_logger, 'queue={0} rtype={1}'.format(queue_config.queueName, resource_type), method_name='make_workers') tmpLog.debug('start') try: # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) if maker is None: # not found tmpLog.error('plugin for {0} not found'.format(queue_config.queueName)) return [], jobchunk_list # get ready workers readyWorkers = self.dbProxy.get_ready_workers(queue_config.queueName, n_ready) # loop over all chunks okChunks = [] ngChunks = [] for iChunk, jobChunk in enumerate(jobchunk_list): # make a worker if iChunk >= n_ready: workSpec = maker.make_worker(jobChunk, queue_config, resource_type) else: # use ready worker if iChunk < len(readyWorkers): workSpec = readyWorkers[iChunk] else: workSpec = None # failed if workSpec is None: ngChunks.append(jobChunk) continue # set workerID if workSpec.workerID is None: workSpec.workerID = self.dbProxy.get_next_seq_number('SEQ_workerID') workSpec.configID = queue_config.configID workSpec.isNew = True okChunks.append((workSpec, jobChunk)) # dump tmpLog.debug('made {0} workers while {1} chunks failed'.format(len(okChunks), len(ngChunks))) return okChunks, ngChunks except Exception: # dump error core_utils.dump_error_message(tmpLog) return [], jobchunk_list # get number of jobs per worker def get_num_jobs_per_worker(self, queue_config, n_workers, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_num_jobs_per_worker(n_workers) # get number of workers per job def get_num_workers_per_job(self, queue_config, n_workers, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_num_workers_per_job(n_workers) # check number of ready resources def num_ready_resources(self, queue_config, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.num_ready_resources() # get upper limit on the cumulative total of workers per job def get_max_workers_per_job_in_total(self, queue_config, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_max_workers_per_job_in_total() # get upper limit on the number of new workers per job in a cycle def get_max_workers_per_job_per_cycle(self, queue_config, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_max_workers_per_job_per_cycle()
class RpcBot(rpyc.Service): # initialization action def on_connect(self, conn): self.pluginFactory = PluginFactory(no_db=True) ###################### # submitter section # submit workers def exposed_submit_workers(self, plugin_config, workspec_list): core = self.pluginFactory.get_plugin(plugin_config) return core.submit_workers(workspec_list) ###################### # monitor section # check workers def exposed_check_workers(self, plugin_config, workspec_list): core = self.pluginFactory.get_plugin(plugin_config) return core.check_workers(workspec_list) ###################### # messenger section # setup access points def exposed_setup_access_points(self, plugin_config, workspec_list): core = self.pluginFactory.get_plugin(plugin_config) return core.setup_access_points(workspec_list) # feed jobs def exposed_feed_jobs(self, plugin_config, workspec, jobspec_list): core = self.pluginFactory.get_plugin(plugin_config) return core.feed_jobs(workspec, jobspec_list) # request job def exposed_job_requested(self, plugin_config, workspec): core = self.pluginFactory.get_plugin(plugin_config) return core.job_requested(workspec) # request kill def exposed_kill_requested(self, plugin_config, workspec): core = self.pluginFactory.get_plugin(plugin_config) return core.kill_requested(workspec) # is alive def exposed_is_alive(self, plugin_config, workspec, worker_heartbeat_limit): core = self.pluginFactory.get_plugin(plugin_config) return core.is_alive(workspec, worker_heartbeat_limit) # get work attributes def exposed_get_work_attributes(self, plugin_config, workspec): core = self.pluginFactory.get_plugin(plugin_config) return core.get_work_attributes(workspec) # get output files def exposed_get_files_to_stage_out(self, plugin_config, workspec): core = self.pluginFactory.get_plugin(plugin_config) return core.get_files_to_stage_out(workspec) # feed events def exposed_feed_events(self, plugin_config, workspec, events_dict): core = self.pluginFactory.get_plugin(plugin_config) return core.feed_events(workspec, events_dict) # get events def exposed_events_to_update(self, plugin_config, workspec): core = self.pluginFactory.get_plugin(plugin_config) return core.events_to_update(workspec) # request events def exposed_events_requested(self, plugin_config, workspec): core = self.pluginFactory.get_plugin(plugin_config) return core.events_requested(workspec) # get PandaIDs def exposed_get_panda_ids(self, plugin_config, workspec): core = self.pluginFactory.get_plugin(plugin_config) return core.get_panda_ids(workspec) # post processing def exposed_post_processing(self, plugin_config, workspec, jobspec_list, map_type): core = self.pluginFactory.get_plugin(plugin_config) return core.post_processing(workspec, jobspec_list, map_type) # send ACK def exposed_acknowledge_events_files(self, plugin_config, workspec): core = self.pluginFactory.get_plugin(plugin_config) return core.acknowledge_events_files(workspec)
class Submitter(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.workerMaker = WorkerMaker() self.workerAdjuster = WorkerAdjuster(queue_config_mapper) self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'submitter-{0}'.format(self.ident) while True: mainLog = core_utils.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting queues to submit workers') # get queues associated to a site to submit workers curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit( harvester_config.submitter.nQueues, harvester_config.submitter.lookupTime, harvester_config.submitter.lockInterval) if siteName is not None: mainLog.debug('got {0} queues for site {1}'.format( len(curWorkers), siteName)) # get commands comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName) commandSpecs = self.dbProxy.get_commands_for_receiver( 'submitter', comStr) mainLog.debug('got {0} commands'.format(len(commandSpecs))) for commandSpec in commandSpecs: newLimits = self.dbProxy.set_queue_limit( siteName, commandSpec.params) for tmpResource, tmpNewVal in iteritems(newLimits): # if available, overwrite new worker value with the command from panda server if tmpResource in resMap: tmpQueueName = resMap[tmpResource] if tmpQueueName in curWorkers: curWorkers[tmpQueueName][tmpResource][ 'nNewWorkers'] = tmpNewVal # define number of new workers if len(curWorkers) == 0: n_workers_per_queue_and_rt = dict() else: n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers( curWorkers, siteName) if n_workers_per_queue_and_rt is None: mainLog.error( 'WorkerAdjuster failed to define the number of workers' ) elif len(n_workers_per_queue_and_rt) == 0: pass else: # loop over all queues and resource types for queueName in n_workers_per_queue_and_rt: for resource_type, tmpVal in iteritems( n_workers_per_queue_and_rt[queueName]): tmpLog = core_utils.make_logger( _logger, 'id={0} queue={1} resource_type={2}'.format( lockedBy, queueName, resource_type), method_name='run') tmpLog.debug('start') nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady'] nReady = tmpVal['nReady'] # check queue if not self.queueConfigMapper.has_queue(queueName): tmpLog.error('config not found') continue # no new workers if nWorkers == 0: tmpLog.debug( 'skipped since no new worker is needed based on current stats' ) continue # get queue queueConfig = self.queueConfigMapper.get_queue( queueName) # actions based on mapping type if queueConfig.mapType == WorkSpec.MT_NoJob: # workers without jobs jobChunks = [] for i in range(nWorkers): jobChunks.append([]) elif queueConfig.mapType == WorkSpec.MT_OneToOne: # one worker per one job jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, 1, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy) elif queueConfig.mapType == WorkSpec.MT_MultiJobs: # one worker for multiple jobs nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker( queueConfig, nWorkers, resource_type) tmpLog.debug('nJobsPerWorker={0}'.format( nJobsPerWorker)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, nJobsPerWorker, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy, queueConfig.allowJobMixture) elif queueConfig.mapType == WorkSpec.MT_MultiWorkers: # multiple workers for one job nWorkersPerJob = self.workerMaker.get_num_workers_per_job( queueConfig, nWorkers, resource_type) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, None, nWorkersPerJob, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy) else: tmpLog.error('unknown mapType={0}'.format( queueConfig.mapType)) continue tmpLog.debug('got {0} job chunks'.format( len(jobChunks))) if len(jobChunks) == 0: continue # make workers okChunks, ngChunks = self.workerMaker.make_workers( jobChunks, queueConfig, nReady, resource_type) if len(ngChunks) == 0: tmpLog.debug( 'successfully made {0} workers'.format( len(okChunks))) else: tmpLog.debug( 'made {0} workers, while {1} workers failed' .format(len(okChunks), len(ngChunks))) timeNow = datetime.datetime.utcnow() # NG (=not good) for ngJobs in ngChunks: for jobSpec in ngJobs: jobSpec.status = 'failed' jobSpec.subStatus = 'failedtomake' jobSpec.stateChangeTime = timeNow jobSpec.lockedBy = None jobSpec.trigger_propagation() self.dbProxy.update_job( jobSpec, { 'lockedBy': lockedBy, 'subStatus': 'prepared' }) # OK pandaIDs = set() workSpecList = [] if len(okChunks) > 0: for workSpec, okJobs in okChunks: # has job if (queueConfig.useJobLateBinding and workSpec.workerID is None) \ or queueConfig.mapType == WorkSpec.MT_NoJob: workSpec.hasJob = 0 else: workSpec.hasJob = 1 if workSpec.nJobsToReFill in [None, 0]: workSpec.set_jobspec_list(okJobs) else: # refill free slots during the worker is running workSpec.set_jobspec_list( okJobs[:workSpec. nJobsToReFill]) workSpec.nJobsToReFill = None for jobSpec in okJobs[ workSpec.nJobsToReFill:]: pandaIDs.add(jobSpec.PandaID) # map type workSpec.mapType = queueConfig.mapType # queue name workSpec.computingSite = queueConfig.queueName # set access point workSpec.accessPoint = queueConfig.messenger[ 'accessPoint'] # events if len(okJobs) > 0 and ( 'eventService' in okJobs[0].jobParams or 'cloneJob' in okJobs[0].jobParams): workSpec.eventsRequest = WorkSpec.EV_useEvents workSpecList.append(workSpec) if len(workSpecList) > 0: # get plugin for submitter submitterCore = self.pluginFactory.get_plugin( queueConfig.submitter) if submitterCore is None: # not found tmpLog.error( 'submitter plugin for {0} not found'. format(jobSpec.computingSite)) continue # get plugin for messenger messenger = self.pluginFactory.get_plugin( queueConfig.messenger) if messenger is None: # not found tmpLog.error( 'messenger plugin for {0} not found'. format(jobSpec.computingSite)) continue # setup access points messenger.setup_access_points(workSpecList) # feed jobs for workSpec in workSpecList: if workSpec.hasJob == 1: tmpStat = messenger.feed_jobs( workSpec, workSpec.get_jobspec_list()) if tmpStat is False: tmpLog.error( 'failed to send jobs to workerID={0}' .format(workSpec.workerID)) else: tmpLog.debug( 'sent jobs to workerID={0} with {1}' .format( workSpec.workerID, tmpStat)) # insert workers self.dbProxy.insert_workers( workSpecList, lockedBy) # submit tmpLog.info('submitting {0} workers'.format( len(workSpecList))) workSpecList, tmpRetList, tmpStrList = self.submit_workers( submitterCore, workSpecList) for iWorker, (tmpRet, tmpStr) in enumerate( zip(tmpRetList, tmpStrList)): workSpec, jobList = okChunks[iWorker] # use associated job list since it can be truncated for re-filling jobList = workSpec.get_jobspec_list() # set status if not tmpRet: # failed submission tmpLog.error( 'failed to submit a workerID={0} with {1}' .format(workSpec.workerID, tmpStr)) workSpec.set_status(WorkSpec.ST_missed) workSpec.set_dialog_message(tmpStr) jobList = [] elif queueConfig.useJobLateBinding and workSpec.hasJob == 1: # directly go to running after feeding jobs for late biding workSpec.set_status( WorkSpec.ST_running) else: # normal successful submission workSpec.set_status( WorkSpec.ST_submitted) workSpec.submitTime = timeNow workSpec.modificationTime = timeNow # prefetch events if tmpRet and workSpec.hasJob == 1 and workSpec.eventsRequest == WorkSpec.EV_useEvents: workSpec.eventsRequest = WorkSpec.EV_requestEvents eventsRequestParams = dict() for jobSpec in jobList: eventsRequestParams[ jobSpec.PandaID] = { 'pandaID': jobSpec.PandaID, 'taskID': jobSpec.taskID, 'jobsetID': jobSpec. jobParams['jobsetID'], 'nRanges': jobSpec. jobParams['coreCount'], } workSpec.eventsRequestParams = eventsRequestParams # register worker tmpStat = self.dbProxy.register_worker( workSpec, jobList, lockedBy) if jobList is not None: for jobSpec in jobList: pandaIDs.add(jobSpec.PandaID) if tmpStat: tmpStr = 'submitted a workerID={0} for PandaID={1} with batchID={2}' tmpLog.info( tmpStr.format( workSpec.workerID, jobSpec.PandaID, workSpec.batchID)) else: tmpStr = 'failed to register a worker for PandaID={0} with batchID={1}' tmpLog.error( tmpStr.format( jobSpec.PandaID, workSpec.batchID)) # release jobs self.dbProxy.release_jobs(pandaIDs, lockedBy) tmpLog.info('done') mainLog.debug('done') # define sleep interval if siteName is None: sleepTime = harvester_config.submitter.sleepTime else: sleepTime = 0 # check if being terminated if self.terminated(sleepTime): mainLog.debug('terminated') return # wrapper for submitWorkers to skip ready workers def submit_workers(self, submitter_core, workspec_list): retList = [] strList = [] newSpecList = [] workersToSubmit = [] for workSpec in workspec_list: if workSpec.status == WorkSpec.ST_ready: newSpecList.append(workSpec) retList.append(True) strList.append('') else: workersToSubmit.append(workSpec) tmpRetList = submitter_core.submit_workers(workersToSubmit) for tmpRet, tmpStr in tmpRetList: retList.append(tmpRet) strList.append(tmpStr) newSpecList += workersToSubmit return newSpecList, retList, strList
begin_job_id = int(sys.argv[2]) if len(sys.argv) > 3: end_job_id = int(sys.argv[3]) if len(sys.argv) > 4: globus_sleep_time = int(sys.argv[4]) queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) initial_queueConfig_preparator = queueConfig.preparator queueConfig.preparator['module'] = 'pandaharvester.harvesterpreparator.go_bulk_preparator' queueConfig.preparator['name'] = 'GlobusBulkPreparator' modified_queueConfig_preparator = queueConfig.preparator pluginFactory = PluginFactory() # get stage-out plugin preparatorCore = pluginFactory.get_plugin(queueConfig.preparator) # logger _logger = core_utils.setup_logger('stageInTest_go_bulk_preparator') tmpLog = core_utils.make_logger(_logger, method_name='stageInTest_go_bulk_preparator') tmpLog.debug('start') for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems(): #print "loggerName - {}".format(loggerName) if loggerName.startswith('panda.log'): if len(loggerObj.handlers) == 0: continue if loggerName.split('.')[-1] in ['db_proxy']: continue stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(loggerObj.handlers[0].formatter)
vomses = get_list(harvester_config.credmanager.voms) # logger _logger = core_utils.setup_logger('credManagerTest') # get plugin(s) exeCores = [] for moduleName, className, inCertFile, outCertFile, voms in \ zip(moduleNames, classNames, inCertFiles, outCertFiles, vomses): pluginPar = {} pluginPar['module'] = moduleName pluginPar['name'] = className pluginPar['inCertFile'] = inCertFile pluginPar['outCertFile'] = outCertFile pluginPar['voms'] = voms exeCore = pluginFactory.get_plugin(pluginPar) exeCores.append(exeCore) # setup logger to write to screen also for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict): if loggerName.startswith('panda.log'): if len(loggerObj.handlers) == 0: continue if loggerName.split('.')[-1] in ['db_proxy']: continue stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) loggerObj.addHandler(stdoutHandler) # loop over all plugins for exeCore in exeCores:
class Monitor(AgentBase): # fifos monitor_fifo = MonitorFIFO() # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.pluginFactory = PluginFactory() self.startTimestamp = time.time() # main loop def run(self): lockedBy = 'monitor-{0}'.format(self.ident) # init messengers for queueConfig in self.queueConfigMapper.get_all_queues().values(): # just import for module initialization self.pluginFactory.get_plugin(queueConfig.messenger) # main last_DB_cycle_timestamp = 0 monitor_fifo = self.monitor_fifo while True: sw = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') if time.time( ) >= last_DB_cycle_timestamp + harvester_config.monitor.sleepTime: # run with workers from DB mainLog.debug('starting run with DB') mainLog.debug('getting workers to monitor') workSpecsPerQueue = self.dbProxy.get_workers_to_update( harvester_config.monitor.maxWorkers, harvester_config.monitor.checkInterval, harvester_config.monitor.lockInterval, lockedBy) mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue))) # loop over all workers for queueName, configIdWorkSpecs in iteritems( workSpecsPerQueue): for configID, workSpecsList in iteritems( configIdWorkSpecs): retVal = self.monitor_agent_core(lockedBy, queueName, workSpecsList, config_id=configID) if self.monitor_fifo.enabled and retVal is not None: workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal if workSpecsToEnqueue: mainLog.debug('putting workers to FIFO') try: score = fifoCheckInterval + timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueue), score) mainLog.info( 'put workers of {0} to FIFO with score {1}' .format(queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO: {0}'. format(errStr)) if workSpecsToEnqueueToHead: mainLog.debug('putting workers to FIFO head') try: score = fifoCheckInterval - timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueueToHead), score) mainLog.info( 'put workers of {0} to FIFO with score {1}' .format(queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO head: {0}' .format(errStr)) last_DB_cycle_timestamp = time.time() mainLog.debug('ended run with DB') elif self.monitor_fifo.enabled: # run with workers from FIFO if monitor_fifo.to_check_workers(): # check fifo size fifo_size = monitor_fifo.size() mainLog.debug('FIFO size is {0}'.format(fifo_size)) mainLog.debug('starting run with FIFO') try: obj_gotten = monitor_fifo.get(timeout=1) except Exception as errStr: mainLog.error( 'failed to get object from FIFO: {0}'.format( errStr)) else: if obj_gotten is not None: queueName, workSpecsList = obj_gotten mainLog.debug('got {0} workers of {1}'.format( len(workSpecsList), queueName)) configID = workSpecsList[0][0].configID for workSpecs in workSpecsList: for workSpec in workSpecs: if workSpec.pandaid_list is None: _jobspec_list = workSpec.get_jobspec_list( ) if _jobspec_list is not None: workSpec.pandaid_list = [ j.PandaID for j in workSpec.get_jobspec_list() ] else: workSpec.pandaid_list = [] workSpec.force_update('pandaid_list') retVal = self.monitor_agent_core( lockedBy, queueName, workSpecsList, from_fifo=True, config_id=configID) if retVal is not None: workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal if workSpecsToEnqueue: mainLog.debug('putting workers to FIFO') try: score = fifoCheckInterval + timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueue), score) mainLog.info( 'put workers of {0} to FIFO with score {1}' .format(queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO: {0}' .format(errStr)) if workSpecsToEnqueueToHead: mainLog.debug( 'putting workers to FIFO head') try: score = fifoCheckInterval - timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueueToHead), score) mainLog.info( 'put workers of {0} to FIFO with score {1}' .format(queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO head: {0}' .format(errStr)) else: mainLog.debug( 'monitor_agent_core returned None. Skipped putting to FIFO' ) else: mainLog.debug('got nothing in FIFO') mainLog.debug('ended run with FIFO') else: mainLog.debug( 'workers in FIFO too young to check. Skipped') if sw.get_elapsed_time_in_sec( ) > harvester_config.monitor.lockInterval: mainLog.warning( 'a single cycle was longer than lockInterval ' + sw.get_elapsed_time()) else: mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated sleepTime = (harvester_config.monitor.fifoSleepTimeMilli / 1000.0) \ if self.monitor_fifo.enabled else harvester_config.monitor.sleepTime if self.terminated(sleepTime): mainLog.debug('terminated') return # core of monitor agent to check workers in workSpecsList of queueName def monitor_agent_core(self, lockedBy, queueName, workSpecsList, from_fifo=False, config_id=None): tmpQueLog = self.make_logger(_logger, 'id={0} queue={1}'.format( lockedBy, queueName), method_name='run') # check queue if not self.queueConfigMapper.has_queue(queueName, config_id): tmpQueLog.error('config not found') return # get queue queueConfig = self.queueConfigMapper.get_queue(queueName, config_id) # get plugins monCore = self.pluginFactory.get_plugin(queueConfig.monitor) messenger = self.pluginFactory.get_plugin(queueConfig.messenger) # workspec chunk of active workers workSpecsToEnqueue = [] workSpecsToEnqueueToHead = [] timeNow_timestamp = time.time() # get fifoCheckInterval for PQ try: fifoCheckInterval = monCore.fifoCheckInterval except: if hasattr(harvester_config.monitor, 'fifoCheckInterval'): fifoCheckInterval = harvester_config.monitor.fifoCheckInterval else: fifoCheckInterval = harvester_config.monitor.checkInterval # check workers allWorkers = [item for sublist in workSpecsList for item in sublist] tmpQueLog.debug('checking {0} workers'.format(len(allWorkers))) tmpStat, tmpRetMap = self.check_workers(monCore, messenger, allWorkers, queueConfig, tmpQueLog) if tmpStat: # loop over all worker chunks tmpQueLog.debug('update jobs and workers') iWorker = 0 for workSpecs in workSpecsList: jobSpecs = None filesToStageOut = dict() pandaIDsList = [] eventsToUpdateList = [] filesToStageOutList = [] mapType = workSpecs[0].mapType # lock workers for fifo temRetLockWorker = None if from_fifo: # lock workers worker_id_list = [w.workerID for w in workSpecs] temRetLockWorker = self.dbProxy.lock_workers( worker_id_list, harvester_config.monitor.lockInterval, lockedBy) # skip if not locked if not temRetLockWorker: continue # loop over workSpecs for workSpec in workSpecs: tmpLog = self.make_logger(_logger, 'id={0} workerID={1}'.format( lockedBy, workSpec.workerID), method_name='run') tmpOut = tmpRetMap[workSpec.workerID] newStatus = tmpOut['newStatus'] monStatus = tmpOut['monStatus'] diagMessage = tmpOut['diagMessage'] workAttributes = tmpOut['workAttributes'] eventsToUpdate = tmpOut['eventsToUpdate'] filesToStageOut = tmpOut['filesToStageOut'] eventsRequestParams = tmpOut['eventsRequestParams'] nJobsToReFill = tmpOut['nJobsToReFill'] pandaIDs = tmpOut['pandaIDs'] tmpStr = 'newStatus={0} monitoredStatus={1} diag={2} ' tmpStr += 'postProcessed={3} files={4}' tmpLog.debug( tmpStr.format(newStatus, monStatus, diagMessage, workSpec.is_post_processed(), str(filesToStageOut))) iWorker += 1 if from_fifo: workSpec.lockedBy = lockedBy workSpec.force_update('lockedBy') # check status if newStatus not in WorkSpec.ST_LIST: tmpLog.error('unknown status={0}'.format(newStatus)) return # update worker workSpec.set_status(newStatus) workSpec.set_work_attributes(workAttributes) workSpec.set_dialog_message(diagMessage) if monStatus == WorkSpec.ST_failed: if not workSpec.has_pilot_error(): workSpec.set_pilot_error( PilotErrors.ERR_GENERALERROR, diagMessage) elif monStatus == WorkSpec.ST_cancelled: if not workSpec.has_pilot_error(): workSpec.set_pilot_error(PilotErrors.ERR_PANDAKILL, diagMessage) # request events if eventsRequestParams != {}: workSpec.eventsRequest = WorkSpec.EV_requestEvents workSpec.eventsRequestParams = eventsRequestParams # jobs to refill if nJobsToReFill is not None: workSpec.nJobsToReFill = nJobsToReFill # get associated jobs for the worker chunk if workSpec.hasJob == 1 and jobSpecs is None: jobSpecs = self.dbProxy.get_jobs_with_worker_id( workSpec.workerID, None, only_running=True) # pandaIDs for push pandaIDsList.append(pandaIDs) if len(eventsToUpdate) > 0: eventsToUpdateList.append(eventsToUpdate) if len(filesToStageOut) > 0: filesToStageOutList.append(filesToStageOut) # update jobs and workers if jobSpecs is not None: tmpQueLog.debug( 'updating {0} jobs with {1} workers'.format( len(jobSpecs), len(workSpecs))) core_utils.update_job_attributes_with_workers( mapType, jobSpecs, workSpecs, filesToStageOutList, eventsToUpdateList) for jobSpec in jobSpecs: tmpLog = self.make_logger(_logger, 'id={0} PandaID={1}'.format( lockedBy, jobSpec.PandaID), method_name='run') tmpLog.debug( 'new status={0} subStatus={1} status_in_metadata={2}' .format(jobSpec.status, jobSpec.subStatus, jobSpec.get_job_status_from_attributes())) # update local database tmpRet = self.dbProxy.update_jobs_workers( jobSpecs, workSpecs, lockedBy, pandaIDsList) if not tmpRet: for workSpec in workSpecs: tmpLog = self.make_logger(_logger, 'id={0} workerID={1}'.format( lockedBy, workSpec.workerID), method_name='run') if from_fifo: tmpLog.info( 'failed to update the DB. Maybe locked by other thread running with DB' ) else: tmpLog.error( 'failed to update the DB. lockInterval may be too short' ) sendWarning = True # send ACK to workers for events and files if len(eventsToUpdateList) > 0 or len(filesToStageOutList) > 0: for workSpec in workSpecs: messenger.acknowledge_events_files(workSpec) # active workers for fifo if self.monitor_fifo.enabled and workSpecs: workSpec = workSpecs[0] tmpOut = tmpRetMap[workSpec.workerID] newStatus = tmpOut['newStatus'] monStatus = tmpOut['monStatus'] if newStatus in [WorkSpec.ST_submitted, WorkSpec.ST_running] \ and workSpec.mapType != WorkSpec.MT_MultiWorkers \ and workSpec.workAttributes is not None: forceEnqueueInterval = harvester_config.monitor.fifoForceEnqueueInterval timeNow = datetime.datetime.utcnow() timeNow_timestamp = time.time() _bool, lastCheckAt = workSpec.get_work_params( 'lastCheckAt') try: last_check_period = timeNow_timestamp - lastCheckAt except TypeError: last_check_period = forceEnqueueInterval + 1.0 if (from_fifo and tmpRet) \ or (not from_fifo and timeNow_timestamp - harvester_config.monitor.sleepTime > self.startTimestamp and last_check_period > forceEnqueueInterval): if not from_fifo and _bool and lastCheckAt is not None \ and last_check_period > harvester_config.monitor.checkInterval: tmpQueLog.warning( 'last check period of workerID={0} is {1} sec, longer than monitor checkInterval' .format(workSpec.workerID, last_check_period)) workSpec.set_work_params( {'lastCheckAt': timeNow_timestamp}) workSpec.lockedBy = None workSpec.force_update('lockedBy') if monStatus in [ WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled ]: _bool, startFifoPreemptAt = workSpec.get_work_params( 'startFifoPreemptAt') if not _bool or startFifoPreemptAt is None: startFifoPreemptAt = timeNow_timestamp workSpec.set_work_params({ 'startFifoPreemptAt': startFifoPreemptAt }) tmpQueLog.debug( 'workerID={0} , startFifoPreemptAt: {1}'. format(workSpec.workerID, startFifoPreemptAt)) if timeNow_timestamp - startFifoPreemptAt < harvester_config.monitor.fifoMaxPreemptInterval: workSpecsToEnqueueToHead.append(workSpecs) else: workSpec.set_work_params({ 'startFifoPreemptAt': timeNow_timestamp }) workSpec.modificationTime = timeNow workSpec.force_update('modificationTime') workSpecsToEnqueue.append(workSpecs) else: workSpec.modificationTime = timeNow workSpec.force_update('modificationTime') workSpecsToEnqueue.append(workSpecs) else: tmpQueLog.error('failed to check workers') retVal = workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval tmpQueLog.debug('done') return retVal # wrapper for checkWorkers def check_workers(self, mon_core, messenger, all_workers, queue_config, tmp_log): workersToCheck = [] retMap = dict() for workSpec in all_workers: eventsRequestParams = {} eventsToUpdate = [] pandaIDs = [] workStatus = None workAttributes = None filesToStageOut = [] nJobsToReFill = None # job-level late binding if workSpec.hasJob == 0 and workSpec.mapType != WorkSpec.MT_NoJob: # check if job is requested jobRequested = messenger.job_requested(workSpec) if jobRequested: # set ready when job is requested workStatus = WorkSpec.ST_ready else: workStatus = workSpec.status elif workSpec.nJobsToReFill in [0, None]: # check if job is requested to refill free slots jobRequested = messenger.job_requested(workSpec) if jobRequested: nJobsToReFill = jobRequested workersToCheck.append(workSpec) else: workersToCheck.append(workSpec) # add retMap[workSpec.workerID] = { 'newStatus': workStatus, 'monStatus': workStatus, 'workAttributes': workAttributes, 'filesToStageOut': filesToStageOut, 'eventsRequestParams': eventsRequestParams, 'eventsToUpdate': eventsToUpdate, 'diagMessage': '', 'pandaIDs': pandaIDs, 'nJobsToReFill': nJobsToReFill } # check workers tmp_log.debug('checking workers with plugin') try: tmpStat, tmpOut = mon_core.check_workers(workersToCheck) if not tmpStat: tmp_log.error( 'failed to check workers with: {0}'.format(tmpOut)) else: tmp_log.debug('checked') for workSpec, (newStatus, diagMessage) in zip(workersToCheck, tmpOut): workerID = workSpec.workerID tmp_log.debug( 'Going to check workerID={0}'.format(workerID)) pandaIDs = [] if workerID in retMap: # request kill if messenger.kill_requested(workSpec): self.dbProxy.kill_worker(workSpec.workerID) # expired heartbeat - only when requested in the configuration try: # check if the queue configuration requires checking for worker heartbeat worker_heartbeat_limit = int( queue_config.messenger['worker_heartbeat']) except (AttributeError, KeyError): worker_heartbeat_limit = None tmp_log.debug( 'workerID={0} heartbeat limit is configured to {1}' .format(workerID, worker_heartbeat_limit)) if worker_heartbeat_limit: if messenger.is_alive(workSpec, worker_heartbeat_limit): tmp_log.debug( 'heartbeat for workerID={0} is valid'. format(workerID)) else: tmp_log.debug( 'heartbeat for workerID={0} expired: sending kill request' .format(workerID)) self.dbProxy.kill_worker(workSpec.workerID) # get work attributes workAttributes = messenger.get_work_attributes( workSpec) retMap[workerID]['workAttributes'] = workAttributes # get output files filesToStageOut = messenger.get_files_to_stage_out( workSpec) retMap[workerID]['filesToStageOut'] = filesToStageOut # get events to update if workSpec.eventsRequest in [ WorkSpec.EV_useEvents, WorkSpec.EV_requestEvents ]: eventsToUpdate = messenger.events_to_update( workSpec) retMap[workerID]['eventsToUpdate'] = eventsToUpdate # request events if workSpec.eventsRequest == WorkSpec.EV_useEvents: eventsRequestParams = messenger.events_requested( workSpec) retMap[workerID][ 'eventsRequestParams'] = eventsRequestParams # get PandaIDs for pull model if workSpec.mapType == WorkSpec.MT_NoJob: pandaIDs = messenger.get_panda_ids(workSpec) retMap[workerID]['pandaIDs'] = pandaIDs # keep original new status retMap[workerID]['monStatus'] = newStatus # set running while there are events to update or files to stage out if newStatus in [ WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled ]: if len(retMap[workerID]['filesToStageOut']) > 0 or \ len(retMap[workerID]['eventsToUpdate']) > 0: newStatus = WorkSpec.ST_running elif not workSpec.is_post_processed(): if not queue_config.is_no_heartbeat_status( newStatus): # post processing unless heartbeat is suppressed jobSpecs = self.dbProxy.get_jobs_with_worker_id( workSpec.workerID, None, True, only_running=True) # post processing messenger.post_processing( workSpec, jobSpecs, workSpec.mapType) workSpec.post_processed() newStatus = WorkSpec.ST_running # reset modification time to immediately trigger subsequent lookup workSpec.trigger_next_lookup() retMap[workerID]['newStatus'] = newStatus retMap[workerID]['diagMessage'] = diagMessage else: tmp_log.debug( 'workerID={0} not in retMap'.format(workerID)) return True, retMap except: core_utils.dump_error_message(tmp_log) return False, None
class Submitter(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.workerMaker = WorkerMaker() self.workerAdjuster = WorkerAdjuster(queue_config_mapper) self.pluginFactory = PluginFactory() self.monitor_fifo = MonitorFIFO() self.apfmon = Apfmon(self.queueConfigMapper) # main loop def run(self): lockedBy = 'submitter-{0}'.format(self.get_pid()) monitor_fifo = self.monitor_fifo queueLockInterval = getattr(harvester_config.submitter, 'queueLockInterval', harvester_config.submitter.lockInterval) while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting queues to submit workers') # get queues associated to a site to submit workers curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(harvester_config.submitter.nQueues, harvester_config.submitter.lookupTime, harvester_config.submitter.lockInterval, lockedBy, queueLockInterval) submitted = False if siteName is not None: mainLog.debug('got {0} queues for site {1}'.format(len(curWorkers), siteName)) # get commands comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName) commandSpecs = self.dbProxy.get_commands_for_receiver('submitter', comStr) mainLog.debug('got {0} {1} commands'.format(commandSpecs, comStr)) for commandSpec in commandSpecs: newLimits = self.dbProxy.set_queue_limit(siteName, commandSpec.params) for tmpResource, tmpNewVal in iteritems(newLimits): # if available, overwrite new worker value with the command from panda server if tmpResource in resMap: tmpQueueName = resMap[tmpResource] if tmpQueueName in curWorkers: curWorkers[tmpQueueName][tmpResource]['nNewWorkers'] = tmpNewVal # define number of new workers if len(curWorkers) == 0: n_workers_per_queue_and_rt = dict() else: n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers(curWorkers, siteName) if n_workers_per_queue_and_rt is None: mainLog.error('WorkerAdjuster failed to define the number of workers') elif len(n_workers_per_queue_and_rt) == 0: pass else: # loop over all queues and resource types for queueName in n_workers_per_queue_and_rt: for resource_type, tmpVal in iteritems(n_workers_per_queue_and_rt[queueName]): tmpLog = self.make_logger(_logger, 'id={0} queue={1} rtype={2}'.format(lockedBy, queueName, resource_type), method_name='run') try: tmpLog.debug('start') tmpLog.debug('workers status: %s' % tmpVal) nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady'] nReady = tmpVal['nReady'] # check queue if not self.queueConfigMapper.has_queue(queueName): tmpLog.error('config not found') continue # no new workers if nWorkers == 0: tmpLog.debug('skipped since no new worker is needed based on current stats') continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) workerMakerCore = self.workerMaker.get_plugin(queueConfig) # check if resource is ready if hasattr(workerMakerCore, 'dynamicSizing') and workerMakerCore.dynamicSizing is True: numReadyResources = self.workerMaker.num_ready_resources(queueConfig, resource_type, workerMakerCore) tmpLog.debug('numReadyResources: %s' % numReadyResources) if not numReadyResources: if hasattr(workerMakerCore, 'staticWorkers'): nQRWorkers = tmpVal['nQueue'] + tmpVal['nRunning'] tmpLog.debug('staticWorkers: %s, nQRWorkers(Queue+Running): %s' % (workerMakerCore.staticWorkers, nQRWorkers)) if nQRWorkers >= workerMakerCore.staticWorkers: tmpLog.debug('No left static workers, skip') continue else: nWorkers = min(workerMakerCore.staticWorkers - nQRWorkers, nWorkers) tmpLog.debug('staticWorkers: %s, nWorkers: %s' % (workerMakerCore.staticWorkers, nWorkers)) else: tmpLog.debug('skip since no resources are ready') continue else: nWorkers = min(nWorkers, numReadyResources) # post action of worker maker if hasattr(workerMakerCore, 'skipOnFail') and workerMakerCore.skipOnFail is True: skipOnFail = True else: skipOnFail = False # actions based on mapping type if queueConfig.mapType == WorkSpec.MT_NoJob: # workers without jobs jobChunks = [] for i in range(nWorkers): jobChunks.append([]) elif queueConfig.mapType == WorkSpec.MT_OneToOne: # one worker per one job jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, 1, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy) elif queueConfig.mapType == WorkSpec.MT_MultiJobs: # one worker for multiple jobs nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(queueConfig, nWorkers, resource_type, maker=workerMakerCore) tmpLog.debug('nJobsPerWorker={0}'.format(nJobsPerWorker)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, nJobsPerWorker, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy, queueConfig.allowJobMixture) elif queueConfig.mapType == WorkSpec.MT_MultiWorkers: # multiple workers for one job nWorkersPerJob = self.workerMaker.get_num_workers_per_job(queueConfig, nWorkers, resource_type, maker=workerMakerCore) maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total( queueConfig, resource_type, maker=workerMakerCore) maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle( queueConfig, resource_type, maker=workerMakerCore) tmpLog.debug('nWorkersPerJob={0}'.format(nWorkersPerJob)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, None, nWorkersPerJob, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy, max_workers_per_job_in_total=maxWorkersPerJob, max_workers_per_job_per_cycle=maxWorkersPerJobPerCycle) else: tmpLog.error('unknown mapType={0}'.format(queueConfig.mapType)) continue tmpLog.debug('got {0} job chunks'.format(len(jobChunks))) if len(jobChunks) == 0: continue # make workers okChunks, ngChunks = self.workerMaker.make_workers(jobChunks, queueConfig, nReady, resource_type, maker=workerMakerCore) if len(ngChunks) == 0: tmpLog.debug('successfully made {0} workers'.format(len(okChunks))) else: tmpLog.debug('made {0} workers, while {1} workers failed'.format(len(okChunks), len(ngChunks))) timeNow = datetime.datetime.utcnow() timeNow_timestamp = time.time() pandaIDs = set() # NG (=not good) for ngJobs in ngChunks: for jobSpec in ngJobs: if skipOnFail: # release jobs when workers are not made pandaIDs.add(jobSpec.PandaID) else: jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_make' jobSpec.stateChangeTime = timeNow jobSpec.lockedBy = None errStr = 'failed to make a worker' jobSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': 'prepared'}) # OK workSpecList = [] if len(okChunks) > 0: for workSpec, okJobs in okChunks: # has job if (queueConfig.useJobLateBinding and workSpec.workerID is None) \ or queueConfig.mapType == WorkSpec.MT_NoJob: workSpec.hasJob = 0 else: workSpec.hasJob = 1 if workSpec.nJobsToReFill in [None, 0]: workSpec.set_jobspec_list(okJobs) else: # refill free slots during the worker is running workSpec.set_jobspec_list(okJobs[:workSpec.nJobsToReFill]) workSpec.nJobsToReFill = None for jobSpec in okJobs[workSpec.nJobsToReFill:]: pandaIDs.add(jobSpec.PandaID) workSpec.set_num_jobs_with_list() # map type workSpec.mapType = queueConfig.mapType # queue name workSpec.computingSite = queueConfig.queueName # set access point workSpec.accessPoint = queueConfig.messenger['accessPoint'] # sync level workSpec.syncLevel = queueConfig.get_synchronization_level() # events if len(okJobs) > 0 and \ ('eventService' in okJobs[0].jobParams or 'cloneJob' in okJobs[0].jobParams): workSpec.eventsRequest = WorkSpec.EV_useEvents workSpecList.append(workSpec) if len(workSpecList) > 0: sw = core_utils.get_stopwatch() # get plugin for submitter submitterCore = self.pluginFactory.get_plugin(queueConfig.submitter) if submitterCore is None: # not found tmpLog.error( 'submitter plugin for {0} not found'.format(jobSpec.computingSite)) continue # get plugin for messenger messenger = self.pluginFactory.get_plugin(queueConfig.messenger) if messenger is None: # not found tmpLog.error( 'messenger plugin for {0} not found'.format(jobSpec.computingSite)) continue # setup access points messenger.setup_access_points(workSpecList) # feed jobs for workSpec in workSpecList: if workSpec.hasJob == 1: tmpStat = messenger.feed_jobs(workSpec, workSpec.get_jobspec_list()) if tmpStat is False: tmpLog.error( 'failed to send jobs to workerID={0}'.format(workSpec.workerID)) else: tmpLog.debug( 'sent jobs to workerID={0} with {1}'.format(workSpec.workerID, tmpStat)) # insert workers self.dbProxy.insert_workers(workSpecList, lockedBy) # submit sw.reset() tmpLog.info('submitting {0} workers'.format(len(workSpecList))) workSpecList, tmpRetList, tmpStrList = self.submit_workers(submitterCore, workSpecList) tmpLog.debug('done submitting {0} workers'.format(len(workSpecList)) + sw.get_elapsed_time()) # collect successful jobs okPandaIDs = set() for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): if tmpRet: workSpec, jobList = okChunks[iWorker] jobList = workSpec.get_jobspec_list() if jobList is not None: for jobSpec in jobList: okPandaIDs.add(jobSpec.PandaID) # loop over all workers for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): workSpec, jobList = okChunks[iWorker] # set harvesterHost workSpec.harvesterHost = socket.gethostname() # use associated job list since it can be truncated for re-filling jobList = workSpec.get_jobspec_list() # set status if not tmpRet: # failed submission errStr = 'failed to submit a workerID={0} with {1}'.format( workSpec.workerID, tmpStr) tmpLog.error(errStr) workSpec.set_status(WorkSpec.ST_missed) workSpec.set_dialog_message(tmpStr) workSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) if jobList is not None: # increment attempt number newJobList = [] for jobSpec in jobList: # skip if successful with another worker if jobSpec.PandaID in okPandaIDs: continue if jobSpec.submissionAttempts is None: jobSpec.submissionAttempts = 0 jobSpec.submissionAttempts += 1 # max attempt or permanent error if tmpRet is False or \ jobSpec.submissionAttempts >= \ queueConfig.maxSubmissionAttempts: newJobList.append(jobSpec) else: self.dbProxy.increment_submission_attempt( jobSpec.PandaID, jobSpec.submissionAttempts) jobList = newJobList elif queueConfig.useJobLateBinding and workSpec.hasJob == 1: # directly go to running after feeding jobs for late biding workSpec.set_status(WorkSpec.ST_running) else: # normal successful submission workSpec.set_status(WorkSpec.ST_submitted) workSpec.submitTime = timeNow workSpec.modificationTime = timeNow workSpec.checkTime = timeNow if self.monitor_fifo.enabled: workSpec.set_work_params({'lastCheckAt': timeNow_timestamp}) # prefetch events if tmpRet and workSpec.hasJob == 1 and \ workSpec.eventsRequest == WorkSpec.EV_useEvents and \ queueConfig.prefetchEvents: workSpec.eventsRequest = WorkSpec.EV_requestEvents eventsRequestParams = dict() for jobSpec in jobList: eventsRequestParams[jobSpec.PandaID] = \ {'pandaID': jobSpec.PandaID, 'taskID': jobSpec.taskID, 'jobsetID': jobSpec.jobParams['jobsetID'], 'nRanges': max(int(math.ceil(workSpec.nCore / len(jobList))), jobSpec.jobParams['coreCount']), } workSpec.eventsRequestParams = eventsRequestParams # register worker tmpStat = self.dbProxy.register_worker(workSpec, jobList, lockedBy) if jobList is not None: for jobSpec in jobList: pandaIDs.add(jobSpec.PandaID) if tmpStat: if tmpRet: tmpStr = \ 'submitted a workerID={0} for PandaID={1} with batchID={2}' tmpLog.info(tmpStr.format(workSpec.workerID, jobSpec.PandaID, workSpec.batchID)) else: tmpStr = 'failed to submit a workerID={0} for PandaID={1}' tmpLog.error(tmpStr.format(workSpec.workerID, jobSpec.PandaID)) else: tmpStr = \ 'failed to register a worker for PandaID={0} with batchID={1}' tmpLog.error(tmpStr.format(jobSpec.PandaID, workSpec.batchID)) # enqueue to monitor fifo if self.monitor_fifo.enabled \ and queueConfig.mapType != WorkSpec.MT_MultiWorkers: workSpecsToEnqueue = \ [[w] for w in workSpecList if w.status in (WorkSpec.ST_submitted, WorkSpec.ST_running)] check_delay = min( getattr(harvester_config.monitor, 'eventBasedCheckInterval', harvester_config.monitor.checkInterval), getattr(harvester_config.monitor, 'fifoCheckInterval', harvester_config.monitor.checkInterval)) monitor_fifo.put((queueName, workSpecsToEnqueue), time.time() + check_delay) mainLog.debug('put workers to monitor FIFO') submitted = True # release jobs self.dbProxy.release_jobs(pandaIDs, lockedBy) tmpLog.info('done') except Exception: core_utils.dump_error_message(tmpLog) # release the site self.dbProxy.release_site(siteName, lockedBy) if sw_main.get_elapsed_time_in_sec() > queueLockInterval: mainLog.warning('a submitter cycle was longer than queueLockInterval {0} sec'.format(queueLockInterval) + sw_main.get_elapsed_time()) mainLog.debug('done') # define sleep interval if siteName is None: sleepTime = harvester_config.submitter.sleepTime else: sleepTime = 0 if submitted and hasattr(harvester_config.submitter, 'minSubmissionInterval'): interval = harvester_config.submitter.minSubmissionInterval if interval > 0: newTime = datetime.datetime.utcnow() + datetime.timedelta(seconds=interval) self.dbProxy.update_panda_queue_attribute('submitTime', newTime, site_name=siteName) # time the cycle mainLog.debug('done a submitter cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(sleepTime): mainLog.debug('terminated') return # wrapper for submitWorkers to skip ready workers def submit_workers(self, submitter_core, workspec_list): retList = [] strList = [] newSpecList = [] workersToSubmit = [] for workSpec in workspec_list: if workSpec.status in [WorkSpec.ST_ready, WorkSpec.ST_running]: newSpecList.append(workSpec) retList.append(True) strList.append('') else: workersToSubmit.append(workSpec) tmpRetList = submitter_core.submit_workers(workersToSubmit) # submit the workers to the monitoring self.apfmon.create_workers(workersToSubmit) for tmpRet, tmpStr in tmpRetList: retList.append(tmpRet) strList.append(tmpStr) newSpecList += workersToSubmit return newSpecList, retList, strList
class Sweeper(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'sweeper-{0}'.format(self.get_pid()) while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') # get commands to kill sw_getcomm = core_utils.get_stopwatch() mainLog.debug('try to get commands') comStr = CommandSpec.COM_killWorkers commandSpecs = self.dbProxy.get_commands_for_receiver( 'sweeper', comStr) mainLog.debug('got {0} {1} commands'.format( len(commandSpecs), comStr)) for commandSpec in commandSpecs: n_to_kill = self.dbProxy.kill_workers_by_query( commandSpec.params) mainLog.debug('will kill {0} workers with {1}'.format( n_to_kill, commandSpec.params)) mainLog.debug('done handling commands' + sw_getcomm.get_elapsed_time()) # killing stage sw_kill = core_utils.get_stopwatch() mainLog.debug('try to get workers to kill') # get workers to kill workersToKill = self.dbProxy.get_workers_to_kill( harvester_config.sweeper.maxWorkers, harvester_config.sweeper.checkInterval) mainLog.debug('got {0} queues to kill workers'.format( len(workersToKill))) # loop over all workers sw = core_utils.get_stopwatch() for queueName, configIdWorkSpecList in iteritems(workersToKill): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue( queueName, configID): mainLog.error( 'queue config for {0}/{1} not found'.format( queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue( queueName, configID) try: sweeperCore = self.pluginFactory.get_plugin( queueConfig.sweeper) except Exception: mainLog.error( 'failed to launch sweeper plugin for {0}/{1}'. format(queueName, configID)) core_utils.dump_error_message(mainLog) continue sw.reset() n_workers = len(workspec_list) try: # try bulk method tmpLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') tmpLog.debug('start killing') tmpList = sweeperCore.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format( workspec.workerID), method_name='run') try: tmpLog.debug('start killing one worker') tmpStat, tmpOut = sweeperCore.kill_worker( workspec) tmpLog.debug( 'done killing with status={0} diag={1}'. format(tmpStat, tmpOut)) except Exception: core_utils.dump_error_message(tmpLog) except Exception: core_utils.dump_error_message(mainLog) else: # bulk method n_killed = 0 for workspec, (tmpStat, tmpOut) in zip(workspec_list, tmpList): tmpLog.debug( 'done killing workerID={0} with status={1} diag={2}' .format(workspec.workerID, tmpStat, tmpOut)) if tmpStat: n_killed += 1 tmpLog.debug('killed {0}/{1} workers'.format( n_killed, n_workers)) mainLog.debug( 'done killing {0} workers'.format(n_workers) + sw.get_elapsed_time()) mainLog.debug('done all killing' + sw_kill.get_elapsed_time()) # cleanup stage sw_cleanup = core_utils.get_stopwatch() # timeout for missed try: keepMissed = harvester_config.sweeper.keepMissed except Exception: keepMissed = 24 try: keepPending = harvester_config.sweeper.keepPending except Exception: keepPending = 24 # get workers for cleanup statusTimeoutMap = { 'finished': harvester_config.sweeper.keepFinished, 'failed': harvester_config.sweeper.keepFailed, 'cancelled': harvester_config.sweeper.keepCancelled, 'missed': keepMissed, 'pending': keepPending } workersForCleanup = self.dbProxy.get_workers_for_cleanup( harvester_config.sweeper.maxWorkers, statusTimeoutMap) mainLog.debug('got {0} queues for workers cleanup'.format( len(workersForCleanup))) sw = core_utils.get_stopwatch() for queueName, configIdWorkSpecList in iteritems( workersForCleanup): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue( queueName, configID): mainLog.error( 'queue config for {0}/{1} not found'.format( queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue( queueName, configID) sweeperCore = self.pluginFactory.get_plugin( queueConfig.sweeper) messenger = self.pluginFactory.get_plugin( queueConfig.messenger) sw.reset() n_workers = len(workspec_list) # make sure workers to clean up are all terminated mainLog.debug( 'making sure workers to clean up are all terminated') try: # try bulk method tmpList = sweeperCore.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format( workspec.workerID), method_name='run') try: tmpStat, tmpOut = sweeperCore.kill_worker( workspec) except Exception: core_utils.dump_error_message(tmpLog) except Exception: core_utils.dump_error_message(mainLog) mainLog.debug( 'made sure workers to clean up are all terminated') # start cleanup for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format( workspec.workerID), method_name='run') try: tmpLog.debug('start cleaning up one worker') # sweep worker tmpStat, tmpOut = sweeperCore.sweep_worker( workspec) tmpLog.debug( 'swept_worker with status={0} diag={1}'.format( tmpStat, tmpOut)) tmpLog.debug('start messenger cleanup') mc_tmpStat, mc_tmpOut = messenger.clean_up( workspec) tmpLog.debug( 'messenger cleaned up with status={0} diag={1}' .format(mc_tmpStat, mc_tmpOut)) if tmpStat: self.dbProxy.delete_worker(workspec.workerID) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug( 'done cleaning up {0} workers'.format(n_workers) + sw.get_elapsed_time()) mainLog.debug('done all cleanup' + sw_cleanup.get_elapsed_time()) # old-job-deletion stage sw_delete = core_utils.get_stopwatch() mainLog.debug('delete old jobs') jobTimeout = max(statusTimeoutMap.values()) + 1 self.dbProxy.delete_old_jobs(jobTimeout) # delete orphaned job info self.dbProxy.delete_orphaned_job_info() mainLog.debug('done deletion of old jobs' + sw_delete.get_elapsed_time()) # disk cleanup if hasattr(harvester_config.sweeper, 'diskCleanUpInterval') and \ hasattr(harvester_config.sweeper, 'diskHighWatermark'): locked = self.dbProxy.get_process_lock( 'sweeper', self.get_pid(), harvester_config.sweeper.diskCleanUpInterval * 60 * 60) if locked: try: all_active_files = None for item in harvester_config.sweeper.diskHighWatermark.split( ','): # dir name and watermark in GB dir_name, watermark = item.split('|') mainLog.debug( 'checking {0} for cleanup with watermark {1} GB' .format(dir_name, watermark)) watermark = int(watermark) * 10**9 total_size = 0 file_dict = {} # scan dir for root, dirs, filenames in walk(dir_name): for base_name in filenames: full_name = os.path.join(root, base_name) f_size = os.path.getsize(full_name) total_size += f_size mtime = os.path.getmtime(full_name) file_dict.setdefault(mtime, set()) file_dict[mtime].add( (base_name, full_name, f_size)) # delete if necessary if total_size < watermark: mainLog.debug( 'skip cleanup {0} due to total_size {1} GB < watermark {2} GB' .format(dir_name, total_size // (10**9), watermark // (10**9))) else: mainLog.debug( 'cleanup {0} due to total_size {1} GB >= watermark {2} GB' .format(dir_name, total_size // (10**9), watermark // (10**9))) # get active input files if all_active_files is None: all_active_files = self.dbProxy.get_all_active_input_files( ) deleted_size = 0 mtimes = sorted(file_dict.keys()) for mtime in mtimes: for base_name, full_name, f_size in file_dict[ mtime]: # keep if active if base_name in all_active_files: continue try: os.remove(full_name) except Exception: core_utils.dump_error_message( mainLog) deleted_size += f_size if total_size - deleted_size < watermark: break if total_size - deleted_size < watermark: break except Exception: core_utils.dump_error_message(mainLog) # time the cycle mainLog.debug('done a sweeper cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.sweeper.sleepTime): mainLog.debug('terminated') return
class WorkerAdjuster: # constructor def __init__(self, queue_config_mapper): self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict() # define number of workers to submit based on various information def define_num_workers(self, static_num_workers, site_name): tmpLog = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers') tmpLog.debug('start') dyn_num_workers = copy.copy(static_num_workers) try: # get queue status queueStat = self.dbProxy.get_cache("panda_queues.json", None) if queueStat is None: queueStat = dict() else: queueStat = queueStat.data # define num of new workers for queueName, tmpVal in iteritems(static_num_workers): # set 0 to num of new workers when the queue is disabled if queueName in queueStat and queueStat[queueName][ 'status'] in ['offline']: dyn_num_workers[queueName]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 since status={0}'.format( queueStat[queueName]['status']) tmpLog.debug(retMsg) continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) # get throttler if queueName not in self.throttlerMap: if hasattr(queueConfig, 'throttler'): throttler = self.pluginFactory.get_plugin( queueConfig.throttler) else: throttler = None self.throttlerMap[queueName] = throttler # check throttler throttler = self.throttlerMap[queueName] if throttler is not None: toThrottle, tmpMsg = throttler.to_be_throttled(queueConfig) if toThrottle: dyn_num_workers[queueName]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 by {0}:{1}'.format( throttler.__class__.__name__, tmpMsg) tmpLog.debug(retMsg) continue # check stats nQueue = tmpVal['nQueue'] nReady = tmpVal['nReady'] nRunning = tmpVal['nRunning'] nQueueLimit = queueConfig.nQueueLimitWorker maxWorkers = queueConfig.maxWorkers if queueConfig.runMode == 'slave': nNewWorkersDef = tmpVal['nNewWorkers'] if nNewWorkersDef == 0: dyn_num_workers[queueName]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 by panda in slave mode' tmpLog.debug(retMsg) continue else: nNewWorkersDef = None # define num of new workers based on static site config nNewWorkers = 0 if nQueueLimit > 0 and nQueue >= nQueueLimit: # enough queued workers retMsg = 'No nNewWorkers since nQueue({0})>=nQueueLimit({1})'.format( nQueue, nQueueLimit) tmpLog.debug(retMsg) pass elif maxWorkers > 0 and (nQueue + nReady + nRunning) >= maxWorkers: # enough workers in the system retMsg = 'No nNewWorkers since nQueue({0}) + nReady({1}) + nRunning({2}) '.format( nQueue, nReady, nRunning) retMsg += '>= maxWorkers({0})'.format(maxWorkers) tmpLog.debug(retMsg) pass else: # get max number of queued workers maxQueuedWorkers = 0 if nQueueLimit > 0: maxQueuedWorkers = nQueueLimit if maxQueuedWorkers == 0: if nNewWorkersDef is not None: # slave mode maxQueuedWorkers = nNewWorkersDef + nQueue else: # use default value maxQueuedWorkers = 1 # new workers nNewWorkers = max(maxQueuedWorkers - nQueue, 0) if maxWorkers > 0: nNewWorkers = min( nNewWorkers, max(maxWorkers - nQueue - nReady - nRunning, 0)) if queueConfig.maxNewWorkersPerCycle > 0: nNewWorkers = min(nNewWorkers, queueConfig.maxNewWorkersPerCycle) dyn_num_workers[queueName]['nNewWorkers'] = nNewWorkers # dump tmpLog.debug('defined {0}'.format(str(dyn_num_workers))) return dyn_num_workers except: # dump error errMsg = core_utils.dump_error_message(tmpLog) return None
class CredManager(AgentBase): # constructor def __init__(self, single_mode=False): AgentBase.__init__(self, single_mode) self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() # get module and class names moduleNames = self.get_list(harvester_config.credmanager.moduleName) classNames = self.get_list(harvester_config.credmanager.className) # file names of original certificates if hasattr(harvester_config.credmanager, 'inCertFile'): inCertFiles = self.get_list( harvester_config.credmanager.inCertFile) else: inCertFiles = self.get_list(harvester_config.credmanager.certFile) # file names of certificates to be generated if hasattr(harvester_config.credmanager, 'outCertFile'): outCertFiles = self.get_list( harvester_config.credmanager.outCertFile) else: # use the file name of the certificate for panda connection as output name outCertFiles = self.get_list(harvester_config.pandacon.cert_file) # VOMS vomses = self.get_list(harvester_config.credmanager.voms) # get plugin self.exeCores = [] for moduleName, className, inCertFile, outCertFile, voms in \ zip(moduleNames, classNames, inCertFiles, outCertFiles, vomses): pluginPar = {} pluginPar['module'] = moduleName pluginPar['name'] = className pluginPar['inCertFile'] = inCertFile pluginPar['outCertFile'] = outCertFile pluginPar['voms'] = voms exeCore = self.pluginFactory.get_plugin(pluginPar) self.exeCores.append(exeCore) # get list def get_list(self, data): if isinstance(data, list): return data else: return [data] # main loop def run(self): while True: # execute self.execute() # check if being terminated if self.terminated(harvester_config.credmanager.sleepTime, randomize=False): return # main def execute(self): # get lock locked = self.dbProxy.get_process_lock( 'credmanager', self.get_pid(), harvester_config.credmanager.sleepTime) if not locked: return # loop over all plugins for exeCore in self.exeCores: # do nothing if exeCore is None: continue # make logger mainLog = core_utils.make_logger(_logger, "{0} {1}".format( exeCore.__class__.__name__, exeCore.outCertFile), method_name='execute') # check credential mainLog.debug('check credential') isValid = exeCore.check_credential() if isValid: mainLog.debug('valid') elif not isValid: # renew it if necessary mainLog.debug('invalid') mainLog.debug('renew credential') tmpStat, tmpOut = exeCore.renew_credential() if not tmpStat: mainLog.error('failed : {0}'.format(tmpOut)) continue mainLog.debug('done')
class Submitter(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.workerMaker = WorkerMaker() self.workerAdjuster = WorkerAdjuster(queue_config_mapper) self.pluginFactory = PluginFactory() self.monitor_fifo = MonitorFIFO() self.apfmon = Apfmon(self.queueConfigMapper) # main loop def run(self): lockedBy = 'submitter-{0}'.format(self.get_pid()) monitor_fifo = self.monitor_fifo while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting queues to submit workers') # get queues associated to a site to submit workers curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit( harvester_config.submitter.nQueues, harvester_config.submitter.lookupTime, harvester_config.submitter.lockInterval) submitted = False if siteName is not None: mainLog.debug('got {0} queues for site {1}'.format( len(curWorkers), siteName)) # get commands comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName) commandSpecs = self.dbProxy.get_commands_for_receiver( 'submitter', comStr) mainLog.debug('got {0} {1} commands'.format( commandSpecs, comStr)) for commandSpec in commandSpecs: newLimits = self.dbProxy.set_queue_limit( siteName, commandSpec.params) for tmpResource, tmpNewVal in iteritems(newLimits): # if available, overwrite new worker value with the command from panda server if tmpResource in resMap: tmpQueueName = resMap[tmpResource] if tmpQueueName in curWorkers: curWorkers[tmpQueueName][tmpResource][ 'nNewWorkers'] = tmpNewVal # define number of new workers if len(curWorkers) == 0: n_workers_per_queue_and_rt = dict() else: n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers( curWorkers, siteName) if n_workers_per_queue_and_rt is None: mainLog.error( 'WorkerAdjuster failed to define the number of workers' ) elif len(n_workers_per_queue_and_rt) == 0: pass else: # loop over all queues and resource types for queueName in n_workers_per_queue_and_rt: for resource_type, tmpVal in iteritems( n_workers_per_queue_and_rt[queueName]): tmpLog = self.make_logger( _logger, 'id={0} queue={1} rtype={2}'.format( lockedBy, queueName, resource_type), method_name='run') try: tmpLog.debug('start') tmpLog.debug('workers status: %s' % tmpVal) nWorkers = tmpVal['nNewWorkers'] + tmpVal[ 'nReady'] nReady = tmpVal['nReady'] # check queue if not self.queueConfigMapper.has_queue( queueName): tmpLog.error('config not found') continue # no new workers if nWorkers == 0: tmpLog.debug( 'skipped since no new worker is needed based on current stats' ) continue # get queue queueConfig = self.queueConfigMapper.get_queue( queueName) workerMakerCore = self.workerMaker.get_plugin( queueConfig) # check if resource is ready if hasattr( workerMakerCore, 'dynamicSizing' ) and workerMakerCore.dynamicSizing is True: numReadyResources = self.workerMaker.num_ready_resources( queueConfig, resource_type, workerMakerCore) tmpLog.debug('numReadyResources: %s' % numReadyResources) if not numReadyResources: if hasattr(workerMakerCore, 'staticWorkers'): nQRWorkers = tmpVal[ 'nQueue'] + tmpVal['nRunning'] tmpLog.debug( 'staticWorkers: %s, nQRWorkers(Queue+Running): %s' % (workerMakerCore.staticWorkers, nQRWorkers)) if nQRWorkers >= workerMakerCore.staticWorkers: tmpLog.debug( 'No left static workers, skip' ) continue else: nWorkers = min( workerMakerCore. staticWorkers - nQRWorkers, nWorkers) tmpLog.debug( 'staticWorkers: %s, nWorkers: %s' % (workerMakerCore. staticWorkers, nWorkers)) else: tmpLog.debug( 'skip since no resources are ready' ) continue else: nWorkers = min(nWorkers, numReadyResources) # post action of worker maker if hasattr( workerMakerCore, 'skipOnFail' ) and workerMakerCore.skipOnFail is True: skipOnFail = True else: skipOnFail = False # actions based on mapping type if queueConfig.mapType == WorkSpec.MT_NoJob: # workers without jobs jobChunks = [] for i in range(nWorkers): jobChunks.append([]) elif queueConfig.mapType == WorkSpec.MT_OneToOne: # one worker per one job jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, 1, None, queueConfig.useJobLateBinding, harvester_config.submitter. checkInterval, harvester_config. submitter.lockInterval, lockedBy) elif queueConfig.mapType == WorkSpec.MT_MultiJobs: # one worker for multiple jobs nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker( queueConfig, nWorkers, resource_type, maker=workerMakerCore) tmpLog.debug('nJobsPerWorker={0}'.format( nJobsPerWorker)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, nJobsPerWorker, None, queueConfig.useJobLateBinding, harvester_config.submitter. checkInterval, harvester_config. submitter.lockInterval, lockedBy, queueConfig.allowJobMixture) elif queueConfig.mapType == WorkSpec.MT_MultiWorkers: # multiple workers for one job nWorkersPerJob = self.workerMaker.get_num_workers_per_job( queueConfig, nWorkers, resource_type, maker=workerMakerCore) maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total( queueConfig, resource_type, maker=workerMakerCore) maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle( queueConfig, resource_type, maker=workerMakerCore) tmpLog.debug('nWorkersPerJob={0}'.format( nWorkersPerJob)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, None, nWorkersPerJob, queueConfig.useJobLateBinding, harvester_config.submitter. checkInterval, harvester_config.submitter. lockInterval, lockedBy, max_workers_per_job_in_total= maxWorkersPerJob, max_workers_per_job_per_cycle= maxWorkersPerJobPerCycle) else: tmpLog.error('unknown mapType={0}'.format( queueConfig.mapType)) continue tmpLog.debug('got {0} job chunks'.format( len(jobChunks))) if len(jobChunks) == 0: continue # make workers okChunks, ngChunks = self.workerMaker.make_workers( jobChunks, queueConfig, nReady, resource_type, maker=workerMakerCore) if len(ngChunks) == 0: tmpLog.debug( 'successfully made {0} workers'.format( len(okChunks))) else: tmpLog.debug( 'made {0} workers, while {1} workers failed' .format(len(okChunks), len(ngChunks))) timeNow = datetime.datetime.utcnow() timeNow_timestamp = time.time() pandaIDs = set() # NG (=not good) for ngJobs in ngChunks: for jobSpec in ngJobs: if skipOnFail: # release jobs when workers are not made pandaIDs.add(jobSpec.PandaID) else: jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_make' jobSpec.stateChangeTime = timeNow jobSpec.lockedBy = None errStr = 'failed to make a worker' jobSpec.set_pilot_error( PilotErrors.ERR_SETUPFAILURE, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job( jobSpec, { 'lockedBy': lockedBy, 'subStatus': 'prepared' }) # OK workSpecList = [] if len(okChunks) > 0: for workSpec, okJobs in okChunks: # has job if (queueConfig.useJobLateBinding and workSpec.workerID is None) \ or queueConfig.mapType == WorkSpec.MT_NoJob: workSpec.hasJob = 0 else: workSpec.hasJob = 1 if workSpec.nJobsToReFill in [ None, 0 ]: workSpec.set_jobspec_list( okJobs) else: # refill free slots during the worker is running workSpec.set_jobspec_list( okJobs[:workSpec. nJobsToReFill]) workSpec.nJobsToReFill = None for jobSpec in okJobs[ workSpec. nJobsToReFill:]: pandaIDs.add( jobSpec.PandaID) workSpec.set_num_jobs_with_list() # map type workSpec.mapType = queueConfig.mapType # queue name workSpec.computingSite = queueConfig.queueName # set access point workSpec.accessPoint = queueConfig.messenger[ 'accessPoint'] # sync level workSpec.syncLevel = queueConfig.get_synchronization_level( ) # events if len(okJobs) > 0 and \ ('eventService' in okJobs[0].jobParams or 'cloneJob' in okJobs[0].jobParams): workSpec.eventsRequest = WorkSpec.EV_useEvents workSpecList.append(workSpec) if len(workSpecList) > 0: sw = core_utils.get_stopwatch() # get plugin for submitter submitterCore = self.pluginFactory.get_plugin( queueConfig.submitter) if submitterCore is None: # not found tmpLog.error( 'submitter plugin for {0} not found' .format(jobSpec.computingSite)) continue # get plugin for messenger messenger = self.pluginFactory.get_plugin( queueConfig.messenger) if messenger is None: # not found tmpLog.error( 'messenger plugin for {0} not found' .format(jobSpec.computingSite)) continue # setup access points messenger.setup_access_points(workSpecList) # feed jobs for workSpec in workSpecList: if workSpec.hasJob == 1: tmpStat = messenger.feed_jobs( workSpec, workSpec.get_jobspec_list()) if tmpStat is False: tmpLog.error( 'failed to send jobs to workerID={0}' .format(workSpec.workerID)) else: tmpLog.debug( 'sent jobs to workerID={0} with {1}' .format( workSpec.workerID, tmpStat)) # insert workers self.dbProxy.insert_workers( workSpecList, lockedBy) # submit sw.reset() tmpLog.info( 'submitting {0} workers'.format( len(workSpecList))) workSpecList, tmpRetList, tmpStrList = self.submit_workers( submitterCore, workSpecList) tmpLog.debug('done submitting {0} workers'. format(len(workSpecList)) + sw.get_elapsed_time()) # collect successful jobs okPandaIDs = set() for iWorker, (tmpRet, tmpStr) in enumerate( zip(tmpRetList, tmpStrList)): if tmpRet: workSpec, jobList = okChunks[ iWorker] jobList = workSpec.get_jobspec_list( ) if jobList is not None: for jobSpec in jobList: okPandaIDs.add( jobSpec.PandaID) # loop over all workers for iWorker, (tmpRet, tmpStr) in enumerate( zip(tmpRetList, tmpStrList)): workSpec, jobList = okChunks[iWorker] # set harvesterHost workSpec.harvesterHost = socket.gethostname( ) # use associated job list since it can be truncated for re-filling jobList = workSpec.get_jobspec_list() # set status if not tmpRet: # failed submission errStr = 'failed to submit a workerID={0} with {1}'.format( workSpec.workerID, tmpStr) tmpLog.error(errStr) workSpec.set_status( WorkSpec.ST_missed) workSpec.set_dialog_message(tmpStr) workSpec.set_pilot_error( PilotErrors.ERR_SETUPFAILURE, errStr) if jobList is not None: # increment attempt number newJobList = [] for jobSpec in jobList: # skip if successful with another worker if jobSpec.PandaID in okPandaIDs: continue if jobSpec.submissionAttempts is None: jobSpec.submissionAttempts = 0 jobSpec.submissionAttempts += 1 # max attempt or permanent error if tmpRet is False or \ jobSpec.submissionAttempts >= \ queueConfig.maxSubmissionAttempts: newJobList.append( jobSpec) else: self.dbProxy.increment_submission_attempt( jobSpec.PandaID, jobSpec. submissionAttempts) jobList = newJobList elif queueConfig.useJobLateBinding and workSpec.hasJob == 1: # directly go to running after feeding jobs for late biding workSpec.set_status( WorkSpec.ST_running) else: # normal successful submission workSpec.set_status( WorkSpec.ST_submitted) workSpec.submitTime = timeNow workSpec.modificationTime = timeNow workSpec.checkTime = timeNow if self.monitor_fifo.enabled: workSpec.set_work_params({ 'lastCheckAt': timeNow_timestamp }) # prefetch events if tmpRet and workSpec.hasJob == 1 and \ workSpec.eventsRequest == WorkSpec.EV_useEvents and \ queueConfig.prefetchEvents: workSpec.eventsRequest = WorkSpec.EV_requestEvents eventsRequestParams = dict() for jobSpec in jobList: eventsRequestParams[jobSpec.PandaID] = \ {'pandaID': jobSpec.PandaID, 'taskID': jobSpec.taskID, 'jobsetID': jobSpec.jobParams['jobsetID'], 'nRanges': max(int(math.ceil(workSpec.nCore / len(jobList))), jobSpec.jobParams['coreCount']), } workSpec.eventsRequestParams = eventsRequestParams # register worker tmpStat = self.dbProxy.register_worker( workSpec, jobList, lockedBy) if jobList is not None: for jobSpec in jobList: pandaIDs.add(jobSpec.PandaID) if tmpStat: if tmpRet: tmpStr = \ 'submitted a workerID={0} for PandaID={1} with batchID={2}' tmpLog.info( tmpStr.format( workSpec. workerID, jobSpec. PandaID, workSpec. batchID)) else: tmpStr = 'failed to submit a workerID={0} for PandaID={1}' tmpLog.error( tmpStr.format( workSpec. workerID, jobSpec.PandaID )) else: tmpStr = \ 'failed to register a worker for PandaID={0} with batchID={1}' tmpLog.error( tmpStr.format( jobSpec.PandaID, workSpec.batchID)) # enqueue to monitor fifo if self.monitor_fifo.enabled \ and queueConfig.mapType != WorkSpec.MT_MultiWorkers: workSpecsToEnqueue = \ [[w] for w in workSpecList if w.status in (WorkSpec.ST_submitted, WorkSpec.ST_running)] monitor_fifo.put( (queueName, workSpecsToEnqueue), time.time() + harvester_config. monitor.fifoCheckInterval) mainLog.debug( 'put workers to monitor FIFO') submitted = True # release jobs self.dbProxy.release_jobs(pandaIDs, lockedBy) tmpLog.info('done') except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done') # define sleep interval if siteName is None: sleepTime = harvester_config.submitter.sleepTime else: sleepTime = 0 if submitted and hasattr(harvester_config.submitter, 'minSubmissionInterval'): interval = harvester_config.submitter.minSubmissionInterval if interval > 0: newTime = datetime.datetime.utcnow( ) + datetime.timedelta(seconds=interval) self.dbProxy.update_panda_queue_attribute( 'submitTime', newTime, site_name=siteName) # time the cycle mainLog.debug('done a submitter cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(sleepTime): mainLog.debug('terminated') return # wrapper for submitWorkers to skip ready workers def submit_workers(self, submitter_core, workspec_list): retList = [] strList = [] newSpecList = [] workersToSubmit = [] for workSpec in workspec_list: if workSpec.status in [WorkSpec.ST_ready, WorkSpec.ST_running]: newSpecList.append(workSpec) retList.append(True) strList.append('') else: workersToSubmit.append(workSpec) tmpRetList = submitter_core.submit_workers(workersToSubmit) # submit the workers to the monitoring self.apfmon.create_workers(workersToSubmit) for tmpRet, tmpStr in tmpRetList: retList.append(tmpRet) strList.append(tmpStr) newSpecList += workersToSubmit return newSpecList, retList, strList
class SAGAMonitor(PluginBase): # constructor def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) self.pluginFactory = PluginFactory() self.queue_config_mapper = QueueConfigMapper() tmpLog = self.make_logger(baseLogger, method_name='__init__') tmpLog.info("[{0}] SAGA adaptor will be used.".format(self.adaptor)) # check workers def check_workers(self, workspec_list): """Check status of workers. This method takes a list of WorkSpecs as input argument and returns a list of worker's statuses. :param workspec_list: a list of work specs instances :return: A tuple of return code (True for success, False otherwise) and a list of worker's statuses. :rtype: (bool, [string,]) """ try: job_service = saga.job.Service(self.adaptor) except saga.SagaException as ex: time.sleep(10) self.check_workers(workspec_list) sagadateformat_str = '%a %b %d %H:%M:%S %Y' retList = [] for workSpec in workspec_list: # make logger errStr = '' tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), method_name='check_workers') tmpLog.debug("SAGA monitor started") if workSpec.batchID: saga_submission_id = '[{0}]-[{1}]'.format( self.adaptor, workSpec.batchID) try: worker = job_service.get_job(saga_submission_id) tmpLog.debug( 'SAGA State for submission with batchid: {0} is: {1}'. format(workSpec.batchID, worker.state)) harvester_job_state = SAGASubmitter.status_translator( worker.state) workSpec.nativeStatus = worker.state workSpec.set_status(harvester_job_state) tmpLog.debug( 'Worker state with batchid: {0} is: {1} exit code: {2}' .format(workSpec.batchID, harvester_job_state, worker.exit_code)) workSpec.set_status(harvester_job_state) if worker.created: tmpLog.debug("Worker created (SAGA): {0}".format( worker.created)) workSpec.submitTime = datetime.strptime( worker.created, sagadateformat_str) if worker.started: tmpLog.debug("Worker started (SAGA): {0}".format( worker.started)) workSpec.startTime = datetime.strptime( worker.started, sagadateformat_str) if worker.finished: tmpLog.debug("Worker finished (SAGA): {0}".format( worker.finished)) workSpec.endTime = datetime.strptime( worker.finished, sagadateformat_str) if workSpec.is_final_status(): workSpec.nativeExitCode = worker.exit_code tmpLog.info( "Worker in final status [{0}] exit code: {1}". format(workSpec.status, workSpec.nativeExitCode)) if workSpec.nativeExitCode != 0: # let's try to find exit code, exit message etc... tmpLog.info( "Deep check to find exit code and exit status required" ) harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob( workSpec.batchID, workSpec.workerID) if harvester_job_state == "": harvester_job_state = workSpec.ST_finished if not workSpec.startTime: workSpec.startTime = starttime if endtime: workSpec.endTime = endtime workSpec.set_status(harvester_job_state) tmpLog.info( 'Worker {2} with BatchID={0} finished with exit code {1} and state {3}' .format(workSpec.batchID, worker.exit_code, workSpec.workerID, worker.state)) tmpLog.debug('Started: [{0}] finished: [{1}]'.format( worker.started, worker.finished)) if worker.state == saga.job.PENDING: queue_time = (datetime.now() - workSpec.submitTime).total_seconds() tmpLog.info( "Worker queued for {0} sec.".format(queue_time)) if hasattr(self, 'maxqueuetime' ) and queue_time > self.maxqueuetime: tmpLog.info( "Queue time {0} is longer than limit {1} worker will be canceled" .format(queue_time, self.maxqueuetime)) worker.cancel() worker.wait() workSpec.nativeExitCode = worker.exit_code cur_time = datetime.now() workSpec.startTime = cur_time workSpec.endTime = cur_time workSpec.set_pilot_closed() workSpec.set_status(workSpec.ST_cancelled) harvester_job_state = workSpec.ST_cancelled tmpLog.info( "Worker state: {0} worker exit code: {1}". format(harvester_job_state, workSpec.nativeExitCode)) # proper processing of jobs for worker will be required, to avoid 'fake' fails except saga.SagaException as ex: tmpLog.info( 'An exception occured during retriving worker information {0}' .format(workSpec.batchID)) tmpLog.info(ex.get_message()) # probably 'fnished' is not proper state in this case, 'undefined' looks a bit better # some more work for SAGA to get proper state harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob( workSpec.batchID, workSpec.workerID) if harvester_job_state == "": harvester_job_state = workSpec.ST_finished if not workSpec.startTime: workSpec.startTime = starttime if endtime: workSpec.endTime = endtime workSpec.set_status(harvester_job_state) tmpLog.debug('Worker state set to: {0} ({1})'.format( workSpec.status, harvester_job_state)) retList.append((harvester_job_state, errStr)) # for compatibility with dummy monitor f = open(os.path.join(workSpec.accessPoint, 'status.txt'), 'w') f.write(workSpec.status) f.close() else: tmpLog.debug( "SAGA monitor found worker [{0}] without batchID".format( workSpec.workerID)) job_service.close() tmpLog.debug('Results: {0}'.format(retList)) return True, retList def deep_checkjob(self, batchid, workerid): """ Get job state, exit code and some more parameters, from resources depending sources :param batchid: :return harvester_job_state, nativeExitCode, nativeStatus, startTime, endTime, diagMessage """ tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workerid), method_name='deep_checkjob') harvester_job_state = None nativeexitcode = None nativestatus = None diagmessage = "" starttime = None endtime = None queue_config = self.queue_config_mapper.get_queue(self.queueName) if hasattr(queue_config, 'resource'): resource_utils = self.pluginFactory.get_plugin( queue_config.resource) else: tmpLog.debug("Resource configuration missed for: {0}".format( self.queueName)) resource_utils = None if resource_utils: batchjob_info = resource_utils.get_batchjob_info(batchid) if batchjob_info: tmpLog.info('Batch job info collected: {0}'.format(batchjob_info)) harvester_job_state = batchjob_info['status'] nativeexitcode = batchjob_info['nativeExitCode'] nativestatus = batchjob_info['nativeStatus'] diagmessage = batchjob_info['nativeExitMsg'] if batchjob_info['start_time']: starttime = batchjob_info['start_time'] if batchjob_info['finish_time']: endtime = batchjob_info['finish_time'] return harvester_job_state, nativeexitcode, nativestatus, starttime, endtime, diagmessage
class Monitor(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'monitor-{0}'.format(self.ident) # init messengers for queueConfig in self.queueConfigMapper.get_all_queues().values(): # just import for module initialization self.pluginFactory.get_plugin(queueConfig.messenger) # main while True: sw = core_utils.get_stopwatch() mainLog = core_utils.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting workers to monitor') workSpecsPerQueue = self.dbProxy.get_workers_to_update( harvester_config.monitor.maxWorkers, harvester_config.monitor.checkInterval, harvester_config.monitor.lockInterval, lockedBy) mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue))) # loop over all workers for queueName, workSpecsList in iteritems(workSpecsPerQueue): tmpQueLog = core_utils.make_logger(_logger, 'id={0} queue={1}'.format( lockedBy, queueName), method_name='run') # check queue if not self.queueConfigMapper.has_queue(queueName): tmpQueLog.error('config not found') continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) # get plugins monCore = self.pluginFactory.get_plugin(queueConfig.monitor) messenger = self.pluginFactory.get_plugin( queueConfig.messenger) # check workers allWorkers = [ item for sublist in workSpecsList for item in sublist ] tmpQueLog.debug('checking {0} workers'.format(len(allWorkers))) tmpRetMap = self.check_workers(monCore, messenger, allWorkers, queueConfig, tmpQueLog) # loop over all worker chunks tmpQueLog.debug('update jobs and workers') iWorker = 0 for workSpecs in workSpecsList: jobSpecs = None filesToStageOut = dict() pandaIDsList = [] eventsToUpdateList = [] filesToStageOutList = [] for workSpec in workSpecs: tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format( workSpec.workerID), method_name='run') tmpOut = tmpRetMap[workSpec.workerID] newStatus = tmpOut['newStatus'] monStatus = tmpOut['monStatus'] diagMessage = tmpOut['diagMessage'] workAttributes = tmpOut['workAttributes'] eventsToUpdate = tmpOut['eventsToUpdate'] filesToStageOut = tmpOut['filesToStageOut'] eventsRequestParams = tmpOut['eventsRequestParams'] nJobsToReFill = tmpOut['nJobsToReFill'] pandaIDs = tmpOut['pandaIDs'] tmpStr = 'newStatus={0} monitoredStatus={1} diag={2} ' tmpStr += 'postProcessed={3} files={4}' tmpLog.debug( tmpStr.format(newStatus, monStatus, diagMessage, workSpec.is_post_processed(), str(filesToStageOut))) iWorker += 1 # check status if newStatus not in WorkSpec.ST_LIST: tmpLog.error( 'unknown status={0}'.format(newStatus)) continue # update worker workSpec.set_status(newStatus) workSpec.set_work_attributes(workAttributes) workSpec.set_dialog_message(diagMessage) # request events if eventsRequestParams != {}: workSpec.eventsRequest = WorkSpec.EV_requestEvents workSpec.eventsRequestParams = eventsRequestParams # jobs to refill if nJobsToReFill is not None: workSpec.nJobsToReFill = nJobsToReFill # get associated jobs for the worker chunk if workSpec.hasJob == 1 and jobSpecs is None: jobSpecs = self.dbProxy.get_jobs_with_worker_id( workSpec.workerID, None, only_running=True) # pandaIDs for push pandaIDsList.append(pandaIDs) if len(eventsToUpdate) > 0: eventsToUpdateList.append(eventsToUpdate) if len(filesToStageOut) > 0: filesToStageOutList.append(filesToStageOut) # update jobs and workers if jobSpecs is not None: tmpQueLog.debug( 'updating {0} jobs with {1} workers'.format( len(jobSpecs), len(workSpecs))) core_utils.update_job_attributes_with_workers( queueConfig.mapType, jobSpecs, workSpecs, filesToStageOutList, eventsToUpdateList) for jobSpec in jobSpecs: tmpLog = core_utils.make_logger( _logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') tmpLog.debug( 'new status={0} subStatus={1} status_in_metadata={2}' .format( jobSpec.status, jobSpec.subStatus, jobSpec.get_job_status_from_attributes())) # update local database tmpRet = self.dbProxy.update_jobs_workers( jobSpecs, workSpecs, lockedBy, pandaIDsList) if not tmpRet: for workSpec in workSpecs: tmpLog = core_utils.make_logger( _logger, 'workerID={0}'.format(workSpec.workerID), method_name='run') tmpLog.error( 'failed to update the DB. lockInterval may be too short' ) # send ACK to workers for events and files if len(eventsToUpdateList) > 0 or len( filesToStageOutList) > 0: for workSpec in workSpecs: messenger.acknowledge_events_files(workSpec) tmpQueLog.debug('done') mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.monitor.sleepTime): mainLog.debug('terminated') return # wrapper for checkWorkers def check_workers(self, mon_core, messenger, all_workers, queue_config, tmp_log): workersToCheck = [] retMap = dict() for workSpec in all_workers: eventsRequestParams = {} eventsToUpdate = [] pandaIDs = [] workStatus = None workAttributes = None filesToStageOut = None nJobsToReFill = None # job-level late binding if workSpec.hasJob == 0 and queue_config.mapType != WorkSpec.MT_NoJob: # check if job is requested jobRequested = messenger.job_requested(workSpec) if jobRequested: # set ready when job is requested workStatus = WorkSpec.ST_ready else: workStatus = workSpec.status elif workSpec.nJobsToReFill in [0, None]: # check if job is requested to refill free slots jobRequested = messenger.job_requested(workSpec) if jobRequested: nJobsToReFill = jobRequested workersToCheck.append(workSpec) else: workersToCheck.append(workSpec) # add retMap[workSpec.workerID] = { 'newStatus': workStatus, 'monStatus': workStatus, 'workAttributes': workAttributes, 'filesToStageOut': filesToStageOut, 'eventsRequestParams': eventsRequestParams, 'eventsToUpdate': eventsToUpdate, 'diagMessage': '', 'pandaIDs': pandaIDs, 'nJobsToReFill': nJobsToReFill } # check workers tmp_log.debug('checking workers with plugin') tmpStat, tmpOut = mon_core.check_workers(workersToCheck) if not tmpStat: tmp_log.error('failed to check workers with {0}'.format(tmpOut)) else: tmp_log.debug('checked') for workSpec, (newStatus, diagMessage) in zip(workersToCheck, tmpOut): workerID = workSpec.workerID pandaIDs = [] if workerID in retMap: # request kill if messenger.kill_requested(workSpec): self.dbProxy.kill_worker(workSpec.workerID) # get output files filesToStageOut = messenger.get_files_to_stage_out( workSpec) retMap[workerID]['filesToStageOut'] = filesToStageOut # get events to update if workSpec.eventsRequest in [ WorkSpec.EV_useEvents, WorkSpec.EV_requestEvents ]: eventsToUpdate = messenger.events_to_update(workSpec) retMap[workerID]['eventsToUpdate'] = eventsToUpdate # request events if workSpec.eventsRequest == WorkSpec.EV_useEvents: eventsRequestParams = messenger.events_requested( workSpec) retMap[workerID][ 'eventsRequestParams'] = eventsRequestParams # get PandaIDs for pull model if queue_config.mapType == WorkSpec.MT_NoJob: pandaIDs = messenger.get_panda_ids(workSpec) retMap[workerID]['pandaIDs'] = pandaIDs # keep original new status retMap[workerID]['monStatus'] = newStatus # set running while there are events to update or files to stage out if newStatus in [ WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled ]: if len(retMap[workerID]['filesToStageOut']) > 0 or \ len(retMap[workerID]['eventsToUpdate']) > 0: newStatus = WorkSpec.ST_running elif not workSpec.is_post_processed(): if not queue_config.is_no_heartbeat_status( newStatus): # post processing unless heartbeat is suppressed jobSpecs = self.dbProxy.get_jobs_with_worker_id( workSpec.workerID, None, True, only_running=True) # post processing messenger.post_processing( workSpec, jobSpecs, queue_config.mapType) workSpec.post_processed() newStatus = WorkSpec.ST_running # reset modification time to immediately trigger subsequent lookup workSpec.trigger_next_lookup() # get work attributes so that they can be updated in post_processing if any workAttributes = messenger.get_work_attributes(workSpec) retMap[workerID]['workAttributes'] = workAttributes retMap[workerID]['newStatus'] = newStatus retMap[workerID]['diagMessage'] = diagMessage return retMap
class SAGAMonitor(PluginBase): # constructor def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) self.pluginFactory = PluginFactory() self.queue_config_mapper = QueueConfigMapper() tmpLog = self.make_logger(baseLogger, method_name='__init__') tmpLog.info("[{0}] SAGA adaptor will be used.".format(self.adaptor)) # check workers def check_workers(self, workspec_list): """Check status of workers. This method takes a list of WorkSpecs as input argument and returns a list of worker's statuses. :param workspec_list: a list of work specs instances :return: A tuple of return code (True for success, False otherwise) and a list of worker's statuses. :rtype: (bool, [string,]) """ try: job_service = saga.job.Service(self.adaptor) except saga.SagaException as ex: time.sleep(10) self.check_workers(workspec_list) sagadateformat_str = '%a %b %d %H:%M:%S %Y' retList = [] for workSpec in workspec_list: # make logger errStr = '' tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), method_name='check_workers') tmpLog.debug("SAGA monitor started") if workSpec.batchID: saga_submission_id = '[{0}]-[{1}]'.format(self.adaptor, workSpec.batchID) try: worker = job_service.get_job(saga_submission_id) tmpLog.debug( 'SAGA State for submission with batchid: {0} is: {1}'.format(workSpec.batchID, worker.state)) harvester_job_state = SAGASubmitter.status_translator(worker.state) workSpec.nativeStatus = worker.state workSpec.set_status(harvester_job_state) tmpLog.debug( 'Worker state with batchid: {0} is: {1} exit code: {2}'.format(workSpec.batchID, harvester_job_state, worker.exit_code)) workSpec.set_status(harvester_job_state) if worker.created: tmpLog.debug("Worker created (SAGA): {0}".format(worker.created)) workSpec.submitTime = datetime.strptime(worker.created, sagadateformat_str) if worker.started: tmpLog.debug("Worker started (SAGA): {0}".format(worker.started)) workSpec.startTime = datetime.strptime(worker.started, sagadateformat_str) if worker.finished: tmpLog.debug("Worker finished (SAGA): {0}".format(worker.finished)) workSpec.endTime = datetime.strptime(worker.finished, sagadateformat_str) if workSpec.is_final_status(): workSpec.nativeExitCode = worker.exit_code tmpLog.info("Worker in final status [{0}] exit code: {1}".format(workSpec.status, workSpec.nativeExitCode)) if workSpec.nativeExitCode != 0: # let's try to find exit code, exit message etc... tmpLog.info("Deep check to find exit code and exit status required") harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob( workSpec.batchID, workSpec.workerID) if harvester_job_state == "": harvester_job_state = workSpec.ST_finished if not workSpec.startTime: workSpec.startTime = starttime if endtime: workSpec.endTime = endtime workSpec.set_status(harvester_job_state) tmpLog.info('Worker {2} with BatchID={0} finished with exit code {1} and state {3}'.format( workSpec.batchID, worker.exit_code, workSpec.workerID, worker.state)) tmpLog.debug('Started: [{0}] finished: [{1}]'.format(worker.started, worker.finished)) if worker.state == saga.job.PENDING: queue_time = (datetime.now() - workSpec.submitTime).total_seconds() tmpLog.info("Worker queued for {0} sec.".format(queue_time)) if hasattr(self, 'maxqueuetime') and queue_time > self.maxqueuetime: tmpLog.info( "Queue time {0} is longer than limit {1} worker will be canceled".format(queue_time, self.maxqueuetime)) worker.cancel() worker.wait() workSpec.nativeExitCode = worker.exit_code cur_time = datetime.now() workSpec.startTime = cur_time workSpec.endTime = cur_time workSpec.set_pilot_closed() workSpec.set_status(workSpec.ST_cancelled) harvester_job_state = workSpec.ST_cancelled tmpLog.info("Worker state: {0} worker exit code: {1}".format(harvester_job_state, workSpec.nativeExitCode)) # proper processing of jobs for worker will be required, to avoid 'fake' fails except saga.SagaException as ex: tmpLog.info('An exception occured during retriving worker information {0}'.format(workSpec.batchID)) tmpLog.info(ex.get_message()) # probably 'fnished' is not proper state in this case, 'undefined' looks a bit better # some more work for SAGA to get proper state harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob( workSpec.batchID, workSpec.workerID) if harvester_job_state == "": harvester_job_state = workSpec.ST_finished if not workSpec.startTime: workSpec.startTime = starttime if endtime: workSpec.endTime = endtime workSpec.set_status(harvester_job_state) tmpLog.debug('Worker state set to: {0} ({1})'.format(workSpec.status, harvester_job_state)) retList.append((harvester_job_state, errStr)) # for compatibility with dummy monitor f = open(os.path.join(workSpec.accessPoint, 'status.txt'), 'w') f.write(workSpec.status) f.close() else: tmpLog.debug("SAGA monitor found worker [{0}] without batchID".format(workSpec.workerID)) job_service.close() tmpLog.debug('Results: {0}'.format(retList)) return True, retList def deep_checkjob(self, batchid, workerid): """ Get job state, exit code and some more parameters, from resources depending sources :param batchid: :return harvester_job_state, nativeExitCode, nativeStatus, startTime, endTime, diagMessage """ tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workerid), method_name='deep_checkjob') harvester_job_state = None nativeexitcode = None nativestatus = None diagmessage = "" starttime = None endtime = None queue_config = self.queue_config_mapper.get_queue(self.queueName) if hasattr(queue_config, 'resource'): resource_utils = self.pluginFactory.get_plugin(queue_config.resource) else: tmpLog.debug("Resource configuration missed for: {0}".format(self.queueName)) resource_utils = None if resource_utils: batchjob_info = resource_utils.get_batchjob_info(batchid) if batchjob_info: tmpLog.info('Batch job info collected: {0}'.format(batchjob_info)) harvester_job_state = batchjob_info['status'] nativeexitcode = batchjob_info['nativeExitCode'] nativestatus = batchjob_info['nativeStatus'] diagmessage = batchjob_info['nativeExitMsg'] if batchjob_info['start_time']: starttime = batchjob_info['start_time'] if batchjob_info['finish_time']: endtime = batchjob_info['finish_time'] return harvester_job_state, nativeexitcode, nativestatus, starttime, endtime, diagmessage
class EventFeeder(AgentBase): # constructor def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.queueConfigMapper = queue_config_mapper self.communicator = communicator self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'eventfeeder-{0}'.format(self.get_pid()) while True: mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting workers to feed events') workSpecsPerQueue = self.dbProxy.get_workers_to_feed_events(harvester_config.eventfeeder.maxWorkers, harvester_config.eventfeeder.lockInterval, lockedBy) mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue))) # loop over all workers for queueName, workSpecList in iteritems(workSpecsPerQueue): tmpQueLog = self.make_logger(_logger, 'queue={0}'.format(queueName), method_name='run') # check queue if not self.queueConfigMapper.has_queue(queueName): tmpQueLog.error('config not found') continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) if hasattr(queueConfig, 'scatteredEvents') and queueConfig.scatteredEvents: scattered = True else: scattered = False # get plugin messenger = self.pluginFactory.get_plugin(queueConfig.messenger) # loop over all workers for workSpec in workSpecList: tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID), method_name='run') # lock worker again lockedFlag = self.dbProxy.lock_worker_again_to_feed_events(workSpec.workerID, lockedBy) if not lockedFlag: tmpLog.debug('skipped since locked by another') continue # get events tmpLog.debug('get events') tmpStat, events = self.communicator.get_event_ranges(workSpec.eventsRequestParams, scattered) # failed if tmpStat is False: tmpLog.error('failed to get events with {0}'.format(events)) continue # lock worker again lockedFlag = self.dbProxy.lock_worker_again_to_feed_events(workSpec.workerID, lockedBy) if not lockedFlag: tmpLog.debug('skipped before feeding since locked by another') continue tmpStat = messenger.feed_events(workSpec, events) # failed if tmpStat is False: tmpLog.error('failed to feed events') continue # dump for pandaID, eventList in iteritems(events): try: nRanges = workSpec.eventsRequestParams[pandaID]['nRanges'] except Exception: nRanges = None tmpLog.debug('got {0} events for PandaID={1} while getting {2} events'.format(len(eventList), pandaID, nRanges)) # disable multi workers if workSpec.mapType == WorkSpec.MT_MultiWorkers: if len(eventList) == 0 or (nRanges is not None and len(eventList) < nRanges): tmpStat = self.dbProxy.disable_multi_workers(pandaID) if tmpStat == 1: tmpStr = 'disabled MultiWorkers for PandaID={0}'.format(pandaID) tmpLog.debug(tmpStr) # update worker workSpec.eventsRequest = WorkSpec.EV_useEvents workSpec.eventsRequestParams = None workSpec.eventFeedTime = None workSpec.eventFeedLock = None # update local database tmpStat = self.dbProxy.update_worker(workSpec, {'eventFeedLock': lockedBy}) tmpLog.debug('done with {0}'.format(tmpStat)) tmpQueLog.debug('done') mainLog.debug('done') # check if being terminated if self.terminated(harvester_config.eventfeeder.sleepTime): mainLog.debug('terminated') return
class Sweeper(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'sweeper-{0}'.format(self.get_pid()) while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') # killing stage sw_kill = core_utils.get_stopwatch() mainLog.debug('try to get workers to kill') # get workers to kill workersToKill = self.dbProxy.get_workers_to_kill(harvester_config.sweeper.maxWorkers, harvester_config.sweeper.checkInterval) mainLog.debug('got {0} queues to kill workers'.format(len(workersToKill))) # loop over all workers sw = core_utils.get_stopwatch() for queueName, configIdWorkSpecList in iteritems(workersToKill): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue(queueName, configID): mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue(queueName, configID) sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper) sw.reset() n_workers = len(workspec_list) try: # try bulk method tmpLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') tmpLog.debug('start killing') tmpList = sweeperCore.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='run') try: tmpLog.debug('start killing one worker') tmpStat, tmpOut = sweeperCore.kill_worker(workspec) tmpLog.debug('done killing with status={0} diag={1}'.format(tmpStat, tmpOut)) except Exception: core_utils.dump_error_message(tmpLog) except Exception: core_utils.dump_error_message(mainLog) else: # bulk method n_killed = 0 for workspec, (tmpStat, tmpOut) in zip(workspec_list, tmpList): tmpLog.debug('done killing workerID={0} with status={1} diag={2}'.format( workspec.workerID, tmpStat, tmpOut)) if tmpStat: n_killed += 1 tmpLog.debug('killed {0}/{1} workers'.format(n_killed, n_workers)) mainLog.debug('done killing {0} workers'.format(n_workers) + sw.get_elapsed_time()) mainLog.debug('done all killing' + sw_kill.get_elapsed_time()) # cleanup stage sw_cleanup = core_utils.get_stopwatch() # timeout for missed try: keepMissed = harvester_config.sweeper.keepMissed except Exception: keepMissed = 24 try: keepPending = harvester_config.sweeper.keepPending except Exception: keepPending = 24 # get workers for cleanup statusTimeoutMap = {'finished': harvester_config.sweeper.keepFinished, 'failed': harvester_config.sweeper.keepFailed, 'cancelled': harvester_config.sweeper.keepCancelled, 'missed': keepMissed, 'pending': keepPending } workersForCleanup = self.dbProxy.get_workers_for_cleanup(harvester_config.sweeper.maxWorkers, statusTimeoutMap) mainLog.debug('got {0} queues for workers cleanup'.format(len(workersForCleanup))) sw = core_utils.get_stopwatch() for queueName, configIdWorkSpecList in iteritems(workersForCleanup): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue(queueName, configID): mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue(queueName, configID) sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper) messenger = self.pluginFactory.get_plugin(queueConfig.messenger) sw.reset() n_workers = len(workspec_list) # make sure workers to clean up are all terminated mainLog.debug('making sure workers to clean up are all terminated') try: # try bulk method tmpList = sweeperCore.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='run') try: tmpStat, tmpOut = sweeperCore.kill_worker(workspec) except Exception: core_utils.dump_error_message(tmpLog) except Exception: core_utils.dump_error_message(mainLog) mainLog.debug('made sure workers to clean up are all terminated') # start cleanup for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='run') try: tmpLog.debug('start cleaning up one worker') # sweep worker tmpStat, tmpOut = sweeperCore.sweep_worker(workspec) tmpLog.debug('swept_worker with status={0} diag={1}'.format(tmpStat, tmpOut)) tmpLog.debug('start messenger cleanup') mc_tmpStat, mc_tmpOut = messenger.clean_up(workspec) tmpLog.debug('messenger cleaned up with status={0} diag={1}'.format(mc_tmpStat, mc_tmpOut)) if tmpStat: self.dbProxy.delete_worker(workspec.workerID) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done cleaning up {0} workers'.format(n_workers) + sw.get_elapsed_time()) mainLog.debug('done all cleanup' + sw_cleanup.get_elapsed_time()) # old-job-deletion stage sw_delete = core_utils.get_stopwatch() mainLog.debug('delete old jobs') jobTimeout = max(statusTimeoutMap.values()) + 1 self.dbProxy.delete_old_jobs(jobTimeout) # delete orphaned job info self.dbProxy.delete_orphaned_job_info() mainLog.debug('done deletion of old jobs' + sw_delete.get_elapsed_time()) # time the cycle mainLog.debug('done a sweeper cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.sweeper.sleepTime): mainLog.debug('terminated') return
# begin_job_id = int(sys.argv[2]) #if len(sys.argv) > 3: # end_job_id = int(sys.argv[3]) #if len(sys.argv) > 4: # globus_sleep_time = int(sys.argv[4]) queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) initial_queueConfig_stager = queueConfig.stager queueConfig.stager['module'] = 'pandaharvester.harvesterstager.go_bulk_stager' queueConfig.stager['name'] = 'GlobusBulkStager' modified_queueConfig_stager = queueConfig.stager pluginFactory = PluginFactory() # get stage-out plugin stagerCore = pluginFactory.get_plugin(queueConfig.stager) # logger _logger = core_utils.setup_logger('further_testing_go_bulk_stager') tmpLog = core_utils.make_logger(_logger, method_name='further_testing_go_bulk_stager') tmpLog.debug('start') for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems(): #print "loggerName - {}".format(loggerName) if loggerName.startswith('panda.log'): if len(loggerObj.handlers) == 0: continue if loggerName.split('.')[-1] in ['db_proxy']: continue stdoutHandler = logging.StreamHandler(sys.stdout)
if len(sys.argv) > 3: end_job_id = int(sys.argv[3]) if len(sys.argv) > 4: globus_sleep_time = int(sys.argv[4]) queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) initial_queueConfig_preparator = queueConfig.preparator queueConfig.preparator[ 'module'] = 'pandaharvester.harvesterpreparator.go_bulk_preparator' queueConfig.preparator['name'] = 'GlobusBulkPreparator' modified_queueConfig_preparator = queueConfig.preparator pluginFactory = PluginFactory() # get stage-out plugin preparatorCore = pluginFactory.get_plugin(queueConfig.preparator) # logger _logger = core_utils.setup_logger('stageInTest_go_bulk_preparator') tmpLog = core_utils.make_logger(_logger, method_name='stageInTest_go_bulk_preparator') tmpLog.debug('start') for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems(): #print "loggerName - {}".format(loggerName) if loggerName.startswith('panda.log'): if len(loggerObj.handlers) == 0: continue if loggerName.split('.')[-1] in ['db_proxy']: continue stdoutHandler = logging.StreamHandler(sys.stdout)
'computingElement', 1, None) if len(jobs) == 0: print ("Failed to get jobs at {0} due to {1}".format(queueConfig.queueName, errStr)) sys.exit(0) jobSpec = JobSpec() jobSpec.convert_job_json(jobs[0]) # set input file paths inFiles = jobSpec.get_input_file_attributes() for inLFN, inFile in iteritems(inFiles): inFile['path'] = '{0}/{1}'.format(os.getcwd(), inLFN) jobSpec.set_input_file_paths(inFiles) jobSpecList.append(jobSpec) maker = pluginFactory.get_plugin(queueConfig.workerMaker) workSpec = maker.make_worker(jobSpecList, queueConfig, 'SCORE') # TODO: needs to be thought workSpec.accessPoint = queueConfig.messenger['accessPoint'] workSpec.mapType = queueConfig.mapType workSpec.computingSite = queueConfig.queueName # set job to worker if not job-level late binding if not queueConfig.useJobLateBinding: workSpec.hasJob = 1 workSpec.set_jobspec_list(jobSpecList) messenger = pluginFactory.get_plugin(queueConfig.messenger) messenger.setup_access_points([workSpec]) # get plugin for messenger
class MultiNodeWorkerMaker(BaseWorkerMaker): # constructor def __init__(self, **kwarg): BaseWorkerMaker.__init__(self, **kwarg) self.pluginFactory = PluginFactory() self.queue_config_mapper = QueueConfigMapper() tmpLog = self.make_logger(baseLogger, method_name='__init__') tmpLog.info("Multinode workermaker: created.") tmpLog.debug("Queue name: {0}".format(self.queueName)) if self.mode == "static": tmpLog.info("Static configuration") elif self.mode == "dynamic": tmpLog.info("Dynamic configuration") self.nNodes, self.walltimelimit = self.get_resources() self.nJobsPerWorker = self.nNodes * self.nJobsPerNode def _get_executable(self): # return string which contain body of script for scheduler: specific enviroment setup, executor with parameters exe_str = "" tmpLog = self.make_logger(baseLogger, method_name='_get_executable') # prepare static enviroment env_str = "" if self.env not in (None, "NULL"): env_str = "\n".join(map(lambda s: s.strip(), self.env.split(", "))) # prepare executor try: if self.executor == "aprun": # "aprun -n [number of required nodes/jobs] -d [number of cpu per node/job]" - for one multicore job per node exe_str = self.executor + " -n {0} -d {1} ".format( self.nJobsPerWorker, self.nCorePerJob) exe_str += self.pilot else: exe_str = self.executor + " " + self.pilot if self.pilot_params: exe_str = " ".join([exe_str, self.pilot_params]) except Exception: tmpLog.error( "Unable to build executor command check configuration") exe_str = "" exe_str = "\n".join([env_str, exe_str]) tmpLog.debug("Shell script body: \n%s" % exe_str) return exe_str # make a worker from jobs def make_worker(self, jobspec_list, queue_config, resource_type): tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format( queue_config.queueName), method_name='make_worker') tmpLog.info("Multi node worker preparation started.") tmpLog.info("Worker size: {0} jobs on {2} nodes for {1} sec.".format( self.nJobsPerWorker, self.walltimelimit, self.nNodes)) workSpec = WorkSpec() workSpec.nCore = self.nNodes * queue_config.submitter['nCorePerNode'] workSpec.minRamCount = 0 workSpec.maxDiskCount = 0 workSpec.maxWalltime = self.walltimelimit workSpec.workParams = self._get_executable() if len(jobspec_list) > 0: # push case: we know the job and set the parameters of the job for jobSpec in jobspec_list: try: workSpec.minRamCount += jobSpec.jobParams['minRamCount'] except Exception: pass try: workSpec.maxDiskCount += jobSpec.jobParams['maxDiskCount'] except Exception: pass #try: # if jobSpec.jobParams['maxWalltime'] not in (None, "NULL"): # workSpec.maxWalltime = max(int(queue_config.walltimeLimit), jobSpec.jobParams['maxWalltime']) # else: # workSpec.maxWalltime = queue_config.walltimeLimit #except Exception: # pass tmpLog.info( "Worker for {0} nodes with {2} jobs with walltime {1} sec. defined" .format(self.nNodes, workSpec.maxWalltime, self.nJobsPerWorker)) return workSpec # def get_num_jobs_per_worker(self, n_workers): # """ # Function to set 'size' of worker. Define number of jobs per worker # """ # tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(self.queueName), # method_name='get_num_jobs_per_worker') # tmpLog.info("Get number of jobs per worker") # self.nJobsPerWorker = 1 # if self.mode == "static": # tmpLog.info("Static configuration") # self.nJobsPerWorker = self.nNodes * self.nJobsPerNode # elif self.mode == "dynamic": # tmpLog.info("Dynamic configuration") # self.nNodes, self.walltimelimit = self.get_resources() # self.nJobsPerWorker = self.nNodes * self.nJobsPerNode # # tmpLog.info("Get: {0} jobs to run for {1} sec.".format(self.nJobsPerWorker, self.walltimelimit)) # return self.nJobsPerWorker def get_resources(self): """ Function to get resourcese and map them to number of jobs """ tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(self.queueName), method_name='get_resources') njobs = 0 walltime = self.walltimelimit queue_config = self.queue_config_mapper.get_queue(self.queueName) resource_utils = self.pluginFactory.get_plugin(queue_config.resource) if resource_utils: nodes, walltime = resource_utils.get_resources() else: tmpLog.info("Resource plugin is not defined") nodes = self.nNodes return nodes, walltime
class WorkerAdjuster(object): # constructor def __init__(self, queue_config_mapper): self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict() self.apf_mon = Apfmon(self.queueConfigMapper) try: self.maxNewWorkers = harvester_config.submitter.maxNewWorkers except AttributeError: self.maxNewWorkers = None # define number of workers to submit based on various information def define_num_workers(self, static_num_workers, site_name): tmpLog = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers') tmpLog.debug('start') tmpLog.debug('static_num_workers: {0}'.format(static_num_workers)) dyn_num_workers = copy.deepcopy(static_num_workers) try: # get queue status queueStat = self.dbProxy.get_cache("panda_queues.json", None) if queueStat is None: queueStat = dict() else: queueStat = queueStat.data # get job statistics job_stats = self.dbProxy.get_cache("job_statistics.json", None) if job_stats is None: job_stats = dict() else: job_stats = job_stats.data # define num of new workers for queueName in static_num_workers: # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) workerLimits_dict = self.dbProxy.get_worker_limits(queueName) maxWorkers = workerLimits_dict.get('maxWorkers', 0) nQueueLimit = workerLimits_dict.get('nQueueLimitWorker', 0) nQueueLimitPerRT = workerLimits_dict['nQueueLimitWorkerPerRT'] nQueue_total, nReady_total, nRunning_total = 0, 0, 0 apf_msg = None apf_data = None for resource_type, tmpVal in iteritems(static_num_workers[queueName]): tmpLog.debug('Processing queue {0} resource {1} with static_num_workers {2}'. format(queueName, resource_type, tmpVal)) # set 0 to num of new workers when the queue is disabled if queueName in queueStat and queueStat[queueName]['status'] in ['offline', 'standby', 'maintenance']: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 since status={0}'.format(queueStat[queueName]['status']) tmpLog.debug(retMsg) apf_msg = 'Not submitting workers since queue status = {0}'.format(queueStat[queueName]['status']) continue # protection against not-up-to-date queue config if queueConfig is None: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 due to missing queueConfig' tmpLog.debug(retMsg) apf_msg = 'Not submitting workers because of missing queueConfig' continue # get throttler if queueName not in self.throttlerMap: if hasattr(queueConfig, 'throttler'): throttler = self.pluginFactory.get_plugin(queueConfig.throttler) else: throttler = None self.throttlerMap[queueName] = throttler # check throttler throttler = self.throttlerMap[queueName] if throttler is not None: toThrottle, tmpMsg = throttler.to_be_throttled(queueConfig) if toThrottle: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 by {0}:{1}'.format(throttler.__class__.__name__, tmpMsg) tmpLog.debug(retMsg) continue # check stats nQueue = tmpVal['nQueue'] nReady = tmpVal['nReady'] nRunning = tmpVal['nRunning'] if resource_type != 'ANY': nQueue_total += nQueue nReady_total += nReady nRunning_total += nRunning if queueConfig.runMode == 'slave': nNewWorkersDef = tmpVal['nNewWorkers'] if nNewWorkersDef == 0: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 by panda in slave mode' tmpLog.debug(retMsg) continue else: nNewWorkersDef = None # define num of new workers based on static site config nNewWorkers = 0 if nQueue >= nQueueLimitPerRT > 0: # enough queued workers retMsg = 'No nNewWorkers since nQueue({0})>=nQueueLimitPerRT({1})'.format(nQueue, nQueueLimitPerRT) tmpLog.debug(retMsg) pass elif (nQueue + nReady + nRunning) >= maxWorkers > 0: # enough workers in the system retMsg = 'No nNewWorkers since nQueue({0}) + nReady({1}) + nRunning({2}) '.format(nQueue, nReady, nRunning) retMsg += '>= maxWorkers({0})'.format(maxWorkers) tmpLog.debug(retMsg) pass else: maxQueuedWorkers = None if nQueueLimitPerRT > 0: # there is a limit set for the queue maxQueuedWorkers = nQueueLimitPerRT # Reset the maxQueueWorkers according to particular if nNewWorkersDef is not None: # don't surpass limits given centrally maxQueuedWorkers_slave = nNewWorkersDef + nQueue if maxQueuedWorkers is not None: maxQueuedWorkers = min(maxQueuedWorkers_slave, maxQueuedWorkers) else: maxQueuedWorkers = maxQueuedWorkers_slave elif queueConfig.mapType == 'NoJob': # for pull mode, limit to activated jobs # limit the queue to the number of activated jobs to avoid empty pilots try: n_activated = max(job_stats[queueName]['activated'], 1) # avoid no activity queues queue_limit = maxQueuedWorkers maxQueuedWorkers = min(n_activated, maxQueuedWorkers) tmpLog.debug('limiting maxQueuedWorkers to min(n_activated={0}, queue_limit={1})'. format(n_activated, queue_limit)) except KeyError: tmpLog.warning('n_activated not defined, defaulting to configured queue limits') pass if maxQueuedWorkers is None: # no value found, use default value maxQueuedWorkers = 1 # new workers nNewWorkers = max(maxQueuedWorkers - nQueue, 0) tmpLog.debug('setting nNewWorkers to {0} in maxQueuedWorkers calculation' .format(nNewWorkers)) if maxWorkers > 0: nNewWorkers = min(nNewWorkers, max(maxWorkers - nQueue - nReady - nRunning, 0)) tmpLog.debug('setting nNewWorkers to {0} to respect maxWorkers' .format(nNewWorkers)) if queueConfig.maxNewWorkersPerCycle > 0: nNewWorkers = min(nNewWorkers, queueConfig.maxNewWorkersPerCycle) tmpLog.debug('setting nNewWorkers to {0} in order to respect maxNewWorkersPerCycle' .format(nNewWorkers)) if self.maxNewWorkers is not None and self.maxNewWorkers > 0: nNewWorkers = min(nNewWorkers, self.maxNewWorkers) tmpLog.debug('setting nNewWorkers to {0} in order to respect universal maxNewWorkers' .format(nNewWorkers)) dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers # adjust nNewWorkers for UCORE to let aggregations over RT respect nQueueLimitWorker and maxWorkers if queueConfig is None: maxNewWorkersPerCycle = 0 retMsg = 'set maxNewWorkersPerCycle=0 in UCORE aggregation due to missing queueConfig' tmpLog.debug(retMsg) else: maxNewWorkersPerCycle = queueConfig.maxNewWorkersPerCycle if len(dyn_num_workers[queueName]) > 1: total_new_workers_rts = sum( dyn_num_workers[queueName][_rt]['nNewWorkers'] if _rt != 'ANY' else 0 for _rt in dyn_num_workers[queueName] ) nNewWorkers_max_agg = min( max(nQueueLimit - nQueue_total, 0), max(maxWorkers - nQueue_total - nReady_total - nRunning_total, 0), ) if maxNewWorkersPerCycle >= 0: nNewWorkers_max_agg = min(nNewWorkers_max_agg, maxNewWorkersPerCycle) if self.maxNewWorkers is not None and self.maxNewWorkers > 0: nNewWorkers_max_agg = min(nNewWorkers_max_agg, self.maxNewWorkers) # exceeded max, to adjust if total_new_workers_rts > nNewWorkers_max_agg: if nNewWorkers_max_agg == 0: for resource_type in dyn_num_workers[queueName]: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 tmpLog.debug('No nNewWorkers since nNewWorkers_max_agg=0 for UCORE') else: tmpLog.debug('nNewWorkers_max_agg={0} for UCORE'.format(nNewWorkers_max_agg)) _d = dyn_num_workers[queueName].copy() del _d['ANY'] simple_rt_nw_list = [ [_rt, _d[_rt].get('nNewWorkers', 0), 0] for _rt in _d ] _countdown = nNewWorkers_max_agg for _rt_list in simple_rt_nw_list: resource_type, nNewWorkers_orig, _r = _rt_list nNewWorkers, remainder = divmod(nNewWorkers_orig*nNewWorkers_max_agg, total_new_workers_rts) dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers _rt_list[2] = remainder _countdown -= nNewWorkers _s_list = sorted(simple_rt_nw_list, key=(lambda x: x[1])) sorted_rt_nw_list = sorted(_s_list, key=(lambda x: x[2]), reverse=True) for resource_type, nNewWorkers_orig, remainder in sorted_rt_nw_list: if _countdown <= 0: break dyn_num_workers[queueName][resource_type]['nNewWorkers'] += 1 _countdown -= 1 for resource_type in dyn_num_workers[queueName]: if resource_type == 'ANY': continue nNewWorkers = dyn_num_workers[queueName][resource_type]['nNewWorkers'] tmpLog.debug('setting nNewWorkers to {0} of type {1} in order to respect RT aggregations for UCORE' .format(nNewWorkers, resource_type)) if not apf_msg: apf_data = copy.deepcopy(dyn_num_workers[queueName]) self.apf_mon.update_label(queueName, apf_msg, apf_data) # dump tmpLog.debug('defined {0}'.format(str(dyn_num_workers))) return dyn_num_workers except Exception: # dump error errMsg = core_utils.dump_error_message(tmpLog) return None
'computingElement', 1, None) if len(jobs) == 0: print("Failed to get jobs at {0} due to {1}".format(queueConfig.queueName, errStr)) sys.exit(0) jobSpec = JobSpec() jobSpec.convert_job_json(jobs[0]) # set input file paths inFiles = jobSpec.get_input_file_attributes() for inLFN, inFile in iteritems(inFiles): inFile['path'] = '{0}/{1}'.format(os.getcwd(), inLFN) jobSpec.set_input_file_paths(inFiles) jobSpecList.append(jobSpec) maker = pluginFactory.get_plugin(queueConfig.workerMaker) workSpec = maker.make_worker(jobSpecList, queueConfig, jobType, resourceType) workSpec.accessPoint = queueConfig.messenger['accessPoint'] workSpec.mapType = queueConfig.mapType workSpec.computingSite = queueConfig.queueName # set job to worker if not job-level late binding if not queueConfig.useJobLateBinding: workSpec.hasJob = 1 workSpec.set_jobspec_list(jobSpecList) messenger = pluginFactory.get_plugin(queueConfig.messenger) messenger.setup_access_points([workSpec]) # get plugin for messenger
class MultiNodeWorkerMaker(BaseWorkerMaker): # constructor def __init__(self, **kwarg): BaseWorkerMaker.__init__(self, **kwarg) self.pluginFactory = PluginFactory() self.queue_config_mapper = QueueConfigMapper() tmpLog = self.make_logger(baseLogger, method_name='__init__') tmpLog.info("Multinode workermaker: created.") tmpLog.debug("Queue name: {0}".format(self.queueName)) if self.mode == "static": tmpLog.info("Static configuration") elif self.mode == "dynamic": tmpLog.info("Dynamic configuration") self.nNodes, self.walltimelimit = self.get_resources() self.nJobsPerWorker = self.nNodes * self.nJobsPerNode def _get_executable(self): # return string which contain body of script for scheduler: specific enviroment setup, executor with parameters exe_str = "" tmpLog = self.make_logger(baseLogger, method_name='_get_executable') # prepare static enviroment env_str = "" if self.env not in (None, "NULL"): env_str = "\n".join(map(lambda s: s.strip(), self.env.split(", "))) # prepare executor try: if self.executor == "aprun": # "aprun -n [number of required nodes/jobs] -d [number of cpu per node/job]" - for one multicore job per node exe_str = self.executor + " -n {0} -d {1} ".format(self.nJobsPerWorker, self.nCorePerJob) exe_str += self.pilot else: exe_str = self.executor + " " + self.pilot if self.pilot_params: exe_str = " ".join([exe_str, self.pilot_params]) except Exception: tmpLog.error("Unable to build executor command check configuration") exe_str = "" exe_str = "\n".join([env_str, exe_str]) tmpLog.debug("Shell script body: \n%s" % exe_str) return exe_str # make a worker from jobs def make_worker(self, jobspec_list, queue_config, resource_type): tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(queue_config.queueName), method_name='make_worker') tmpLog.info("Multi node worker preparation started.") tmpLog.info("Worker size: {0} jobs on {2} nodes for {1} sec.".format(self.nJobsPerWorker, self.walltimelimit, self.nNodes)) workSpec = WorkSpec() workSpec.nCore = self.nNodes * queue_config.submitter['nCorePerNode'] workSpec.minRamCount = 0 workSpec.maxDiskCount = 0 workSpec.maxWalltime = self.walltimelimit workSpec.workParams = self._get_executable() if len(jobspec_list) > 0: # push case: we know the job and set the parameters of the job for jobSpec in jobspec_list: try: workSpec.minRamCount += jobSpec.jobParams['minRamCount'] except Exception: pass try: workSpec.maxDiskCount += jobSpec.jobParams['maxDiskCount'] except Exception: pass #try: # if jobSpec.jobParams['maxWalltime'] not in (None, "NULL"): # workSpec.maxWalltime = max(int(queue_config.walltimeLimit), jobSpec.jobParams['maxWalltime']) # else: # workSpec.maxWalltime = queue_config.walltimeLimit #except Exception: # pass tmpLog.info("Worker for {0} nodes with {2} jobs with walltime {1} sec. defined".format(self.nNodes, workSpec.maxWalltime, self.nJobsPerWorker)) return workSpec # def get_num_jobs_per_worker(self, n_workers): # """ # Function to set 'size' of worker. Define number of jobs per worker # """ # tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(self.queueName), # method_name='get_num_jobs_per_worker') # tmpLog.info("Get number of jobs per worker") # self.nJobsPerWorker = 1 # if self.mode == "static": # tmpLog.info("Static configuration") # self.nJobsPerWorker = self.nNodes * self.nJobsPerNode # elif self.mode == "dynamic": # tmpLog.info("Dynamic configuration") # self.nNodes, self.walltimelimit = self.get_resources() # self.nJobsPerWorker = self.nNodes * self.nJobsPerNode # # tmpLog.info("Get: {0} jobs to run for {1} sec.".format(self.nJobsPerWorker, self.walltimelimit)) # return self.nJobsPerWorker def get_resources(self): """ Function to get resourcese and map them to number of jobs """ tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(self.queueName), method_name='get_resources') njobs = 0 walltime = self.walltimelimit queue_config = self.queue_config_mapper.get_queue(self.queueName) resource_utils = self.pluginFactory.get_plugin(queue_config.resource) if resource_utils: nodes, walltime = resource_utils.get_resources() else: tmpLog.info("Resource plugin is not defined") nodes = self.nNodes return nodes, walltime
class Stager(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'stager-{0}'.format(self.get_pid()) while True: sw = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('try to get jobs to check') # get jobs to check preparation try: maxFilesPerJob = harvester_config.stager.maxFilesPerJobToCheck except Exception: maxFilesPerJob = None jobsToCheck = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToCheck, harvester_config.stager.checkInterval, harvester_config.stager.lockInterval, lockedBy, 'transferring', JobSpec.HO_hasTransfer, max_files_per_job=maxFilesPerJob) mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck))) # loop over all jobs for jobSpec in jobsToCheck: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('start checking') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin(queueConfig.stager) if stagerCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue tmpStat, tmpStr = stagerCore.check_status(jobSpec) # check result if tmpStat is True: # succeeded newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) tmpLog.debug('succeeded new subStatus={0}'.format(newSubStatus)) elif tmpStat is False: # fatal error tmpLog.debug('fatal error when checking status with {0}'.format(tmpStr)) # update job for fileSpec in jobSpec.outFiles: if fileSpec.status != 'finished': fileSpec.status = 'failed' errStr = 'stage-out failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr) jobSpec.trigger_propagation() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) tmpLog.debug('updated new subStatus={0}'.format(newSubStatus)) else: # on-going tmpLog.debug('try to check later since {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to trigger stage-out try: maxFilesPerJob = harvester_config.stager.maxFilesPerJobToTrigger except Exception: maxFilesPerJob = None jobsToTrigger = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToTrigger, harvester_config.stager.triggerInterval, harvester_config.stager.lockInterval, lockedBy, 'to_transfer', JobSpec.HO_hasOutput, JobSpec.HO_hasZipOutput, max_files_per_job=maxFilesPerJob) mainLog.debug('got {0} jobs to trigger'.format(len(jobsToTrigger))) # loop over all jobs for jobSpec in jobsToTrigger: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('try to trigger stage-out') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin(queueConfig.stager) if stagerCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue # trigger stage-out tmpStat, tmpStr = stagerCore.trigger_stage_out(jobSpec) # check result if tmpStat is True: # succeeded jobSpec.all_files_triggered_to_stage_out() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) tmpLog.debug('triggered new subStatus={0}'.format(newSubStatus)) elif tmpStat is False: # fatal error tmpLog.debug('fatal error to trigger with {0}'.format(tmpStr)) # update job for fileSpec in jobSpec.outFiles: if fileSpec.status != 'finished': fileSpec.status = 'failed' errStr = 'stage-out failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr) jobSpec.trigger_propagation() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) tmpLog.debug('updated new subStatus={0}'.format(newSubStatus)) else: # temporary error tmpLog.debug('try to trigger later since {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to zip output try: maxFilesPerJob = harvester_config.stager.maxFilesPerJobToZip except Exception: maxFilesPerJob = None try: zipInterval = harvester_config.stager.zipInterval except Exception: zipInterval = harvester_config.stager.triggerInterval jobsToZip = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToZip, zipInterval, harvester_config.stager.lockInterval, lockedBy, 'to_transfer', JobSpec.HO_hasZipOutput, JobSpec.HO_hasOutput, max_files_per_job=maxFilesPerJob) mainLog.debug('got {0} jobs to zip'.format(len(jobsToZip))) # loop over all jobs for jobSpec in jobsToZip: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('try to zip output') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin(queueConfig.stager) if stagerCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue # trigger preparation tmpStat, tmpStr = stagerCore.zip_output(jobSpec) # succeeded if tmpStat is True: # update job jobSpec.all_files_zipped() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, False, lockedBy) tmpLog.debug('zipped new subStatus={0}'.format(newSubStatus)) else: # failed tmpLog.debug('failed to zip with {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.stager.sleepTime): mainLog.debug('terminated') return
oFile.close() fileSpec.add_associated_file(assFileSpec) jobSpec = JobSpec() jobSpec.jobParams = {'outFiles': fileSpec.lfn + ',log', 'scopeOut': 'panda', 'scopeLog': 'panda', 'logFile': 'log', 'realDatasets': 'panda.' + fileSpec.lfn, 'ddmEndPointOut': 'BNL-OSG2_DATADISK', } jobSpec.add_out_file(fileSpec) pluginFactory = PluginFactory() # get stage-out plugin stagerCore = pluginFactory.get_plugin(queueConfig.stager) print ("plugin={0}".format(stagerCore.__class__.__name__)) print ("testing zip") tmpStat, tmpOut = stagerCore.zip_output(jobSpec) if tmpStat: print (" OK") else: print (" NG {0}".format(tmpOut)) print () print ("testing stage-out") transferID = None tmpStat, tmpOut = stagerCore.trigger_stage_out(jobSpec) if tmpStat:
class Sweeper(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'sweeper-{0}'.format(self.get_pid()) while True: mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('try to get workers to kill') # get workers to kill workersToKill = self.dbProxy.get_workers_to_kill(harvester_config.sweeper.maxWorkers, harvester_config.sweeper.checkInterval) mainLog.debug('got {0} queues to kill workers'.format(len(workersToKill))) # loop over all workers for queueName, configIdWorkSpecs in iteritems(workersToKill): for configID, workSpecs in iteritems(configIdWorkSpecs): # get sweeper if not self.queueConfigMapper.has_queue(queueName, configID): mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue(queueName, configID) sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper) for workSpec in workSpecs: tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID), method_name='run') try: tmpLog.debug('start killing') tmpStat, tmpOut = sweeperCore.kill_worker(workSpec) tmpLog.debug('done with status={0} diag={1}'.format(tmpStat, tmpOut)) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done kill') # timeout for missed try: keepMissed = harvester_config.sweeper.keepMissed except Exception: keepMissed = 24 keepPending = 24 # get workers for cleanup statusTimeoutMap = {'finished': harvester_config.sweeper.keepFinished, 'failed': harvester_config.sweeper.keepFailed, 'cancelled': harvester_config.sweeper.keepCancelled, 'missed': keepMissed, 'pending': keepPending } workersForCleanup = self.dbProxy.get_workers_for_cleanup(harvester_config.sweeper.maxWorkers, statusTimeoutMap) mainLog.debug('got {0} queues for workers cleanup'.format(len(workersForCleanup))) for queueName, configIdWorkSpecs in iteritems(workersForCleanup): for configID, workSpecs in iteritems(configIdWorkSpecs): # get sweeper if not self.queueConfigMapper.has_queue(queueName, configID): mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue(queueName, configID) sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper) for workSpec in workSpecs: tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID), method_name='run') try: tmpLog.debug('start cleanup') tmpStat, tmpOut = sweeperCore.sweep_worker(workSpec) tmpLog.debug('done with status={0} diag={1}'.format(tmpStat, tmpOut)) if tmpStat: # delete from DB self.dbProxy.delete_worker(workSpec.workerID) except Exception: core_utils.dump_error_message(tmpLog) # delete old jobs mainLog.debug('delete old jobs') jobTimeout = max(statusTimeoutMap.values()) + 1 self.dbProxy.delete_old_jobs(jobTimeout) mainLog.debug('done cleanup') # check if being terminated if self.terminated(harvester_config.sweeper.sleepTime): mainLog.debug('terminated') return