class Propagator(AgentBase): # constructor def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.communicator = communicator self.queueConfigMapper = queue_config_mapper self._last_stats_update = None self._last_metrics_update = None # main loop def run(self): while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run') mainLog.debug('getting jobs to propagate') sw = core_utils.get_stopwatch() jobSpecs = self.dbProxy.get_jobs_to_propagate(harvester_config.propagator.maxJobs, harvester_config.propagator.lockInterval, harvester_config.propagator.updateInterval, self.get_pid()) mainLog.debug('got {0} jobs {1}'.format(len(jobSpecs), sw.get_elapsed_time())) # update jobs in central database iJobs = 0 nJobs = harvester_config.propagator.nJobsInBulk hbSuppressMap = dict() while iJobs < len(jobSpecs): jobList = jobSpecs[iJobs:iJobs + nJobs] iJobs += nJobs # collect jobs to update or check jobListToSkip = [] jobListToUpdate = [] jobListToCheck = [] retList = [] for tmpJobSpec in jobList: if tmpJobSpec.computingSite not in hbSuppressMap: queueConfig = self.queueConfigMapper.get_queue(tmpJobSpec.computingSite, tmpJobSpec.configID) hbSuppressMap[tmpJobSpec.computingSite] = queueConfig.get_no_heartbeat_status() # heartbeat is suppressed if tmpJobSpec.get_status() in hbSuppressMap[tmpJobSpec.computingSite] and \ not tmpJobSpec.not_suppress_heartbeat(): # check running job to detect lost heartbeat if tmpJobSpec.status == 'running': jobListToCheck.append(tmpJobSpec) else: jobListToSkip.append(tmpJobSpec) retList.append({'StatusCode': 0, 'command': None}) else: jobListToUpdate.append(tmpJobSpec) sw.reset() retList += self.communicator.check_jobs(jobListToCheck) mainLog.debug('check_jobs for {0} jobs {1}'.format(len(jobListToCheck), sw.get_elapsed_time())) sw.reset() retList += self.communicator.update_jobs(jobListToUpdate, self.get_pid()) mainLog.debug('update_jobs for {0} jobs took {1}'.format(len(jobListToUpdate), sw.get_elapsed_time())) # logging for tmpJobSpec, tmpRet in zip(jobListToSkip+jobListToCheck+jobListToUpdate, retList): if tmpRet['StatusCode'] == 0: if tmpJobSpec in jobListToUpdate: mainLog.debug('updated PandaID={0} status={1}'.format(tmpJobSpec.PandaID, tmpJobSpec.status)) else: mainLog.debug('skip updating PandaID={0} status={1}'.format(tmpJobSpec.PandaID, tmpJobSpec.status)) # release job tmpJobSpec.propagatorLock = None if tmpJobSpec.is_final_status() and tmpJobSpec.status == tmpJobSpec.get_status(): # unset to disable further updating tmpJobSpec.propagatorTime = None tmpJobSpec.subStatus = 'done' tmpJobSpec.modificationTime = datetime.datetime.utcnow() elif tmpJobSpec.is_final_status() and not tmpJobSpec.all_events_done(): # trigger next propagation to update remaining events tmpJobSpec.trigger_propagation() else: # check event availability if tmpJobSpec.status == 'starting' and 'eventService' in tmpJobSpec.jobParams and \ tmpJobSpec.subStatus != 'submitted': tmpEvStat, tmpEvRet = self.communicator.check_event_availability(tmpJobSpec) if tmpEvStat: if tmpEvRet is not None: tmpJobSpec.nRemainingEvents = tmpEvRet if tmpEvRet == 0: mainLog.debug('kill PandaID={0} due to no event'.format(tmpJobSpec.PandaID)) tmpRet['command'] = 'tobekilled' # got kill command if 'command' in tmpRet and tmpRet['command'] in ['tobekilled']: nWorkers = self.dbProxy.kill_workers_with_job(tmpJobSpec.PandaID) if nWorkers == 0: # no workers tmpJobSpec.status = 'cancelled' tmpJobSpec.subStatus = 'killed' tmpJobSpec.set_pilot_error(PilotErrors.ERR_PANDAKILL, PilotErrors.pilotError[PilotErrors.ERR_PANDAKILL]) tmpJobSpec.stateChangeTime = datetime.datetime.utcnow() tmpJobSpec.trigger_propagation() self.dbProxy.update_job(tmpJobSpec, {'propagatorLock': self.get_pid()}) else: mainLog.error('failed to update PandaID={0} status={1}'.format(tmpJobSpec.PandaID, tmpJobSpec.status)) mainLog.debug('getting workers to propagate') sw.reset() workSpecs = self.dbProxy.get_workers_to_propagate(harvester_config.propagator.maxWorkers, harvester_config.propagator.updateInterval) mainLog.debug('got {0} workers {1}'.format(len(workSpecs), sw.get_elapsed_time())) # update workers in central database sw.reset() iWorkers = 0 nWorkers = harvester_config.propagator.nWorkersInBulk while iWorkers < len(workSpecs): workList = workSpecs[iWorkers:iWorkers + nWorkers] iWorkers += nWorkers retList, tmpErrStr = self.communicator.update_workers(workList) # logging if retList is None: mainLog.error('failed to update workers with {0}'.format(tmpErrStr)) else: for tmpWorkSpec, tmpRet in zip(workList, retList): if tmpRet: mainLog.debug('updated workerID={0} status={1}'.format(tmpWorkSpec.workerID, tmpWorkSpec.status)) # update logs for logFilePath, logOffset, logSize, logRemoteName in \ tmpWorkSpec.get_log_files_to_upload(): with open(logFilePath, 'rb') as logFileObj: tmpStat, tmpErr = self.communicator.upload_file(logRemoteName, logFileObj, logOffset, logSize) if tmpStat: tmpWorkSpec.update_log_files_to_upload(logFilePath, logOffset+logSize) # disable further update if tmpWorkSpec.is_final_status(): tmpWorkSpec.disable_propagation() self.dbProxy.update_worker(tmpWorkSpec, {'workerID': tmpWorkSpec.workerID}) else: mainLog.error('failed to update workerID={0} status={1}'.format(tmpWorkSpec.workerID, tmpWorkSpec.status)) mainLog.debug('update_workers for {0} workers took {1}'.format(iWorkers, sw.get_elapsed_time())) mainLog.debug('getting commands') commandSpecs = self.dbProxy.get_commands_for_receiver('propagator') mainLog.debug('got {0} commands'.format(len(commandSpecs))) for commandSpec in commandSpecs: if commandSpec.command.startswith(CommandSpec.COM_reportWorkerStats): # get worker stats siteName = commandSpec.command.split(':')[-1] workerStats = self.dbProxy.get_worker_stats(siteName) if len(workerStats) == 0: mainLog.error('failed to get worker stats for {0}'.format(siteName)) else: # report worker stats tmpRet, tmpStr = self.communicator.update_worker_stats(siteName, workerStats) if tmpRet: mainLog.debug('updated worker stats (command) for {0}'.format(siteName)) else: mainLog.error('failed to update worker stats (command) for {0} err={1}'.format(siteName, tmpStr)) if not self._last_stats_update or time.time() - self._last_stats_update > STATS_PERIOD: # get active UPS queues. PanDA server needs to know about them and which harvester instance is taking # care of them active_ups_queues = self.queueConfigMapper.get_active_ups_queues() # update worker stats for all sites worker_stats_bulk = self.dbProxy.get_worker_stats_bulk(active_ups_queues) if not worker_stats_bulk: mainLog.error('failed to get worker stats in bulk') else: for site_name in worker_stats_bulk: tmp_ret, tmp_str = self.communicator.update_worker_stats(site_name, worker_stats_bulk[site_name]) if tmp_ret: mainLog.debug('update of worker stats (bulk) for {0}'.format(site_name)) self._last_stats_update = time.time() else: mainLog.error('failed to update worker stats (bulk) for {0} err={1}'.format(site_name, tmp_str)) if not self._last_metrics_update \ or datetime.datetime.utcnow() - self._last_metrics_update > datetime.timedelta(seconds=METRICS_PERIOD): # get latest metrics from DB service_metrics_list = self.dbProxy.get_service_metrics(self._last_metrics_update) if not service_metrics_list: mainLog.error('failed to get service metrics') self._last_metrics_update = datetime.datetime.utcnow() else: tmp_ret, tmp_str = self.communicator.update_service_metrics(service_metrics_list) if tmp_ret: mainLog.debug('update of service metrics OK') self._last_metrics_update = datetime.datetime.utcnow() else: mainLog.error('failed to update service metrics err={0}'.format(tmp_str)) # send dialog messages mainLog.debug('getting dialog messages to propagate') try: maxDialogs = harvester_config.propagator.maxDialogs except Exception: maxDialogs = 50 diagSpecs = self.dbProxy.get_dialog_messages_to_send(maxDialogs, harvester_config.propagator.lockInterval) mainLog.debug('got {0} dialogs'.format(len(diagSpecs))) if len(diagSpecs) > 0: tmpStat, tmpStr = self.communicator.send_dialog_messages(diagSpecs) if tmpStat: diagIDs = [diagSpec.diagID for diagSpec in diagSpecs] self.dbProxy.delete_dialog_messages(diagIDs) mainLog.debug('sent {0} dialogs'.format(len(diagSpecs))) else: mainLog.error('failed to send dialogs err={0}'.format(tmpStr)) if sw_main.get_elapsed_time_in_sec() > harvester_config.propagator.lockInterval: mainLog.warning('a single cycle was longer than lockInterval. done' + sw_main.get_elapsed_time()) else: mainLog.debug('done' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.propagator.sleepTime): mainLog.debug('terminated') return
class Submitter(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.workerMaker = WorkerMaker() self.workerAdjuster = WorkerAdjuster(queue_config_mapper) self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'submitter-{0}'.format(self.ident) while True: mainLog = core_utils.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting queues to submit workers') # get queues associated to a site to submit workers curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit( harvester_config.submitter.nQueues, harvester_config.submitter.lookupTime) mainLog.debug('got {0} queues for site {1}'.format( len(curWorkers), siteName)) # get commands if siteName is not None: comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName) commandSpecs = self.dbProxy.get_commands_for_receiver( 'submitter', comStr) mainLog.debug('got {0} commands'.format(len(commandSpecs))) for commandSpec in commandSpecs: newLimits = self.dbProxy.set_queue_limit( siteName, commandSpec.params) for tmpResource, tmpNewVal in iteritems(newLimits): if tmpResource in resMap: tmpQueueName = resMap[tmpResource] if tmpQueueName in curWorkers: curWorkers[tmpQueueName][ 'nNewWorkers'] = tmpNewVal # define number of new workers if len(curWorkers) == 0: nWorkersPerQueue = dict() else: nWorkersPerQueue = self.workerAdjuster.define_num_workers( curWorkers, siteName) if nWorkersPerQueue is None: mainLog.error( 'WorkerAdjuster failed to define the number of workers') elif len(nWorkersPerQueue) == 0: pass else: # loop over all queues for queueName, tmpVal in iteritems(nWorkersPerQueue): tmpLog = core_utils.make_logger( _logger, 'queue={0}'.format(queueName), method_name='run') tmpLog.debug('start') nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady'] nReady = tmpVal['nReady'] # check queue if not self.queueConfigMapper.has_queue(queueName): tmpLog.error('config not found') continue # no new workers if nWorkers == 0: tmpLog.debug( 'skipped since no new worker is needed based on current stats' ) continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) # actions based on mapping type if queueConfig.mapType == WorkSpec.MT_NoJob: # workers without jobs jobChunks = [] for i in range(nWorkers): jobChunks.append([]) elif queueConfig.mapType == WorkSpec.MT_OneToOne: # one worker per one job jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, 1, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy) elif queueConfig.mapType == WorkSpec.MT_MultiJobs: # one worker for multiple jobs nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker( queueConfig, nWorkers) tmpLog.debug( 'nJobsPerWorker={0}'.format(nJobsPerWorker)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, nJobsPerWorker, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy, queueConfig.allowJobMixture) elif queueConfig.mapType == WorkSpec.MT_MultiWorkers: # multiple workers for one job nWorkersPerJob = self.workerMaker.get_num_workers_per_job( queueConfig, nWorkers) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, None, nWorkersPerJob, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy) else: tmpLog.error('unknown mapType={0}'.format( queueConfig.mapType)) continue tmpLog.debug('got {0} job chunks'.format(len(jobChunks))) if len(jobChunks) == 0: continue # make workers okChunks, ngChunks = self.workerMaker.make_workers( jobChunks, queueConfig, nReady) if len(ngChunks) == 0: tmpLog.debug('successfully made {0} workers'.format( len(okChunks))) else: tmpLog.debug( 'made {0} workers, while {1} workers failed'. format(len(okChunks), len(ngChunks))) timeNow = datetime.datetime.utcnow() # NG for ngJobs in ngChunks: for jobSpec in ngJobs: jobSpec.status = 'failed' jobSpec.subStatus = 'failedtomake' jobSpec.stateChangeTime = timeNow jobSpec.lockedBy = None jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': 'prepared' }) # OK pandaIDs = set() workSpecList = [] if len(okChunks) > 0: for workSpec, okJobs in okChunks: # has job if (queueConfig.useJobLateBinding and workSpec.workerID is None) \ or queueConfig.mapType == WorkSpec.MT_NoJob: workSpec.hasJob = 0 else: workSpec.hasJob = 1 if workSpec.nJobsToReFill in [None, 0]: workSpec.set_jobspec_list(okJobs) else: # refill free slots during the worker is running workSpec.set_jobspec_list( okJobs[:workSpec.nJobsToReFill]) workSpec.nJobsToReFill = None for jobSpec in okJobs[workSpec. nJobsToReFill:]: pandaIDs.add(jobSpec.PandaID) # map type workSpec.mapType = queueConfig.mapType # queue name workSpec.computingSite = queueConfig.queueName # set access point workSpec.accessPoint = queueConfig.messenger[ 'accessPoint'] # events if len(okJobs) > 0 and ( 'eventService' in okJobs[0].jobParams or 'cloneJob' in okJobs[0].jobParams): workSpec.eventsRequest = WorkSpec.EV_useEvents workSpecList.append(workSpec) if len(workSpecList) > 0: # get plugin for submitter submitterCore = self.pluginFactory.get_plugin( queueConfig.submitter) if submitterCore is None: # not found tmpLog.error( 'submitter plugin for {0} not found'.format( jobSpec.computingSite)) continue # get plugin for messenger messenger = self.pluginFactory.get_plugin( queueConfig.messenger) if messenger is None: # not found tmpLog.error( 'messenger plugin for {0} not found'.format( jobSpec.computingSite)) continue # setup access points messenger.setup_access_points(workSpecList) # feed jobs for workSpec in workSpecList: if workSpec.hasJob == 1: tmpStat = messenger.feed_jobs( workSpec, workSpec.get_jobspec_list()) if tmpStat is False: tmpLog.error( 'failed to send jobs to workerID={0}'. format(workSpec.workerID)) else: tmpLog.debug( 'sent jobs to workerID={0} with {1}'. format(workSpec.workerID, tmpStat)) # submit tmpLog.debug('submitting {0} workers'.format( len(workSpecList))) workSpecList, tmpRetList, tmpStrList = self.submit_workers( submitterCore, workSpecList) for iWorker, (tmpRet, tmpStr) in enumerate( zip(tmpRetList, tmpStrList)): workSpec, jobList = okChunks[iWorker] # use associated job list since it can be truncated for re-filling jobList = workSpec.get_jobspec_list() # set status if not tmpRet: # failed submission tmpLog.error( 'failed to submit a workerID={0} with {1}'. format(workSpec.workerID, tmpStr)) workSpec.set_status(WorkSpec.ST_missed) jobList = [] elif queueConfig.useJobLateBinding and workSpec.hasJob == 1: # directly go to running after feeding jobs for late biding workSpec.set_status(WorkSpec.ST_running) else: # normal successful submission workSpec.set_status(WorkSpec.ST_submitted) workSpec.submitTime = timeNow workSpec.modificationTime = timeNow # prefetch events if tmpRet and workSpec.hasJob == 1 and workSpec.eventsRequest == WorkSpec.EV_useEvents: workSpec.eventsRequest = WorkSpec.EV_requestEvents eventsRequestParams = dict() for jobSpec in jobList: eventsRequestParams[jobSpec.PandaID] = { 'pandaID': jobSpec.PandaID, 'taskID': jobSpec.taskID, 'jobsetID': jobSpec.jobParams['jobsetID'], 'nRanges': jobSpec.jobParams['coreCount'], } workSpec.eventsRequestParams = eventsRequestParams # register worker tmpStat = self.dbProxy.register_worker( workSpec, jobList, lockedBy) if jobList is not None: for jobSpec in jobList: pandaIDs.add(jobSpec.PandaID) if tmpStat: tmpStr = 'submitted a workerID={0} for PandaID={1} with batchID={2}' tmpLog.info( tmpStr.format( workSpec.workerID, jobSpec.PandaID, workSpec.batchID)) else: tmpStr = 'failed to register a worker for PandaID={0} with batchID={1}' tmpLog.error( tmpStr.format( jobSpec.PandaID, workSpec.batchID)) # release jobs self.dbProxy.release_jobs(pandaIDs, lockedBy) mainLog.debug('done') # check if being terminated if self.terminated(harvester_config.submitter.sleepTime): mainLog.debug('terminated') return # wrapper for submitWorkers to skip ready workers def submit_workers(self, submitter_core, workspec_list): retList = [] strList = [] newSpecList = [] workersToSubmit = [] for workSpec in workspec_list: if workSpec.status == WorkSpec.ST_ready: newSpecList.append(workSpec) retList.append(True) strList.append('') else: workersToSubmit.append(workSpec) tmpRetList = submitter_core.submit_workers(workersToSubmit) for tmpRet, tmpStr in tmpRetList: retList.append(tmpRet) strList.append(tmpStr) newSpecList += workersToSubmit return newSpecList, retList, strList
class Preparator(AgentBase): # constructor def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.communicator = communicator self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'preparator-{0}'.format(self.ident) while True: sw = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('try to get jobs to check') # get jobs to check preparation jobsToCheck = self.dbProxy.get_jobs_in_sub_status('preparing', harvester_config.preparator.maxJobsToCheck, 'preparatorTime', 'lockedBy', harvester_config.preparator.checkInterval, harvester_config.preparator.lockInterval, lockedBy) mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck))) # loop over all jobs for jobSpec in jobsToCheck: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('start checking') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, jobSpec.configID) oldSubStatus = jobSpec.subStatus # get plugin preparatorCore = self.pluginFactory.get_plugin(queueConfig.preparator) if preparatorCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'preparatorTime', 'lockedBy', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue tmpStat, tmpStr = preparatorCore.check_status(jobSpec) # still running if tmpStat is None: # update job jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}) tmpLog.debug('try to check later since still preparing with {0}'.format(tmpStr)) continue # succeeded if tmpStat is True: # resolve path tmpStat, tmpStr = preparatorCore.resolve_input_paths(jobSpec) if tmpStat is False: jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}) tmpLog.error('failed to resolve input file paths : {0}'.format(tmpStr)) continue # update job jobSpec.subStatus = 'prepared' jobSpec.lockedBy = None jobSpec.preparatorTime = None jobSpec.set_all_input_ready() self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}, update_in_file=True) tmpLog.debug('succeeded') else: # update job jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_prepare' jobSpec.lockedBy = None jobSpec.preparatorTime = None jobSpec.stateChangeTime = datetime.datetime.utcnow() errStr = 'stage-in failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.ERR_STAGEINFAILED, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}) tmpLog.error('failed with {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to trigger preparation mainLog.debug('try to get jobs to prepare') jobsToTrigger = self.dbProxy.get_jobs_in_sub_status('fetched', harvester_config.preparator.maxJobsToTrigger, 'preparatorTime', 'lockedBy', harvester_config.preparator.triggerInterval, harvester_config.preparator.lockInterval, lockedBy, 'preparing') mainLog.debug('got {0} jobs to prepare'.format(len(jobsToTrigger))) # loop over all jobs fileStatMap = dict() for jobSpec in jobsToTrigger: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('try to trigger preparation') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) oldSubStatus = jobSpec.subStatus # get plugin preparatorCore = self.pluginFactory.get_plugin(queueConfig.preparator) if preparatorCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'preparatorTime', 'lockedBy', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue # check file status if queueConfig.ddmEndpointIn not in fileStatMap: fileStatMap[queueConfig.ddmEndpointIn] = dict() newFileStatusData = [] toWait = False for fileSpec in jobSpec.inFiles: if fileSpec.status == 'preparing': updateStatus = False if fileSpec.lfn not in fileStatMap[queueConfig.ddmEndpointIn]: fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn] \ = self.dbProxy.get_file_status(fileSpec.lfn, 'input', queueConfig.ddmEndpointIn, 'starting') if 'ready' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: # the file is ready fileSpec.status = 'ready' # set group info if any groupInfo = self.dbProxy.get_group_for_file(fileSpec.lfn, 'input', queueConfig.ddmEndpointIn) if groupInfo is not None: fileSpec.groupID = groupInfo['groupID'] fileSpec.groupStatus = groupInfo['groupStatus'] fileSpec.groupUpdateTime = groupInfo['groupUpdateTime'] updateStatus = True elif 'to_prepare' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: # the file is being prepared by another toWait = True else: # change file status if the file is not prepared by another fileSpec.status = 'to_prepare' updateStatus = True # set new status if updateStatus: newFileStatusData.append((fileSpec.fileID, fileSpec.lfn, fileSpec.status)) if fileSpec.status not in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn][fileSpec.status] = 0 fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn][fileSpec.status] += 1 if len(newFileStatusData) > 0: self.dbProxy.change_file_status(jobSpec.PandaID, newFileStatusData, lockedBy) # wait since files are being prepared by another if toWait: # update job jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}) tmpLog.debug('wait since files are being prepared by another job') continue # trigger preparation tmpStat, tmpStr = preparatorCore.trigger_preparation(jobSpec) # check result if tmpStat is True: # succeeded jobSpec.subStatus = 'preparing' jobSpec.lockedBy = None jobSpec.preparatorTime = None self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}, update_in_file=True) tmpLog.debug('triggered') elif tmpStat is False: # fatal error jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_prepare' jobSpec.lockedBy = None jobSpec.preparatorTime = None jobSpec.stateChangeTime = datetime.datetime.utcnow() errStr = 'stage-in failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.ERR_STAGEINFAILED, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}) tmpLog.debug('failed to trigger with {0}'.format(tmpStr)) else: # temporary error jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': oldSubStatus}) tmpLog.debug('try to prepare later since {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.preparator.sleepTime): mainLog.debug('terminated') return
class Preparator(AgentBase): # constructor def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.communicator = communicator self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'preparator-{0}'.format(self.get_pid()) while True: sw = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('try to get jobs to check') # get jobs to check preparation try: maxFilesPerJob = harvester_config.preparator.maxFilesPerJobToCheck if maxFilesPerJob <= 0: maxFilesPerJob = None except Exception: maxFilesPerJob = None jobsToCheck = self.dbProxy.get_jobs_in_sub_status( 'preparing', harvester_config.preparator.maxJobsToCheck, 'preparatorTime', 'lockedBy', harvester_config.preparator.checkInterval, harvester_config.preparator.lockInterval, lockedBy, max_files_per_job=maxFilesPerJob, ng_file_status_list=['ready']) mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck))) # loop over all jobs for jobSpec in jobsToCheck: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format( jobSpec.PandaID), method_name='run') try: tmpLog.debug('start checking') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue( jobSpec.computingSite, configID): tmpLog.error( 'queue config for {0}/{1} not found'.format( jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue( jobSpec.computingSite, jobSpec.configID) oldSubStatus = jobSpec.subStatus # get plugin if jobSpec.auxInput in [None, JobSpec.AUX_allTriggered]: preparatorCore = self.pluginFactory.get_plugin( queueConfig.preparator) else: preparatorCore = self.pluginFactory.get_plugin( queueConfig.aux_preparator) if preparatorCore is None: # not found tmpLog.error('plugin for {0} not found'.format( jobSpec.computingSite)) continue tmpLog.debug("plugin={0}".format( preparatorCore.__class__.__name__)) # lock job again lockedAgain = self.dbProxy.lock_job_again( jobSpec.PandaID, 'preparatorTime', 'lockedBy', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue tmpStat, tmpStr = preparatorCore.check_stage_in_status( jobSpec) # still running if tmpStat is None: # update job jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }) tmpLog.debug( 'try to check later since still preparing with {0}' .format(tmpStr)) continue # succeeded if tmpStat is True: # resolve path tmpStat, tmpStr = preparatorCore.resolve_input_paths( jobSpec) if tmpStat is False: jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }) tmpLog.error( 'failed to resolve input file paths : {0}'. format(tmpStr)) continue # manipulate container-related job params jobSpec.manipulate_job_params_for_container() # update job jobSpec.lockedBy = None jobSpec.set_all_input_ready() if (maxFilesPerJob is None and jobSpec.auxInput is None) or \ (len(jobSpec.inFiles) == 0 and jobSpec.auxInput in [None, JobSpec.AUX_inReady]): # all done allDone = True jobSpec.subStatus = 'prepared' jobSpec.preparatorTime = None if jobSpec.auxInput is not None: jobSpec.auxInput = JobSpec.AUX_allReady else: # immediate next lookup since there could be more files to check allDone = False jobSpec.trigger_preparation() # change auxInput flag to check auxiliary inputs if len( jobSpec.inFiles ) == 0 and jobSpec.auxInput == JobSpec.AUX_allTriggered: jobSpec.auxInput = JobSpec.AUX_inReady self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }, update_in_file=True) if allDone: tmpLog.debug('succeeded') else: tmpLog.debug('partially succeeded') else: # update job jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_prepare' jobSpec.lockedBy = None jobSpec.preparatorTime = None jobSpec.stateChangeTime = datetime.datetime.utcnow() errStr = 'stage-in failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.STAGEINFAILED, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }) tmpLog.error('failed with {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to trigger preparation mainLog.debug('try to get jobs to prepare') try: maxFilesPerJob = harvester_config.preparator.maxFilesPerJobToPrepare if maxFilesPerJob <= 0: maxFilesPerJob = None except Exception: maxFilesPerJob = None jobsToTrigger = self.dbProxy.get_jobs_in_sub_status( 'fetched', harvester_config.preparator.maxJobsToTrigger, 'preparatorTime', 'lockedBy', harvester_config.preparator.triggerInterval, harvester_config.preparator.lockInterval, lockedBy, 'preparing', max_files_per_job=maxFilesPerJob, ng_file_status_list=['triggered', 'ready']) mainLog.debug('got {0} jobs to prepare'.format(len(jobsToTrigger))) # loop over all jobs fileStatMap = dict() for jobSpec in jobsToTrigger: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format( jobSpec.PandaID), method_name='run') try: tmpLog.debug('try to trigger preparation') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue( jobSpec.computingSite, configID): tmpLog.error( 'queue config for {0}/{1} not found'.format( jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue( jobSpec.computingSite, configID) oldSubStatus = jobSpec.subStatus # get plugin if jobSpec.auxInput in [None, JobSpec.AUX_hasAuxInput]: preparatorCore = self.pluginFactory.get_plugin( queueConfig.preparator) fileType = 'input' else: preparatorCore = self.pluginFactory.get_plugin( queueConfig.aux_preparator) fileType = FileSpec.AUX_INPUT if preparatorCore is None: # not found tmpLog.error('plugin for {0} not found'.format( jobSpec.computingSite)) continue tmpLog.debug("plugin={0}".format( preparatorCore.__class__.__name__)) # lock job again lockedAgain = self.dbProxy.lock_job_again( jobSpec.PandaID, 'preparatorTime', 'lockedBy', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue # check file status if queueConfig.ddmEndpointIn not in fileStatMap: fileStatMap[queueConfig.ddmEndpointIn] = dict() # check if has to_prepare hasToPrepare = False for fileSpec in jobSpec.inFiles: if fileSpec.status == 'to_prepare': hasToPrepare = True break newFileStatusData = [] toWait = False newInFiles = [] for fileSpec in jobSpec.inFiles: if fileSpec.status in ['preparing', 'to_prepare']: newInFiles.append(fileSpec) updateStatus = False if fileSpec.lfn not in fileStatMap[ queueConfig.ddmEndpointIn]: fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn] \ = self.dbProxy.get_file_status(fileSpec.lfn, fileType, queueConfig.ddmEndpointIn, 'starting') if 'ready' in fileStatMap[ queueConfig.ddmEndpointIn][fileSpec.lfn]: # the file is ready fileSpec.status = 'ready' if fileStatMap[queueConfig.ddmEndpointIn][ fileSpec.lfn]['ready']['path']: fileSpec.path = list( fileStatMap[queueConfig.ddmEndpointIn][ fileSpec.lfn]['ready']['path'])[0] # set group info if any groupInfo = self.dbProxy.get_group_for_file( fileSpec.lfn, fileType, queueConfig.ddmEndpointIn) if groupInfo is not None: fileSpec.groupID = groupInfo['groupID'] fileSpec.groupStatus = groupInfo[ 'groupStatus'] fileSpec.groupUpdateTime = groupInfo[ 'groupUpdateTime'] updateStatus = True elif (not hasToPrepare and 'to_prepare' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]) or \ 'triggered' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: # the file is being prepared by another toWait = True if fileSpec.status != 'preparing': fileSpec.status = 'preparing' updateStatus = True else: # change file status if the file is not prepared by another if fileSpec.status != 'to_prepare': fileSpec.status = 'to_prepare' updateStatus = True # set new status if updateStatus: newFileStatusData.append( (fileSpec.fileID, fileSpec.lfn, fileSpec.status)) fileStatMap[queueConfig.ddmEndpointIn][ fileSpec.lfn].setdefault( fileSpec.status, None) if len(newFileStatusData) > 0: self.dbProxy.change_file_status( jobSpec.PandaID, newFileStatusData, lockedBy) # wait since files are being prepared by another if toWait: # update job jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }) tmpLog.debug( 'wait since files are being prepared by another job' ) continue # trigger preparation tmpStat, tmpStr = preparatorCore.trigger_preparation( jobSpec) # check result if tmpStat is True: # succeeded jobSpec.lockedBy = None if (maxFilesPerJob is None and jobSpec.auxInput is None) or \ (len(jobSpec.inFiles) == 0 and jobSpec.auxInput in [None, JobSpec.AUX_inTriggered]): # all done allDone = True jobSpec.subStatus = 'preparing' jobSpec.preparatorTime = None if jobSpec.auxInput is not None: jobSpec.auxInput = JobSpec.AUX_allTriggered else: # change file status but not change job sub status since # there could be more files to prepare allDone = False for fileSpec in jobSpec.inFiles: if fileSpec.status == 'to_prepare': fileSpec.status = 'triggered' # immediate next lookup jobSpec.trigger_preparation() # change auxInput flag to prepare auxiliary inputs if len( jobSpec.inFiles ) == 0 and jobSpec.auxInput == JobSpec.AUX_hasAuxInput: jobSpec.auxInput = JobSpec.AUX_inTriggered self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }, update_in_file=True) if allDone: tmpLog.debug('triggered') else: tmpLog.debug('partially triggered') elif tmpStat is False: # fatal error jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_prepare' jobSpec.lockedBy = None jobSpec.preparatorTime = None jobSpec.stateChangeTime = datetime.datetime.utcnow() errStr = 'stage-in failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.STAGEINFAILED, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }) tmpLog.debug( 'failed to trigger with {0}'.format(tmpStr)) else: # temporary error jobSpec.lockedBy = None self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': oldSubStatus }) tmpLog.debug( 'try to prepare later since {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.preparator.sleepTime): mainLog.debug('terminated') return
class Propagator(AgentBase): # constructor def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.communicator = communicator self.queueConfigMapper = queue_config_mapper self._last_stats_update = None # main loop def run(self): while True: sw = core_utils.get_stopwatch() mainLog = core_utils.make_logger(_logger, 'id={0}'.format(self.ident), method_name='run') mainLog.debug('getting jobs to propagate') jobSpecs = self.dbProxy.get_jobs_to_propagate( harvester_config.propagator.maxJobs, harvester_config.propagator.lockInterval, harvester_config.propagator.updateInterval, self.ident) mainLog.debug('got {0} jobs'.format(len(jobSpecs))) # update jobs in central database iJobs = 0 nJobs = harvester_config.propagator.nJobsInBulk hbSuppressMap = dict() while iJobs < len(jobSpecs): jobList = jobSpecs[iJobs:iJobs + nJobs] iJobs += nJobs # collect jobs to update or check jobListToSkip = [] jobListToUpdate = [] jobListToCheck = [] retList = [] for tmpJobSpec in jobList: if tmpJobSpec.computingSite not in hbSuppressMap: queueConfig = self.queueConfigMapper.get_queue( tmpJobSpec.computingSite) hbSuppressMap[ tmpJobSpec. computingSite] = queueConfig.get_no_heartbeat_status( ) # heartbeat is suppressed if tmpJobSpec.status in hbSuppressMap[ tmpJobSpec.computingSite]: # check running job to detect lost heartbeat if tmpJobSpec.status == 'running': jobListToCheck.append(tmpJobSpec) else: jobListToSkip.append(tmpJobSpec) retList.append({'StatusCode': 0, 'command': None}) else: jobListToUpdate.append(tmpJobSpec) retList += self.communicator.check_jobs(jobListToCheck) retList += self.communicator.update_jobs(jobListToUpdate) # logging for tmpJobSpec, tmpRet in zip( jobListToSkip + jobListToCheck + jobListToUpdate, retList): if tmpRet['StatusCode'] == 0: if tmpJobSpec in jobListToUpdate: mainLog.debug( 'updated PandaID={0} status={1}'.format( tmpJobSpec.PandaID, tmpJobSpec.status)) else: mainLog.debug( 'skip updating PandaID={0} status={1}'.format( tmpJobSpec.PandaID, tmpJobSpec.status)) # release job tmpJobSpec.propagatorLock = None if tmpJobSpec.is_final_status( ) and tmpJobSpec.status == tmpJobSpec.get_status(): # unset to disable further updating tmpJobSpec.propagatorTime = None tmpJobSpec.subStatus = 'done' else: # check event availability if tmpJobSpec.status == 'starting' and 'eventService' in tmpJobSpec.jobParams and \ tmpJobSpec.subStatus != 'submitted': tmpEvStat, tmpEvRet = self.communicator.check_event_availability( tmpJobSpec) if tmpEvStat and tmpEvRet == 0: mainLog.debug( 'kill PandaID={0} due to no event'. format(tmpJobSpec.PandaID)) tmpRet['command'] = 'tobekilled' # got kill command if 'command' in tmpRet and tmpRet['command'] in [ 'tobekilled' ]: nWorkers = self.dbProxy.kill_workers_with_job( tmpJobSpec.PandaID) if nWorkers == 0: # no remaining workers tmpJobSpec.status = 'cancelled' tmpJobSpec.subStatus = 'killed' tmpJobSpec.stateChangeTime = datetime.datetime.utcnow( ) tmpJobSpec.trigger_propagation() self.dbProxy.update_job(tmpJobSpec, {'propagatorLock': self.ident}) else: mainLog.error( 'failed to update PandaID={0} status={1}'.format( tmpJobSpec.PandaID, tmpJobSpec.status)) mainLog.debug('getting workers to propagate') workSpecs = self.dbProxy.get_workers_to_propagate( harvester_config.propagator.maxWorkers, harvester_config.propagator.updateInterval) mainLog.debug('got {0} workers'.format(len(workSpecs))) # update workers in central database iWorkers = 0 nWorkers = harvester_config.propagator.nWorkersInBulk while iWorkers < len(workSpecs): workList = workSpecs[iWorkers:iWorkers + nJobs] iWorkers += nWorkers retList, tmpErrStr = self.communicator.update_workers(workList) # logging if retList is None: mainLog.error( 'failed to update workers with {0}'.format(tmpErrStr)) else: for tmpWorkSpec, tmpRet in zip(workList, retList): if tmpRet: mainLog.debug( 'updated workerID={0} status={1}'.format( tmpWorkSpec.workerID, tmpWorkSpec.status)) # update logs for logFilePath, logOffset, logSize, logRemoteName in \ tmpWorkSpec.get_log_files_to_upload(): with open(logFilePath, 'rb') as logFileObj: tmpStat, tmpErr = self.communicator.upload_file( logRemoteName, logFileObj, logOffset, logSize) if tmpStat: tmpWorkSpec.update_log_files_to_upload( logFilePath, logOffset + logSize) # disable further update if tmpWorkSpec.is_final_status(): tmpWorkSpec.disable_propagation() self.dbProxy.update_worker( tmpWorkSpec, {'workerID': tmpWorkSpec.workerID}) else: mainLog.error( 'failed to update workerID={0} status={1}'. format(tmpWorkSpec.workerID, tmpWorkSpec.status)) mainLog.debug('getting commands') commandSpecs = self.dbProxy.get_commands_for_receiver('propagator') mainLog.debug('got {0} commands'.format(len(commandSpecs))) for commandSpec in commandSpecs: if commandSpec.command.startswith( CommandSpec.COM_reportWorkerStats): # get worker stats siteName = commandSpec.command.split(':')[-1] workerStats = self.dbProxy.get_worker_stats(siteName) if len(workerStats) == 0: mainLog.error( 'failed to get worker stats for {0}'.format( siteName)) else: # report worker stats tmpRet, tmpStr = self.communicator.update_worker_stats( siteName, workerStats) if tmpRet: mainLog.debug( 'updated worker stats (command) for {0}'. format(siteName)) else: mainLog.error( 'failed to update worker stats (command) for {0} err={1}' .format(siteName, tmpStr)) if not self._last_stats_update or time.time( ) - self._last_stats_update > STATS_PERIOD: # update worker stats for all sites worker_stats_bulk = self.dbProxy.get_worker_stats_bulk() if not worker_stats_bulk: mainLog.error('failed to get worker stats in bulk') else: for site_name in worker_stats_bulk: tmp_ret, tmp_str = self.communicator.update_worker_stats( site_name, worker_stats_bulk[site_name]) if tmp_ret: mainLog.debug( 'update of worker stats (bulk) for {0}'.format( site_name)) self._last_stats_update = time.time() else: mainLog.error( 'failed to update worker stats (bulk) for {0} err={1}' .format(site_name, tmp_str)) mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.propagator.sleepTime): mainLog.debug('terminated') return
class Submitter(AgentBase): # fifos monitor_fifo = MonitorFIFO() # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.workerMaker = WorkerMaker() self.workerAdjuster = WorkerAdjuster(queue_config_mapper) self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'submitter-{0}'.format(self.get_pid()) monitor_fifo = self.monitor_fifo while True: mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting queues to submit workers') # get queues associated to a site to submit workers curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit( harvester_config.submitter.nQueues, harvester_config.submitter.lookupTime, harvester_config.submitter.lockInterval) submitted = False if siteName is not None: mainLog.debug('got {0} queues for site {1}'.format( len(curWorkers), siteName)) # get commands comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName) commandSpecs = self.dbProxy.get_commands_for_receiver( 'submitter', comStr) mainLog.debug('got {0} {1} commands'.format( commandSpecs, comStr)) for commandSpec in commandSpecs: newLimits = self.dbProxy.set_queue_limit( siteName, commandSpec.params) for tmpResource, tmpNewVal in iteritems(newLimits): # if available, overwrite new worker value with the command from panda server if tmpResource in resMap: tmpQueueName = resMap[tmpResource] if tmpQueueName in curWorkers: curWorkers[tmpQueueName][tmpResource][ 'nNewWorkers'] = tmpNewVal # define number of new workers if len(curWorkers) == 0: n_workers_per_queue_and_rt = dict() else: n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers( curWorkers, siteName) if n_workers_per_queue_and_rt is None: mainLog.error( 'WorkerAdjuster failed to define the number of workers' ) elif len(n_workers_per_queue_and_rt) == 0: pass else: # loop over all queues and resource types for queueName in n_workers_per_queue_and_rt: for resource_type, tmpVal in iteritems( n_workers_per_queue_and_rt[queueName]): tmpLog = self.make_logger( _logger, 'id={0} queue={1} rtype={2}'.format( lockedBy, queueName, resource_type), method_name='run') try: tmpLog.debug('start') tmpLog.debug('workers status: %s' % tmpVal) nWorkers = tmpVal['nNewWorkers'] + tmpVal[ 'nReady'] nReady = tmpVal['nReady'] # check queue if not self.queueConfigMapper.has_queue( queueName): tmpLog.error('config not found') continue # no new workers if nWorkers == 0: tmpLog.debug( 'skipped since no new worker is needed based on current stats' ) continue # get queue queueConfig = self.queueConfigMapper.get_queue( queueName) workerMakerCore = self.workerMaker.get_plugin( queueConfig) # check if resource is ready if hasattr( workerMakerCore, 'dynamicSizing' ) and workerMakerCore.dynamicSizing is True: numReadyResources = self.workerMaker.num_ready_resources( queueConfig, resource_type, workerMakerCore) tmpLog.debug('numReadyResources: %s' % numReadyResources) if not numReadyResources: if hasattr(workerMakerCore, 'staticWorkers'): nQRWorkers = tmpVal[ 'nQueue'] + tmpVal['nRunning'] tmpLog.debug( 'staticWorkers: %s, nQRWorkers(Queue+Running): %s' % (workerMakerCore.staticWorkers, nQRWorkers)) if nQRWorkers >= workerMakerCore.staticWorkers: tmpLog.debug( 'No left static workers, skip' ) continue else: nWorkers = min( workerMakerCore. staticWorkers - nQRWorkers, nWorkers) tmpLog.debug( 'staticWorkers: %s, nWorkers: %s' % (workerMakerCore. staticWorkers, nWorkers)) else: tmpLog.debug( 'skip since no resources are ready' ) continue else: nWorkers = min(nWorkers, numReadyResources) # post action of worker maker if hasattr( workerMakerCore, 'skipOnFail' ) and workerMakerCore.skipOnFail is True: skipOnFail = True else: skipOnFail = False # actions based on mapping type if queueConfig.mapType == WorkSpec.MT_NoJob: # workers without jobs jobChunks = [] for i in range(nWorkers): jobChunks.append([]) elif queueConfig.mapType == WorkSpec.MT_OneToOne: # one worker per one job jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, 1, None, queueConfig.useJobLateBinding, harvester_config.submitter. checkInterval, harvester_config. submitter.lockInterval, lockedBy) elif queueConfig.mapType == WorkSpec.MT_MultiJobs: # one worker for multiple jobs nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker( queueConfig, nWorkers, resource_type, maker=workerMakerCore) tmpLog.debug('nJobsPerWorker={0}'.format( nJobsPerWorker)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, nJobsPerWorker, None, queueConfig.useJobLateBinding, harvester_config.submitter. checkInterval, harvester_config. submitter.lockInterval, lockedBy, queueConfig.allowJobMixture) elif queueConfig.mapType == WorkSpec.MT_MultiWorkers: # multiple workers for one job nWorkersPerJob = self.workerMaker.get_num_workers_per_job( queueConfig, nWorkers, resource_type, maker=workerMakerCore) tmpLog.debug('nWorkersPerJob={0}'.format( nWorkersPerJob)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, None, nWorkersPerJob, queueConfig.useJobLateBinding, harvester_config.submitter. checkInterval, harvester_config. submitter.lockInterval, lockedBy) else: tmpLog.error('unknown mapType={0}'.format( queueConfig.mapType)) continue tmpLog.debug('got {0} job chunks'.format( len(jobChunks))) if len(jobChunks) == 0: continue # make workers okChunks, ngChunks = self.workerMaker.make_workers( jobChunks, queueConfig, nReady, resource_type, maker=workerMakerCore) if len(ngChunks) == 0: tmpLog.debug( 'successfully made {0} workers'.format( len(okChunks))) else: tmpLog.debug( 'made {0} workers, while {1} workers failed' .format(len(okChunks), len(ngChunks))) timeNow = datetime.datetime.utcnow() timeNow_timestamp = time.time() pandaIDs = set() # NG (=not good) for ngJobs in ngChunks: for jobSpec in ngJobs: if skipOnFail: # release jobs when workers are not made pandaIDs.add(jobSpec.PandaID) else: jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_make' jobSpec.stateChangeTime = timeNow jobSpec.lockedBy = None errStr = 'failed to make a worker' jobSpec.set_pilot_error( PilotErrors.ERR_SETUPFAILURE, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job( jobSpec, { 'lockedBy': lockedBy, 'subStatus': 'prepared' }) # OK workSpecList = [] if len(okChunks) > 0: for workSpec, okJobs in okChunks: # has job if (queueConfig.useJobLateBinding and workSpec.workerID is None) \ or queueConfig.mapType == WorkSpec.MT_NoJob: workSpec.hasJob = 0 else: workSpec.hasJob = 1 if workSpec.nJobsToReFill in [ None, 0 ]: workSpec.set_jobspec_list( okJobs) else: # refill free slots during the worker is running workSpec.set_jobspec_list( okJobs[:workSpec. nJobsToReFill]) workSpec.nJobsToReFill = None for jobSpec in okJobs[ workSpec. nJobsToReFill:]: pandaIDs.add( jobSpec.PandaID) workSpec.set_num_jobs_with_list() # map type workSpec.mapType = queueConfig.mapType # queue name workSpec.computingSite = queueConfig.queueName # set access point workSpec.accessPoint = queueConfig.messenger[ 'accessPoint'] # sync level workSpec.syncLevel = queueConfig.get_synchronization_level( ) # events if len(okJobs) > 0 and \ ('eventService' in okJobs[0].jobParams or 'cloneJob' in okJobs[0].jobParams): workSpec.eventsRequest = WorkSpec.EV_useEvents workSpecList.append(workSpec) if len(workSpecList) > 0: # get plugin for submitter submitterCore = self.pluginFactory.get_plugin( queueConfig.submitter) if submitterCore is None: # not found tmpLog.error( 'submitter plugin for {0} not found' .format(jobSpec.computingSite)) continue # get plugin for messenger messenger = self.pluginFactory.get_plugin( queueConfig.messenger) if messenger is None: # not found tmpLog.error( 'messenger plugin for {0} not found' .format(jobSpec.computingSite)) continue # setup access points messenger.setup_access_points(workSpecList) # feed jobs for workSpec in workSpecList: if workSpec.hasJob == 1: tmpStat = messenger.feed_jobs( workSpec, workSpec.get_jobspec_list()) if tmpStat is False: tmpLog.error( 'failed to send jobs to workerID={0}' .format(workSpec.workerID)) else: tmpLog.debug( 'sent jobs to workerID={0} with {1}' .format( workSpec.workerID, tmpStat)) # insert workers self.dbProxy.insert_workers( workSpecList, lockedBy) # submit tmpLog.info( 'submitting {0} workers'.format( len(workSpecList))) workSpecList, tmpRetList, tmpStrList = self.submit_workers( submitterCore, workSpecList) for iWorker, (tmpRet, tmpStr) in enumerate( zip(tmpRetList, tmpStrList)): workSpec, jobList = okChunks[iWorker] # use associated job list since it can be truncated for re-filling jobList = workSpec.get_jobspec_list() # set status if not tmpRet: # failed submission errStr = 'failed to submit a workerID={0} with {1}'.format( workSpec.workerID, tmpStr) tmpLog.error(errStr) workSpec.set_status( WorkSpec.ST_missed) workSpec.set_dialog_message(tmpStr) workSpec.set_pilot_error( PilotErrors.ERR_SETUPFAILURE, errStr) if jobList is not None: # increment attempt number newJobList = [] for jobSpec in jobList: if jobSpec.submissionAttempts is None: jobSpec.submissionAttempts = 0 jobSpec.submissionAttempts += 1 # max attempt or permanent error if tmpRet is False or \ jobSpec.submissionAttempts >= \ queueConfig.maxSubmissionAttempts: newJobList.append( jobSpec) else: self.dbProxy.increment_submission_attempt( jobSpec.PandaID, jobSpec. submissionAttempts) jobList = newJobList elif queueConfig.useJobLateBinding and workSpec.hasJob == 1: # directly go to running after feeding jobs for late biding workSpec.set_status( WorkSpec.ST_running) else: # normal successful submission workSpec.set_status( WorkSpec.ST_submitted) workSpec.submitTime = timeNow workSpec.modificationTime = timeNow workSpec.checkTime = timeNow if self.monitor_fifo.enabled: workSpec.set_work_params({ 'lastCheckAt': timeNow_timestamp }) # prefetch events if tmpRet and workSpec.hasJob == 1 and \ workSpec.eventsRequest == WorkSpec.EV_useEvents and \ queueConfig.prefetchEvents: workSpec.eventsRequest = WorkSpec.EV_requestEvents eventsRequestParams = dict() for jobSpec in jobList: eventsRequestParams[jobSpec.PandaID] = \ {'pandaID': jobSpec.PandaID, 'taskID': jobSpec.taskID, 'jobsetID': jobSpec.jobParams['jobsetID'], 'nRanges': max(int(math.ceil(workSpec.nCore / len(jobList))), jobSpec.jobParams['coreCount']), } workSpec.eventsRequestParams = eventsRequestParams # register worker tmpStat = self.dbProxy.register_worker( workSpec, jobList, lockedBy) if jobList is not None: for jobSpec in jobList: pandaIDs.add(jobSpec.PandaID) if tmpStat: if tmpRet: tmpStr = \ 'submitted a workerID={0} for PandaID={1} with batchID={2}' tmpLog.info( tmpStr.format( workSpec. workerID, jobSpec. PandaID, workSpec. batchID)) else: tmpStr = 'failed to submit a workerID={0} for PandaID={1}' tmpLog.error( tmpStr.format( workSpec. workerID, jobSpec.PandaID )) else: tmpStr = \ 'failed to register a worker for PandaID={0} with batchID={1}' tmpLog.error( tmpStr.format( jobSpec.PandaID, workSpec.batchID)) # enqueue to monitor fifo if self.monitor_fifo.enabled \ and queueConfig.mapType != WorkSpec.MT_MultiWorkers: workSpecsToEnqueue = \ [[w] for w in workSpecList if w.status in (WorkSpec.ST_submitted, WorkSpec.ST_running)] monitor_fifo.put( (queueName, workSpecsToEnqueue)) mainLog.debug( 'put workers to monitor FIFO') submitted = True # release jobs self.dbProxy.release_jobs(pandaIDs, lockedBy) tmpLog.info('done') except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done') # define sleep interval if siteName is None: sleepTime = harvester_config.submitter.sleepTime else: sleepTime = 0 if submitted and hasattr(harvester_config.submitter, 'minSubmissionInterval'): interval = harvester_config.submitter.minSubmissionInterval if interval > 0: newTime = datetime.datetime.utcnow( ) + datetime.timedelta(seconds=interval) self.dbProxy.update_panda_queue_attribute( 'submitTime', newTime, site_name=siteName) # check if being terminated if self.terminated(sleepTime): mainLog.debug('terminated') return # wrapper for submitWorkers to skip ready workers def submit_workers(self, submitter_core, workspec_list): retList = [] strList = [] newSpecList = [] workersToSubmit = [] for workSpec in workspec_list: if workSpec.status in [WorkSpec.ST_ready, WorkSpec.ST_running]: newSpecList.append(workSpec) retList.append(True) strList.append('') else: workersToSubmit.append(workSpec) tmpRetList = submitter_core.submit_workers(workersToSubmit) for tmpRet, tmpStr in tmpRetList: retList.append(tmpRet) strList.append(tmpStr) newSpecList += workersToSubmit return newSpecList, retList, strList
class Submitter(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.workerMaker = WorkerMaker() self.workerAdjuster = WorkerAdjuster(queue_config_mapper) self.pluginFactory = PluginFactory() self.monitor_fifo = MonitorFIFO() self.apfmon = Apfmon(self.queueConfigMapper) # main loop def run(self): lockedBy = 'submitter-{0}'.format(self.get_pid()) monitor_fifo = self.monitor_fifo queueLockInterval = getattr(harvester_config.submitter, 'queueLockInterval', harvester_config.submitter.lockInterval) while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting queues to submit workers') # get queues associated to a site to submit workers curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(harvester_config.submitter.nQueues, harvester_config.submitter.lookupTime, harvester_config.submitter.lockInterval, lockedBy, queueLockInterval) submitted = False if siteName is not None: mainLog.debug('got {0} queues for site {1}'.format(len(curWorkers), siteName)) # get commands comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName) commandSpecs = self.dbProxy.get_commands_for_receiver('submitter', comStr) mainLog.debug('got {0} {1} commands'.format(commandSpecs, comStr)) for commandSpec in commandSpecs: newLimits = self.dbProxy.set_queue_limit(siteName, commandSpec.params) for tmpResource, tmpNewVal in iteritems(newLimits): # if available, overwrite new worker value with the command from panda server if tmpResource in resMap: tmpQueueName = resMap[tmpResource] if tmpQueueName in curWorkers: curWorkers[tmpQueueName][tmpResource]['nNewWorkers'] = tmpNewVal # define number of new workers if len(curWorkers) == 0: n_workers_per_queue_and_rt = dict() else: n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers(curWorkers, siteName) if n_workers_per_queue_and_rt is None: mainLog.error('WorkerAdjuster failed to define the number of workers') elif len(n_workers_per_queue_and_rt) == 0: pass else: # loop over all queues and resource types for queueName in n_workers_per_queue_and_rt: for resource_type, tmpVal in iteritems(n_workers_per_queue_and_rt[queueName]): tmpLog = self.make_logger(_logger, 'id={0} queue={1} rtype={2}'.format(lockedBy, queueName, resource_type), method_name='run') try: tmpLog.debug('start') tmpLog.debug('workers status: %s' % tmpVal) nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady'] nReady = tmpVal['nReady'] # check queue if not self.queueConfigMapper.has_queue(queueName): tmpLog.error('config not found') continue # no new workers if nWorkers == 0: tmpLog.debug('skipped since no new worker is needed based on current stats') continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) workerMakerCore = self.workerMaker.get_plugin(queueConfig) # check if resource is ready if hasattr(workerMakerCore, 'dynamicSizing') and workerMakerCore.dynamicSizing is True: numReadyResources = self.workerMaker.num_ready_resources(queueConfig, resource_type, workerMakerCore) tmpLog.debug('numReadyResources: %s' % numReadyResources) if not numReadyResources: if hasattr(workerMakerCore, 'staticWorkers'): nQRWorkers = tmpVal['nQueue'] + tmpVal['nRunning'] tmpLog.debug('staticWorkers: %s, nQRWorkers(Queue+Running): %s' % (workerMakerCore.staticWorkers, nQRWorkers)) if nQRWorkers >= workerMakerCore.staticWorkers: tmpLog.debug('No left static workers, skip') continue else: nWorkers = min(workerMakerCore.staticWorkers - nQRWorkers, nWorkers) tmpLog.debug('staticWorkers: %s, nWorkers: %s' % (workerMakerCore.staticWorkers, nWorkers)) else: tmpLog.debug('skip since no resources are ready') continue else: nWorkers = min(nWorkers, numReadyResources) # post action of worker maker if hasattr(workerMakerCore, 'skipOnFail') and workerMakerCore.skipOnFail is True: skipOnFail = True else: skipOnFail = False # actions based on mapping type if queueConfig.mapType == WorkSpec.MT_NoJob: # workers without jobs jobChunks = [] for i in range(nWorkers): jobChunks.append([]) elif queueConfig.mapType == WorkSpec.MT_OneToOne: # one worker per one job jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, 1, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy) elif queueConfig.mapType == WorkSpec.MT_MultiJobs: # one worker for multiple jobs nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(queueConfig, nWorkers, resource_type, maker=workerMakerCore) tmpLog.debug('nJobsPerWorker={0}'.format(nJobsPerWorker)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, nJobsPerWorker, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy, queueConfig.allowJobMixture) elif queueConfig.mapType == WorkSpec.MT_MultiWorkers: # multiple workers for one job nWorkersPerJob = self.workerMaker.get_num_workers_per_job(queueConfig, nWorkers, resource_type, maker=workerMakerCore) maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total( queueConfig, resource_type, maker=workerMakerCore) maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle( queueConfig, resource_type, maker=workerMakerCore) tmpLog.debug('nWorkersPerJob={0}'.format(nWorkersPerJob)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, None, nWorkersPerJob, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy, max_workers_per_job_in_total=maxWorkersPerJob, max_workers_per_job_per_cycle=maxWorkersPerJobPerCycle) else: tmpLog.error('unknown mapType={0}'.format(queueConfig.mapType)) continue tmpLog.debug('got {0} job chunks'.format(len(jobChunks))) if len(jobChunks) == 0: continue # make workers okChunks, ngChunks = self.workerMaker.make_workers(jobChunks, queueConfig, nReady, resource_type, maker=workerMakerCore) if len(ngChunks) == 0: tmpLog.debug('successfully made {0} workers'.format(len(okChunks))) else: tmpLog.debug('made {0} workers, while {1} workers failed'.format(len(okChunks), len(ngChunks))) timeNow = datetime.datetime.utcnow() timeNow_timestamp = time.time() pandaIDs = set() # NG (=not good) for ngJobs in ngChunks: for jobSpec in ngJobs: if skipOnFail: # release jobs when workers are not made pandaIDs.add(jobSpec.PandaID) else: jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_make' jobSpec.stateChangeTime = timeNow jobSpec.lockedBy = None errStr = 'failed to make a worker' jobSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': 'prepared'}) # OK workSpecList = [] if len(okChunks) > 0: for workSpec, okJobs in okChunks: # has job if (queueConfig.useJobLateBinding and workSpec.workerID is None) \ or queueConfig.mapType == WorkSpec.MT_NoJob: workSpec.hasJob = 0 else: workSpec.hasJob = 1 if workSpec.nJobsToReFill in [None, 0]: workSpec.set_jobspec_list(okJobs) else: # refill free slots during the worker is running workSpec.set_jobspec_list(okJobs[:workSpec.nJobsToReFill]) workSpec.nJobsToReFill = None for jobSpec in okJobs[workSpec.nJobsToReFill:]: pandaIDs.add(jobSpec.PandaID) workSpec.set_num_jobs_with_list() # map type workSpec.mapType = queueConfig.mapType # queue name workSpec.computingSite = queueConfig.queueName # set access point workSpec.accessPoint = queueConfig.messenger['accessPoint'] # sync level workSpec.syncLevel = queueConfig.get_synchronization_level() # events if len(okJobs) > 0 and \ ('eventService' in okJobs[0].jobParams or 'cloneJob' in okJobs[0].jobParams): workSpec.eventsRequest = WorkSpec.EV_useEvents workSpecList.append(workSpec) if len(workSpecList) > 0: sw = core_utils.get_stopwatch() # get plugin for submitter submitterCore = self.pluginFactory.get_plugin(queueConfig.submitter) if submitterCore is None: # not found tmpLog.error( 'submitter plugin for {0} not found'.format(jobSpec.computingSite)) continue # get plugin for messenger messenger = self.pluginFactory.get_plugin(queueConfig.messenger) if messenger is None: # not found tmpLog.error( 'messenger plugin for {0} not found'.format(jobSpec.computingSite)) continue # setup access points messenger.setup_access_points(workSpecList) # feed jobs for workSpec in workSpecList: if workSpec.hasJob == 1: tmpStat = messenger.feed_jobs(workSpec, workSpec.get_jobspec_list()) if tmpStat is False: tmpLog.error( 'failed to send jobs to workerID={0}'.format(workSpec.workerID)) else: tmpLog.debug( 'sent jobs to workerID={0} with {1}'.format(workSpec.workerID, tmpStat)) # insert workers self.dbProxy.insert_workers(workSpecList, lockedBy) # submit sw.reset() tmpLog.info('submitting {0} workers'.format(len(workSpecList))) workSpecList, tmpRetList, tmpStrList = self.submit_workers(submitterCore, workSpecList) tmpLog.debug('done submitting {0} workers'.format(len(workSpecList)) + sw.get_elapsed_time()) # collect successful jobs okPandaIDs = set() for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): if tmpRet: workSpec, jobList = okChunks[iWorker] jobList = workSpec.get_jobspec_list() if jobList is not None: for jobSpec in jobList: okPandaIDs.add(jobSpec.PandaID) # loop over all workers for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): workSpec, jobList = okChunks[iWorker] # set harvesterHost workSpec.harvesterHost = socket.gethostname() # use associated job list since it can be truncated for re-filling jobList = workSpec.get_jobspec_list() # set status if not tmpRet: # failed submission errStr = 'failed to submit a workerID={0} with {1}'.format( workSpec.workerID, tmpStr) tmpLog.error(errStr) workSpec.set_status(WorkSpec.ST_missed) workSpec.set_dialog_message(tmpStr) workSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) if jobList is not None: # increment attempt number newJobList = [] for jobSpec in jobList: # skip if successful with another worker if jobSpec.PandaID in okPandaIDs: continue if jobSpec.submissionAttempts is None: jobSpec.submissionAttempts = 0 jobSpec.submissionAttempts += 1 # max attempt or permanent error if tmpRet is False or \ jobSpec.submissionAttempts >= \ queueConfig.maxSubmissionAttempts: newJobList.append(jobSpec) else: self.dbProxy.increment_submission_attempt( jobSpec.PandaID, jobSpec.submissionAttempts) jobList = newJobList elif queueConfig.useJobLateBinding and workSpec.hasJob == 1: # directly go to running after feeding jobs for late biding workSpec.set_status(WorkSpec.ST_running) else: # normal successful submission workSpec.set_status(WorkSpec.ST_submitted) workSpec.submitTime = timeNow workSpec.modificationTime = timeNow workSpec.checkTime = timeNow if self.monitor_fifo.enabled: workSpec.set_work_params({'lastCheckAt': timeNow_timestamp}) # prefetch events if tmpRet and workSpec.hasJob == 1 and \ workSpec.eventsRequest == WorkSpec.EV_useEvents and \ queueConfig.prefetchEvents: workSpec.eventsRequest = WorkSpec.EV_requestEvents eventsRequestParams = dict() for jobSpec in jobList: eventsRequestParams[jobSpec.PandaID] = \ {'pandaID': jobSpec.PandaID, 'taskID': jobSpec.taskID, 'jobsetID': jobSpec.jobParams['jobsetID'], 'nRanges': max(int(math.ceil(workSpec.nCore / len(jobList))), jobSpec.jobParams['coreCount']), } workSpec.eventsRequestParams = eventsRequestParams # register worker tmpStat = self.dbProxy.register_worker(workSpec, jobList, lockedBy) if jobList is not None: for jobSpec in jobList: pandaIDs.add(jobSpec.PandaID) if tmpStat: if tmpRet: tmpStr = \ 'submitted a workerID={0} for PandaID={1} with batchID={2}' tmpLog.info(tmpStr.format(workSpec.workerID, jobSpec.PandaID, workSpec.batchID)) else: tmpStr = 'failed to submit a workerID={0} for PandaID={1}' tmpLog.error(tmpStr.format(workSpec.workerID, jobSpec.PandaID)) else: tmpStr = \ 'failed to register a worker for PandaID={0} with batchID={1}' tmpLog.error(tmpStr.format(jobSpec.PandaID, workSpec.batchID)) # enqueue to monitor fifo if self.monitor_fifo.enabled \ and queueConfig.mapType != WorkSpec.MT_MultiWorkers: workSpecsToEnqueue = \ [[w] for w in workSpecList if w.status in (WorkSpec.ST_submitted, WorkSpec.ST_running)] check_delay = min( getattr(harvester_config.monitor, 'eventBasedCheckInterval', harvester_config.monitor.checkInterval), getattr(harvester_config.monitor, 'fifoCheckInterval', harvester_config.monitor.checkInterval)) monitor_fifo.put((queueName, workSpecsToEnqueue), time.time() + check_delay) mainLog.debug('put workers to monitor FIFO') submitted = True # release jobs self.dbProxy.release_jobs(pandaIDs, lockedBy) tmpLog.info('done') except Exception: core_utils.dump_error_message(tmpLog) # release the site self.dbProxy.release_site(siteName, lockedBy) if sw_main.get_elapsed_time_in_sec() > queueLockInterval: mainLog.warning('a submitter cycle was longer than queueLockInterval {0} sec'.format(queueLockInterval) + sw_main.get_elapsed_time()) mainLog.debug('done') # define sleep interval if siteName is None: sleepTime = harvester_config.submitter.sleepTime else: sleepTime = 0 if submitted and hasattr(harvester_config.submitter, 'minSubmissionInterval'): interval = harvester_config.submitter.minSubmissionInterval if interval > 0: newTime = datetime.datetime.utcnow() + datetime.timedelta(seconds=interval) self.dbProxy.update_panda_queue_attribute('submitTime', newTime, site_name=siteName) # time the cycle mainLog.debug('done a submitter cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(sleepTime): mainLog.debug('terminated') return # wrapper for submitWorkers to skip ready workers def submit_workers(self, submitter_core, workspec_list): retList = [] strList = [] newSpecList = [] workersToSubmit = [] for workSpec in workspec_list: if workSpec.status in [WorkSpec.ST_ready, WorkSpec.ST_running]: newSpecList.append(workSpec) retList.append(True) strList.append('') else: workersToSubmit.append(workSpec) tmpRetList = submitter_core.submit_workers(workersToSubmit) # submit the workers to the monitoring self.apfmon.create_workers(workersToSubmit) for tmpRet, tmpStr in tmpRetList: retList.append(tmpRet) strList.append(tmpStr) newSpecList += workersToSubmit return newSpecList, retList, strList