def kill_workers(arguments): status_in = 'ALL' if (len(arguments.status) == 1 and arguments.status[0] == 'ALL') else arguments.status computingSite_in = 'ALL' if ( len(arguments.sites) == 1 and arguments.sites[0] == 'ALL') else arguments.sites computingElement_in = 'ALL' if (len( arguments.ces) == 1 and arguments.ces[0] == 'ALL') else arguments.ces submissionHost_in = 'ALL' if (len(arguments.submissionhosts) == 1 and arguments.submissionhosts[0] == 'ALL') else arguments.submissionhosts dbProxy = DBProxy() retVal = dbProxy.kill_workers_by_query({ 'status': status_in, 'computingSite': computingSite_in, 'computingElement': computingElement_in, 'submissionHost': submissionHost_in }) if retVal is not None: msg_temp = ('Sweeper will soon kill {n_workers} workers, with ' 'status in {status_in}, ' 'computingSite in {computingSite_in}, ' 'computingElement in {computingElement_in}, ' 'submissionHost in {submissionHost_in}') print( msg_temp.format(n_workers=retVal, status_in=status_in, computingSite_in=computingSite_in, computingElement_in=computingElement_in, submissionHost_in=submissionHost_in)) else: mainLogger.critical('Failed to kill workers. See panda-db_proxy.log')
def qconf_purge(arguments): queueName = arguments.queue dbProxy = DBProxy() retVal = dbProxy.purge_pq(queueName) if retVal: print('Purged {0} from harvester DB'.format(queueName)) else: mainLogger.critical('Failed to purge {0} . See panda-db_proxy.log'.format(queueName))
class EventFeeder(AgentBase): # constructor def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.queueConfigMapper = queue_config_mapper self.communicator = communicator self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'eventfeeder-{0}'.format(self.ident) while True: mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting workers to feed events') workSpecsPerQueue = self.dbProxy.get_workers_to_feed_events(harvester_config.eventfeeder.maxWorkers, harvester_config.eventfeeder.lockInterval) mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue))) # loop over all workers for queueName, workSpecList in iteritems(workSpecsPerQueue): tmpQueLog = self.make_logger(_logger, 'queue={0}'.format(queueName), method_name='run') # check queue if not self.queueConfigMapper.has_queue(queueName): tmpQueLog.error('config not found') continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) # get plugin messenger = self.pluginFactory.get_plugin(queueConfig.messenger) # loop over all workers for workSpec in workSpecList: tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID), method_name='run') # get events tmpLog.debug('get events') tmpStat, events = self.communicator.get_event_ranges(workSpec.eventsRequestParams) # failed if tmpStat is False: tmpLog.error('failed to get events with {0}'.format(events)) continue tmpStat = messenger.feed_events(workSpec, events) # failed if tmpStat is False: tmpLog.error('failed to feed events') continue # update worker workSpec.eventsRequest = WorkSpec.EV_useEvents workSpec.eventsRequestParams = None workSpec.eventFeedTime = None # update local database tmpStat = self.dbProxy.update_worker(workSpec) tmpLog.debug('done with {0}'.format(tmpStat)) tmpQueLog.debug('done') mainLog.debug('done') # check if being terminated if self.terminated(harvester_config.eventfeeder.sleepTime): mainLog.debug('terminated') return
def __init__(self, single_mode=False, stop_event=None, daemon_mode=True): # initialize database and config self.singleMode = single_mode self.stopEvent = stop_event self.daemonMode = daemon_mode from pandaharvester.harvestercore.communicator_pool import CommunicatorPool self.communicatorPool = CommunicatorPool() from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper self.queueConfigMapper = QueueConfigMapper() from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy dbProxy = DBProxy() dbProxy.make_tables(self.queueConfigMapper)
def __init__(self, single_mode=False, stop_event=None, daemon_mode=True): # initialize database and config self.singleMode = single_mode self.stopEvent = stop_event self.daemonMode = daemon_mode from pandaharvester.harvestercore.communicator_pool import CommunicatorPool self.communicatorPool = CommunicatorPool() from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper self.queueConfigMapper = QueueConfigMapper() from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy dbProxy = DBProxy() dbProxy.make_tables(self.queueConfigMapper)
def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.workerMaker = WorkerMaker() self.workerAdjuster = WorkerAdjuster(queue_config_mapper) self.pluginFactory = PluginFactory()
def HarvesterReport(self): try: from distutils.sysconfig import get_python_lib # pylint: disable=import-error sys.path.append(get_python_lib()+'/pandacommon') os.environ['PANDA_HOME']=os.environ['VIRTUAL_ENV'] from collections import defaultdict # pylint: disable=import-error from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy # pylint: disable=import-error self.dbProxy = DBProxy() workers = self.dbProxy.get_worker_stats_bulk(None) rep = defaultdict(dict) rtot = defaultdict(int) for site, prodsourcelabels in workers.items(): for prodsourcelabel, resources in prodsourcelabels.items(): for resource, jobs in resources.items(): rep[f'{site}-{resource}'][prodsourcelabel or 'empty'] = jobs for state, count in jobs.items(): rtot[state] += count self.log(f"All Harvester jobs: {sum(rtot.values())} prodSourceLabel: submitted/running") for k in sorted(rep.keys()): log=f"{k:>28.28}:" for psl, jobs in rep[k].items(): log += f"{psl:>10}: {jobs['submitted']}/{jobs['running']}" self.log(log) log = f"{'Totals':>28}: submitted: {rtot['submitted']} running: {rtot['running']}" self.log(log+'\n\n') except: pass
def __init__(self, single_mode=False): AgentBase.__init__(self, single_mode) self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() # get module and class names moduleNames = self.get_list(harvester_config.credmanager.moduleName) classNames = self.get_list(harvester_config.credmanager.className) # file names of original certificates if hasattr(harvester_config.credmanager, 'inCertFile'): inCertFiles = self.get_list( harvester_config.credmanager.inCertFile) else: inCertFiles = self.get_list(harvester_config.credmanager.certFile) # file names of certificates to be generated if hasattr(harvester_config.credmanager, 'outCertFile'): outCertFiles = self.get_list( harvester_config.credmanager.outCertFile) else: # use the file name of the certificate for panda connection as output name outCertFiles = self.get_list(harvester_config.pandacon.cert_file) # VOMS vomses = self.get_list(harvester_config.credmanager.voms) # get plugin self.exeCores = [] for moduleName, className, inCertFile, outCertFile, voms in \ zip(moduleNames, classNames, inCertFiles, outCertFiles, vomses): pluginPar = {} pluginPar['module'] = moduleName pluginPar['name'] = className pluginPar['inCertFile'] = inCertFile pluginPar['outCertFile'] = outCertFile pluginPar['voms'] = voms exeCore = self.pluginFactory.get_plugin(pluginPar) self.exeCores.append(exeCore)
def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.communicator = communicator self.queueConfigMapper = queue_config_mapper self._last_stats_update = None self._last_metrics_update = None
def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.db_proxy = DBProxy() self.communicator = communicator self.queueConfigMapper = queue_config_mapper self.nodeName = socket.gethostname() self.lastHeartbeat = None
def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.communicator = communicator self.nodeName = socket.gethostname() self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory()
def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.db_proxy = DBProxy() self.communicator = communicator self.queueConfigMapper = queue_config_mapper self.nodeName = socket.gethostname() self.lastHeartbeat = None
def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.pluginFactory = PluginFactory() self.startTimestamp = time.time() self.monitor_fifo = MonitorFIFO() self.apfmon = Apfmon(self.queueConfigMapper)
def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.workerMaker = WorkerMaker() self.workerAdjuster = WorkerAdjuster(queue_config_mapper) self.pluginFactory = PluginFactory() self.monitor_fifo = MonitorFIFO() self.apfmon = Apfmon(self.queueConfigMapper)
def __init__(self, queue_config_mapper): self.queue_configMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict() self.apf_mon = Apfmon(self.queue_configMapper) try: self.maxNewWorkers = harvester_config.submitter.maxNewWorkers except AttributeError: self.maxNewWorkers = None
def __init__(self, queue_config_mapper): self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict() self.apf_mon = Apfmon(self.queueConfigMapper) try: self.maxNewWorkers = harvester_config.submitter.maxNewWorkers except AttributeError: self.maxNewWorkers = None
def __init__(self, **kwarg): '''Set up DB connection and credentials''' PluginBase.__init__(self, **kwarg) self.dbproxy = DBProxy() self.schedulerid = harvester_config.master.harvester_id # Credential dictionary role: proxy file self.certs = dict(zip([r.split('=')[1] for r in list(harvester_config.credmanager.voms)], list(harvester_config.credmanager.outCertFile))) self.cred_type = arc.initializeCredentialsType(arc.initializeCredentialsType.SkipCredentials)
def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queue_config_mapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() # plugin cores self.exeCores = [] self.queue_exe_cores = [] # get plugin from harvester config self.get_cores_from_harvester_config() # update plugin cores from queue config self.update_cores_from_queue_config()
def __init__(self, **kwarg): '''Set up DB connection and credentials''' PluginBase.__init__(self, **kwarg) self.dbproxy = DBProxy() self.schedulerid = harvester_config.master.harvester_id # Credential dictionary role: proxy file self.certs = dict( zip([ r.split('=')[1] for r in list(harvester_config.credmanager.voms) ], list(harvester_config.credmanager.outCertFile))) self.cred_type = arc.initializeCredentialsType( arc.initializeCredentialsType.SkipCredentials)
def __init__(self, pid_file, single_mode=False): AgentBase.__init__(self, single_mode) self.db_proxy = DBProxy() if pid_file is not None: self.pid_file = pid_file else: try: self.pid_file = harvester_config.service_monitor.pidfile except Exception: self.pid_file = None self.pid = self.get_master_pid() self.master_process = psutil.Process(self.pid) self.children = self.master_process.children(recursive=True) self.cpu_count = multiprocessing.cpu_count()
def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.pluginFactory = PluginFactory() self.startTimestamp = time.time() self.monitor_fifo = MonitorFIFO() if self.monitor_fifo.enabled: self.monitor_event_fifo = MonitorEventFIFO() else: self.monitor_event_fifo = None self.apfmon = Apfmon(self.queueConfigMapper) self.eventBasedMonCoreList = [] if getattr(harvester_config.monitor, 'eventBasedEnable', False): for pluginConf in harvester_config.monitor.eventBasedPlugins: pluginFactory = PluginFactory() self.eventBasedMonCoreList.append(pluginFactory.get_plugin(pluginConf))
def __init__(self, pid_file, single_mode=False): AgentBase.__init__(self, single_mode) self.db_proxy = DBProxy() if pid_file is not None: self.pid_file = pid_file else: try: self.pid_file = harvester_config.service_monitor.pidfile except Exception: self.pid_file = None self.pid = self.get_master_pid() self.master_process = psutil.Process(self.pid) self.children = self.master_process.children(recursive=True) self.cpu_count = multiprocessing.cpu_count()
class Submitter(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.workerMaker = WorkerMaker() self.workerAdjuster = WorkerAdjuster(queue_config_mapper) self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'submitter-{0}'.format(self.ident) while True: mainLog = core_utils.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting queues to submit workers') # get queues associated to a site to submit workers curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit( harvester_config.submitter.nQueues, harvester_config.submitter.lookupTime) mainLog.debug('got {0} queues for site {1}'.format( len(curWorkers), siteName)) # get commands if siteName is not None: comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName) commandSpecs = self.dbProxy.get_commands_for_receiver( 'submitter', comStr) mainLog.debug('got {0} commands'.format(len(commandSpecs))) for commandSpec in commandSpecs: newLimits = self.dbProxy.set_queue_limit( siteName, commandSpec.params) for tmpResource, tmpNewVal in iteritems(newLimits): if tmpResource in resMap: tmpQueueName = resMap[tmpResource] if tmpQueueName in curWorkers: curWorkers[tmpQueueName][ 'nNewWorkers'] = tmpNewVal # define number of new workers if len(curWorkers) == 0: nWorkersPerQueue = dict() else: nWorkersPerQueue = self.workerAdjuster.define_num_workers( curWorkers, siteName) if nWorkersPerQueue is None: mainLog.error( 'WorkerAdjuster failed to define the number of workers') elif len(nWorkersPerQueue) == 0: pass else: # loop over all queues for queueName, tmpVal in iteritems(nWorkersPerQueue): tmpLog = core_utils.make_logger( _logger, 'queue={0}'.format(queueName), method_name='run') tmpLog.debug('start') nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady'] nReady = tmpVal['nReady'] # check queue if not self.queueConfigMapper.has_queue(queueName): tmpLog.error('config not found') continue # no new workers if nWorkers == 0: tmpLog.debug( 'skipped since no new worker is needed based on current stats' ) continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) # actions based on mapping type if queueConfig.mapType == WorkSpec.MT_NoJob: # workers without jobs jobChunks = [] for i in range(nWorkers): jobChunks.append([]) elif queueConfig.mapType == WorkSpec.MT_OneToOne: # one worker per one job jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, 1, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy) elif queueConfig.mapType == WorkSpec.MT_MultiJobs: # one worker for multiple jobs nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker( queueConfig, nWorkers) tmpLog.debug( 'nJobsPerWorker={0}'.format(nJobsPerWorker)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, nJobsPerWorker, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy, queueConfig.allowJobMixture) elif queueConfig.mapType == WorkSpec.MT_MultiWorkers: # multiple workers for one job nWorkersPerJob = self.workerMaker.get_num_workers_per_job( queueConfig, nWorkers) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, None, nWorkersPerJob, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy) else: tmpLog.error('unknown mapType={0}'.format( queueConfig.mapType)) continue tmpLog.debug('got {0} job chunks'.format(len(jobChunks))) if len(jobChunks) == 0: continue # make workers okChunks, ngChunks = self.workerMaker.make_workers( jobChunks, queueConfig, nReady) if len(ngChunks) == 0: tmpLog.debug('successfully made {0} workers'.format( len(okChunks))) else: tmpLog.debug( 'made {0} workers, while {1} workers failed'. format(len(okChunks), len(ngChunks))) timeNow = datetime.datetime.utcnow() # NG for ngJobs in ngChunks: for jobSpec in ngJobs: jobSpec.status = 'failed' jobSpec.subStatus = 'failedtomake' jobSpec.stateChangeTime = timeNow jobSpec.lockedBy = None jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, { 'lockedBy': lockedBy, 'subStatus': 'prepared' }) # OK pandaIDs = set() workSpecList = [] if len(okChunks) > 0: for workSpec, okJobs in okChunks: # has job if (queueConfig.useJobLateBinding and workSpec.workerID is None) \ or queueConfig.mapType == WorkSpec.MT_NoJob: workSpec.hasJob = 0 else: workSpec.hasJob = 1 if workSpec.nJobsToReFill in [None, 0]: workSpec.set_jobspec_list(okJobs) else: # refill free slots during the worker is running workSpec.set_jobspec_list( okJobs[:workSpec.nJobsToReFill]) workSpec.nJobsToReFill = None for jobSpec in okJobs[workSpec. nJobsToReFill:]: pandaIDs.add(jobSpec.PandaID) # map type workSpec.mapType = queueConfig.mapType # queue name workSpec.computingSite = queueConfig.queueName # set access point workSpec.accessPoint = queueConfig.messenger[ 'accessPoint'] # events if len(okJobs) > 0 and ( 'eventService' in okJobs[0].jobParams or 'cloneJob' in okJobs[0].jobParams): workSpec.eventsRequest = WorkSpec.EV_useEvents workSpecList.append(workSpec) if len(workSpecList) > 0: # get plugin for submitter submitterCore = self.pluginFactory.get_plugin( queueConfig.submitter) if submitterCore is None: # not found tmpLog.error( 'submitter plugin for {0} not found'.format( jobSpec.computingSite)) continue # get plugin for messenger messenger = self.pluginFactory.get_plugin( queueConfig.messenger) if messenger is None: # not found tmpLog.error( 'messenger plugin for {0} not found'.format( jobSpec.computingSite)) continue # setup access points messenger.setup_access_points(workSpecList) # feed jobs for workSpec in workSpecList: if workSpec.hasJob == 1: tmpStat = messenger.feed_jobs( workSpec, workSpec.get_jobspec_list()) if tmpStat is False: tmpLog.error( 'failed to send jobs to workerID={0}'. format(workSpec.workerID)) else: tmpLog.debug( 'sent jobs to workerID={0} with {1}'. format(workSpec.workerID, tmpStat)) # submit tmpLog.debug('submitting {0} workers'.format( len(workSpecList))) workSpecList, tmpRetList, tmpStrList = self.submit_workers( submitterCore, workSpecList) for iWorker, (tmpRet, tmpStr) in enumerate( zip(tmpRetList, tmpStrList)): workSpec, jobList = okChunks[iWorker] # use associated job list since it can be truncated for re-filling jobList = workSpec.get_jobspec_list() # set status if not tmpRet: # failed submission tmpLog.error( 'failed to submit a workerID={0} with {1}'. format(workSpec.workerID, tmpStr)) workSpec.set_status(WorkSpec.ST_missed) jobList = [] elif queueConfig.useJobLateBinding and workSpec.hasJob == 1: # directly go to running after feeding jobs for late biding workSpec.set_status(WorkSpec.ST_running) else: # normal successful submission workSpec.set_status(WorkSpec.ST_submitted) workSpec.submitTime = timeNow workSpec.modificationTime = timeNow # prefetch events if tmpRet and workSpec.hasJob == 1 and workSpec.eventsRequest == WorkSpec.EV_useEvents: workSpec.eventsRequest = WorkSpec.EV_requestEvents eventsRequestParams = dict() for jobSpec in jobList: eventsRequestParams[jobSpec.PandaID] = { 'pandaID': jobSpec.PandaID, 'taskID': jobSpec.taskID, 'jobsetID': jobSpec.jobParams['jobsetID'], 'nRanges': jobSpec.jobParams['coreCount'], } workSpec.eventsRequestParams = eventsRequestParams # register worker tmpStat = self.dbProxy.register_worker( workSpec, jobList, lockedBy) if jobList is not None: for jobSpec in jobList: pandaIDs.add(jobSpec.PandaID) if tmpStat: tmpStr = 'submitted a workerID={0} for PandaID={1} with batchID={2}' tmpLog.info( tmpStr.format( workSpec.workerID, jobSpec.PandaID, workSpec.batchID)) else: tmpStr = 'failed to register a worker for PandaID={0} with batchID={1}' tmpLog.error( tmpStr.format( jobSpec.PandaID, workSpec.batchID)) # release jobs self.dbProxy.release_jobs(pandaIDs, lockedBy) mainLog.debug('done') # check if being terminated if self.terminated(harvester_config.submitter.sleepTime): mainLog.debug('terminated') return # wrapper for submitWorkers to skip ready workers def submit_workers(self, submitter_core, workspec_list): retList = [] strList = [] newSpecList = [] workersToSubmit = [] for workSpec in workspec_list: if workSpec.status == WorkSpec.ST_ready: newSpecList.append(workSpec) retList.append(True) strList.append('') else: workersToSubmit.append(workSpec) tmpRetList = submitter_core.submit_workers(workersToSubmit) for tmpRet, tmpStr in tmpRetList: retList.append(tmpRet) strList.append(tmpStr) newSpecList += workersToSubmit return newSpecList, retList, strList
if loggerName.split('.')[-1] in ['db_proxy']: continue stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) loggerObj.addHandler(stdoutHandler) msgStr = "plugin={0}".format(stagerCore.__class__.__name__) tmpLog.debug(msgStr) msgStr = "Initial queueConfig.stager = {}".format(initial_queueConfig_stager) tmpLog.debug(msgStr) msgStr = "Modified queueConfig.stager = {}".format(modified_queueConfig_stager) tmpLog.debug(msgStr) scope = 'panda' proxy = DBProxy() communicator = CommunicatorPool() cacher = Cacher(communicator, single_mode=True) cacher.run() # check if db lock exits locked = stagerCore.dbInterface.get_object_lock('dummy_id_for_out_0',lock_interval=120) if not locked: tmpLog.debug('DB Already locked by another thread') # now unlock db unlocked = stagerCore.dbInterface.release_object_lock('dummy_id_for_out_0') if unlocked : tmpLog.debug('unlocked db') else:
class WorkerAdjuster(object): # constructor def __init__(self, queue_config_mapper): self.queue_configMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict() self.apf_mon = Apfmon(self.queue_configMapper) try: self.maxNewWorkers = harvester_config.submitter.maxNewWorkers except AttributeError: self.maxNewWorkers = None # define number of workers to submit based on various information def define_num_workers(self, static_num_workers, site_name): tmp_log = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers') tmp_log.debug('start') tmp_log.debug('static_num_workers: {0}'.format(static_num_workers)) dyn_num_workers = copy.deepcopy(static_num_workers) try: # get queue status queue_stat = self.dbProxy.get_cache("panda_queues.json", None) if queue_stat is None: queue_stat = dict() else: queue_stat = queue_stat.data # get job statistics job_stats = self.dbProxy.get_cache("job_statistics.json", None) if job_stats is None: job_stats = dict() else: job_stats = job_stats.data # define num of new workers for queue_name in static_num_workers: # get queue queue_config = self.queue_configMapper.get_queue(queue_name) worker_limits_dict = self.dbProxy.get_worker_limits(queue_name) max_workers = worker_limits_dict.get('maxWorkers', 0) n_queue_limit = worker_limits_dict.get('nQueueLimitWorker', 0) n_queue_limit_per_rt = worker_limits_dict[ 'nQueueLimitWorkerPerRT'] n_queue_total, n_ready_total, n_running_total = 0, 0, 0 apf_msg = None apf_data = None for job_type, jt_values in iteritems( static_num_workers[queue_name]): for resource_type, tmp_val in iteritems(jt_values): tmp_log.debug( 'Processing queue {0} job_type {1} resource_type {2} with static_num_workers {3}' .format(queue_name, job_type, resource_type, tmp_val)) # set 0 to num of new workers when the queue is disabled if queue_name in queue_stat and queue_stat[queue_name][ 'status'] in [ 'offline', 'standby', 'maintenance' ]: dyn_num_workers[queue_name][job_type][ resource_type]['nNewWorkers'] = 0 ret_msg = 'set n_new_workers=0 since status={0}'.format( queue_stat[queue_name]['status']) tmp_log.debug(ret_msg) apf_msg = 'Not submitting workers since queue status = {0}'.format( queue_stat[queue_name]['status']) continue # protection against not-up-to-date queue config if queue_config is None: dyn_num_workers[queue_name][job_type][ resource_type]['nNewWorkers'] = 0 ret_msg = 'set n_new_workers=0 due to missing queue_config' tmp_log.debug(ret_msg) apf_msg = 'Not submitting workers because of missing queue_config' continue # get throttler if queue_name not in self.throttlerMap: if hasattr(queue_config, 'throttler'): throttler = self.pluginFactory.get_plugin( queue_config.throttler) else: throttler = None self.throttlerMap[queue_name] = throttler # check throttler throttler = self.throttlerMap[queue_name] if throttler is not None: to_throttle, tmp_msg = throttler.to_be_throttled( queue_config) if to_throttle: dyn_num_workers[queue_name][job_type][ resource_type]['nNewWorkers'] = 0 ret_msg = 'set n_new_workers=0 by {0}:{1}'.format( throttler.__class__.__name__, tmp_msg) tmp_log.debug(ret_msg) continue # check stats n_queue = tmp_val['nQueue'] n_ready = tmp_val['nReady'] n_running = tmp_val['nRunning'] if resource_type != 'ANY' and job_type != 'ANY' and job_type is not None: n_queue_total += n_queue n_ready_total += n_ready n_running_total += n_running if queue_config.runMode == 'slave': n_new_workers_def = tmp_val['nNewWorkers'] if n_new_workers_def == 0: dyn_num_workers[queue_name][job_type][ resource_type]['nNewWorkers'] = 0 ret_msg = 'set n_new_workers=0 by panda in slave mode' tmp_log.debug(ret_msg) continue else: n_new_workers_def = None # define num of new workers based on static site config n_new_workers = 0 if n_queue >= n_queue_limit_per_rt > 0: # enough queued workers ret_msg = 'No n_new_workers since n_queue({0})>=n_queue_limit_per_rt({1})'.format( n_queue, n_queue_limit_per_rt) tmp_log.debug(ret_msg) pass elif (n_queue + n_ready + n_running) >= max_workers > 0: # enough workers in the system ret_msg = 'No n_new_workers since n_queue({0}) + n_ready({1}) + n_running({2}) '.format( n_queue, n_ready, n_running) ret_msg += '>= max_workers({0})'.format( max_workers) tmp_log.debug(ret_msg) pass else: max_queued_workers = None if n_queue_limit_per_rt > 0: # there is a limit set for the queue max_queued_workers = n_queue_limit_per_rt # Reset the maxQueueWorkers according to particular if n_new_workers_def is not None: # don't surpass limits given centrally maxQueuedWorkers_slave = n_new_workers_def + n_queue if max_queued_workers is not None: max_queued_workers = min( maxQueuedWorkers_slave, max_queued_workers) else: max_queued_workers = maxQueuedWorkers_slave elif queue_config.mapType == 'NoJob': # for pull mode, limit to activated jobs # limit the queue to the number of activated jobs to avoid empty pilots try: n_activated = max( job_stats[queue_name]['activated'], 1) # avoid no activity queues queue_limit = max_queued_workers max_queued_workers = min( n_activated, max_queued_workers) tmp_log.debug( 'limiting max_queued_workers to min(n_activated={0}, queue_limit={1})' .format(n_activated, queue_limit)) except KeyError: tmp_log.warning( 'n_activated not defined, defaulting to configured queue limits' ) pass if max_queued_workers is None: # no value found, use default value max_queued_workers = 1 # new workers n_new_workers = max(max_queued_workers - n_queue, 0) tmp_log.debug( 'setting n_new_workers to {0} in max_queued_workers calculation' .format(n_new_workers)) if max_workers > 0: n_new_workers = min( n_new_workers, max( max_workers - n_queue - n_ready - n_running, 0)) tmp_log.debug( 'setting n_new_workers to {0} to respect max_workers' .format(n_new_workers)) if queue_config.maxNewWorkersPerCycle > 0: n_new_workers = min( n_new_workers, queue_config.maxNewWorkersPerCycle) tmp_log.debug( 'setting n_new_workers to {0} in order to respect maxNewWorkersPerCycle' .format(n_new_workers)) if self.maxNewWorkers is not None and self.maxNewWorkers > 0: n_new_workers = min(n_new_workers, self.maxNewWorkers) tmp_log.debug( 'setting n_new_workers to {0} in order to respect universal maxNewWorkers' .format(n_new_workers)) dyn_num_workers[queue_name][job_type][resource_type][ 'nNewWorkers'] = n_new_workers # adjust n_new_workers for UCORE to let aggregations over RT respect nQueueLimitWorker and max_workers if queue_config is None: max_new_workers_per_cycle = 0 ret_msg = 'set max_new_workers_per_cycle=0 in UCORE aggregation due to missing queue_config' tmp_log.debug(ret_msg) else: max_new_workers_per_cycle = queue_config.maxNewWorkersPerCycle if len(dyn_num_workers[queue_name]) > 1: total_new_workers_rts = 0 for _jt in dyn_num_workers[queue_name]: for _rt in dyn_num_workers[queue_name][_jt]: if _jt != 'ANY' and _rt != 'ANY': total_new_workers_rts = total_new_workers_rts + dyn_num_workers[ queue_name][_jt][_rt]['nNewWorkers'] n_new_workers_max_agg = min( max(n_queue_limit - n_queue_total, 0), max( max_workers - n_queue_total - n_ready_total - n_running_total, 0)) if max_new_workers_per_cycle >= 0: n_new_workers_max_agg = min(n_new_workers_max_agg, max_new_workers_per_cycle) if self.maxNewWorkers is not None and self.maxNewWorkers > 0: n_new_workers_max_agg = min(n_new_workers_max_agg, self.maxNewWorkers) # exceeded max, to adjust if total_new_workers_rts > n_new_workers_max_agg: if n_new_workers_max_agg == 0: for job_type in dyn_num_workers[queue_name]: for resource_type in dyn_num_workers[ queue_name][job_type]: dyn_num_workers[queue_name][job_type][ resource_type]['nNewWorkers'] = 0 tmp_log.debug( 'No n_new_workers since n_new_workers_max_agg=0 for UCORE' ) else: tmp_log.debug( 'n_new_workers_max_agg={0} for UCORE'.format( n_new_workers_max_agg)) _d = dyn_num_workers[queue_name].copy() del _d['ANY'] # TODO: needs to be recalculated simple_rt_nw_list = [] for job_type in _d: # jt: job type for resource_type in _d[ job_type]: # rt: resource type simple_rt_nw_list.append([ (resource_type, job_type), _d[job_type][resource_type].get( 'nNewWorkers', 0), 0 ]) _countdown = n_new_workers_max_agg for _rt_list in simple_rt_nw_list: (resource_type, job_type), n_new_workers_orig, _r = _rt_list n_new_workers, remainder = divmod( n_new_workers_orig * n_new_workers_max_agg, total_new_workers_rts) dyn_num_workers[queue_name][ job_type].setdefault( resource_type, { 'nReady': 0, 'nRunning': 0, 'nQueue': 0, 'nNewWorkers': 0 }) dyn_num_workers[queue_name][job_type][ resource_type][ 'nNewWorkers'] = n_new_workers _rt_list[2] = remainder _countdown -= n_new_workers _s_list = sorted(simple_rt_nw_list, key=(lambda x: x[1])) sorted_rt_nw_list = sorted(_s_list, key=(lambda x: x[2]), reverse=True) for ( resource_type, job_type ), n_new_workers_orig, remainder in sorted_rt_nw_list: if _countdown <= 0: break dyn_num_workers[queue_name][job_type][ resource_type]['nNewWorkers'] += 1 _countdown -= 1 for job_type in dyn_num_workers[queue_name]: for resource_type in dyn_num_workers[queue_name][ job_type]: if job_type == 'ANY' or resource_type == 'ANY': continue n_new_workers = dyn_num_workers[queue_name][ job_type][resource_type]['nNewWorkers'] tmp_log.debug( 'setting n_new_workers to {0} of job_type {1} resource_type {2} in order to respect RT aggregations for UCORE' .format(n_new_workers, job_type, resource_type)) if not apf_msg: apf_data = copy.deepcopy(dyn_num_workers[queue_name]) self.apf_mon.update_label(queue_name, apf_msg, apf_data) # dump tmp_log.debug('defined {0}'.format(str(dyn_num_workers))) return dyn_num_workers except Exception: # dump error err_msg = core_utils.dump_error_message(tmp_log) return None
class ServiceMonitor(AgentBase): # constructor def __init__(self, pid_file, single_mode=False): AgentBase.__init__(self, single_mode) self.db_proxy = DBProxy() if pid_file is not None: self.pid_file = pid_file else: try: self.pid_file = harvester_config.service_monitor.pidfile except Exception: self.pid_file = None self.pid = self.get_master_pid() self.master_process = psutil.Process(self.pid) self.children = self.master_process.children(recursive=True) self.cpu_count = multiprocessing.cpu_count() def get_master_pid(self): """ Gets the master pid from the lock file :return: """ try: fh = open(self.pid_file, 'r') pid = int(fh.readline()) fh.close() except Exception: _logger.error('Could not read pidfile "{0}"'.format(self.pid_file)) pid = None return pid def refresh_children_list(self, children): children_refreshed = [] for child_current in children: pid_current = child_current.pid found = False for child_stored in self.children: pid_stored = child_stored.pid if pid_stored == pid_current: found = True break if found: children_refreshed.append(child_stored) else: children_refreshed.append(child_current) self.children = children_refreshed return children_refreshed def get_memory_n_cpu(self): """ sum memory of whole process tree starting from master pid :return: rss in MiB """ try: master_process = self.master_process rss = master_process.memory_info()[0] memory_pc = master_process.memory_percent() cpu_pc = master_process.cpu_percent() children = self.refresh_children_list(master_process.children(recursive=True)) for child in children: rss += child.memory_info()[0] memory_pc += child.memory_percent() cpu_pc += child.cpu_percent() # convert bytes to MiB rss_mib = rss / float(2 ** 20) # normalize cpu percentage by cpu count cpu_pc = cpu_pc * 1.0 / self.cpu_count except Exception: _logger.error('Excepted with: {0}'.format(traceback.format_exc())) rss_mib = None memory_pc = None cpu_pc = None return rss_mib, memory_pc, cpu_pc def volume_use(self, volume_name): command = "df -Pkh /" + volume_name used_amount = 0 tmp_array = command.split() output = subprocess.Popen(tmp_array, stdout=subprocess.PIPE).communicate()[0].decode("utf-8") for line in output.split('\n'): if re.search(volume_name, line): used_amount = re.search(r"(\d+)\%", line).group(1) try: used_amount_float = float(used_amount) except ValueError: used_amount_float = None _logger.error('Could not convert used amount {0} to float for volume {1}'.format(used_amount, volume_name)) return used_amount_float # main loop def run(self): while True: _logger.debug('Running service monitor') service_metrics = {} # get memory usage rss_mib, memory_pc, cpu_pc = self.get_memory_n_cpu() service_metrics['rss_mib'] = rss_mib service_metrics['memory_pc'] = memory_pc service_metrics['cpu_pc'] = cpu_pc _logger.debug('Memory usage: {0} MiB/{1}%, CPU usage: {2}'.format(rss_mib, memory_pc, cpu_pc)) # get volume usage try: volumes = harvester_config.service_monitor.disk_volumes.split(',') except Exception: volumes = [] for volume in volumes: volume_use = self.volume_use(volume) _logger.debug('Disk usage of {0}: {1} %'.format(volume, volume_use)) service_metrics['volume_{0}_pc'.format(volume)] = volume_use service_metrics_spec = ServiceMetricSpec(service_metrics) self.db_proxy.insert_service_metrics(service_metrics_spec) # check if being terminated try: sleep_time = harvester_config.service_monitor.sleepTime except Exception: sleep_time = 300 if self.terminated(sleep_time, randomize=False): return
class Submitter(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.workerMaker = WorkerMaker() self.workerAdjuster = WorkerAdjuster(queue_config_mapper) self.pluginFactory = PluginFactory() self.monitor_fifo = MonitorFIFO() self.apfmon = Apfmon(self.queueConfigMapper) # main loop def run(self): lockedBy = 'submitter-{0}'.format(self.get_pid()) monitor_fifo = self.monitor_fifo queueLockInterval = getattr(harvester_config.submitter, 'queueLockInterval', harvester_config.submitter.lockInterval) while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting queues to submit workers') # get queues associated to a site to submit workers curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(harvester_config.submitter.nQueues, harvester_config.submitter.lookupTime, harvester_config.submitter.lockInterval, lockedBy, queueLockInterval) submitted = False if siteName is not None: mainLog.debug('got {0} queues for site {1}'.format(len(curWorkers), siteName)) # get commands comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName) commandSpecs = self.dbProxy.get_commands_for_receiver('submitter', comStr) mainLog.debug('got {0} {1} commands'.format(commandSpecs, comStr)) for commandSpec in commandSpecs: newLimits = self.dbProxy.set_queue_limit(siteName, commandSpec.params) for tmpResource, tmpNewVal in iteritems(newLimits): # if available, overwrite new worker value with the command from panda server if tmpResource in resMap: tmpQueueName = resMap[tmpResource] if tmpQueueName in curWorkers: curWorkers[tmpQueueName][tmpResource]['nNewWorkers'] = tmpNewVal # define number of new workers if len(curWorkers) == 0: n_workers_per_queue_and_rt = dict() else: n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers(curWorkers, siteName) if n_workers_per_queue_and_rt is None: mainLog.error('WorkerAdjuster failed to define the number of workers') elif len(n_workers_per_queue_and_rt) == 0: pass else: # loop over all queues and resource types for queueName in n_workers_per_queue_and_rt: for resource_type, tmpVal in iteritems(n_workers_per_queue_and_rt[queueName]): tmpLog = self.make_logger(_logger, 'id={0} queue={1} rtype={2}'.format(lockedBy, queueName, resource_type), method_name='run') try: tmpLog.debug('start') tmpLog.debug('workers status: %s' % tmpVal) nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady'] nReady = tmpVal['nReady'] # check queue if not self.queueConfigMapper.has_queue(queueName): tmpLog.error('config not found') continue # no new workers if nWorkers == 0: tmpLog.debug('skipped since no new worker is needed based on current stats') continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) workerMakerCore = self.workerMaker.get_plugin(queueConfig) # check if resource is ready if hasattr(workerMakerCore, 'dynamicSizing') and workerMakerCore.dynamicSizing is True: numReadyResources = self.workerMaker.num_ready_resources(queueConfig, resource_type, workerMakerCore) tmpLog.debug('numReadyResources: %s' % numReadyResources) if not numReadyResources: if hasattr(workerMakerCore, 'staticWorkers'): nQRWorkers = tmpVal['nQueue'] + tmpVal['nRunning'] tmpLog.debug('staticWorkers: %s, nQRWorkers(Queue+Running): %s' % (workerMakerCore.staticWorkers, nQRWorkers)) if nQRWorkers >= workerMakerCore.staticWorkers: tmpLog.debug('No left static workers, skip') continue else: nWorkers = min(workerMakerCore.staticWorkers - nQRWorkers, nWorkers) tmpLog.debug('staticWorkers: %s, nWorkers: %s' % (workerMakerCore.staticWorkers, nWorkers)) else: tmpLog.debug('skip since no resources are ready') continue else: nWorkers = min(nWorkers, numReadyResources) # post action of worker maker if hasattr(workerMakerCore, 'skipOnFail') and workerMakerCore.skipOnFail is True: skipOnFail = True else: skipOnFail = False # actions based on mapping type if queueConfig.mapType == WorkSpec.MT_NoJob: # workers without jobs jobChunks = [] for i in range(nWorkers): jobChunks.append([]) elif queueConfig.mapType == WorkSpec.MT_OneToOne: # one worker per one job jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, 1, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy) elif queueConfig.mapType == WorkSpec.MT_MultiJobs: # one worker for multiple jobs nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(queueConfig, nWorkers, resource_type, maker=workerMakerCore) tmpLog.debug('nJobsPerWorker={0}'.format(nJobsPerWorker)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, nJobsPerWorker, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy, queueConfig.allowJobMixture) elif queueConfig.mapType == WorkSpec.MT_MultiWorkers: # multiple workers for one job nWorkersPerJob = self.workerMaker.get_num_workers_per_job(queueConfig, nWorkers, resource_type, maker=workerMakerCore) maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total( queueConfig, resource_type, maker=workerMakerCore) maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle( queueConfig, resource_type, maker=workerMakerCore) tmpLog.debug('nWorkersPerJob={0}'.format(nWorkersPerJob)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, None, nWorkersPerJob, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy, max_workers_per_job_in_total=maxWorkersPerJob, max_workers_per_job_per_cycle=maxWorkersPerJobPerCycle) else: tmpLog.error('unknown mapType={0}'.format(queueConfig.mapType)) continue tmpLog.debug('got {0} job chunks'.format(len(jobChunks))) if len(jobChunks) == 0: continue # make workers okChunks, ngChunks = self.workerMaker.make_workers(jobChunks, queueConfig, nReady, resource_type, maker=workerMakerCore) if len(ngChunks) == 0: tmpLog.debug('successfully made {0} workers'.format(len(okChunks))) else: tmpLog.debug('made {0} workers, while {1} workers failed'.format(len(okChunks), len(ngChunks))) timeNow = datetime.datetime.utcnow() timeNow_timestamp = time.time() pandaIDs = set() # NG (=not good) for ngJobs in ngChunks: for jobSpec in ngJobs: if skipOnFail: # release jobs when workers are not made pandaIDs.add(jobSpec.PandaID) else: jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_make' jobSpec.stateChangeTime = timeNow jobSpec.lockedBy = None errStr = 'failed to make a worker' jobSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': 'prepared'}) # OK workSpecList = [] if len(okChunks) > 0: for workSpec, okJobs in okChunks: # has job if (queueConfig.useJobLateBinding and workSpec.workerID is None) \ or queueConfig.mapType == WorkSpec.MT_NoJob: workSpec.hasJob = 0 else: workSpec.hasJob = 1 if workSpec.nJobsToReFill in [None, 0]: workSpec.set_jobspec_list(okJobs) else: # refill free slots during the worker is running workSpec.set_jobspec_list(okJobs[:workSpec.nJobsToReFill]) workSpec.nJobsToReFill = None for jobSpec in okJobs[workSpec.nJobsToReFill:]: pandaIDs.add(jobSpec.PandaID) workSpec.set_num_jobs_with_list() # map type workSpec.mapType = queueConfig.mapType # queue name workSpec.computingSite = queueConfig.queueName # set access point workSpec.accessPoint = queueConfig.messenger['accessPoint'] # sync level workSpec.syncLevel = queueConfig.get_synchronization_level() # events if len(okJobs) > 0 and \ ('eventService' in okJobs[0].jobParams or 'cloneJob' in okJobs[0].jobParams): workSpec.eventsRequest = WorkSpec.EV_useEvents workSpecList.append(workSpec) if len(workSpecList) > 0: sw = core_utils.get_stopwatch() # get plugin for submitter submitterCore = self.pluginFactory.get_plugin(queueConfig.submitter) if submitterCore is None: # not found tmpLog.error( 'submitter plugin for {0} not found'.format(jobSpec.computingSite)) continue # get plugin for messenger messenger = self.pluginFactory.get_plugin(queueConfig.messenger) if messenger is None: # not found tmpLog.error( 'messenger plugin for {0} not found'.format(jobSpec.computingSite)) continue # setup access points messenger.setup_access_points(workSpecList) # feed jobs for workSpec in workSpecList: if workSpec.hasJob == 1: tmpStat = messenger.feed_jobs(workSpec, workSpec.get_jobspec_list()) if tmpStat is False: tmpLog.error( 'failed to send jobs to workerID={0}'.format(workSpec.workerID)) else: tmpLog.debug( 'sent jobs to workerID={0} with {1}'.format(workSpec.workerID, tmpStat)) # insert workers self.dbProxy.insert_workers(workSpecList, lockedBy) # submit sw.reset() tmpLog.info('submitting {0} workers'.format(len(workSpecList))) workSpecList, tmpRetList, tmpStrList = self.submit_workers(submitterCore, workSpecList) tmpLog.debug('done submitting {0} workers'.format(len(workSpecList)) + sw.get_elapsed_time()) # collect successful jobs okPandaIDs = set() for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): if tmpRet: workSpec, jobList = okChunks[iWorker] jobList = workSpec.get_jobspec_list() if jobList is not None: for jobSpec in jobList: okPandaIDs.add(jobSpec.PandaID) # loop over all workers for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): workSpec, jobList = okChunks[iWorker] # set harvesterHost workSpec.harvesterHost = socket.gethostname() # use associated job list since it can be truncated for re-filling jobList = workSpec.get_jobspec_list() # set status if not tmpRet: # failed submission errStr = 'failed to submit a workerID={0} with {1}'.format( workSpec.workerID, tmpStr) tmpLog.error(errStr) workSpec.set_status(WorkSpec.ST_missed) workSpec.set_dialog_message(tmpStr) workSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) if jobList is not None: # increment attempt number newJobList = [] for jobSpec in jobList: # skip if successful with another worker if jobSpec.PandaID in okPandaIDs: continue if jobSpec.submissionAttempts is None: jobSpec.submissionAttempts = 0 jobSpec.submissionAttempts += 1 # max attempt or permanent error if tmpRet is False or \ jobSpec.submissionAttempts >= \ queueConfig.maxSubmissionAttempts: newJobList.append(jobSpec) else: self.dbProxy.increment_submission_attempt( jobSpec.PandaID, jobSpec.submissionAttempts) jobList = newJobList elif queueConfig.useJobLateBinding and workSpec.hasJob == 1: # directly go to running after feeding jobs for late biding workSpec.set_status(WorkSpec.ST_running) else: # normal successful submission workSpec.set_status(WorkSpec.ST_submitted) workSpec.submitTime = timeNow workSpec.modificationTime = timeNow workSpec.checkTime = timeNow if self.monitor_fifo.enabled: workSpec.set_work_params({'lastCheckAt': timeNow_timestamp}) # prefetch events if tmpRet and workSpec.hasJob == 1 and \ workSpec.eventsRequest == WorkSpec.EV_useEvents and \ queueConfig.prefetchEvents: workSpec.eventsRequest = WorkSpec.EV_requestEvents eventsRequestParams = dict() for jobSpec in jobList: eventsRequestParams[jobSpec.PandaID] = \ {'pandaID': jobSpec.PandaID, 'taskID': jobSpec.taskID, 'jobsetID': jobSpec.jobParams['jobsetID'], 'nRanges': max(int(math.ceil(workSpec.nCore / len(jobList))), jobSpec.jobParams['coreCount']), } workSpec.eventsRequestParams = eventsRequestParams # register worker tmpStat = self.dbProxy.register_worker(workSpec, jobList, lockedBy) if jobList is not None: for jobSpec in jobList: pandaIDs.add(jobSpec.PandaID) if tmpStat: if tmpRet: tmpStr = \ 'submitted a workerID={0} for PandaID={1} with batchID={2}' tmpLog.info(tmpStr.format(workSpec.workerID, jobSpec.PandaID, workSpec.batchID)) else: tmpStr = 'failed to submit a workerID={0} for PandaID={1}' tmpLog.error(tmpStr.format(workSpec.workerID, jobSpec.PandaID)) else: tmpStr = \ 'failed to register a worker for PandaID={0} with batchID={1}' tmpLog.error(tmpStr.format(jobSpec.PandaID, workSpec.batchID)) # enqueue to monitor fifo if self.monitor_fifo.enabled \ and queueConfig.mapType != WorkSpec.MT_MultiWorkers: workSpecsToEnqueue = \ [[w] for w in workSpecList if w.status in (WorkSpec.ST_submitted, WorkSpec.ST_running)] check_delay = min( getattr(harvester_config.monitor, 'eventBasedCheckInterval', harvester_config.monitor.checkInterval), getattr(harvester_config.monitor, 'fifoCheckInterval', harvester_config.monitor.checkInterval)) monitor_fifo.put((queueName, workSpecsToEnqueue), time.time() + check_delay) mainLog.debug('put workers to monitor FIFO') submitted = True # release jobs self.dbProxy.release_jobs(pandaIDs, lockedBy) tmpLog.info('done') except Exception: core_utils.dump_error_message(tmpLog) # release the site self.dbProxy.release_site(siteName, lockedBy) if sw_main.get_elapsed_time_in_sec() > queueLockInterval: mainLog.warning('a submitter cycle was longer than queueLockInterval {0} sec'.format(queueLockInterval) + sw_main.get_elapsed_time()) mainLog.debug('done') # define sleep interval if siteName is None: sleepTime = harvester_config.submitter.sleepTime else: sleepTime = 0 if submitted and hasattr(harvester_config.submitter, 'minSubmissionInterval'): interval = harvester_config.submitter.minSubmissionInterval if interval > 0: newTime = datetime.datetime.utcnow() + datetime.timedelta(seconds=interval) self.dbProxy.update_panda_queue_attribute('submitTime', newTime, site_name=siteName) # time the cycle mainLog.debug('done a submitter cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(sleepTime): mainLog.debug('terminated') return # wrapper for submitWorkers to skip ready workers def submit_workers(self, submitter_core, workspec_list): retList = [] strList = [] newSpecList = [] workersToSubmit = [] for workSpec in workspec_list: if workSpec.status in [WorkSpec.ST_ready, WorkSpec.ST_running]: newSpecList.append(workSpec) retList.append(True) strList.append('') else: workersToSubmit.append(workSpec) tmpRetList = submitter_core.submit_workers(workersToSubmit) # submit the workers to the monitoring self.apfmon.create_workers(workersToSubmit) for tmpRet, tmpStr in tmpRetList: retList.append(tmpRet) strList.append(tmpStr) newSpecList += workersToSubmit return newSpecList, retList, strList
import os import sys import json from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy from pandaharvester.harvestermessenger import shared_file_messenger workerID = int(sys.argv[1]) eventID = sys.argv[2] status = sys.argv[3] proxy = DBProxy() workSpec = proxy.get_worker_with_id(workerID) jobSpec = proxy.get_jobs_with_worker_id(workerID, None)[0] accessPoint = workSpec.get_access_point() try: os.makedirs(accessPoint) except: pass node = {} node['eventRangeID'] = eventID node['eventStatus'] = status f = open( os.path.join(accessPoint, shared_file_messenger.jsonEventsUpdateFileName), 'w') json.dump([node], f) f.close()
def __init__(self, **kwarg): # logic type : AND: throttled if all rules are satisfied, OR: throttled if one rule is satisfied self.logicType = 'OR' PluginBase.__init__(self, **kwarg) self.dbProxy = DBProxy()
stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) loggerObj.addHandler(stdoutHandler) msgStr = "plugin={0}".format(preparatorCore.__class__.__name__) tmpLog.debug(msgStr) msgStr = "Initial queueConfig.preparator = {}".format( initial_queueConfig_preparator) tmpLog.debug(msgStr) msgStr = "Modified queueConfig.preparator = {}".format( modified_queueConfig_preparator) tmpLog.debug(msgStr) scope = 'panda' proxy = DBProxy() communicator = CommunicatorPool() cacher = Cacher(communicator, single_mode=True) cacher.run() tmpLog.debug("plugin={0}".format(preparatorCore.__class__.__name__)) tmpLog.debug("BasePath from preparator configuration: %s " % preparatorCore.basePath) # get all jobs in table in a preparing substate tmpLog.debug('try to get all jobs in a preparing substate') jobSpec_list = proxy.get_jobs_in_sub_status('preparing', 2000, None, None, None, None, None, None) tmpLog.debug('got {0} jobs'.format(len(jobSpec_list))) # loop over all found jobs if len(jobSpec_list) > 0:
import os import sys from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy from pandaharvester.harvestercore.communicator_pool import CommunicatorPool from pandaharvester.harvestermessenger import shared_file_messenger workerID = int(sys.argv[1]) proxy = DBProxy() workSpec = proxy.get_worker_with_id(workerID) jobSpec = proxy.get_jobs_with_worker_id(workerID, None)[0] accessPoint = workSpec.get_access_point() try: os.makedirs(accessPoint) except: pass node = {} node['pandaID'] = jobSpec.PandaID node['jobsetID'] = jobSpec.jobParams['jobsetID'] node['taskID'] = jobSpec.taskID a = CommunicatorPool() tmpStat, tmpVal = a.getEventRanges(node) mess = shared_file_messenger.SharedFileMessenger() mess.feed_events(workSpec, tmpVal)
import os import sys workerID = int(sys.argv[1]) from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy proxy = DBProxy() workSpec = proxy.get_worker_with_id(workerID) accessPoint = workSpec.get_access_point() try: os.makedirs(accessPoint) except: pass from pandaharvester.harvestermessenger import shared_file_messenger f = open(os.path.join(accessPoint, shared_file_messenger.jsonJobRequestFileName), 'w') f.close()
except Exception: pass for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict): if loggerName.startswith('panda.log'): if len(loggerObj.handlers) == 0: continue if loggerName.split('.')[-1] in ['db_proxy']: continue stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) loggerObj.addHandler(stdoutHandler) queueConfigMapper = QueueConfigMapper() proxy = DBProxy() proxy.make_tables(queueConfigMapper) job = JobSpec() job.PandaID = 1 job.modificationTime = datetime.datetime.now() proxy.insert_jobs([job]) newJob = proxy.get_job(1) a = CommunicatorPool() a.get_jobs('siteName', 'nodeName', 'prodSourceLabel', 'computingElement', 1, {})
class ARCSubmitter(PluginBase): '''Submitter for ARC CE''' def __init__(self, **kwarg): '''Set up DB connection and credentials''' PluginBase.__init__(self, **kwarg) self.dbproxy = DBProxy() self.schedulerid = harvester_config.master.harvester_id # Credential dictionary role: proxy file self.certs = dict(zip([r.split('=')[1] for r in list(harvester_config.credmanager.voms)], list(harvester_config.credmanager.outCertFile))) self.cred_type = arc.initializeCredentialsType(arc.initializeCredentialsType.SkipCredentials) def _run_submit(self, thr): '''Run a thread to do the submission''' try: thr.start() except: pass # Be careful to wait longer than submission timeout thr.join(thr.userconfig.Timeout() + 60.0) if thr.isAlive(): # abort due to timeout and try again raise Exception("Submission timeout") if thr.job is None: raise Exception("Submission failed") return thr.job def _arc_submit(self, xrsl, arcces, userconfig, log): '''Check the available CEs and submit''' queuelist = [] for arcce in arcces: (ce_endpoint, ce_queue) = arcce aris = arc.URL(str(ce_endpoint)) ce_host = aris.Host() if aris.Protocol() == 'https': aris.ChangePath('/arex') infoendpoints = [arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.ogf.glue.emies.resourceinfo')] else: aris = 'ldap://'+aris.Host()+'/mds-vo-name=local,o=grid' infoendpoints = [arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng')] # retriever contains a list of CE endpoints retriever = arc.ComputingServiceRetriever(userconfig, infoendpoints) retriever.wait() # targets is the list of queues # parse target.ComputingService.ID for the CE hostname # target.ComputingShare.Name is the queue name targets = retriever.GetExecutionTargets() # Filter only sites for this process for target in targets: if not target.ComputingService.ID: log.info("Target {0} does not have ComputingService ID defined, skipping".format(target.ComputingService.Name)) continue # If EMI-ES infoendpoint, force EMI-ES submission if infoendpoints[0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' \ and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation': log.debug("Rejecting target interface {0} because not EMI-ES".format(target.ComputingEndpoint.InterfaceName)) continue # Check for matching host and queue targethost = re.sub(':arex$', '', re.sub('urn:ogf:ComputingService:', '', target.ComputingService.ID)) targetqueue = target.ComputingShare.Name if targethost != ce_host: log.debug('Rejecting target host {0} as it does not match {1}'.format(targethost, ce_host)) continue if targetqueue != ce_queue: log.debug('Rejecting target queue {0} as it does not match {1}'.format(targetqueue, ce_queue)) continue queuelist.append(target) log.debug("Adding target {0}:{1}".format(targethost, targetqueue)) # check if any queues are available, if not leave and try again next time if not queuelist: raise Exception("No free queues available") log.debug("preparing submission") jobdescs = arc.JobDescriptionList() if not arc.JobDescription_Parse(str(xrsl), jobdescs): raise Exception("Failed to prepare job description") # Run the submission in a separate thread thr = SubmitThr(queuelist, jobdescs, userconfig) return self._run_submit(thr) def _set_logdir(self, site): date = time.strftime('%Y-%m-%d') return os.path.join(date, site) # submit workers def submit_workers(self, workspec_list): retlist = [] # Get queue info from DB pandaqueues = self.dbproxy.get_cache("panda_queues.json", None) if pandaqueues is None: raise Exception("Failed to get panda queue info from database") pandaqueues = pandaqueues.data osmap = self.dbproxy.get_cache("ddmendpoints_objectstores.json", None) if osmap is None: raise Exception("Failed to get Object Store info from database") osmap = osmap.data for workspec in workspec_list: arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log # Assume for aCT that jobs are always pre-fetched (no late-binding) for jobspec in workspec.get_jobspec_list(): tmplog.debug("JobSpec: {0}".format(jobspec.values_map())) if jobspec.computingSite not in pandaqueues: retlist.append((False, "No queue information for {0}".format(jobspec.computingSite))) continue # Get CEs from panda queue info # List of (endpoint, queue) tuples arcces = [] for endpoint in pandaqueues[jobspec.computingSite]['queues']: ce_endpoint = endpoint['ce_endpoint'] if not re.search('://', ce_endpoint): ce_endpoint = 'gsiftp://%s' % ce_endpoint ce_queue = endpoint['ce_queue_name'] arcces.append((ce_endpoint, ce_queue)) if not arcces: retlist.append((False, "No CEs defined for %{0}".format(jobspec.computingSite))) continue # Set true pilot or not queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue(jobspec.computingSite) pandaqueues[jobspec.computingSite]['truepilot'] = 'running' in queueconfig.noHeartbeat # Set log URL for GTAG env in job description logbaseurl = queueconfig.submitter.get('logBaseURL') logsubdir = self._set_logdir(jobspec.computingSite) logfileurl = '/'.join([logbaseurl, logsubdir, '%d.out' % jobspec.PandaID]) if logbaseurl else None tmplog.debug("Converting to ARC XRSL format") arcxrsl = ARCParser(jobspec.jobParams, jobspec.computingSite, pandaqueues[jobspec.computingSite], logfileurl, self.schedulerid, osmap, '/tmp', # tmpdir, TODO common tmp dir None, #jobSpec.eventranges, # TODO event ranges tmplog) arcxrsl.parse() xrsl = arcxrsl.getXrsl() tmplog.debug("ARC xrsl: {0}".format(xrsl)) # Set the files to be downloaded at the end of the job downloadfiles = 'gmlog/errors' if 'logFile' in jobspec.jobParams: downloadfiles += ';%s' %jobspec.jobParams['logFile'].replace('.tgz', '') if not pandaqueues[jobspec.computingSite]['truepilot']: downloadfiles += ';jobSmallFiles.tgz' # Set certificate userconfig = arc.UserConfig(self.cred_type) proxyrole = '' if jobspec.jobParams['prodSourceLabel'] == 'user': userconfig.ProxyPath(str(self.certs['pilot'])) proxyrole = 'pilot' else: userconfig.ProxyPath(str(self.certs['production'])) proxyrole = 'production' tmplog.debug("Submitting using {0} proxy at {1}".format(proxyrole, userconfig.ProxyPath())) try: tmplog.debug("Submission targets: {0}".format(arcces)) arcjob = self._arc_submit(xrsl, arcces, userconfig, tmplog) tmplog.info("ARC CE job id {0}".format(arcjob.JobID)) arc_utils.arcjob2workspec(arcjob, workspec) workspec.workAttributes['arcdownloadfiles'] = downloadfiles workspec.workAttributes['proxyrole'] = proxyrole workspec.workAttributes['logsubdir'] = logsubdir workspec.batchID = arcjob.JobID tmplog.debug(workspec.workAttributes) result = (True, '') except Exception as exc: tmplog.error(traceback.format_exc()) result = (False, "Failed to submit ARC job: {0}".format(str(exc))) retlist.append(result) return retlist
class Sweeper(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'sweeper-{0}'.format(self.ident) while True: mainLog = core_utils.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('try to get workers to kill') # get workers to kill workersToKill = self.dbProxy.get_workers_to_kill(harvester_config.sweeper.maxWorkers, harvester_config.sweeper.checkInterval) mainLog.debug('got {0} queues to kill workers'.format(len(workersToKill))) # loop over all workers for queueName, workSpecs in iteritems(workersToKill): # get sweeper if not self.queueConfigMapper.has_queue(queueName): mainLog.error('queue config for {0} not found'.format(queueName)) continue queueConfig = self.queueConfigMapper.get_queue(queueName) sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper) for workSpec in workSpecs: tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID), method_name='run') tmpLog.debug('start killing') tmpStat, tmpOut = sweeperCore.kill_worker(workSpec) tmpLog.debug('done with status={0} diag={1}'.format(tmpStat, tmpOut)) mainLog.debug('done kill') # timeout for missed try: keepMissed = harvester_config.sweeper.keepMissed except: keepMissed = 24 # get workers for cleanup statusTimeoutMap = {'finished': harvester_config.sweeper.keepFinished, 'failed': harvester_config.sweeper.keepFailed, 'cancelled': harvester_config.sweeper.keepCancelled, 'missed': keepMissed } workersForCleanup = self.dbProxy.get_workers_for_cleanup(harvester_config.sweeper.maxWorkers, statusTimeoutMap) mainLog.debug('got {0} queues for workers cleanup'.format(len(workersForCleanup))) for queueName, workSpecs in iteritems(workersForCleanup): # get sweeper if not self.queueConfigMapper.has_queue(queueName): mainLog.error('queue config for {0} not found'.format(queueName)) continue queueConfig = self.queueConfigMapper.get_queue(queueName) sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper) for workSpec in workSpecs: tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID), method_name='run') tmpLog.debug('start cleanup') tmpStat, tmpOut = sweeperCore.sweep_worker(workSpec) tmpLog.debug('done with status={0} diag={1}'.format(tmpStat, tmpOut)) if tmpStat: # delete from DB self.dbProxy.delete_worker(workSpec.workerID) mainLog.debug('done cleanup') # check if being terminated if self.terminated(harvester_config.sweeper.sleepTime): mainLog.debug('terminated') return
def __init__(self): self.pluginFactory = PluginFactory() self.dbProxy = DBProxy()
def __init__(self, **kwarg): # logic type : AND: throttled if all rules are satisfied, OR: throttled if one rule is satisfied self.logicType = 'OR' PluginBase.__init__(self, **kwarg) self.dbProxy = DBProxy()
if loggerName.split('.')[-1] in ['db_proxy']: continue stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) loggerObj.addHandler(stdoutHandler) msgStr = "plugin={0}".format(preparatorCore.__class__.__name__) tmpLog.debug(msgStr) msgStr = "Initial queueConfig.preparator = {}".format(initial_queueConfig_preparator) tmpLog.debug(msgStr) msgStr = "Modified queueConfig.preparator = {}".format(modified_queueConfig_preparator) tmpLog.debug(msgStr) scope = 'panda' proxy = DBProxy() communicator = CommunicatorPool() cacher = Cacher(communicator, single_mode=True) cacher.run() tmpLog.debug("plugin={0}".format(preparatorCore.__class__.__name__)) tmpLog.debug("BasePath from preparator configuration: %s " % preparatorCore.basePath) # get all jobs in table in a preparing substate #tmpLog.debug('try to get all jobs in a preparing substate') #jobSpec_list = proxy.get_jobs_in_sub_status('preparing',2000,None,None,None,None,None,None) # get all jobs if job_id > 0 : tmpLog.debug('try to get job ID - {}'.format(job_id)) jobSpec_list = [proxy.get_job(job_id)] else :
for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict): if loggerName.startswith('panda.log'): if len(loggerObj.handlers) == 0: continue if loggerName.split('.')[-1] in ['db_proxy']: continue stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) loggerObj.addHandler(stdoutHandler) pp = pprint.PrettyPrinter(indent=4) queueConfigMapper = QueueConfigMapper() proxy = DBProxy() sqlJ ="SELECT * FROM job_table" resultsJobcur = proxy.execute(sqlJ) resultsJob = resultsJobcur.fetchall() proxy.commit() sqlF ="SELECT * FROM file_table" resultsFilescur = proxy.execute(sqlF) resultsFiles = resultsFilescur.fetchall() proxy.commit() print "job_table - " print resultsJob[0].keys()
if loggerName.split('.')[-1] in ['db_proxy']: continue stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) loggerObj.addHandler(stdoutHandler) msgStr = "plugin={0}".format(preparatorCore.__class__.__name__) tmpLog.debug(msgStr) msgStr = "Initial queueConfig.preparator = {}".format(initial_queueConfig_preparator) tmpLog.debug(msgStr) msgStr = "Modified queueConfig.preparator = {}".format(modified_queueConfig_preparator) tmpLog.debug(msgStr) scope = 'panda' proxy = DBProxy() communicator = CommunicatorPool() cacher = Cacher(communicator, single_mode=True) cacher.run() tmpLog.debug("plugin={0}".format(preparatorCore.__class__.__name__)) tmpLog.debug("BasePath from preparator configuration: %s " % preparatorCore.basePath) # get all jobs in table in a preparing substate tmpLog.debug('try to get all jobs in a preparing substate') jobSpec_list = proxy.get_jobs_in_sub_status('preparing',2000,None,None,None,None,None,None) tmpLog.debug('got {0} jobs'.format(len(jobSpec_list))) # loop over all found jobs if len(jobSpec_list) > 0 : for jobSpec in jobSpec_list: tmpLog.debug(' PandaID = %d status = %s subStatus = %s lockedBy = %s' %
class Monitor(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.pluginFactory = PluginFactory() self.startTimestamp = time.time() self.monitor_fifo = MonitorFIFO() self.apfmon = Apfmon(self.queueConfigMapper) # main loop def run(self): lockedBy = 'monitor-{0}'.format(self.get_pid()) # init messengers for queueConfig in self.queueConfigMapper.get_all_queues().values(): # just import for module initialization self.pluginFactory.get_plugin(queueConfig.messenger) # main try: fifoSleepTimeMilli = harvester_config.monitor.fifoSleepTimeMilli except AttributeError: fifoSleepTimeMilli = 5000 try: fifoCheckDuration = harvester_config.monitor.fifoCheckDuration except AttributeError: fifoCheckDuration = 30 try: fifoMaxWorkersPerChunk = harvester_config.monitor.fifoMaxWorkersPerChunk except AttributeError: fifoMaxWorkersPerChunk = 500 try: fifoProtectiveDequeue = harvester_config.monitor.fifoProtectiveDequeue except AttributeError: fifoProtectiveDequeue = True last_DB_cycle_timestamp = 0 monitor_fifo = self.monitor_fifo sleepTime = (fifoSleepTimeMilli / 1000.0) \ if monitor_fifo.enabled else harvester_config.monitor.sleepTime adjusted_sleepTime = sleepTime if monitor_fifo.enabled: monitor_fifo.restore() while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('start a monitor cycle') if time.time() >= last_DB_cycle_timestamp + harvester_config.monitor.sleepTime and \ not (monitor_fifo.enabled and self.singleMode): # run with workers from DB sw_db = core_utils.get_stopwatch() mainLog.debug('starting run with DB') mainLog.debug('getting workers to monitor') workSpecsPerQueue = self.dbProxy.get_workers_to_update( harvester_config.monitor.maxWorkers, harvester_config.monitor.checkInterval, harvester_config.monitor.lockInterval, lockedBy) mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue))) # loop over all workers for queueName, configIdWorkSpecs in iteritems( workSpecsPerQueue): for configID, workSpecsList in iteritems( configIdWorkSpecs): retVal = self.monitor_agent_core(lockedBy, queueName, workSpecsList, config_id=configID) if monitor_fifo.enabled and retVal is not None: workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal if workSpecsToEnqueue: mainLog.debug('putting workers to FIFO') try: score = fifoCheckInterval + timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueue), score) mainLog.info( 'put workers of {0} to FIFO with score {1}' .format(queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO: {0}'. format(errStr)) if workSpecsToEnqueueToHead: mainLog.debug('putting workers to FIFO head') try: score = fifoCheckInterval - timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueueToHead), score) mainLog.info( 'put workers of {0} to FIFO with score {1}' .format(queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO head: {0}' .format(errStr)) last_DB_cycle_timestamp = time.time() if sw_db.get_elapsed_time_in_sec( ) > harvester_config.monitor.lockInterval: mainLog.warning( 'a single DB cycle was longer than lockInterval ' + sw_db.get_elapsed_time()) else: mainLog.debug('done a DB cycle' + sw_db.get_elapsed_time()) mainLog.debug('ended run with DB') elif monitor_fifo.enabled: # run with workers from FIFO sw = core_utils.get_stopwatch() n_loops = 0 n_loops_hit = 0 last_fifo_cycle_timestamp = time.time() to_break = False obj_dequeued_id_list = [] obj_to_enqueue_dict = collections.defaultdict( lambda: [[], 0, 0]) obj_to_enqueue_to_head_dict = collections.defaultdict( lambda: [[], 0, 0]) remaining_obj_to_enqueue_dict = {} remaining_obj_to_enqueue_to_head_dict = {} n_chunk_peeked_stat, sum_overhead_time_stat = 0, 0.0 while time.time( ) < last_fifo_cycle_timestamp + fifoCheckDuration: sw.reset() n_loops += 1 retVal, overhead_time = monitor_fifo.to_check_workers() if overhead_time is not None: n_chunk_peeked_stat += 1 sum_overhead_time_stat += overhead_time if retVal: # check fifo size fifo_size = monitor_fifo.size() mainLog.debug('FIFO size is {0}'.format(fifo_size)) mainLog.debug('starting run with FIFO') try: obj_gotten = monitor_fifo.get( timeout=1, protective=fifoProtectiveDequeue) except Exception as errStr: mainLog.error( 'failed to get object from FIFO: {0}'.format( errStr)) else: if obj_gotten is not None: sw_fifo = core_utils.get_stopwatch() if fifoProtectiveDequeue: obj_dequeued_id_list.append(obj_gotten.id) queueName, workSpecsList = obj_gotten.item mainLog.debug( 'got a chunk of {0} workers of {1} from FIFO' .format(len(workSpecsList), queueName) + sw.get_elapsed_time()) sw.reset() configID = None for workSpecs in workSpecsList: if configID is None and len(workSpecs) > 0: configID = workSpecs[0].configID for workSpec in workSpecs: if workSpec.pandaid_list is None: _jobspec_list = workSpec.get_jobspec_list( ) if _jobspec_list is not None: workSpec.pandaid_list = [ j.PandaID for j in workSpec. get_jobspec_list() ] else: workSpec.pandaid_list = [] workSpec.force_update( 'pandaid_list') retVal = self.monitor_agent_core( lockedBy, queueName, workSpecsList, from_fifo=True, config_id=configID) if retVal is not None: workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal try: if len(obj_to_enqueue_dict[queueName] [0]) + len( workSpecsToEnqueue ) <= fifoMaxWorkersPerChunk: obj_to_enqueue_dict[queueName][ 0].extend(workSpecsToEnqueue) obj_to_enqueue_dict[queueName][ 1] = max( obj_to_enqueue_dict[ queueName][1], timeNow_timestamp) obj_to_enqueue_dict[queueName][ 2] = max( obj_to_enqueue_dict[ queueName][2], fifoCheckInterval) else: to_break = True remaining_obj_to_enqueue_dict[ queueName] = [ workSpecsToEnqueue, timeNow_timestamp, fifoCheckInterval ] except Exception as errStr: mainLog.error( 'failed to gather workers for FIFO: {0}' .format(errStr)) to_break = True try: if len(obj_to_enqueue_to_head_dict[ queueName][0]) + len( workSpecsToEnqueueToHead ) <= fifoMaxWorkersPerChunk: obj_to_enqueue_to_head_dict[ queueName][0].extend( workSpecsToEnqueueToHead) obj_to_enqueue_to_head_dict[ queueName][1] = max( obj_to_enqueue_to_head_dict[ queueName][1], timeNow_timestamp) obj_to_enqueue_to_head_dict[ queueName][2] = max( obj_to_enqueue_to_head_dict[ queueName][2], fifoCheckInterval) else: to_break = True remaining_obj_to_enqueue_to_head_dict[ queueName] = [ workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval ] except Exception as errStr: mainLog.error( 'failed to gather workers for FIFO head: {0}' .format(errStr)) to_break = True mainLog.debug( 'checked {0} workers from FIFO'.format( len(workSpecsList)) + sw.get_elapsed_time()) else: mainLog.debug( 'monitor_agent_core returned None. Skipped putting to FIFO' ) if sw_fifo.get_elapsed_time_in_sec( ) > harvester_config.monitor.lockInterval: mainLog.warning( 'a single FIFO cycle was longer than lockInterval ' + sw_fifo.get_elapsed_time()) else: mainLog.debug('done a FIFO cycle' + sw_fifo.get_elapsed_time()) n_loops_hit += 1 if to_break: break else: mainLog.debug('got nothing in FIFO') else: mainLog.debug( 'workers in FIFO too young to check. Skipped') if self.singleMode: break if overhead_time is not None: time.sleep( max(-overhead_time * random.uniform(0.1, 1), adjusted_sleepTime)) else: time.sleep( max(fifoCheckDuration * random.uniform(0.1, 1), adjusted_sleepTime)) mainLog.debug( 'run {0} loops, including {1} FIFO cycles'.format( n_loops, n_loops_hit)) # enqueue to fifo sw.reset() n_chunk_put = 0 mainLog.debug('putting worker chunks to FIFO') for _dct in (obj_to_enqueue_dict, remaining_obj_to_enqueue_dict): for queueName, obj_to_enqueue in iteritems(_dct): try: workSpecsToEnqueue, timeNow_timestamp, fifoCheckInterval = obj_to_enqueue if workSpecsToEnqueue: score = fifoCheckInterval + timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueue), score) n_chunk_put += 1 mainLog.info( 'put a chunk of {0} workers of {1} to FIFO with score {2}' .format(len(workSpecsToEnqueue), queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO: {0}'.format( errStr)) mainLog.debug('putting worker chunks to FIFO head') for _dct in (obj_to_enqueue_to_head_dict, remaining_obj_to_enqueue_to_head_dict): for queueName, obj_to_enqueue_to_head in iteritems(_dct): try: workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = obj_to_enqueue_to_head if workSpecsToEnqueueToHead: score = fifoCheckInterval + timeNow_timestamp - 2**32 monitor_fifo.put( (queueName, workSpecsToEnqueueToHead), score) n_chunk_put += 1 mainLog.info( 'put a chunk of {0} workers of {1} to FIFO with score {2}' .format(len(workSpecsToEnqueueToHead), queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO head: {0}'. format(errStr)) # release protective dequeued objects if fifoProtectiveDequeue and len(obj_dequeued_id_list) > 0: monitor_fifo.release(ids=obj_dequeued_id_list) mainLog.debug( 'put {0} worker chunks into FIFO'.format(n_chunk_put) + sw.get_elapsed_time()) # adjust adjusted_sleepTime if n_chunk_peeked_stat > 0 and sum_overhead_time_stat > sleepTime: speedup_factor = (sum_overhead_time_stat - sleepTime) / ( n_chunk_peeked_stat * harvester_config.monitor.checkInterval) speedup_factor = max(speedup_factor, 0) adjusted_sleepTime = adjusted_sleepTime / (1. + speedup_factor) elif n_chunk_peeked_stat == 0 or sum_overhead_time_stat < 0: adjusted_sleepTime = (sleepTime + adjusted_sleepTime) / 2 mainLog.debug('adjusted_sleepTime becomes {0:.3f} sec'.format( adjusted_sleepTime)) # end run with fifo mainLog.debug('ended run with FIFO') # time the cycle mainLog.debug('done a monitor cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(adjusted_sleepTime): mainLog.debug('terminated') return # core of monitor agent to check workers in workSpecsList of queueName def monitor_agent_core(self, lockedBy, queueName, workSpecsList, from_fifo=False, config_id=None): tmpQueLog = self.make_logger(_logger, 'id={0} queue={1}'.format( lockedBy, queueName), method_name='run') # check queue if not self.queueConfigMapper.has_queue(queueName, config_id): tmpQueLog.error('config not found') return None # get queue queueConfig = self.queueConfigMapper.get_queue(queueName, config_id) try: apfmon_status_updates = self.queueConfigMapper.queueConfig[ queueName].monitor['apfmon_status_updates'] except Exception: apfmon_status_updates = False tmpQueLog.debug( 'apfmon_status_updates: {0}'.format(apfmon_status_updates)) # get plugins monCore = self.pluginFactory.get_plugin(queueConfig.monitor) messenger = self.pluginFactory.get_plugin(queueConfig.messenger) # workspec chunk of active workers workSpecsToEnqueue_dict = {} workSpecsToEnqueueToHead_dict = {} timeNow_timestamp = time.time() # get fifoCheckInterval for PQ and other fifo attributes try: fifoCheckInterval = monCore.fifoCheckInterval except Exception: if hasattr(harvester_config.monitor, 'fifoCheckInterval'): fifoCheckInterval = harvester_config.monitor.fifoCheckInterval else: fifoCheckInterval = harvester_config.monitor.checkInterval try: forceEnqueueInterval = harvester_config.monitor.fifoForceEnqueueInterval except AttributeError: forceEnqueueInterval = 3600 try: fifoMaxPreemptInterval = harvester_config.monitor.fifoMaxPreemptInterval except AttributeError: fifoMaxPreemptInterval = 60 # check workers allWorkers = [item for sublist in workSpecsList for item in sublist] tmpQueLog.debug('checking {0} workers'.format(len(allWorkers))) tmpStat, tmpRetMap = self.check_workers(monCore, messenger, allWorkers, queueConfig, tmpQueLog, from_fifo) if tmpStat: # loop over all worker chunks tmpQueLog.debug('update jobs and workers') iWorker = 0 for workSpecs in workSpecsList: jobSpecs = None pandaIDsList = [] eventsToUpdateList = [] filesToStageOutList = dict() isCheckedList = [] mapType = workSpecs[0].mapType # loop over workSpecs for workSpec in workSpecs: tmpLog = self.make_logger(_logger, 'id={0} workerID={1}'.format( lockedBy, workSpec.workerID), method_name='run') tmpOut = tmpRetMap[workSpec.workerID] oldStatus = tmpOut['oldStatus'] newStatus = tmpOut['newStatus'] monStatus = tmpOut['monStatus'] diagMessage = tmpOut['diagMessage'] workAttributes = tmpOut['workAttributes'] eventsToUpdate = tmpOut['eventsToUpdate'] filesToStageOut = tmpOut['filesToStageOut'] eventsRequestParams = tmpOut['eventsRequestParams'] nJobsToReFill = tmpOut['nJobsToReFill'] pandaIDs = tmpOut['pandaIDs'] isChecked = tmpOut['isChecked'] tmpStr = 'newStatus={0} monitoredStatus={1} diag={2} ' tmpStr += 'postProcessed={3} files={4}' tmpLog.debug( tmpStr.format(newStatus, monStatus, diagMessage, workSpec.is_post_processed(), str(filesToStageOut))) iWorker += 1 # check status if newStatus not in WorkSpec.ST_LIST: tmpLog.error('unknown status={0}'.format(newStatus)) return # update worker workSpec.set_status(newStatus) workSpec.set_work_attributes(workAttributes) workSpec.set_dialog_message(diagMessage) if isChecked: workSpec.checkTime = datetime.datetime.utcnow() isCheckedList.append(isChecked) if monStatus == WorkSpec.ST_failed: if not workSpec.has_pilot_error(): workSpec.set_pilot_error( PilotErrors.ERR_GENERALERROR, diagMessage) elif monStatus == WorkSpec.ST_cancelled: if not workSpec.has_pilot_error(): workSpec.set_pilot_error(PilotErrors.ERR_PANDAKILL, diagMessage) if monStatus in [ WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled ]: workSpec.set_work_params({'finalMonStatus': monStatus}) # request events if eventsRequestParams != {}: workSpec.eventsRequest = WorkSpec.EV_requestEvents workSpec.eventsRequestParams = eventsRequestParams # jobs to refill if nJobsToReFill is not None: workSpec.nJobsToReFill = nJobsToReFill # get associated jobs for the worker chunk if workSpec.hasJob == 1 and jobSpecs is None: jobSpecs = self.dbProxy.get_jobs_with_worker_id( workSpec.workerID, None, only_running=True, slim=True) # pandaIDs for push pandaIDsList.append(pandaIDs) if len(eventsToUpdate) > 0: eventsToUpdateList.append(eventsToUpdate) if len(filesToStageOut) > 0: filesToStageOutList[ workSpec.workerID] = filesToStageOut # apfmon status update if apfmon_status_updates and newStatus != oldStatus: tmpQueLog.debug( 'apfmon_status_updates: {0} newStatus: {1} monStatus: {2} oldStatus: {3} workSpecStatus: {4}' .format(apfmon_status_updates, newStatus, monStatus, oldStatus, workSpec.status)) self.apfmon.update_worker(workSpec, monStatus) # lock workers for fifo if from_fifo: # collect some attributes to be updated when workers are locked worker_id_list = dict() for workSpec, isChecked in zip(workSpecs, isCheckedList): attrs = dict() if isChecked: attrs['checkTime'] = workSpec.checkTime workSpec.force_not_update('checkTime') if workSpec.has_updated_attributes(): attrs['lockedBy'] = lockedBy workSpec.lockedBy = lockedBy workSpec.force_not_update('lockedBy') else: attrs['lockedBy'] = None worker_id_list[workSpec.workerID] = attrs temRetLockWorker = self.dbProxy.lock_workers( worker_id_list, harvester_config.monitor.lockInterval) # skip if not locked if not temRetLockWorker: continue # update jobs and workers if jobSpecs is not None and len(jobSpecs) > 0: tmpQueLog.debug( 'updating {0} jobs with {1} workers'.format( len(jobSpecs), len(workSpecs))) core_utils.update_job_attributes_with_workers( mapType, jobSpecs, workSpecs, filesToStageOutList, eventsToUpdateList) # update local database tmpRet = self.dbProxy.update_jobs_workers( jobSpecs, workSpecs, lockedBy, pandaIDsList) if not tmpRet: for workSpec in workSpecs: tmpLog = self.make_logger(_logger, 'id={0} workerID={1}'.format( lockedBy, workSpec.workerID), method_name='run') if from_fifo: tmpLog.info( 'failed to update the DB. Maybe locked by other thread running with DB' ) else: if workSpec.status in [ WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled, WorkSpec.ST_missed ]: tmpLog.info( 'worker already in final status. Skipped') else: tmpLog.error( 'failed to update the DB. lockInterval may be too short' ) else: if jobSpecs is not None: for jobSpec in jobSpecs: tmpLog = self.make_logger( _logger, 'id={0} PandaID={1}'.format( lockedBy, jobSpec.PandaID), method_name='run') tmpLog.debug( 'new status={0} subStatus={1} status_in_metadata={2}' .format( jobSpec.status, jobSpec.subStatus, jobSpec.get_job_status_from_attributes())) # send ACK to workers for events and files if len(eventsToUpdateList) > 0 or len(filesToStageOutList) > 0: for workSpec in workSpecs: try: messenger.acknowledge_events_files(workSpec) except Exception: core_utils.dump_error_message(tmpQueLog) tmpQueLog.error( 'failed to send ACK to workerID={0}'.format( workSpec.workerID)) # active workers for fifo if self.monitor_fifo.enabled and workSpecs: workSpec = workSpecs[0] tmpOut = tmpRetMap[workSpec.workerID] newStatus = tmpOut['newStatus'] monStatus = tmpOut['monStatus'] if newStatus in [WorkSpec.ST_submitted, WorkSpec.ST_running, WorkSpec.ST_idle] \ and workSpec.mapType != WorkSpec.MT_MultiWorkers \ and workSpec.workAttributes is not None: timeNow = datetime.datetime.utcnow() timeNow_timestamp = time.time() # get lastCheckAt _bool, lastCheckAt = workSpec.get_work_params( 'lastCheckAt') try: last_check_period = timeNow_timestamp - lastCheckAt except TypeError: last_check_period = forceEnqueueInterval + 1.0 # get lastForceEnqueueAt _bool, lastForceEnqueueAt = workSpec.get_work_params( 'lastForceEnqueueAt') if not (_bool and lastForceEnqueueAt is not None): lastForceEnqueueAt = 0 # notification intolerable_delay = max( forceEnqueueInterval * 2, harvester_config.monitor.checkInterval * 4) if _bool and lastCheckAt is not None and last_check_period > harvester_config.monitor.checkInterval \ and timeNow_timestamp - harvester_config.monitor.checkInterval > self.startTimestamp: if last_check_period > intolerable_delay: tmpQueLog.error( 'last check period of workerID={0} is {1} sec, intolerably longer than monitor checkInterval. Will NOT enquque worker by force. Please check why monitor checks worker slowly' .format(workSpec.workerID, last_check_period)) else: tmpQueLog.warning( 'last check period of workerID={0} is {1} sec, longer than monitor checkInterval' .format(workSpec.workerID, last_check_period)) # prepartion to enqueue fifo if (from_fifo) \ or (not from_fifo and timeNow_timestamp - harvester_config.monitor.sleepTime > self.startTimestamp and last_check_period > forceEnqueueInterval and last_check_period < intolerable_delay and timeNow_timestamp - lastForceEnqueueAt > 86400 + forceEnqueueInterval): if not from_fifo: # in DB cycle tmpQueLog.warning( 'last check period of workerID={0} is {1} sec, longer than monitor forceEnqueueInterval. Enqueue the worker by force' .format(workSpec.workerID, last_check_period)) workSpec.set_work_params( {'lastForceEnqueueAt': timeNow_timestamp}) workSpec.set_work_params( {'lastCheckAt': timeNow_timestamp}) workSpec.lockedBy = None workSpec.force_update('lockedBy') if monStatus in [ WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled ]: # for post-processing _bool, startFifoPreemptAt = workSpec.get_work_params( 'startFifoPreemptAt') if not _bool or startFifoPreemptAt is None: startFifoPreemptAt = timeNow_timestamp workSpec.set_work_params({ 'startFifoPreemptAt': startFifoPreemptAt }) tmpQueLog.debug( 'workerID={0} , startFifoPreemptAt: {1}'. format(workSpec.workerID, startFifoPreemptAt)) if timeNow_timestamp - startFifoPreemptAt < fifoMaxPreemptInterval: workSpecsToEnqueueToHead_dict[ workSpec.workerID] = workSpecs else: workSpec.set_work_params({ 'startFifoPreemptAt': timeNow_timestamp }) workSpec.modificationTime = timeNow workSpec.force_update('modificationTime') workSpecsToEnqueue_dict[ workSpec.workerID] = workSpecs else: workSpec.modificationTime = timeNow workSpec.force_update('modificationTime') workSpecsToEnqueue_dict[ workSpec.workerID] = workSpecs else: tmpQueLog.error('failed to check workers') workSpecsToEnqueue = list(workSpecsToEnqueue_dict.values()) workSpecsToEnqueueToHead = list(workSpecsToEnqueueToHead_dict.values()) retVal = workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval tmpQueLog.debug('done') return retVal # wrapper for checkWorkers def check_workers(self, mon_core, messenger, all_workers, queue_config, tmp_log, from_fifo): # check timeout value try: checkTimeout = mon_core.checkTimeout except Exception: try: checkTimeout = harvester_config.monitor.checkTimeout except Exception: checkTimeout = None try: workerQueueTimeLimit = harvester_config.monitor.workerQueueTimeLimit except AttributeError: workerQueueTimeLimit = 172800 workersToCheck = [] thingsToPostProcess = [] retMap = dict() for workSpec in all_workers: eventsRequestParams = {} eventsToUpdate = [] pandaIDs = [] workStatus = None workAttributes = None filesToStageOut = [] nJobsToReFill = None if workSpec.has_work_params('finalMonStatus'): # to post-process _bool, finalMonStatus = workSpec.get_work_params( 'finalMonStatus') _thing = (workSpec, (finalMonStatus, '')) thingsToPostProcess.append(_thing) else: # job-level late binding if workSpec.hasJob == 0 and workSpec.mapType != WorkSpec.MT_NoJob: # check if job is requested jobRequested = messenger.job_requested(workSpec) if jobRequested: # set ready when job is requested workStatus = WorkSpec.ST_ready else: workStatus = workSpec.status elif workSpec.nJobsToReFill in [0, None]: # check if job is requested to refill free slots jobRequested = messenger.job_requested(workSpec) if jobRequested: nJobsToReFill = jobRequested workersToCheck.append(workSpec) else: workersToCheck.append(workSpec) # add retMap[workSpec.workerID] = { 'oldStatus': workSpec.status, 'newStatus': workStatus, 'monStatus': workStatus, 'workAttributes': workAttributes, 'filesToStageOut': filesToStageOut, 'eventsRequestParams': eventsRequestParams, 'eventsToUpdate': eventsToUpdate, 'diagMessage': '', 'pandaIDs': pandaIDs, 'nJobsToReFill': nJobsToReFill, 'isChecked': True } # check workers tmp_log.debug('checking workers with plugin') try: if workersToCheck: tmpStat, tmpOut = mon_core.check_workers(workersToCheck) if not tmpStat: tmp_log.error( 'failed to check workers with: {0}'.format(tmpOut)) workersToCheck = [] tmpOut = [] else: tmp_log.debug('checked') else: tmp_log.debug('Nothing to be checked with plugin') tmpOut = [] timeNow = datetime.datetime.utcnow() for workSpec, (newStatus, diagMessage) in itertools.chain( zip(workersToCheck, tmpOut), thingsToPostProcess): workerID = workSpec.workerID tmp_log.debug('Going to check workerID={0}'.format(workerID)) pandaIDs = [] if workerID in retMap: # failed to check status if newStatus is None: tmp_log.warning( 'Failed to check workerID={0} with {1}'.format( workerID, diagMessage)) retMap[workerID]['isChecked'] = False # set status if workSpec.checkTime is not None and checkTimeout is not None and \ timeNow - workSpec.checkTime > datetime.timedelta(seconds=checkTimeout): # kill due to timeout tmp_log.debug( 'kill workerID={0} due to consecutive check failures' .format(workerID)) self.dbProxy.kill_worker(workSpec.workerID) newStatus = WorkSpec.ST_cancelled diagMessage = 'Killed by Harvester due to consecutive worker check failures. ' + diagMessage workSpec.set_pilot_error( PilotErrors.ERR_FAILEDBYSERVER, diagMessage) else: # use original status newStatus = workSpec.status # request kill if messenger.kill_requested(workSpec): tmp_log.debug( 'kill workerID={0} as requested'.format(workerID)) self.dbProxy.kill_worker(workSpec.workerID) # stuck queuing for too long if workSpec.status == WorkSpec.ST_submitted \ and timeNow > workSpec.submitTime + datetime.timedelta(seconds=workerQueueTimeLimit): tmp_log.debug( 'kill workerID={0} due to queuing longer than {1} seconds' .format(workerID, workerQueueTimeLimit)) self.dbProxy.kill_worker(workSpec.workerID) diagMessage = 'Killed by Harvester due to worker queuing too long' + diagMessage workSpec.set_pilot_error( PilotErrors.ERR_FAILEDBYSERVER, diagMessage) # expired heartbeat - only when requested in the configuration try: # check if the queue configuration requires checking for worker heartbeat worker_heartbeat_limit = int( queue_config.messenger['worker_heartbeat']) except (AttributeError, KeyError): worker_heartbeat_limit = None tmp_log.debug( 'workerID={0} heartbeat limit is configured to {1}'. format(workerID, worker_heartbeat_limit)) if worker_heartbeat_limit: if messenger.is_alive(workSpec, worker_heartbeat_limit): tmp_log.debug( 'heartbeat for workerID={0} is valid'.format( workerID)) else: tmp_log.debug( 'heartbeat for workerID={0} expired: sending kill request' .format(workerID)) self.dbProxy.kill_worker(workSpec.workerID) diagMessage = 'Killed by Harvester due to worker heartbeat expired. ' + diagMessage workSpec.set_pilot_error( PilotErrors.ERR_FAILEDBYSERVER, diagMessage) # get work attributes workAttributes = messenger.get_work_attributes(workSpec) retMap[workerID]['workAttributes'] = workAttributes # get output files filesToStageOut = messenger.get_files_to_stage_out( workSpec) retMap[workerID]['filesToStageOut'] = filesToStageOut # get events to update if workSpec.eventsRequest in [ WorkSpec.EV_useEvents, WorkSpec.EV_requestEvents ]: eventsToUpdate = messenger.events_to_update(workSpec) retMap[workerID]['eventsToUpdate'] = eventsToUpdate # request events if workSpec.eventsRequest == WorkSpec.EV_useEvents: eventsRequestParams = messenger.events_requested( workSpec) retMap[workerID][ 'eventsRequestParams'] = eventsRequestParams # get PandaIDs for pull model if workSpec.mapType == WorkSpec.MT_NoJob: pandaIDs = messenger.get_panda_ids(workSpec) retMap[workerID]['pandaIDs'] = pandaIDs # keep original new status retMap[workerID]['monStatus'] = newStatus # set running or idle while there are events to update or files to stage out if newStatus in [ WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled ]: if len(retMap[workerID]['filesToStageOut']) > 0 or \ len(retMap[workerID]['eventsToUpdate']) > 0: if workSpec.status == WorkSpec.ST_running: newStatus = WorkSpec.ST_running else: newStatus = WorkSpec.ST_idle elif not workSpec.is_post_processed(): if not queue_config.is_no_heartbeat_status( newStatus): # post processing unless heartbeat is suppressed jobSpecs = self.dbProxy.get_jobs_with_worker_id( workSpec.workerID, None, True, only_running=True, slim=True) # post processing messenger.post_processing( workSpec, jobSpecs, workSpec.mapType) workSpec.post_processed() if workSpec.status == WorkSpec.ST_running: newStatus = WorkSpec.ST_running else: newStatus = WorkSpec.ST_idle # reset modification time to immediately trigger subsequent lookup if not self.monitor_fifo.enabled: workSpec.trigger_next_lookup() retMap[workerID]['newStatus'] = newStatus retMap[workerID]['diagMessage'] = diagMessage else: tmp_log.debug( 'workerID={0} not in retMap'.format(workerID)) return True, retMap except Exception: core_utils.dump_error_message(tmp_log) return False, None
class Propagator(AgentBase): # constructor def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.communicator = communicator self.queueConfigMapper = queue_config_mapper self._last_stats_update = None self._last_metrics_update = None # main loop def run(self): while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run') mainLog.debug('getting jobs to propagate') sw = core_utils.get_stopwatch() jobSpecs = self.dbProxy.get_jobs_to_propagate(harvester_config.propagator.maxJobs, harvester_config.propagator.lockInterval, harvester_config.propagator.updateInterval, self.get_pid()) mainLog.debug('got {0} jobs {1}'.format(len(jobSpecs), sw.get_elapsed_time())) # update jobs in central database iJobs = 0 nJobs = harvester_config.propagator.nJobsInBulk hbSuppressMap = dict() while iJobs < len(jobSpecs): jobList = jobSpecs[iJobs:iJobs + nJobs] iJobs += nJobs # collect jobs to update or check jobListToSkip = [] jobListToUpdate = [] jobListToCheck = [] retList = [] for tmpJobSpec in jobList: if tmpJobSpec.computingSite not in hbSuppressMap: queueConfig = self.queueConfigMapper.get_queue(tmpJobSpec.computingSite, tmpJobSpec.configID) hbSuppressMap[tmpJobSpec.computingSite] = queueConfig.get_no_heartbeat_status() # heartbeat is suppressed if tmpJobSpec.get_status() in hbSuppressMap[tmpJobSpec.computingSite] and \ not tmpJobSpec.not_suppress_heartbeat(): # check running job to detect lost heartbeat if tmpJobSpec.status == 'running': jobListToCheck.append(tmpJobSpec) else: jobListToSkip.append(tmpJobSpec) retList.append({'StatusCode': 0, 'command': None}) else: jobListToUpdate.append(tmpJobSpec) sw.reset() retList += self.communicator.check_jobs(jobListToCheck) mainLog.debug('check_jobs for {0} jobs {1}'.format(len(jobListToCheck), sw.get_elapsed_time())) sw.reset() retList += self.communicator.update_jobs(jobListToUpdate, self.get_pid()) mainLog.debug('update_jobs for {0} jobs took {1}'.format(len(jobListToUpdate), sw.get_elapsed_time())) # logging for tmpJobSpec, tmpRet in zip(jobListToSkip+jobListToCheck+jobListToUpdate, retList): if tmpRet['StatusCode'] == 0: if tmpJobSpec in jobListToUpdate: mainLog.debug('updated PandaID={0} status={1}'.format(tmpJobSpec.PandaID, tmpJobSpec.status)) else: mainLog.debug('skip updating PandaID={0} status={1}'.format(tmpJobSpec.PandaID, tmpJobSpec.status)) # release job tmpJobSpec.propagatorLock = None if tmpJobSpec.is_final_status() and tmpJobSpec.status == tmpJobSpec.get_status(): # unset to disable further updating tmpJobSpec.propagatorTime = None tmpJobSpec.subStatus = 'done' tmpJobSpec.modificationTime = datetime.datetime.utcnow() elif tmpJobSpec.is_final_status() and not tmpJobSpec.all_events_done(): # trigger next propagation to update remaining events tmpJobSpec.trigger_propagation() else: # check event availability if tmpJobSpec.status == 'starting' and 'eventService' in tmpJobSpec.jobParams and \ tmpJobSpec.subStatus != 'submitted': tmpEvStat, tmpEvRet = self.communicator.check_event_availability(tmpJobSpec) if tmpEvStat: if tmpEvRet is not None: tmpJobSpec.nRemainingEvents = tmpEvRet if tmpEvRet == 0: mainLog.debug('kill PandaID={0} due to no event'.format(tmpJobSpec.PandaID)) tmpRet['command'] = 'tobekilled' # got kill command if 'command' in tmpRet and tmpRet['command'] in ['tobekilled']: nWorkers = self.dbProxy.kill_workers_with_job(tmpJobSpec.PandaID) if nWorkers == 0: # no workers tmpJobSpec.status = 'cancelled' tmpJobSpec.subStatus = 'killed' tmpJobSpec.set_pilot_error(PilotErrors.ERR_PANDAKILL, PilotErrors.pilotError[PilotErrors.ERR_PANDAKILL]) tmpJobSpec.stateChangeTime = datetime.datetime.utcnow() tmpJobSpec.trigger_propagation() self.dbProxy.update_job(tmpJobSpec, {'propagatorLock': self.get_pid()}) else: mainLog.error('failed to update PandaID={0} status={1}'.format(tmpJobSpec.PandaID, tmpJobSpec.status)) mainLog.debug('getting workers to propagate') sw.reset() workSpecs = self.dbProxy.get_workers_to_propagate(harvester_config.propagator.maxWorkers, harvester_config.propagator.updateInterval) mainLog.debug('got {0} workers {1}'.format(len(workSpecs), sw.get_elapsed_time())) # update workers in central database sw.reset() iWorkers = 0 nWorkers = harvester_config.propagator.nWorkersInBulk while iWorkers < len(workSpecs): workList = workSpecs[iWorkers:iWorkers + nWorkers] iWorkers += nWorkers retList, tmpErrStr = self.communicator.update_workers(workList) # logging if retList is None: mainLog.error('failed to update workers with {0}'.format(tmpErrStr)) else: for tmpWorkSpec, tmpRet in zip(workList, retList): if tmpRet: mainLog.debug('updated workerID={0} status={1}'.format(tmpWorkSpec.workerID, tmpWorkSpec.status)) # update logs for logFilePath, logOffset, logSize, logRemoteName in \ tmpWorkSpec.get_log_files_to_upload(): with open(logFilePath, 'rb') as logFileObj: tmpStat, tmpErr = self.communicator.upload_file(logRemoteName, logFileObj, logOffset, logSize) if tmpStat: tmpWorkSpec.update_log_files_to_upload(logFilePath, logOffset+logSize) # disable further update if tmpWorkSpec.is_final_status(): tmpWorkSpec.disable_propagation() self.dbProxy.update_worker(tmpWorkSpec, {'workerID': tmpWorkSpec.workerID}) else: mainLog.error('failed to update workerID={0} status={1}'.format(tmpWorkSpec.workerID, tmpWorkSpec.status)) mainLog.debug('update_workers for {0} workers took {1}'.format(iWorkers, sw.get_elapsed_time())) mainLog.debug('getting commands') commandSpecs = self.dbProxy.get_commands_for_receiver('propagator') mainLog.debug('got {0} commands'.format(len(commandSpecs))) for commandSpec in commandSpecs: if commandSpec.command.startswith(CommandSpec.COM_reportWorkerStats): # get worker stats siteName = commandSpec.command.split(':')[-1] workerStats = self.dbProxy.get_worker_stats(siteName) if len(workerStats) == 0: mainLog.error('failed to get worker stats for {0}'.format(siteName)) else: # report worker stats tmpRet, tmpStr = self.communicator.update_worker_stats(siteName, workerStats) if tmpRet: mainLog.debug('updated worker stats (command) for {0}'.format(siteName)) else: mainLog.error('failed to update worker stats (command) for {0} err={1}'.format(siteName, tmpStr)) if not self._last_stats_update or time.time() - self._last_stats_update > STATS_PERIOD: # get active UPS queues. PanDA server needs to know about them and which harvester instance is taking # care of them active_ups_queues = self.queueConfigMapper.get_active_ups_queues() # update worker stats for all sites worker_stats_bulk = self.dbProxy.get_worker_stats_bulk(active_ups_queues) if not worker_stats_bulk: mainLog.error('failed to get worker stats in bulk') else: for site_name in worker_stats_bulk: tmp_ret, tmp_str = self.communicator.update_worker_stats(site_name, worker_stats_bulk[site_name]) if tmp_ret: mainLog.debug('update of worker stats (bulk) for {0}'.format(site_name)) self._last_stats_update = time.time() else: mainLog.error('failed to update worker stats (bulk) for {0} err={1}'.format(site_name, tmp_str)) if not self._last_metrics_update \ or datetime.datetime.utcnow() - self._last_metrics_update > datetime.timedelta(seconds=METRICS_PERIOD): # get latest metrics from DB service_metrics_list = self.dbProxy.get_service_metrics(self._last_metrics_update) if not service_metrics_list: mainLog.error('failed to get service metrics') self._last_metrics_update = datetime.datetime.utcnow() else: tmp_ret, tmp_str = self.communicator.update_service_metrics(service_metrics_list) if tmp_ret: mainLog.debug('update of service metrics OK') self._last_metrics_update = datetime.datetime.utcnow() else: mainLog.error('failed to update service metrics err={0}'.format(tmp_str)) # send dialog messages mainLog.debug('getting dialog messages to propagate') try: maxDialogs = harvester_config.propagator.maxDialogs except Exception: maxDialogs = 50 diagSpecs = self.dbProxy.get_dialog_messages_to_send(maxDialogs, harvester_config.propagator.lockInterval) mainLog.debug('got {0} dialogs'.format(len(diagSpecs))) if len(diagSpecs) > 0: tmpStat, tmpStr = self.communicator.send_dialog_messages(diagSpecs) if tmpStat: diagIDs = [diagSpec.diagID for diagSpec in diagSpecs] self.dbProxy.delete_dialog_messages(diagIDs) mainLog.debug('sent {0} dialogs'.format(len(diagSpecs))) else: mainLog.error('failed to send dialogs err={0}'.format(tmpStr)) if sw_main.get_elapsed_time_in_sec() > harvester_config.propagator.lockInterval: mainLog.warning('a single cycle was longer than lockInterval. done' + sw_main.get_elapsed_time()) else: mainLog.debug('done' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.propagator.sleepTime): mainLog.debug('terminated') return
class CredManager(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queue_config_mapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() # plugin cores self.exeCores = [] self.queue_exe_cores = [] # get plugin from harvester config self.get_cores_from_harvester_config() # update plugin cores from queue config self.update_cores_from_queue_config() # get list def get_list(self, data): if isinstance(data, list): return data else: return [data] # get plugin cores from harvester config def get_cores_from_harvester_config(self): # get module and class names if hasattr(harvester_config.credmanager, 'moduleName'): moduleNames = self.get_list( harvester_config.credmanager.moduleName) else: moduleNames = [] if hasattr(harvester_config.credmanager, 'className'): classNames = self.get_list(harvester_config.credmanager.className) else: classNames = [] # file names of original certificates if hasattr(harvester_config.credmanager, 'inCertFile'): inCertFiles = self.get_list( harvester_config.credmanager.inCertFile) elif hasattr(harvester_config.credmanager, 'certFile'): inCertFiles = self.get_list(harvester_config.credmanager.certFile) else: inCertFiles = [] # file names of certificates to be generated if hasattr(harvester_config.credmanager, 'outCertFile'): outCertFiles = self.get_list( harvester_config.credmanager.outCertFile) else: # use the file name of the certificate for panda connection as output name outCertFiles = self.get_list(harvester_config.pandacon.cert_file) # VOMS if hasattr(harvester_config.credmanager, 'voms'): vomses = self.get_list(harvester_config.credmanager.voms) else: vomses = [] # direct and merged plugin configuration in json if hasattr(harvester_config.credmanager, 'pluginConfigs'): pluginConfigs = harvester_config.credmanager.pluginConfigs else: pluginConfigs = [] # from traditional attributes for moduleName, className, inCertFile, outCertFile, voms in \ zip(moduleNames, classNames, inCertFiles, outCertFiles, vomses): pluginPar = {} pluginPar['module'] = moduleName pluginPar['name'] = className pluginPar['inCertFile'] = inCertFile pluginPar['outCertFile'] = outCertFile pluginPar['voms'] = voms try: exeCore = self.pluginFactory.get_plugin(pluginPar) self.exeCores.append(exeCore) except Exception: _logger.error( 'failed to launch credmanager with traditional attributes for {0}' .format(pluginPar)) core_utils.dump_error_message(_logger) # from pluginConfigs for pc in pluginConfigs: try: setup_maps = pc['configs'] for setup_name, setup_map in setup_maps.items(): try: pluginPar = {} pluginPar['module'] = pc['module'] pluginPar['name'] = pc['name'] pluginPar['setup_name'] = setup_name pluginPar.update(setup_map) exeCore = self.pluginFactory.get_plugin(pluginPar) self.exeCores.append(exeCore) except Exception: _logger.error( 'failed to launch credmanager in pluginConfigs for {0}' .format(pluginPar)) core_utils.dump_error_message(_logger) except Exception: _logger.error('failed to parse pluginConfigs {0}'.format(pc)) core_utils.dump_error_message(_logger) # update plugin cores from queue config def update_cores_from_queue_config(self): self.queue_exe_cores = [] for queue_name, queue_config in self.queue_config_mapper.get_all_queues( ).items(): if queue_config.queueStatus == 'offline' \ or not hasattr(queue_config, 'credmanagers') \ or not isinstance(queue_config.credmanagers, list): continue for cm_setup in queue_config.credmanagers: try: pluginPar = {} pluginPar['module'] = cm_setup['module'] pluginPar['name'] = cm_setup['name'] pluginPar['setup_name'] = queue_name for k, v in cm_setup.items(): if k in ('module', 'name'): pass if isinstance(v, str) and '$' in v: # replace placeholders value = v patts = re.findall('\$\{([a-zA-Z\d_.]+)\}', v) for patt in patts: tmp_ph = '${' + patt + '}' tmp_val = None if patt == 'harvesterID': tmp_val = harvester_config.master.harvester_id elif patt == 'queueName': tmp_val = queue_name elif patt.startswith('common.'): # values from common blocks attr = patt.replace('common.', '') if hasattr( queue_config, 'common' ) and attr in queue_config.common: tmp_val = queue_config.common[attr] if tmp_val is not None: value = value.replace(tmp_ph, tmp_val) # fill in pluginPar[k] = value else: # fill in pluginPar[k] = v exe_core = self.pluginFactory.get_plugin(pluginPar) self.queue_exe_cores.append(exe_core) except Exception: _logger.error( 'failed to launch about queue={0} for {1}'.format( queue_name, pluginPar)) core_utils.dump_error_message(_logger) # main loop def run(self): while True: # update plugin cores from queue config self.update_cores_from_queue_config() # execute self.execute() # this is the main run # check if being terminated if self.terminated(harvester_config.credmanager.sleepTime, randomize=False): return # main def execute(self): # get lock locked = self.dbProxy.get_process_lock( 'credmanager', self.get_pid(), harvester_config.credmanager.sleepTime) if not locked: return # loop over all plugins for exeCore in itertools.chain(self.exeCores, self.queue_exe_cores): # do nothing if exeCore is None: continue # make logger credmanager_name = '' if hasattr(exeCore, 'setup_name'): credmanager_name = exeCore.setup_name else: credmanager_name = '{0} {1}'.format(exeCore.inCertFile, exeCore.outCertFile) mainLog = self.make_logger(_logger, '{0} {1}'.format( exeCore.__class__.__name__, credmanager_name), method_name='execute') try: # check credential mainLog.debug('check credential') isValid = exeCore.check_credential() if isValid: mainLog.debug('valid') elif not isValid: # renew it if necessary mainLog.debug('invalid') mainLog.debug('renew credential') tmpStat, tmpOut = exeCore.renew_credential() if not tmpStat: mainLog.error('failed : {0}'.format(tmpOut)) continue except Exception: core_utils.dump_error_message(mainLog) mainLog.debug('done') # monit main def execute_monit(self): self.update_cores_from_queue_config() metrics = {} # loop over all plugins for exeCore in itertools.chain(self.exeCores, self.queue_exe_cores): # do nothing if exeCore is None: continue # make logger if hasattr(exeCore, 'setup_name'): credmanager_name = exeCore.setup_name else: credmanager_name = '{0} {1}'.format(exeCore.inCertFile, exeCore.outCertFile) subLog = self.make_logger(_logger, '{0} {1}'.format( exeCore.__class__.__name__, credmanager_name), method_name='execute_monit') try: # check credential subLog.debug('check credential lifetime') lifetime = exeCore.check_credential_lifetime() if lifetime is not None: metrics[exeCore.outCertFile] = lifetime except Exception: core_utils.dump_error_message(subLog) subLog.debug('done') return metrics
class WorkerAdjuster(object): # constructor def __init__(self, queue_config_mapper): self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict() self.apf_mon = Apfmon(self.queueConfigMapper) try: self.maxNewWorkers = harvester_config.submitter.maxNewWorkers except AttributeError: self.maxNewWorkers = None # define number of workers to submit based on various information def define_num_workers(self, static_num_workers, site_name): tmpLog = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers') tmpLog.debug('start') tmpLog.debug('static_num_workers: {0}'.format(static_num_workers)) dyn_num_workers = copy.deepcopy(static_num_workers) try: # get queue status queueStat = self.dbProxy.get_cache("panda_queues.json", None) if queueStat is None: queueStat = dict() else: queueStat = queueStat.data # get job statistics job_stats = self.dbProxy.get_cache("job_statistics.json", None) if job_stats is None: job_stats = dict() else: job_stats = job_stats.data # define num of new workers for queueName in static_num_workers: # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) workerLimits_dict = self.dbProxy.get_worker_limits(queueName) maxWorkers = workerLimits_dict.get('maxWorkers', 0) nQueueLimit = workerLimits_dict.get('nQueueLimitWorker', 0) nQueueLimitPerRT = workerLimits_dict['nQueueLimitWorkerPerRT'] nQueue_total, nReady_total, nRunning_total = 0, 0, 0 apf_msg = None apf_data = None for resource_type, tmpVal in iteritems(static_num_workers[queueName]): tmpLog.debug('Processing queue {0} resource {1} with static_num_workers {2}'. format(queueName, resource_type, tmpVal)) # set 0 to num of new workers when the queue is disabled if queueName in queueStat and queueStat[queueName]['status'] in ['offline', 'standby', 'maintenance']: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 since status={0}'.format(queueStat[queueName]['status']) tmpLog.debug(retMsg) apf_msg = 'Not submitting workers since queue status = {0}'.format(queueStat[queueName]['status']) continue # protection against not-up-to-date queue config if queueConfig is None: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 due to missing queueConfig' tmpLog.debug(retMsg) apf_msg = 'Not submitting workers because of missing queueConfig' continue # get throttler if queueName not in self.throttlerMap: if hasattr(queueConfig, 'throttler'): throttler = self.pluginFactory.get_plugin(queueConfig.throttler) else: throttler = None self.throttlerMap[queueName] = throttler # check throttler throttler = self.throttlerMap[queueName] if throttler is not None: toThrottle, tmpMsg = throttler.to_be_throttled(queueConfig) if toThrottle: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 by {0}:{1}'.format(throttler.__class__.__name__, tmpMsg) tmpLog.debug(retMsg) continue # check stats nQueue = tmpVal['nQueue'] nReady = tmpVal['nReady'] nRunning = tmpVal['nRunning'] if resource_type != 'ANY': nQueue_total += nQueue nReady_total += nReady nRunning_total += nRunning if queueConfig.runMode == 'slave': nNewWorkersDef = tmpVal['nNewWorkers'] if nNewWorkersDef == 0: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 by panda in slave mode' tmpLog.debug(retMsg) continue else: nNewWorkersDef = None # define num of new workers based on static site config nNewWorkers = 0 if nQueue >= nQueueLimitPerRT > 0: # enough queued workers retMsg = 'No nNewWorkers since nQueue({0})>=nQueueLimitPerRT({1})'.format(nQueue, nQueueLimitPerRT) tmpLog.debug(retMsg) pass elif (nQueue + nReady + nRunning) >= maxWorkers > 0: # enough workers in the system retMsg = 'No nNewWorkers since nQueue({0}) + nReady({1}) + nRunning({2}) '.format(nQueue, nReady, nRunning) retMsg += '>= maxWorkers({0})'.format(maxWorkers) tmpLog.debug(retMsg) pass else: maxQueuedWorkers = None if nQueueLimitPerRT > 0: # there is a limit set for the queue maxQueuedWorkers = nQueueLimitPerRT # Reset the maxQueueWorkers according to particular if nNewWorkersDef is not None: # don't surpass limits given centrally maxQueuedWorkers_slave = nNewWorkersDef + nQueue if maxQueuedWorkers is not None: maxQueuedWorkers = min(maxQueuedWorkers_slave, maxQueuedWorkers) else: maxQueuedWorkers = maxQueuedWorkers_slave elif queueConfig.mapType == 'NoJob': # for pull mode, limit to activated jobs # limit the queue to the number of activated jobs to avoid empty pilots try: n_activated = max(job_stats[queueName]['activated'], 1) # avoid no activity queues queue_limit = maxQueuedWorkers maxQueuedWorkers = min(n_activated, maxQueuedWorkers) tmpLog.debug('limiting maxQueuedWorkers to min(n_activated={0}, queue_limit={1})'. format(n_activated, queue_limit)) except KeyError: tmpLog.warning('n_activated not defined, defaulting to configured queue limits') pass if maxQueuedWorkers is None: # no value found, use default value maxQueuedWorkers = 1 # new workers nNewWorkers = max(maxQueuedWorkers - nQueue, 0) tmpLog.debug('setting nNewWorkers to {0} in maxQueuedWorkers calculation' .format(nNewWorkers)) if maxWorkers > 0: nNewWorkers = min(nNewWorkers, max(maxWorkers - nQueue - nReady - nRunning, 0)) tmpLog.debug('setting nNewWorkers to {0} to respect maxWorkers' .format(nNewWorkers)) if queueConfig.maxNewWorkersPerCycle > 0: nNewWorkers = min(nNewWorkers, queueConfig.maxNewWorkersPerCycle) tmpLog.debug('setting nNewWorkers to {0} in order to respect maxNewWorkersPerCycle' .format(nNewWorkers)) if self.maxNewWorkers is not None and self.maxNewWorkers > 0: nNewWorkers = min(nNewWorkers, self.maxNewWorkers) tmpLog.debug('setting nNewWorkers to {0} in order to respect universal maxNewWorkers' .format(nNewWorkers)) dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers # adjust nNewWorkers for UCORE to let aggregations over RT respect nQueueLimitWorker and maxWorkers if queueConfig is None: maxNewWorkersPerCycle = 0 retMsg = 'set maxNewWorkersPerCycle=0 in UCORE aggregation due to missing queueConfig' tmpLog.debug(retMsg) else: maxNewWorkersPerCycle = queueConfig.maxNewWorkersPerCycle if len(dyn_num_workers[queueName]) > 1: total_new_workers_rts = sum( dyn_num_workers[queueName][_rt]['nNewWorkers'] if _rt != 'ANY' else 0 for _rt in dyn_num_workers[queueName] ) nNewWorkers_max_agg = min( max(nQueueLimit - nQueue_total, 0), max(maxWorkers - nQueue_total - nReady_total - nRunning_total, 0), ) if maxNewWorkersPerCycle >= 0: nNewWorkers_max_agg = min(nNewWorkers_max_agg, maxNewWorkersPerCycle) if self.maxNewWorkers is not None and self.maxNewWorkers > 0: nNewWorkers_max_agg = min(nNewWorkers_max_agg, self.maxNewWorkers) # exceeded max, to adjust if total_new_workers_rts > nNewWorkers_max_agg: if nNewWorkers_max_agg == 0: for resource_type in dyn_num_workers[queueName]: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 tmpLog.debug('No nNewWorkers since nNewWorkers_max_agg=0 for UCORE') else: tmpLog.debug('nNewWorkers_max_agg={0} for UCORE'.format(nNewWorkers_max_agg)) _d = dyn_num_workers[queueName].copy() del _d['ANY'] simple_rt_nw_list = [ [_rt, _d[_rt].get('nNewWorkers', 0), 0] for _rt in _d ] _countdown = nNewWorkers_max_agg for _rt_list in simple_rt_nw_list: resource_type, nNewWorkers_orig, _r = _rt_list nNewWorkers, remainder = divmod(nNewWorkers_orig*nNewWorkers_max_agg, total_new_workers_rts) dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers _rt_list[2] = remainder _countdown -= nNewWorkers _s_list = sorted(simple_rt_nw_list, key=(lambda x: x[1])) sorted_rt_nw_list = sorted(_s_list, key=(lambda x: x[2]), reverse=True) for resource_type, nNewWorkers_orig, remainder in sorted_rt_nw_list: if _countdown <= 0: break dyn_num_workers[queueName][resource_type]['nNewWorkers'] += 1 _countdown -= 1 for resource_type in dyn_num_workers[queueName]: if resource_type == 'ANY': continue nNewWorkers = dyn_num_workers[queueName][resource_type]['nNewWorkers'] tmpLog.debug('setting nNewWorkers to {0} of type {1} in order to respect RT aggregations for UCORE' .format(nNewWorkers, resource_type)) if not apf_msg: apf_data = copy.deepcopy(dyn_num_workers[queueName]) self.apf_mon.update_label(queueName, apf_msg, apf_data) # dump tmpLog.debug('defined {0}'.format(str(dyn_num_workers))) return dyn_num_workers except Exception: # dump error errMsg = core_utils.dump_error_message(tmpLog) return None
os.remove(harvester_config.db.database_filename) except Exception: pass for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict): if loggerName.startswith('panda.log'): if len(loggerObj.handlers) == 0: continue if loggerName.split('.')[-1] in ['db_proxy']: continue stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) loggerObj.addHandler(stdoutHandler) queueConfigMapper = QueueConfigMapper() proxy = DBProxy() proxy.make_tables(queueConfigMapper) job = JobSpec() job.PandaID = 1 job.modificationTime = datetime.datetime.now() proxy.insert_jobs([job]) newJob = proxy.get_job(1) a = CommunicatorPool() a.get_jobs('siteName', 'nodeName', 'prodSourceLabel', 'computingElement', 1, {})
class CommandManager(AgentBase): # constructor def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.db_proxy = DBProxy() self.communicator = communicator self.queueConfigMapper = queue_config_mapper self.nodeName = socket.gethostname() self.lastHeartbeat = None # set single mode def set_single_mode(self, single_mode): self.singleMode = single_mode def convert_to_command_specs(self, commands): """ Generates a list of CommandSpec objects """ command_specs = [] for command in commands: command_spec = CommandSpec() command_spec.convert_command_json(command) for comStr, receiver in iteritems(CommandSpec.receiver_map): if command_spec.command.startswith(comStr): command_spec.receiver = receiver break if command_spec.receiver is not None: command_specs.append(command_spec) return command_specs def run(self): """ main """ main_log = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run') bulk_size = harvester_config.commandmanager.commands_bulk_size locked = self.db_proxy.get_process_lock('commandmanager', self.get_pid(), harvester_config.commandmanager.sleepTime) if locked: # send command list to be received siteNames = set() commandList = [] for queueName, queueConfig in iteritems(self.queueConfigMapper.get_active_queues()): if queueConfig is None or queueConfig.runMode != 'slave': continue # one command for all queues in one site if queueConfig.siteName not in siteNames: commandItem = {'command': CommandSpec.COM_reportWorkerStats, 'computingSite': queueConfig.siteName, 'resourceType': queueConfig.resourceType } commandList.append(commandItem) siteNames.add(queueConfig.siteName) # one command for each queue commandItem = {'command': CommandSpec.COM_setNWorkers, 'computingSite': queueConfig.siteName, 'resourceType': queueConfig.resourceType } commandList.append(commandItem) data = {'startTime': datetime.datetime.utcnow(), 'sw_version': panda_pkg_info.release_version, 'commit_stamp': commit_timestamp.timestamp} if len(commandList) > 0: main_log.debug('sending command list to receive') data['commands'] = commandList self.communicator.is_alive(data) # main loop while True: # get lock locked = self.db_proxy.get_process_lock('commandmanager', self.get_pid(), harvester_config.commandmanager.sleepTime) if locked or self.singleMode: main_log.debug('polling commands loop') # send heartbeat if self.lastHeartbeat is None \ or self.lastHeartbeat < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): self.lastHeartbeat = datetime.datetime.utcnow() self.communicator.is_alive({}) continuous_loop = True # as long as there are commands, retrieve them while continuous_loop: # get commands from panda server for this harvester instance commands = self.communicator.get_commands(bulk_size) main_log.debug('got {0} commands (bulk size: {1})'.format(len(commands), bulk_size)) command_specs = self.convert_to_command_specs(commands) # cache commands in internal DB self.db_proxy.store_commands(command_specs) main_log.debug('cached {0} commands in internal DB'.format(len(command_specs))) # retrieve processed commands from harvester cache command_ids_ack = self.db_proxy.get_commands_ack() for shard in core_utils.create_shards(command_ids_ack, bulk_size): # post acknowledgements to panda server self.communicator.ack_commands(shard) main_log.debug('acknowledged {0} commands to panda server'.format(len(shard))) # clean acknowledged commands self.db_proxy.clean_commands_by_id(shard) # clean commands that have been processed and do not need acknowledgement self.db_proxy.clean_processed_commands() # if we didn't collect the full bulk, give panda server a break if len(commands) < bulk_size: continuous_loop = False # check if being terminated if self.terminated(harvester_config.commandmanager.sleepTime, randomize=False): main_log.debug('terminated') return
class EventFeeder(AgentBase): # constructor def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.queueConfigMapper = queue_config_mapper self.communicator = communicator self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'eventfeeder-{0}'.format(self.get_pid()) while True: mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting workers to feed events') workSpecsPerQueue = self.dbProxy.get_workers_to_feed_events(harvester_config.eventfeeder.maxWorkers, harvester_config.eventfeeder.lockInterval, lockedBy) mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue))) # loop over all workers for queueName, workSpecList in iteritems(workSpecsPerQueue): tmpQueLog = self.make_logger(_logger, 'queue={0}'.format(queueName), method_name='run') # check queue if not self.queueConfigMapper.has_queue(queueName): tmpQueLog.error('config not found') continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) if hasattr(queueConfig, 'scatteredEvents') and queueConfig.scatteredEvents: scattered = True else: scattered = False # get plugin messenger = self.pluginFactory.get_plugin(queueConfig.messenger) # loop over all workers for workSpec in workSpecList: tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID), method_name='run') # lock worker again lockedFlag = self.dbProxy.lock_worker_again_to_feed_events(workSpec.workerID, lockedBy) if not lockedFlag: tmpLog.debug('skipped since locked by another') continue # get events tmpLog.debug('get events') tmpStat, events = self.communicator.get_event_ranges(workSpec.eventsRequestParams, scattered) # failed if tmpStat is False: tmpLog.error('failed to get events with {0}'.format(events)) continue # lock worker again lockedFlag = self.dbProxy.lock_worker_again_to_feed_events(workSpec.workerID, lockedBy) if not lockedFlag: tmpLog.debug('skipped before feeding since locked by another') continue tmpStat = messenger.feed_events(workSpec, events) # failed if tmpStat is False: tmpLog.error('failed to feed events') continue # dump for pandaID, eventList in iteritems(events): try: nRanges = workSpec.eventsRequestParams[pandaID]['nRanges'] except Exception: nRanges = None tmpLog.debug('got {0} events for PandaID={1} while getting {2} events'.format(len(eventList), pandaID, nRanges)) # disable multi workers if workSpec.mapType == WorkSpec.MT_MultiWorkers: if len(eventList) == 0 or (nRanges is not None and len(eventList) < nRanges): tmpStat = self.dbProxy.disable_multi_workers(pandaID) if tmpStat == 1: tmpStr = 'disabled MultiWorkers for PandaID={0}'.format(pandaID) tmpLog.debug(tmpStr) # update worker workSpec.eventsRequest = WorkSpec.EV_useEvents workSpec.eventsRequestParams = None workSpec.eventFeedTime = None workSpec.eventFeedLock = None # update local database tmpStat = self.dbProxy.update_worker(workSpec, {'eventFeedLock': lockedBy}) tmpLog.debug('done with {0}'.format(tmpStat)) tmpQueLog.debug('done') mainLog.debug('done') # check if being terminated if self.terminated(harvester_config.eventfeeder.sleepTime): mainLog.debug('terminated') return
def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory()
for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict): if loggerName.startswith('panda.log'): if len(loggerObj.handlers) == 0: continue if loggerName.split('.')[-1] in ['db_proxy']: continue stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) loggerObj.addHandler(stdoutHandler) pp = pprint.PrettyPrinter(indent=4) queueConfigMapper = QueueConfigMapper() proxy = DBProxy() sqlJ = "SELECT * FROM job_table" resultsJobcur = proxy.execute(sqlJ) resultsJob = resultsJobcur.fetchall() proxy.commit() sqlF = "SELECT * FROM file_table" resultsFilescur = proxy.execute(sqlF) resultsFiles = resultsFilescur.fetchall() proxy.commit() print "job_table - " print resultsJob[0].keys()
class Sweeper(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'sweeper-{0}'.format(self.get_pid()) while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') # killing stage sw_kill = core_utils.get_stopwatch() mainLog.debug('try to get workers to kill') # get workers to kill workersToKill = self.dbProxy.get_workers_to_kill(harvester_config.sweeper.maxWorkers, harvester_config.sweeper.checkInterval) mainLog.debug('got {0} queues to kill workers'.format(len(workersToKill))) # loop over all workers sw = core_utils.get_stopwatch() for queueName, configIdWorkSpecList in iteritems(workersToKill): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue(queueName, configID): mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue(queueName, configID) sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper) sw.reset() n_workers = len(workspec_list) try: # try bulk method tmpLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') tmpLog.debug('start killing') tmpList = sweeperCore.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='run') try: tmpLog.debug('start killing one worker') tmpStat, tmpOut = sweeperCore.kill_worker(workspec) tmpLog.debug('done killing with status={0} diag={1}'.format(tmpStat, tmpOut)) except Exception: core_utils.dump_error_message(tmpLog) except Exception: core_utils.dump_error_message(mainLog) else: # bulk method n_killed = 0 for workspec, (tmpStat, tmpOut) in zip(workspec_list, tmpList): tmpLog.debug('done killing workerID={0} with status={1} diag={2}'.format( workspec.workerID, tmpStat, tmpOut)) if tmpStat: n_killed += 1 tmpLog.debug('killed {0}/{1} workers'.format(n_killed, n_workers)) mainLog.debug('done killing {0} workers'.format(n_workers) + sw.get_elapsed_time()) mainLog.debug('done all killing' + sw_kill.get_elapsed_time()) # cleanup stage sw_cleanup = core_utils.get_stopwatch() # timeout for missed try: keepMissed = harvester_config.sweeper.keepMissed except Exception: keepMissed = 24 try: keepPending = harvester_config.sweeper.keepPending except Exception: keepPending = 24 # get workers for cleanup statusTimeoutMap = {'finished': harvester_config.sweeper.keepFinished, 'failed': harvester_config.sweeper.keepFailed, 'cancelled': harvester_config.sweeper.keepCancelled, 'missed': keepMissed, 'pending': keepPending } workersForCleanup = self.dbProxy.get_workers_for_cleanup(harvester_config.sweeper.maxWorkers, statusTimeoutMap) mainLog.debug('got {0} queues for workers cleanup'.format(len(workersForCleanup))) sw = core_utils.get_stopwatch() for queueName, configIdWorkSpecList in iteritems(workersForCleanup): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue(queueName, configID): mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue(queueName, configID) sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper) messenger = self.pluginFactory.get_plugin(queueConfig.messenger) sw.reset() n_workers = len(workspec_list) # make sure workers to clean up are all terminated mainLog.debug('making sure workers to clean up are all terminated') try: # try bulk method tmpList = sweeperCore.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='run') try: tmpStat, tmpOut = sweeperCore.kill_worker(workspec) except Exception: core_utils.dump_error_message(tmpLog) except Exception: core_utils.dump_error_message(mainLog) mainLog.debug('made sure workers to clean up are all terminated') # start cleanup for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='run') try: tmpLog.debug('start cleaning up one worker') # sweep worker tmpStat, tmpOut = sweeperCore.sweep_worker(workspec) tmpLog.debug('swept_worker with status={0} diag={1}'.format(tmpStat, tmpOut)) tmpLog.debug('start messenger cleanup') mc_tmpStat, mc_tmpOut = messenger.clean_up(workspec) tmpLog.debug('messenger cleaned up with status={0} diag={1}'.format(mc_tmpStat, mc_tmpOut)) if tmpStat: self.dbProxy.delete_worker(workspec.workerID) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done cleaning up {0} workers'.format(n_workers) + sw.get_elapsed_time()) mainLog.debug('done all cleanup' + sw_cleanup.get_elapsed_time()) # old-job-deletion stage sw_delete = core_utils.get_stopwatch() mainLog.debug('delete old jobs') jobTimeout = max(statusTimeoutMap.values()) + 1 self.dbProxy.delete_old_jobs(jobTimeout) # delete orphaned job info self.dbProxy.delete_orphaned_job_info() mainLog.debug('done deletion of old jobs' + sw_delete.get_elapsed_time()) # time the cycle mainLog.debug('done a sweeper cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.sweeper.sleepTime): mainLog.debug('terminated') return
class WorkerMaker(object): # constructor def __init__(self): self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() # get plugin def get_plugin(self, queue_config): return self.pluginFactory.get_plugin(queue_config.workerMaker) # make workers def make_workers(self, jobchunk_list, queue_config, n_ready, resource_type, maker=None): tmpLog = core_utils.make_logger(_logger, 'queue={0} rtype={1}'.format(queue_config.queueName, resource_type), method_name='make_workers') tmpLog.debug('start') try: # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) if maker is None: # not found tmpLog.error('plugin for {0} not found'.format(queue_config.queueName)) return [], jobchunk_list # get ready workers readyWorkers = self.dbProxy.get_ready_workers(queue_config.queueName, n_ready) # loop over all chunks okChunks = [] ngChunks = [] for iChunk, jobChunk in enumerate(jobchunk_list): # make a worker if iChunk >= n_ready: workSpec = maker.make_worker(jobChunk, queue_config, resource_type) else: # use ready worker if iChunk < len(readyWorkers): workSpec = readyWorkers[iChunk] else: workSpec = None # failed if workSpec is None: ngChunks.append(jobChunk) continue # set workerID if workSpec.workerID is None: workSpec.workerID = self.dbProxy.get_next_seq_number('SEQ_workerID') workSpec.configID = queue_config.configID workSpec.isNew = True okChunks.append((workSpec, jobChunk)) # dump tmpLog.debug('made {0} workers while {1} chunks failed'.format(len(okChunks), len(ngChunks))) return okChunks, ngChunks except Exception: # dump error core_utils.dump_error_message(tmpLog) return [], jobchunk_list # get number of jobs per worker def get_num_jobs_per_worker(self, queue_config, n_workers, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_num_jobs_per_worker(n_workers) # get number of workers per job def get_num_workers_per_job(self, queue_config, n_workers, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_num_workers_per_job(n_workers) # check number of ready resources def num_ready_resources(self, queue_config, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.num_ready_resources() # get upper limit on the cumulative total of workers per job def get_max_workers_per_job_in_total(self, queue_config, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_max_workers_per_job_in_total() # get upper limit on the number of new workers per job in a cycle def get_max_workers_per_job_per_cycle(self, queue_config, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_max_workers_per_job_per_cycle()
def __init__(self, queue_config_mapper): self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict()
class SimpleThrottler(PluginBase): # constructor def __init__(self, **kwarg): # logic type : AND: throttled if all rules are satisfied, OR: throttled if one rule is satisfied self.logicType = 'OR' PluginBase.__init__(self, **kwarg) self.dbProxy = DBProxy() # check if to be throttled def to_be_throttled(self, queue_config): tmpLog = self.make_logger(baseLogger, 'computingSite={0}'.format( queue_config.queueName), method_name='to_be_throttled') tmpLog.debug('start') # set default return vale if self.logicType == 'OR': retVal = False, "no rule was satisfied" else: retVal = True, "all rules were satisfied" # loop over all rules criteriaList = [] maxMissedList = [] timeNow = datetime.datetime.utcnow() for rule in self.rulesForMissed: # convert rule to criteria if rule['level'] == 'site': criteria = dict() criteria['siteName'] = queue_config.siteName criteria['timeLimit'] = timeNow - datetime.timedelta( minutes=rule['timeWindow']) criteriaList.append(criteria) maxMissedList.append(rule['maxMissed']) elif rule['level'] == 'pq': criteria = dict() criteria['computingSite'] = queue_config.queueName criteria['timeLimit'] = timeNow - datetime.timedelta( minutes=rule['timeWindow']) criteriaList.append(criteria) maxMissedList.append(rule['maxMissed']) elif rule['level'] == 'ce': elmName = 'computingElements' if elmName not in queue_config.submitter: tmpLog.debug( 'skipped since {0} is undefined in submitter config'. format(elmName)) continue for ce in queue_config.submitter[elmName]: criteria = dict() criteria['computingElement'] = ce criteria['timeLimit'] = timeNow - datetime.timedelta( minutes=rule['timeWindow']) criteriaList.append(criteria) maxMissedList.append(rule['maxMissed']) # loop over all criteria for criteria, maxMissed in zip(criteriaList, maxMissedList): nMissed = self.dbProxy.get_num_missed_workers( queue_config.queueName, criteria) if nMissed > maxMissed: if self.logicType == 'OR': tmpMsg = 'logic={0} and '.format(self.logicType) tmpMsg += 'nMissed={0} > maxMissed={1} for {2}'.format( nMissed, maxMissed, str(criteria)) retVal = True, tmpMsg break else: if self.logicType == 'AND': tmpMsg = 'logic={0} and '.format(self.logicType) tmpMsg += 'nMissed={0} <= maxMissed={1} for {2}'.format( nMissed, maxMissed, str(criteria)) retVal = False, tmpMsg break tmpLog.debug('ret={0} : {1}'.format(*retVal)) return retVal
class JobFetcher(AgentBase): # constructor def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.communicator = communicator self.nodeName = socket.gethostname() self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() # main loop def run(self): while True: mainLog = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run') mainLog.debug('getting number of jobs to be fetched') # get number of jobs to be fetched nJobsPerQueue = self.dbProxy.get_num_jobs_to_fetch( harvester_config.jobfetcher.nQueues, harvester_config.jobfetcher.lookupTime) mainLog.debug('got {0} queues'.format(len(nJobsPerQueue))) # loop over all queues for queueName, nJobs in iteritems(nJobsPerQueue): # check queue if not self.queueConfigMapper.has_queue(queueName): continue tmpLog = self.make_logger(_logger, 'queueName={0}'.format(queueName), method_name='run') # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) # upper limit if nJobs > harvester_config.jobfetcher.maxJobs: nJobs = harvester_config.jobfetcher.maxJobs # get jobs default_prodSourceLabel = queueConfig.get_source_label() pdpm = getattr(queueConfig, 'prodSourceLabelRandomWeightsPermille', {}) choice_list = core_utils.make_choice_list( pdpm=pdpm, default=default_prodSourceLabel) prodSourceLabel = random.choice(choice_list) tmpLog.debug('getting {0} jobs for prodSourceLabel {1}'.format( nJobs, prodSourceLabel)) sw = core_utils.get_stopwatch() siteName = queueConfig.siteName jobs, errStr = self.communicator.get_jobs( siteName, self.nodeName, prodSourceLabel, self.nodeName, nJobs, queueConfig.getJobCriteria) tmpLog.info('got {0} jobs with {1} {2}'.format( len(jobs), errStr, sw.get_elapsed_time())) # convert to JobSpec if len(jobs) > 0: # get extractor plugin if hasattr(queueConfig, 'extractor'): extractorCore = self.pluginFactory.get_plugin( queueConfig.extractor) else: extractorCore = None jobSpecs = [] fileStatMap = dict() sw_startconvert = core_utils.get_stopwatch() for job in jobs: timeNow = datetime.datetime.utcnow() jobSpec = JobSpec() jobSpec.convert_job_json(job) jobSpec.computingSite = queueName jobSpec.status = 'starting' jobSpec.subStatus = 'fetched' jobSpec.creationTime = timeNow jobSpec.stateChangeTime = timeNow jobSpec.configID = queueConfig.configID jobSpec.set_one_attribute( 'schedulerID', 'harvester-{0}'.format( harvester_config.master.harvester_id)) if queueConfig.zipPerMB is not None and jobSpec.zipPerMB is None: jobSpec.zipPerMB = queueConfig.zipPerMB fileGroupDictList = [ jobSpec.get_input_file_attributes() ] if extractorCore is not None: fileGroupDictList.append( extractorCore.get_aux_inputs(jobSpec)) for fileGroupDict in fileGroupDictList: for tmpLFN, fileAttrs in iteritems(fileGroupDict): # check file status if tmpLFN not in fileStatMap: fileStatMap[ tmpLFN] = self.dbProxy.get_file_status( tmpLFN, 'input', queueConfig.ddmEndpointIn, 'starting') # make file spec fileSpec = FileSpec() fileSpec.PandaID = jobSpec.PandaID fileSpec.taskID = jobSpec.taskID fileSpec.lfn = tmpLFN fileSpec.endpoint = queueConfig.ddmEndpointIn fileSpec.scope = fileAttrs['scope'] # set preparing to skip stage-in if the file is (being) taken care of by another job if 'ready' in fileStatMap[tmpLFN] or 'preparing' in fileStatMap[tmpLFN] \ or 'to_prepare' in fileStatMap[tmpLFN]: fileSpec.status = 'preparing' else: fileSpec.status = 'to_prepare' if fileSpec.status not in fileStatMap[tmpLFN]: fileStatMap[tmpLFN][fileSpec.status] = 0 fileStatMap[tmpLFN][fileSpec.status] += 1 if 'INTERNAL_FileType' in fileAttrs: fileSpec.fileType = fileAttrs[ 'INTERNAL_FileType'] jobSpec.auxInput = JobSpec.AUX_hasAuxInput else: fileSpec.fileType = 'input' if 'INTERNAL_URL' in fileAttrs: fileSpec.url = fileAttrs['INTERNAL_URL'] jobSpec.add_in_file(fileSpec) jobSpec.trigger_propagation() jobSpecs.append(jobSpec) # insert to DB tmpLog.debug("Converting of {0} jobs {1}".format( len(jobs), sw_startconvert.get_elapsed_time())) sw_insertdb = core_utils.get_stopwatch() self.dbProxy.insert_jobs(jobSpecs) tmpLog.debug('Insert of {0} jobs {1}'.format( len(jobSpecs), sw_insertdb.get_elapsed_time())) mainLog.debug('done') # check if being terminated if self.terminated(harvester_config.jobfetcher.sleepTime): mainLog.debug('terminated') return
class EventFeeder(AgentBase): # constructor def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.queueConfigMapper = queue_config_mapper self.communicator = communicator self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'eventfeeder-{0}'.format(self.get_pid()) while True: mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting workers to feed events') workSpecsPerQueue = self.dbProxy.get_workers_to_feed_events(harvester_config.eventfeeder.maxWorkers, harvester_config.eventfeeder.lockInterval, lockedBy) mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue))) # loop over all workers for queueName, workSpecList in iteritems(workSpecsPerQueue): tmpQueLog = self.make_logger(_logger, 'queue={0}'.format(queueName), method_name='run') # check queue if not self.queueConfigMapper.has_queue(queueName): tmpQueLog.error('config not found') continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) if hasattr(queueConfig, 'scatteredEvents') and queueConfig.scatteredEvents: scattered = True else: scattered = False # get plugin messenger = self.pluginFactory.get_plugin(queueConfig.messenger) # loop over all workers for workSpec in workSpecList: tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID), method_name='run') # lock worker again lockedFlag = self.dbProxy.lock_worker_again_to_feed_events(workSpec.workerID, lockedBy) if not lockedFlag: tmpLog.debug('skipped since locked by another') continue # get events tmpLog.debug('get events') tmpStat, events = self.communicator.get_event_ranges(workSpec.eventsRequestParams, scattered, workSpec.get_access_point()) # failed if tmpStat is False: tmpLog.error('failed to get events with {0}'.format(events)) continue # lock worker again lockedFlag = self.dbProxy.lock_worker_again_to_feed_events(workSpec.workerID, lockedBy) if not lockedFlag: tmpLog.debug('skipped before feeding since locked by another') continue tmpStat = messenger.feed_events(workSpec, events) # failed if tmpStat is False: tmpLog.error('failed to feed events') continue # dump for pandaID, eventList in iteritems(events): try: nRanges = workSpec.eventsRequestParams[pandaID]['nRanges'] except Exception: nRanges = None tmpLog.debug('got {0} events for PandaID={1} while getting {2} events'.format(len(eventList), pandaID, nRanges)) # disable multi workers if workSpec.mapType == WorkSpec.MT_MultiWorkers: if len(eventList) == 0 or (nRanges is not None and len(eventList) < nRanges): tmpStat = self.dbProxy.disable_multi_workers(pandaID) if tmpStat == 1: tmpStr = 'disabled MultiWorkers for PandaID={0}'.format(pandaID) tmpLog.debug(tmpStr) # update worker workSpec.eventsRequest = WorkSpec.EV_useEvents workSpec.eventsRequestParams = None workSpec.eventFeedTime = None workSpec.eventFeedLock = None # update local database tmpStat = self.dbProxy.update_worker(workSpec, {'eventFeedLock': lockedBy}) tmpLog.debug('done with {0}'.format(tmpStat)) tmpQueLog.debug('done') mainLog.debug('done') # check if being terminated if self.terminated(harvester_config.eventfeeder.sleepTime): mainLog.debug('terminated') return
class SimpleThrottler(PluginBase): # constructor def __init__(self, **kwarg): # logic type : AND: throttled if all rules are satisfied, OR: throttled if one rule is satisfied self.logicType = 'OR' PluginBase.__init__(self, **kwarg) self.dbProxy = DBProxy() # check if to be throttled def to_be_throttled(self, queue_config): tmpLog = self.make_logger(baseLogger, 'computingSite={0}'.format(queue_config.queueName), method_name='to_be_throttled') tmpLog.debug('start') # set default return vale if self.logicType == 'OR': retVal = False, "no rule was satisfied" else: retVal = True, "all rules were satisfied" # loop over all rules criteriaList = [] maxMissedList = [] timeNow = datetime.datetime.utcnow() for rule in self.rulesForMissed: # convert rule to criteria if rule['level'] == 'site': criteria = dict() criteria['siteName'] = queue_config.siteName criteria['timeLimit'] = timeNow - datetime.timedelta(minutes=rule['timeWindow']) criteriaList.append(criteria) maxMissedList.append(rule['maxMissed']) elif rule['level'] == 'pq': criteria = dict() criteria['computingSite'] = queue_config.queueName criteria['timeLimit'] = timeNow - datetime.timedelta(minutes=rule['timeWindow']) criteriaList.append(criteria) maxMissedList.append(rule['maxMissed']) elif rule['level'] == 'ce': elmName = 'computingElements' if elmName not in queue_config.submitter: tmpLog.debug('skipped since {0} is undefined in submitter config'.format(elmName)) continue for ce in queue_config.submitter[elmName]: criteria = dict() criteria['computingElement'] = ce criteria['timeLimit'] = timeNow - datetime.timedelta(minutes=rule['timeWindow']) criteriaList.append(criteria) maxMissedList.append(rule['maxMissed']) # loop over all criteria for criteria, maxMissed in zip(criteriaList, maxMissedList): nMissed = self.dbProxy.get_num_missed_workers(queue_config.queueName, criteria) if nMissed > maxMissed: if self.logicType == 'OR': tmpMsg = 'logic={0} and '.format(self.logicType) tmpMsg += 'nMissed={0} > maxMissed={1} for {2}'.format(nMissed, maxMissed, str(criteria)) retVal = True, tmpMsg break else: if self.logicType == 'AND': tmpMsg = 'logic={0} and '.format(self.logicType) tmpMsg += 'nMissed={0} <= maxMissed={1} for {2}'.format(nMissed, maxMissed, str(criteria)) retVal = False, tmpMsg break tmpLog.debug('ret={0} : {1}'.format(*retVal)) return retVal
class JobFetcher(AgentBase): # constructor def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.communicator = communicator self.nodeName = socket.gethostname() self.queueConfigMapper = queue_config_mapper # main loop def run(self): while True: mainLog = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run') mainLog.debug('getting number of jobs to be fetched') # get number of jobs to be fetched nJobsPerQueue = self.dbProxy.get_num_jobs_to_fetch(harvester_config.jobfetcher.nQueues, harvester_config.jobfetcher.lookupTime) mainLog.debug('got {0} queues'.format(len(nJobsPerQueue))) # loop over all queues for queueName, nJobs in iteritems(nJobsPerQueue): # check queue if not self.queueConfigMapper.has_queue(queueName): continue tmpLog = self.make_logger(_logger, 'queueName={0}'.format(queueName), method_name='run') # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) # upper limit if nJobs > harvester_config.jobfetcher.maxJobs: nJobs = harvester_config.jobfetcher.maxJobs # get jobs tmpLog.debug('getting {0} jobs'.format(nJobs)) sw = core_utils.get_stopwatch() siteName = queueConfig.siteName jobs, errStr = self.communicator.get_jobs(siteName, self.nodeName, queueConfig.get_source_label(), self.nodeName, nJobs, queueConfig.getJobCriteria) tmpLog.info('got {0} jobs with {1} {2}'.format(len(jobs), errStr, sw.get_elapsed_time())) # convert to JobSpec if len(jobs) > 0: jobSpecs = [] fileStatMap = dict() sw_startconvert = core_utils.get_stopwatch() for job in jobs: timeNow = datetime.datetime.utcnow() jobSpec = JobSpec() jobSpec.convert_job_json(job) jobSpec.computingSite = queueName jobSpec.status = 'starting' jobSpec.subStatus = 'fetched' jobSpec.creationTime = timeNow jobSpec.stateChangeTime = timeNow jobSpec.configID = queueConfig.configID jobSpec.set_one_attribute('schedulerID', 'harvester-{0}'.format(harvester_config.master.harvester_id)) if queueConfig.zipPerMB is not None and jobSpec.zipPerMB is None: jobSpec.zipPerMB = queueConfig.zipPerMB for tmpLFN, fileAttrs in iteritems(jobSpec.get_input_file_attributes()): # check file status if tmpLFN not in fileStatMap: fileStatMap[tmpLFN] = self.dbProxy.get_file_status(tmpLFN, 'input', queueConfig.ddmEndpointIn, 'starting') # make file spec fileSpec = FileSpec() fileSpec.PandaID = jobSpec.PandaID fileSpec.taskID = jobSpec.taskID fileSpec.lfn = tmpLFN fileSpec.endpoint = queueConfig.ddmEndpointIn fileSpec.scope = fileAttrs['scope'] # set preparing to skip stage-in if the file is (being) taken care of by another job if 'ready' in fileStatMap[tmpLFN] or 'preparing' in fileStatMap[tmpLFN] \ or 'to_prepare' in fileStatMap[tmpLFN]: fileSpec.status = 'preparing' else: fileSpec.status = 'to_prepare' if fileSpec.status not in fileStatMap[tmpLFN]: fileStatMap[tmpLFN][fileSpec.status] = 0 fileStatMap[tmpLFN][fileSpec.status] += 1 fileSpec.fileType = 'input' jobSpec.add_in_file(fileSpec) jobSpec.trigger_propagation() jobSpecs.append(jobSpec) # insert to DB tmpLog.debug("Converting of {0} jobs {1}".format(len(jobs),sw_startconvert.get_elapsed_time())) sw_insertdb =core_utils.get_stopwatch() self.dbProxy.insert_jobs(jobSpecs) tmpLog.debug('Insert of {0} jobs {1}'.format(len(jobSpecs), sw_insertdb.get_elapsed_time())) mainLog.debug('done') # check if being terminated if self.terminated(harvester_config.jobfetcher.sleepTime): mainLog.debug('terminated') return
class Stager(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() # main loop def run(self): lockedBy = 'stager-{0}'.format(self.get_pid()) while True: sw = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('try to get jobs to check') # get jobs to check preparation try: maxFilesPerJob = harvester_config.stager.maxFilesPerJobToCheck except Exception: maxFilesPerJob = None jobsToCheck = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToCheck, harvester_config.stager.checkInterval, harvester_config.stager.lockInterval, lockedBy, 'transferring', JobSpec.HO_hasTransfer, max_files_per_job=maxFilesPerJob) mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck))) # loop over all jobs for jobSpec in jobsToCheck: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('start checking') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin(queueConfig.stager) if stagerCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue tmpStat, tmpStr = stagerCore.check_status(jobSpec) # check result if tmpStat is True: # succeeded newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) tmpLog.debug('succeeded new subStatus={0}'.format(newSubStatus)) elif tmpStat is False: # fatal error tmpLog.debug('fatal error when checking status with {0}'.format(tmpStr)) # update job for fileSpec in jobSpec.outFiles: if fileSpec.status != 'finished': fileSpec.status = 'failed' errStr = 'stage-out failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr) jobSpec.trigger_propagation() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) tmpLog.debug('updated new subStatus={0}'.format(newSubStatus)) else: # on-going tmpLog.debug('try to check later since {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to trigger stage-out try: maxFilesPerJob = harvester_config.stager.maxFilesPerJobToTrigger except Exception: maxFilesPerJob = None jobsToTrigger = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToTrigger, harvester_config.stager.triggerInterval, harvester_config.stager.lockInterval, lockedBy, 'to_transfer', JobSpec.HO_hasOutput, JobSpec.HO_hasZipOutput, max_files_per_job=maxFilesPerJob) mainLog.debug('got {0} jobs to trigger'.format(len(jobsToTrigger))) # loop over all jobs for jobSpec in jobsToTrigger: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('try to trigger stage-out') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin(queueConfig.stager) if stagerCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue # trigger stage-out tmpStat, tmpStr = stagerCore.trigger_stage_out(jobSpec) # check result if tmpStat is True: # succeeded jobSpec.all_files_triggered_to_stage_out() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) tmpLog.debug('triggered new subStatus={0}'.format(newSubStatus)) elif tmpStat is False: # fatal error tmpLog.debug('fatal error to trigger with {0}'.format(tmpStr)) # update job for fileSpec in jobSpec.outFiles: if fileSpec.status != 'finished': fileSpec.status = 'failed' errStr = 'stage-out failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr) jobSpec.trigger_propagation() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) tmpLog.debug('updated new subStatus={0}'.format(newSubStatus)) else: # temporary error tmpLog.debug('try to trigger later since {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to zip output try: maxFilesPerJob = harvester_config.stager.maxFilesPerJobToZip except Exception: maxFilesPerJob = None try: zipInterval = harvester_config.stager.zipInterval except Exception: zipInterval = harvester_config.stager.triggerInterval jobsToZip = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToZip, zipInterval, harvester_config.stager.lockInterval, lockedBy, 'to_transfer', JobSpec.HO_hasZipOutput, JobSpec.HO_hasOutput, max_files_per_job=maxFilesPerJob) mainLog.debug('got {0} jobs to zip'.format(len(jobsToZip))) # loop over all jobs for jobSpec in jobsToZip: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') try: tmpLog.debug('try to zip output') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin(queueConfig.stager) if stagerCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue # trigger preparation tmpStat, tmpStr = stagerCore.zip_output(jobSpec) # succeeded if tmpStat is True: # update job jobSpec.all_files_zipped() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, False, lockedBy) tmpLog.debug('zipped new subStatus={0}'.format(newSubStatus)) else: # failed tmpLog.debug('failed to zip with {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.stager.sleepTime): mainLog.debug('terminated') return