def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.pluginFactory = PluginFactory() self.startTimestamp = time.time() self.monitor_fifo = MonitorFIFO() self.apfmon = Apfmon(self.queueConfigMapper)
def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.workerMaker = WorkerMaker() self.workerAdjuster = WorkerAdjuster(queue_config_mapper) self.pluginFactory = PluginFactory() self.monitor_fifo = MonitorFIFO() self.apfmon = Apfmon(self.queueConfigMapper)
def __init__(self, queue_config_mapper): self.queue_configMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict() self.apf_mon = Apfmon(self.queue_configMapper) try: self.maxNewWorkers = harvester_config.submitter.maxNewWorkers except AttributeError: self.maxNewWorkers = None
def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.pluginFactory = PluginFactory() self.startTimestamp = time.time() self.monitor_fifo = MonitorFIFO() if self.monitor_fifo.enabled: self.monitor_event_fifo = MonitorEventFIFO() else: self.monitor_event_fifo = None self.apfmon = Apfmon(self.queueConfigMapper) self.eventBasedMonCoreList = [] if getattr(harvester_config.monitor, 'eventBasedEnable', False): for pluginConf in harvester_config.monitor.eventBasedPlugins: pluginFactory = PluginFactory() self.eventBasedMonCoreList.append(pluginFactory.get_plugin(pluginConf))
def __init__(self, queue_config_mapper): self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict() self.apf_mon = Apfmon(self.queueConfigMapper) try: self.maxNewWorkers = harvester_config.submitter.maxNewWorkers except AttributeError: self.maxNewWorkers = None
class WorkerAdjuster(object): # constructor def __init__(self, queue_config_mapper): self.queue_configMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict() self.apf_mon = Apfmon(self.queue_configMapper) try: self.maxNewWorkers = harvester_config.submitter.maxNewWorkers except AttributeError: self.maxNewWorkers = None # define number of workers to submit based on various information def define_num_workers(self, static_num_workers, site_name): tmp_log = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers') tmp_log.debug('start') tmp_log.debug('static_num_workers: {0}'.format(static_num_workers)) dyn_num_workers = copy.deepcopy(static_num_workers) try: # get queue status queue_stat = self.dbProxy.get_cache("panda_queues.json", None) if queue_stat is None: queue_stat = dict() else: queue_stat = queue_stat.data # get job statistics job_stats = self.dbProxy.get_cache("job_statistics.json", None) if job_stats is None: job_stats = dict() else: job_stats = job_stats.data # define num of new workers for queue_name in static_num_workers: # get queue queue_config = self.queue_configMapper.get_queue(queue_name) worker_limits_dict = self.dbProxy.get_worker_limits(queue_name) max_workers = worker_limits_dict.get('maxWorkers', 0) n_queue_limit = worker_limits_dict.get('nQueueLimitWorker', 0) n_queue_limit_per_rt = worker_limits_dict[ 'nQueueLimitWorkerPerRT'] n_queue_total, n_ready_total, n_running_total = 0, 0, 0 apf_msg = None apf_data = None for job_type, jt_values in iteritems( static_num_workers[queue_name]): for resource_type, tmp_val in iteritems(jt_values): tmp_log.debug( 'Processing queue {0} job_type {1} resource_type {2} with static_num_workers {3}' .format(queue_name, job_type, resource_type, tmp_val)) # set 0 to num of new workers when the queue is disabled if queue_name in queue_stat and queue_stat[queue_name][ 'status'] in [ 'offline', 'standby', 'maintenance' ]: dyn_num_workers[queue_name][job_type][ resource_type]['nNewWorkers'] = 0 ret_msg = 'set n_new_workers=0 since status={0}'.format( queue_stat[queue_name]['status']) tmp_log.debug(ret_msg) apf_msg = 'Not submitting workers since queue status = {0}'.format( queue_stat[queue_name]['status']) continue # protection against not-up-to-date queue config if queue_config is None: dyn_num_workers[queue_name][job_type][ resource_type]['nNewWorkers'] = 0 ret_msg = 'set n_new_workers=0 due to missing queue_config' tmp_log.debug(ret_msg) apf_msg = 'Not submitting workers because of missing queue_config' continue # get throttler if queue_name not in self.throttlerMap: if hasattr(queue_config, 'throttler'): throttler = self.pluginFactory.get_plugin( queue_config.throttler) else: throttler = None self.throttlerMap[queue_name] = throttler # check throttler throttler = self.throttlerMap[queue_name] if throttler is not None: to_throttle, tmp_msg = throttler.to_be_throttled( queue_config) if to_throttle: dyn_num_workers[queue_name][job_type][ resource_type]['nNewWorkers'] = 0 ret_msg = 'set n_new_workers=0 by {0}:{1}'.format( throttler.__class__.__name__, tmp_msg) tmp_log.debug(ret_msg) continue # check stats n_queue = tmp_val['nQueue'] n_ready = tmp_val['nReady'] n_running = tmp_val['nRunning'] if resource_type != 'ANY' and job_type != 'ANY' and job_type is not None: n_queue_total += n_queue n_ready_total += n_ready n_running_total += n_running if queue_config.runMode == 'slave': n_new_workers_def = tmp_val['nNewWorkers'] if n_new_workers_def == 0: dyn_num_workers[queue_name][job_type][ resource_type]['nNewWorkers'] = 0 ret_msg = 'set n_new_workers=0 by panda in slave mode' tmp_log.debug(ret_msg) continue else: n_new_workers_def = None # define num of new workers based on static site config n_new_workers = 0 if n_queue >= n_queue_limit_per_rt > 0: # enough queued workers ret_msg = 'No n_new_workers since n_queue({0})>=n_queue_limit_per_rt({1})'.format( n_queue, n_queue_limit_per_rt) tmp_log.debug(ret_msg) pass elif (n_queue + n_ready + n_running) >= max_workers > 0: # enough workers in the system ret_msg = 'No n_new_workers since n_queue({0}) + n_ready({1}) + n_running({2}) '.format( n_queue, n_ready, n_running) ret_msg += '>= max_workers({0})'.format( max_workers) tmp_log.debug(ret_msg) pass else: max_queued_workers = None if n_queue_limit_per_rt > 0: # there is a limit set for the queue max_queued_workers = n_queue_limit_per_rt # Reset the maxQueueWorkers according to particular if n_new_workers_def is not None: # don't surpass limits given centrally maxQueuedWorkers_slave = n_new_workers_def + n_queue if max_queued_workers is not None: max_queued_workers = min( maxQueuedWorkers_slave, max_queued_workers) else: max_queued_workers = maxQueuedWorkers_slave elif queue_config.mapType == 'NoJob': # for pull mode, limit to activated jobs # limit the queue to the number of activated jobs to avoid empty pilots try: n_activated = max( job_stats[queue_name]['activated'], 1) # avoid no activity queues queue_limit = max_queued_workers max_queued_workers = min( n_activated, max_queued_workers) tmp_log.debug( 'limiting max_queued_workers to min(n_activated={0}, queue_limit={1})' .format(n_activated, queue_limit)) except KeyError: tmp_log.warning( 'n_activated not defined, defaulting to configured queue limits' ) pass if max_queued_workers is None: # no value found, use default value max_queued_workers = 1 # new workers n_new_workers = max(max_queued_workers - n_queue, 0) tmp_log.debug( 'setting n_new_workers to {0} in max_queued_workers calculation' .format(n_new_workers)) if max_workers > 0: n_new_workers = min( n_new_workers, max( max_workers - n_queue - n_ready - n_running, 0)) tmp_log.debug( 'setting n_new_workers to {0} to respect max_workers' .format(n_new_workers)) if queue_config.maxNewWorkersPerCycle > 0: n_new_workers = min( n_new_workers, queue_config.maxNewWorkersPerCycle) tmp_log.debug( 'setting n_new_workers to {0} in order to respect maxNewWorkersPerCycle' .format(n_new_workers)) if self.maxNewWorkers is not None and self.maxNewWorkers > 0: n_new_workers = min(n_new_workers, self.maxNewWorkers) tmp_log.debug( 'setting n_new_workers to {0} in order to respect universal maxNewWorkers' .format(n_new_workers)) dyn_num_workers[queue_name][job_type][resource_type][ 'nNewWorkers'] = n_new_workers # adjust n_new_workers for UCORE to let aggregations over RT respect nQueueLimitWorker and max_workers if queue_config is None: max_new_workers_per_cycle = 0 ret_msg = 'set max_new_workers_per_cycle=0 in UCORE aggregation due to missing queue_config' tmp_log.debug(ret_msg) else: max_new_workers_per_cycle = queue_config.maxNewWorkersPerCycle if len(dyn_num_workers[queue_name]) > 1: total_new_workers_rts = 0 for _jt in dyn_num_workers[queue_name]: for _rt in dyn_num_workers[queue_name][_jt]: if _jt != 'ANY' and _rt != 'ANY': total_new_workers_rts = total_new_workers_rts + dyn_num_workers[ queue_name][_jt][_rt]['nNewWorkers'] n_new_workers_max_agg = min( max(n_queue_limit - n_queue_total, 0), max( max_workers - n_queue_total - n_ready_total - n_running_total, 0)) if max_new_workers_per_cycle >= 0: n_new_workers_max_agg = min(n_new_workers_max_agg, max_new_workers_per_cycle) if self.maxNewWorkers is not None and self.maxNewWorkers > 0: n_new_workers_max_agg = min(n_new_workers_max_agg, self.maxNewWorkers) # exceeded max, to adjust if total_new_workers_rts > n_new_workers_max_agg: if n_new_workers_max_agg == 0: for job_type in dyn_num_workers[queue_name]: for resource_type in dyn_num_workers[ queue_name][job_type]: dyn_num_workers[queue_name][job_type][ resource_type]['nNewWorkers'] = 0 tmp_log.debug( 'No n_new_workers since n_new_workers_max_agg=0 for UCORE' ) else: tmp_log.debug( 'n_new_workers_max_agg={0} for UCORE'.format( n_new_workers_max_agg)) _d = dyn_num_workers[queue_name].copy() del _d['ANY'] # TODO: needs to be recalculated simple_rt_nw_list = [] for job_type in _d: # jt: job type for resource_type in _d[ job_type]: # rt: resource type simple_rt_nw_list.append([ (resource_type, job_type), _d[job_type][resource_type].get( 'nNewWorkers', 0), 0 ]) _countdown = n_new_workers_max_agg for _rt_list in simple_rt_nw_list: (resource_type, job_type), n_new_workers_orig, _r = _rt_list n_new_workers, remainder = divmod( n_new_workers_orig * n_new_workers_max_agg, total_new_workers_rts) dyn_num_workers[queue_name][ job_type].setdefault( resource_type, { 'nReady': 0, 'nRunning': 0, 'nQueue': 0, 'nNewWorkers': 0 }) dyn_num_workers[queue_name][job_type][ resource_type][ 'nNewWorkers'] = n_new_workers _rt_list[2] = remainder _countdown -= n_new_workers _s_list = sorted(simple_rt_nw_list, key=(lambda x: x[1])) sorted_rt_nw_list = sorted(_s_list, key=(lambda x: x[2]), reverse=True) for ( resource_type, job_type ), n_new_workers_orig, remainder in sorted_rt_nw_list: if _countdown <= 0: break dyn_num_workers[queue_name][job_type][ resource_type]['nNewWorkers'] += 1 _countdown -= 1 for job_type in dyn_num_workers[queue_name]: for resource_type in dyn_num_workers[queue_name][ job_type]: if job_type == 'ANY' or resource_type == 'ANY': continue n_new_workers = dyn_num_workers[queue_name][ job_type][resource_type]['nNewWorkers'] tmp_log.debug( 'setting n_new_workers to {0} of job_type {1} resource_type {2} in order to respect RT aggregations for UCORE' .format(n_new_workers, job_type, resource_type)) if not apf_msg: apf_data = copy.deepcopy(dyn_num_workers[queue_name]) self.apf_mon.update_label(queue_name, apf_msg, apf_data) # dump tmp_log.debug('defined {0}'.format(str(dyn_num_workers))) return dyn_num_workers except Exception: # dump error err_msg = core_utils.dump_error_message(tmp_log) return None
class Monitor(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.pluginFactory = PluginFactory() self.startTimestamp = time.time() self.monitor_fifo = MonitorFIFO() self.apfmon = Apfmon(self.queueConfigMapper) # main loop def run(self): lockedBy = 'monitor-{0}'.format(self.get_pid()) # init messengers for queueConfig in self.queueConfigMapper.get_all_queues().values(): # just import for module initialization self.pluginFactory.get_plugin(queueConfig.messenger) # main try: fifoSleepTimeMilli = harvester_config.monitor.fifoSleepTimeMilli except AttributeError: fifoSleepTimeMilli = 5000 try: fifoCheckDuration = harvester_config.monitor.fifoCheckDuration except AttributeError: fifoCheckDuration = 30 try: fifoMaxWorkersPerChunk = harvester_config.monitor.fifoMaxWorkersPerChunk except AttributeError: fifoMaxWorkersPerChunk = 500 try: fifoProtectiveDequeue = harvester_config.monitor.fifoProtectiveDequeue except AttributeError: fifoProtectiveDequeue = True last_DB_cycle_timestamp = 0 monitor_fifo = self.monitor_fifo sleepTime = (fifoSleepTimeMilli / 1000.0) \ if monitor_fifo.enabled else harvester_config.monitor.sleepTime adjusted_sleepTime = sleepTime if monitor_fifo.enabled: monitor_fifo.restore() while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('start a monitor cycle') if time.time() >= last_DB_cycle_timestamp + harvester_config.monitor.sleepTime and \ not (monitor_fifo.enabled and self.singleMode): # run with workers from DB sw_db = core_utils.get_stopwatch() mainLog.debug('starting run with DB') mainLog.debug('getting workers to monitor') workSpecsPerQueue = self.dbProxy.get_workers_to_update( harvester_config.monitor.maxWorkers, harvester_config.monitor.checkInterval, harvester_config.monitor.lockInterval, lockedBy) mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue))) # loop over all workers for queueName, configIdWorkSpecs in iteritems( workSpecsPerQueue): for configID, workSpecsList in iteritems( configIdWorkSpecs): retVal = self.monitor_agent_core(lockedBy, queueName, workSpecsList, config_id=configID) if monitor_fifo.enabled and retVal is not None: workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal if workSpecsToEnqueue: mainLog.debug('putting workers to FIFO') try: score = fifoCheckInterval + timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueue), score) mainLog.info( 'put workers of {0} to FIFO with score {1}' .format(queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO: {0}'. format(errStr)) if workSpecsToEnqueueToHead: mainLog.debug('putting workers to FIFO head') try: score = fifoCheckInterval - timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueueToHead), score) mainLog.info( 'put workers of {0} to FIFO with score {1}' .format(queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO head: {0}' .format(errStr)) last_DB_cycle_timestamp = time.time() if sw_db.get_elapsed_time_in_sec( ) > harvester_config.monitor.lockInterval: mainLog.warning( 'a single DB cycle was longer than lockInterval ' + sw_db.get_elapsed_time()) else: mainLog.debug('done a DB cycle' + sw_db.get_elapsed_time()) mainLog.debug('ended run with DB') elif monitor_fifo.enabled: # run with workers from FIFO sw = core_utils.get_stopwatch() n_loops = 0 n_loops_hit = 0 last_fifo_cycle_timestamp = time.time() to_break = False obj_dequeued_id_list = [] obj_to_enqueue_dict = collections.defaultdict( lambda: [[], 0, 0]) obj_to_enqueue_to_head_dict = collections.defaultdict( lambda: [[], 0, 0]) remaining_obj_to_enqueue_dict = {} remaining_obj_to_enqueue_to_head_dict = {} n_chunk_peeked_stat, sum_overhead_time_stat = 0, 0.0 while time.time( ) < last_fifo_cycle_timestamp + fifoCheckDuration: sw.reset() n_loops += 1 retVal, overhead_time = monitor_fifo.to_check_workers() if overhead_time is not None: n_chunk_peeked_stat += 1 sum_overhead_time_stat += overhead_time if retVal: # check fifo size fifo_size = monitor_fifo.size() mainLog.debug('FIFO size is {0}'.format(fifo_size)) mainLog.debug('starting run with FIFO') try: obj_gotten = monitor_fifo.get( timeout=1, protective=fifoProtectiveDequeue) except Exception as errStr: mainLog.error( 'failed to get object from FIFO: {0}'.format( errStr)) else: if obj_gotten is not None: sw_fifo = core_utils.get_stopwatch() if fifoProtectiveDequeue: obj_dequeued_id_list.append(obj_gotten.id) queueName, workSpecsList = obj_gotten.item mainLog.debug( 'got a chunk of {0} workers of {1} from FIFO' .format(len(workSpecsList), queueName) + sw.get_elapsed_time()) sw.reset() configID = None for workSpecs in workSpecsList: if configID is None and len(workSpecs) > 0: configID = workSpecs[0].configID for workSpec in workSpecs: if workSpec.pandaid_list is None: _jobspec_list = workSpec.get_jobspec_list( ) if _jobspec_list is not None: workSpec.pandaid_list = [ j.PandaID for j in workSpec. get_jobspec_list() ] else: workSpec.pandaid_list = [] workSpec.force_update( 'pandaid_list') retVal = self.monitor_agent_core( lockedBy, queueName, workSpecsList, from_fifo=True, config_id=configID) if retVal is not None: workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal try: if len(obj_to_enqueue_dict[queueName] [0]) + len( workSpecsToEnqueue ) <= fifoMaxWorkersPerChunk: obj_to_enqueue_dict[queueName][ 0].extend(workSpecsToEnqueue) obj_to_enqueue_dict[queueName][ 1] = max( obj_to_enqueue_dict[ queueName][1], timeNow_timestamp) obj_to_enqueue_dict[queueName][ 2] = max( obj_to_enqueue_dict[ queueName][2], fifoCheckInterval) else: to_break = True remaining_obj_to_enqueue_dict[ queueName] = [ workSpecsToEnqueue, timeNow_timestamp, fifoCheckInterval ] except Exception as errStr: mainLog.error( 'failed to gather workers for FIFO: {0}' .format(errStr)) to_break = True try: if len(obj_to_enqueue_to_head_dict[ queueName][0]) + len( workSpecsToEnqueueToHead ) <= fifoMaxWorkersPerChunk: obj_to_enqueue_to_head_dict[ queueName][0].extend( workSpecsToEnqueueToHead) obj_to_enqueue_to_head_dict[ queueName][1] = max( obj_to_enqueue_to_head_dict[ queueName][1], timeNow_timestamp) obj_to_enqueue_to_head_dict[ queueName][2] = max( obj_to_enqueue_to_head_dict[ queueName][2], fifoCheckInterval) else: to_break = True remaining_obj_to_enqueue_to_head_dict[ queueName] = [ workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval ] except Exception as errStr: mainLog.error( 'failed to gather workers for FIFO head: {0}' .format(errStr)) to_break = True mainLog.debug( 'checked {0} workers from FIFO'.format( len(workSpecsList)) + sw.get_elapsed_time()) else: mainLog.debug( 'monitor_agent_core returned None. Skipped putting to FIFO' ) if sw_fifo.get_elapsed_time_in_sec( ) > harvester_config.monitor.lockInterval: mainLog.warning( 'a single FIFO cycle was longer than lockInterval ' + sw_fifo.get_elapsed_time()) else: mainLog.debug('done a FIFO cycle' + sw_fifo.get_elapsed_time()) n_loops_hit += 1 if to_break: break else: mainLog.debug('got nothing in FIFO') else: mainLog.debug( 'workers in FIFO too young to check. Skipped') if self.singleMode: break if overhead_time is not None: time.sleep( max(-overhead_time * random.uniform(0.1, 1), adjusted_sleepTime)) else: time.sleep( max(fifoCheckDuration * random.uniform(0.1, 1), adjusted_sleepTime)) mainLog.debug( 'run {0} loops, including {1} FIFO cycles'.format( n_loops, n_loops_hit)) # enqueue to fifo sw.reset() n_chunk_put = 0 mainLog.debug('putting worker chunks to FIFO') for _dct in (obj_to_enqueue_dict, remaining_obj_to_enqueue_dict): for queueName, obj_to_enqueue in iteritems(_dct): try: workSpecsToEnqueue, timeNow_timestamp, fifoCheckInterval = obj_to_enqueue if workSpecsToEnqueue: score = fifoCheckInterval + timeNow_timestamp monitor_fifo.put( (queueName, workSpecsToEnqueue), score) n_chunk_put += 1 mainLog.info( 'put a chunk of {0} workers of {1} to FIFO with score {2}' .format(len(workSpecsToEnqueue), queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO: {0}'.format( errStr)) mainLog.debug('putting worker chunks to FIFO head') for _dct in (obj_to_enqueue_to_head_dict, remaining_obj_to_enqueue_to_head_dict): for queueName, obj_to_enqueue_to_head in iteritems(_dct): try: workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = obj_to_enqueue_to_head if workSpecsToEnqueueToHead: score = fifoCheckInterval + timeNow_timestamp - 2**32 monitor_fifo.put( (queueName, workSpecsToEnqueueToHead), score) n_chunk_put += 1 mainLog.info( 'put a chunk of {0} workers of {1} to FIFO with score {2}' .format(len(workSpecsToEnqueueToHead), queueName, score)) except Exception as errStr: mainLog.error( 'failed to put object from FIFO head: {0}'. format(errStr)) # release protective dequeued objects if fifoProtectiveDequeue and len(obj_dequeued_id_list) > 0: monitor_fifo.release(ids=obj_dequeued_id_list) mainLog.debug( 'put {0} worker chunks into FIFO'.format(n_chunk_put) + sw.get_elapsed_time()) # adjust adjusted_sleepTime if n_chunk_peeked_stat > 0 and sum_overhead_time_stat > sleepTime: speedup_factor = (sum_overhead_time_stat - sleepTime) / ( n_chunk_peeked_stat * harvester_config.monitor.checkInterval) speedup_factor = max(speedup_factor, 0) adjusted_sleepTime = adjusted_sleepTime / (1. + speedup_factor) elif n_chunk_peeked_stat == 0 or sum_overhead_time_stat < 0: adjusted_sleepTime = (sleepTime + adjusted_sleepTime) / 2 mainLog.debug('adjusted_sleepTime becomes {0:.3f} sec'.format( adjusted_sleepTime)) # end run with fifo mainLog.debug('ended run with FIFO') # time the cycle mainLog.debug('done a monitor cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(adjusted_sleepTime): mainLog.debug('terminated') return # core of monitor agent to check workers in workSpecsList of queueName def monitor_agent_core(self, lockedBy, queueName, workSpecsList, from_fifo=False, config_id=None): tmpQueLog = self.make_logger(_logger, 'id={0} queue={1}'.format( lockedBy, queueName), method_name='run') # check queue if not self.queueConfigMapper.has_queue(queueName, config_id): tmpQueLog.error('config not found') return None # get queue queueConfig = self.queueConfigMapper.get_queue(queueName, config_id) try: apfmon_status_updates = self.queueConfigMapper.queueConfig[ queueName].monitor['apfmon_status_updates'] except Exception: apfmon_status_updates = False tmpQueLog.debug( 'apfmon_status_updates: {0}'.format(apfmon_status_updates)) # get plugins monCore = self.pluginFactory.get_plugin(queueConfig.monitor) messenger = self.pluginFactory.get_plugin(queueConfig.messenger) # workspec chunk of active workers workSpecsToEnqueue_dict = {} workSpecsToEnqueueToHead_dict = {} timeNow_timestamp = time.time() # get fifoCheckInterval for PQ and other fifo attributes try: fifoCheckInterval = monCore.fifoCheckInterval except Exception: if hasattr(harvester_config.monitor, 'fifoCheckInterval'): fifoCheckInterval = harvester_config.monitor.fifoCheckInterval else: fifoCheckInterval = harvester_config.monitor.checkInterval try: forceEnqueueInterval = harvester_config.monitor.fifoForceEnqueueInterval except AttributeError: forceEnqueueInterval = 3600 try: fifoMaxPreemptInterval = harvester_config.monitor.fifoMaxPreemptInterval except AttributeError: fifoMaxPreemptInterval = 60 # check workers allWorkers = [item for sublist in workSpecsList for item in sublist] tmpQueLog.debug('checking {0} workers'.format(len(allWorkers))) tmpStat, tmpRetMap = self.check_workers(monCore, messenger, allWorkers, queueConfig, tmpQueLog, from_fifo) if tmpStat: # loop over all worker chunks tmpQueLog.debug('update jobs and workers') iWorker = 0 for workSpecs in workSpecsList: jobSpecs = None pandaIDsList = [] eventsToUpdateList = [] filesToStageOutList = dict() isCheckedList = [] mapType = workSpecs[0].mapType # loop over workSpecs for workSpec in workSpecs: tmpLog = self.make_logger(_logger, 'id={0} workerID={1}'.format( lockedBy, workSpec.workerID), method_name='run') tmpOut = tmpRetMap[workSpec.workerID] oldStatus = tmpOut['oldStatus'] newStatus = tmpOut['newStatus'] monStatus = tmpOut['monStatus'] diagMessage = tmpOut['diagMessage'] workAttributes = tmpOut['workAttributes'] eventsToUpdate = tmpOut['eventsToUpdate'] filesToStageOut = tmpOut['filesToStageOut'] eventsRequestParams = tmpOut['eventsRequestParams'] nJobsToReFill = tmpOut['nJobsToReFill'] pandaIDs = tmpOut['pandaIDs'] isChecked = tmpOut['isChecked'] tmpStr = 'newStatus={0} monitoredStatus={1} diag={2} ' tmpStr += 'postProcessed={3} files={4}' tmpLog.debug( tmpStr.format(newStatus, monStatus, diagMessage, workSpec.is_post_processed(), str(filesToStageOut))) iWorker += 1 # check status if newStatus not in WorkSpec.ST_LIST: tmpLog.error('unknown status={0}'.format(newStatus)) return # update worker workSpec.set_status(newStatus) workSpec.set_work_attributes(workAttributes) workSpec.set_dialog_message(diagMessage) if isChecked: workSpec.checkTime = datetime.datetime.utcnow() isCheckedList.append(isChecked) if monStatus == WorkSpec.ST_failed: if not workSpec.has_pilot_error(): workSpec.set_pilot_error( PilotErrors.ERR_GENERALERROR, diagMessage) elif monStatus == WorkSpec.ST_cancelled: if not workSpec.has_pilot_error(): workSpec.set_pilot_error(PilotErrors.ERR_PANDAKILL, diagMessage) if monStatus in [ WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled ]: workSpec.set_work_params({'finalMonStatus': monStatus}) # request events if eventsRequestParams != {}: workSpec.eventsRequest = WorkSpec.EV_requestEvents workSpec.eventsRequestParams = eventsRequestParams # jobs to refill if nJobsToReFill is not None: workSpec.nJobsToReFill = nJobsToReFill # get associated jobs for the worker chunk if workSpec.hasJob == 1 and jobSpecs is None: jobSpecs = self.dbProxy.get_jobs_with_worker_id( workSpec.workerID, None, only_running=True, slim=True) # pandaIDs for push pandaIDsList.append(pandaIDs) if len(eventsToUpdate) > 0: eventsToUpdateList.append(eventsToUpdate) if len(filesToStageOut) > 0: filesToStageOutList[ workSpec.workerID] = filesToStageOut # apfmon status update if apfmon_status_updates and newStatus != oldStatus: tmpQueLog.debug( 'apfmon_status_updates: {0} newStatus: {1} monStatus: {2} oldStatus: {3} workSpecStatus: {4}' .format(apfmon_status_updates, newStatus, monStatus, oldStatus, workSpec.status)) self.apfmon.update_worker(workSpec, monStatus) # lock workers for fifo if from_fifo: # collect some attributes to be updated when workers are locked worker_id_list = dict() for workSpec, isChecked in zip(workSpecs, isCheckedList): attrs = dict() if isChecked: attrs['checkTime'] = workSpec.checkTime workSpec.force_not_update('checkTime') if workSpec.has_updated_attributes(): attrs['lockedBy'] = lockedBy workSpec.lockedBy = lockedBy workSpec.force_not_update('lockedBy') else: attrs['lockedBy'] = None worker_id_list[workSpec.workerID] = attrs temRetLockWorker = self.dbProxy.lock_workers( worker_id_list, harvester_config.monitor.lockInterval) # skip if not locked if not temRetLockWorker: continue # update jobs and workers if jobSpecs is not None and len(jobSpecs) > 0: tmpQueLog.debug( 'updating {0} jobs with {1} workers'.format( len(jobSpecs), len(workSpecs))) core_utils.update_job_attributes_with_workers( mapType, jobSpecs, workSpecs, filesToStageOutList, eventsToUpdateList) # update local database tmpRet = self.dbProxy.update_jobs_workers( jobSpecs, workSpecs, lockedBy, pandaIDsList) if not tmpRet: for workSpec in workSpecs: tmpLog = self.make_logger(_logger, 'id={0} workerID={1}'.format( lockedBy, workSpec.workerID), method_name='run') if from_fifo: tmpLog.info( 'failed to update the DB. Maybe locked by other thread running with DB' ) else: if workSpec.status in [ WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled, WorkSpec.ST_missed ]: tmpLog.info( 'worker already in final status. Skipped') else: tmpLog.error( 'failed to update the DB. lockInterval may be too short' ) else: if jobSpecs is not None: for jobSpec in jobSpecs: tmpLog = self.make_logger( _logger, 'id={0} PandaID={1}'.format( lockedBy, jobSpec.PandaID), method_name='run') tmpLog.debug( 'new status={0} subStatus={1} status_in_metadata={2}' .format( jobSpec.status, jobSpec.subStatus, jobSpec.get_job_status_from_attributes())) # send ACK to workers for events and files if len(eventsToUpdateList) > 0 or len(filesToStageOutList) > 0: for workSpec in workSpecs: try: messenger.acknowledge_events_files(workSpec) except Exception: core_utils.dump_error_message(tmpQueLog) tmpQueLog.error( 'failed to send ACK to workerID={0}'.format( workSpec.workerID)) # active workers for fifo if self.monitor_fifo.enabled and workSpecs: workSpec = workSpecs[0] tmpOut = tmpRetMap[workSpec.workerID] newStatus = tmpOut['newStatus'] monStatus = tmpOut['monStatus'] if newStatus in [WorkSpec.ST_submitted, WorkSpec.ST_running, WorkSpec.ST_idle] \ and workSpec.mapType != WorkSpec.MT_MultiWorkers \ and workSpec.workAttributes is not None: timeNow = datetime.datetime.utcnow() timeNow_timestamp = time.time() # get lastCheckAt _bool, lastCheckAt = workSpec.get_work_params( 'lastCheckAt') try: last_check_period = timeNow_timestamp - lastCheckAt except TypeError: last_check_period = forceEnqueueInterval + 1.0 # get lastForceEnqueueAt _bool, lastForceEnqueueAt = workSpec.get_work_params( 'lastForceEnqueueAt') if not (_bool and lastForceEnqueueAt is not None): lastForceEnqueueAt = 0 # notification intolerable_delay = max( forceEnqueueInterval * 2, harvester_config.monitor.checkInterval * 4) if _bool and lastCheckAt is not None and last_check_period > harvester_config.monitor.checkInterval \ and timeNow_timestamp - harvester_config.monitor.checkInterval > self.startTimestamp: if last_check_period > intolerable_delay: tmpQueLog.error( 'last check period of workerID={0} is {1} sec, intolerably longer than monitor checkInterval. Will NOT enquque worker by force. Please check why monitor checks worker slowly' .format(workSpec.workerID, last_check_period)) else: tmpQueLog.warning( 'last check period of workerID={0} is {1} sec, longer than monitor checkInterval' .format(workSpec.workerID, last_check_period)) # prepartion to enqueue fifo if (from_fifo) \ or (not from_fifo and timeNow_timestamp - harvester_config.monitor.sleepTime > self.startTimestamp and last_check_period > forceEnqueueInterval and last_check_period < intolerable_delay and timeNow_timestamp - lastForceEnqueueAt > 86400 + forceEnqueueInterval): if not from_fifo: # in DB cycle tmpQueLog.warning( 'last check period of workerID={0} is {1} sec, longer than monitor forceEnqueueInterval. Enqueue the worker by force' .format(workSpec.workerID, last_check_period)) workSpec.set_work_params( {'lastForceEnqueueAt': timeNow_timestamp}) workSpec.set_work_params( {'lastCheckAt': timeNow_timestamp}) workSpec.lockedBy = None workSpec.force_update('lockedBy') if monStatus in [ WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled ]: # for post-processing _bool, startFifoPreemptAt = workSpec.get_work_params( 'startFifoPreemptAt') if not _bool or startFifoPreemptAt is None: startFifoPreemptAt = timeNow_timestamp workSpec.set_work_params({ 'startFifoPreemptAt': startFifoPreemptAt }) tmpQueLog.debug( 'workerID={0} , startFifoPreemptAt: {1}'. format(workSpec.workerID, startFifoPreemptAt)) if timeNow_timestamp - startFifoPreemptAt < fifoMaxPreemptInterval: workSpecsToEnqueueToHead_dict[ workSpec.workerID] = workSpecs else: workSpec.set_work_params({ 'startFifoPreemptAt': timeNow_timestamp }) workSpec.modificationTime = timeNow workSpec.force_update('modificationTime') workSpecsToEnqueue_dict[ workSpec.workerID] = workSpecs else: workSpec.modificationTime = timeNow workSpec.force_update('modificationTime') workSpecsToEnqueue_dict[ workSpec.workerID] = workSpecs else: tmpQueLog.error('failed to check workers') workSpecsToEnqueue = list(workSpecsToEnqueue_dict.values()) workSpecsToEnqueueToHead = list(workSpecsToEnqueueToHead_dict.values()) retVal = workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval tmpQueLog.debug('done') return retVal # wrapper for checkWorkers def check_workers(self, mon_core, messenger, all_workers, queue_config, tmp_log, from_fifo): # check timeout value try: checkTimeout = mon_core.checkTimeout except Exception: try: checkTimeout = harvester_config.monitor.checkTimeout except Exception: checkTimeout = None try: workerQueueTimeLimit = harvester_config.monitor.workerQueueTimeLimit except AttributeError: workerQueueTimeLimit = 172800 workersToCheck = [] thingsToPostProcess = [] retMap = dict() for workSpec in all_workers: eventsRequestParams = {} eventsToUpdate = [] pandaIDs = [] workStatus = None workAttributes = None filesToStageOut = [] nJobsToReFill = None if workSpec.has_work_params('finalMonStatus'): # to post-process _bool, finalMonStatus = workSpec.get_work_params( 'finalMonStatus') _thing = (workSpec, (finalMonStatus, '')) thingsToPostProcess.append(_thing) else: # job-level late binding if workSpec.hasJob == 0 and workSpec.mapType != WorkSpec.MT_NoJob: # check if job is requested jobRequested = messenger.job_requested(workSpec) if jobRequested: # set ready when job is requested workStatus = WorkSpec.ST_ready else: workStatus = workSpec.status elif workSpec.nJobsToReFill in [0, None]: # check if job is requested to refill free slots jobRequested = messenger.job_requested(workSpec) if jobRequested: nJobsToReFill = jobRequested workersToCheck.append(workSpec) else: workersToCheck.append(workSpec) # add retMap[workSpec.workerID] = { 'oldStatus': workSpec.status, 'newStatus': workStatus, 'monStatus': workStatus, 'workAttributes': workAttributes, 'filesToStageOut': filesToStageOut, 'eventsRequestParams': eventsRequestParams, 'eventsToUpdate': eventsToUpdate, 'diagMessage': '', 'pandaIDs': pandaIDs, 'nJobsToReFill': nJobsToReFill, 'isChecked': True } # check workers tmp_log.debug('checking workers with plugin') try: if workersToCheck: tmpStat, tmpOut = mon_core.check_workers(workersToCheck) if not tmpStat: tmp_log.error( 'failed to check workers with: {0}'.format(tmpOut)) workersToCheck = [] tmpOut = [] else: tmp_log.debug('checked') else: tmp_log.debug('Nothing to be checked with plugin') tmpOut = [] timeNow = datetime.datetime.utcnow() for workSpec, (newStatus, diagMessage) in itertools.chain( zip(workersToCheck, tmpOut), thingsToPostProcess): workerID = workSpec.workerID tmp_log.debug('Going to check workerID={0}'.format(workerID)) pandaIDs = [] if workerID in retMap: # failed to check status if newStatus is None: tmp_log.warning( 'Failed to check workerID={0} with {1}'.format( workerID, diagMessage)) retMap[workerID]['isChecked'] = False # set status if workSpec.checkTime is not None and checkTimeout is not None and \ timeNow - workSpec.checkTime > datetime.timedelta(seconds=checkTimeout): # kill due to timeout tmp_log.debug( 'kill workerID={0} due to consecutive check failures' .format(workerID)) self.dbProxy.kill_worker(workSpec.workerID) newStatus = WorkSpec.ST_cancelled diagMessage = 'Killed by Harvester due to consecutive worker check failures. ' + diagMessage workSpec.set_pilot_error( PilotErrors.ERR_FAILEDBYSERVER, diagMessage) else: # use original status newStatus = workSpec.status # request kill if messenger.kill_requested(workSpec): tmp_log.debug( 'kill workerID={0} as requested'.format(workerID)) self.dbProxy.kill_worker(workSpec.workerID) # stuck queuing for too long if workSpec.status == WorkSpec.ST_submitted \ and timeNow > workSpec.submitTime + datetime.timedelta(seconds=workerQueueTimeLimit): tmp_log.debug( 'kill workerID={0} due to queuing longer than {1} seconds' .format(workerID, workerQueueTimeLimit)) self.dbProxy.kill_worker(workSpec.workerID) diagMessage = 'Killed by Harvester due to worker queuing too long' + diagMessage workSpec.set_pilot_error( PilotErrors.ERR_FAILEDBYSERVER, diagMessage) # expired heartbeat - only when requested in the configuration try: # check if the queue configuration requires checking for worker heartbeat worker_heartbeat_limit = int( queue_config.messenger['worker_heartbeat']) except (AttributeError, KeyError): worker_heartbeat_limit = None tmp_log.debug( 'workerID={0} heartbeat limit is configured to {1}'. format(workerID, worker_heartbeat_limit)) if worker_heartbeat_limit: if messenger.is_alive(workSpec, worker_heartbeat_limit): tmp_log.debug( 'heartbeat for workerID={0} is valid'.format( workerID)) else: tmp_log.debug( 'heartbeat for workerID={0} expired: sending kill request' .format(workerID)) self.dbProxy.kill_worker(workSpec.workerID) diagMessage = 'Killed by Harvester due to worker heartbeat expired. ' + diagMessage workSpec.set_pilot_error( PilotErrors.ERR_FAILEDBYSERVER, diagMessage) # get work attributes workAttributes = messenger.get_work_attributes(workSpec) retMap[workerID]['workAttributes'] = workAttributes # get output files filesToStageOut = messenger.get_files_to_stage_out( workSpec) retMap[workerID]['filesToStageOut'] = filesToStageOut # get events to update if workSpec.eventsRequest in [ WorkSpec.EV_useEvents, WorkSpec.EV_requestEvents ]: eventsToUpdate = messenger.events_to_update(workSpec) retMap[workerID]['eventsToUpdate'] = eventsToUpdate # request events if workSpec.eventsRequest == WorkSpec.EV_useEvents: eventsRequestParams = messenger.events_requested( workSpec) retMap[workerID][ 'eventsRequestParams'] = eventsRequestParams # get PandaIDs for pull model if workSpec.mapType == WorkSpec.MT_NoJob: pandaIDs = messenger.get_panda_ids(workSpec) retMap[workerID]['pandaIDs'] = pandaIDs # keep original new status retMap[workerID]['monStatus'] = newStatus # set running or idle while there are events to update or files to stage out if newStatus in [ WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled ]: if len(retMap[workerID]['filesToStageOut']) > 0 or \ len(retMap[workerID]['eventsToUpdate']) > 0: if workSpec.status == WorkSpec.ST_running: newStatus = WorkSpec.ST_running else: newStatus = WorkSpec.ST_idle elif not workSpec.is_post_processed(): if not queue_config.is_no_heartbeat_status( newStatus): # post processing unless heartbeat is suppressed jobSpecs = self.dbProxy.get_jobs_with_worker_id( workSpec.workerID, None, True, only_running=True, slim=True) # post processing messenger.post_processing( workSpec, jobSpecs, workSpec.mapType) workSpec.post_processed() if workSpec.status == WorkSpec.ST_running: newStatus = WorkSpec.ST_running else: newStatus = WorkSpec.ST_idle # reset modification time to immediately trigger subsequent lookup if not self.monitor_fifo.enabled: workSpec.trigger_next_lookup() retMap[workerID]['newStatus'] = newStatus retMap[workerID]['diagMessage'] = diagMessage else: tmp_log.debug( 'workerID={0} not in retMap'.format(workerID)) return True, retMap except Exception: core_utils.dump_error_message(tmp_log) return False, None
def start(self): # thread list thrList = [] # Credential Manager from pandaharvester.harvesterbody.cred_manager import CredManager thr = CredManager(single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.execute() thr.start() thrList.append(thr) # Command manager from pandaharvester.harvesterbody.command_manager import CommandManager thr = CommandManager(self.communicatorPool, self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Cacher from pandaharvester.harvesterbody.cacher import Cacher thr = Cacher(self.communicatorPool, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.execute(force_update=True, skip_lock=True) thr.start() thrList.append(thr) # Watcher from pandaharvester.harvesterbody.watcher import Watcher thr = Watcher(single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Job Fetcher from pandaharvester.harvesterbody.job_fetcher import JobFetcher nThr = harvester_config.jobfetcher.nThreads for iThr in range(nThr): thr = JobFetcher(self.communicatorPool, self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Propagator from pandaharvester.harvesterbody.propagator import Propagator nThr = harvester_config.propagator.nThreads for iThr in range(nThr): thr = Propagator(self.communicatorPool, self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Monitor from pandaharvester.harvesterbody.monitor import Monitor nThr = harvester_config.monitor.nThreads for iThr in range(nThr): thr = Monitor(self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Preparator from pandaharvester.harvesterbody.preparator import Preparator nThr = harvester_config.preparator.nThreads for iThr in range(nThr): thr = Preparator(self.communicatorPool, self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Submitter from pandaharvester.harvesterbody.submitter import Submitter nThr = harvester_config.submitter.nThreads for iThr in range(nThr): thr = Submitter(self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Stager from pandaharvester.harvesterbody.stager import Stager nThr = harvester_config.stager.nThreads for iThr in range(nThr): thr = Stager(self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # EventFeeder from pandaharvester.harvesterbody.event_feeder import EventFeeder nThr = harvester_config.eventfeeder.nThreads for iThr in range(nThr): thr = EventFeeder(self.communicatorPool, self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Sweeper from pandaharvester.harvesterbody.sweeper import Sweeper nThr = harvester_config.sweeper.nThreads for iThr in range(nThr): thr = Sweeper(self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Service monitor try: sm_active = harvester_config.service_monitor.active except: sm_active = False if sm_active: from pandaharvester.harvesterbody.service_monitor import ServiceMonitor thr = ServiceMonitor(options.pid, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Report itself to APF Mon apf_mon = Apfmon(self.queueConfigMapper) apf_mon.create_factory() apf_mon.create_labels() ################## # loop on stop event to be interruptable since thr.join blocks signal capture in python 2.7 while True: if self.singleMode or not self.daemonMode: break self.stopEvent.wait(1) if self.stopEvent.is_set(): break ################## # join if self.daemonMode: for thr in thrList: thr.join()
class WorkerAdjuster(object): # constructor def __init__(self, queue_config_mapper): self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict() self.apf_mon = Apfmon(self.queueConfigMapper) try: self.maxNewWorkers = harvester_config.submitter.maxNewWorkers except AttributeError: self.maxNewWorkers = None # define number of workers to submit based on various information def define_num_workers(self, static_num_workers, site_name): tmpLog = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers') tmpLog.debug('start') tmpLog.debug('static_num_workers: {0}'.format(static_num_workers)) dyn_num_workers = copy.deepcopy(static_num_workers) try: # get queue status queueStat = self.dbProxy.get_cache("panda_queues.json", None) if queueStat is None: queueStat = dict() else: queueStat = queueStat.data # get job statistics job_stats = self.dbProxy.get_cache("job_statistics.json", None) if job_stats is None: job_stats = dict() else: job_stats = job_stats.data # define num of new workers for queueName in static_num_workers: # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) workerLimits_dict = self.dbProxy.get_worker_limits(queueName) maxWorkers = workerLimits_dict.get('maxWorkers', 0) nQueueLimit = workerLimits_dict.get('nQueueLimitWorker', 0) nQueueLimitPerRT = workerLimits_dict['nQueueLimitWorkerPerRT'] nQueue_total, nReady_total, nRunning_total = 0, 0, 0 apf_msg = None apf_data = None for resource_type, tmpVal in iteritems(static_num_workers[queueName]): tmpLog.debug('Processing queue {0} resource {1} with static_num_workers {2}'. format(queueName, resource_type, tmpVal)) # set 0 to num of new workers when the queue is disabled if queueName in queueStat and queueStat[queueName]['status'] in ['offline', 'standby', 'maintenance']: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 since status={0}'.format(queueStat[queueName]['status']) tmpLog.debug(retMsg) apf_msg = 'Not submitting workers since queue status = {0}'.format(queueStat[queueName]['status']) continue # protection against not-up-to-date queue config if queueConfig is None: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 due to missing queueConfig' tmpLog.debug(retMsg) apf_msg = 'Not submitting workers because of missing queueConfig' continue # get throttler if queueName not in self.throttlerMap: if hasattr(queueConfig, 'throttler'): throttler = self.pluginFactory.get_plugin(queueConfig.throttler) else: throttler = None self.throttlerMap[queueName] = throttler # check throttler throttler = self.throttlerMap[queueName] if throttler is not None: toThrottle, tmpMsg = throttler.to_be_throttled(queueConfig) if toThrottle: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 by {0}:{1}'.format(throttler.__class__.__name__, tmpMsg) tmpLog.debug(retMsg) continue # check stats nQueue = tmpVal['nQueue'] nReady = tmpVal['nReady'] nRunning = tmpVal['nRunning'] if resource_type != 'ANY': nQueue_total += nQueue nReady_total += nReady nRunning_total += nRunning if queueConfig.runMode == 'slave': nNewWorkersDef = tmpVal['nNewWorkers'] if nNewWorkersDef == 0: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 by panda in slave mode' tmpLog.debug(retMsg) continue else: nNewWorkersDef = None # define num of new workers based on static site config nNewWorkers = 0 if nQueue >= nQueueLimitPerRT > 0: # enough queued workers retMsg = 'No nNewWorkers since nQueue({0})>=nQueueLimitPerRT({1})'.format(nQueue, nQueueLimitPerRT) tmpLog.debug(retMsg) pass elif (nQueue + nReady + nRunning) >= maxWorkers > 0: # enough workers in the system retMsg = 'No nNewWorkers since nQueue({0}) + nReady({1}) + nRunning({2}) '.format(nQueue, nReady, nRunning) retMsg += '>= maxWorkers({0})'.format(maxWorkers) tmpLog.debug(retMsg) pass else: maxQueuedWorkers = None if nQueueLimitPerRT > 0: # there is a limit set for the queue maxQueuedWorkers = nQueueLimitPerRT # Reset the maxQueueWorkers according to particular if nNewWorkersDef is not None: # don't surpass limits given centrally maxQueuedWorkers_slave = nNewWorkersDef + nQueue if maxQueuedWorkers is not None: maxQueuedWorkers = min(maxQueuedWorkers_slave, maxQueuedWorkers) else: maxQueuedWorkers = maxQueuedWorkers_slave elif queueConfig.mapType == 'NoJob': # for pull mode, limit to activated jobs # limit the queue to the number of activated jobs to avoid empty pilots try: n_activated = max(job_stats[queueName]['activated'], 1) # avoid no activity queues queue_limit = maxQueuedWorkers maxQueuedWorkers = min(n_activated, maxQueuedWorkers) tmpLog.debug('limiting maxQueuedWorkers to min(n_activated={0}, queue_limit={1})'. format(n_activated, queue_limit)) except KeyError: tmpLog.warning('n_activated not defined, defaulting to configured queue limits') pass if maxQueuedWorkers is None: # no value found, use default value maxQueuedWorkers = 1 # new workers nNewWorkers = max(maxQueuedWorkers - nQueue, 0) tmpLog.debug('setting nNewWorkers to {0} in maxQueuedWorkers calculation' .format(nNewWorkers)) if maxWorkers > 0: nNewWorkers = min(nNewWorkers, max(maxWorkers - nQueue - nReady - nRunning, 0)) tmpLog.debug('setting nNewWorkers to {0} to respect maxWorkers' .format(nNewWorkers)) if queueConfig.maxNewWorkersPerCycle > 0: nNewWorkers = min(nNewWorkers, queueConfig.maxNewWorkersPerCycle) tmpLog.debug('setting nNewWorkers to {0} in order to respect maxNewWorkersPerCycle' .format(nNewWorkers)) if self.maxNewWorkers is not None and self.maxNewWorkers > 0: nNewWorkers = min(nNewWorkers, self.maxNewWorkers) tmpLog.debug('setting nNewWorkers to {0} in order to respect universal maxNewWorkers' .format(nNewWorkers)) dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers # adjust nNewWorkers for UCORE to let aggregations over RT respect nQueueLimitWorker and maxWorkers if queueConfig is None: maxNewWorkersPerCycle = 0 retMsg = 'set maxNewWorkersPerCycle=0 in UCORE aggregation due to missing queueConfig' tmpLog.debug(retMsg) else: maxNewWorkersPerCycle = queueConfig.maxNewWorkersPerCycle if len(dyn_num_workers[queueName]) > 1: total_new_workers_rts = sum( dyn_num_workers[queueName][_rt]['nNewWorkers'] if _rt != 'ANY' else 0 for _rt in dyn_num_workers[queueName] ) nNewWorkers_max_agg = min( max(nQueueLimit - nQueue_total, 0), max(maxWorkers - nQueue_total - nReady_total - nRunning_total, 0), ) if maxNewWorkersPerCycle >= 0: nNewWorkers_max_agg = min(nNewWorkers_max_agg, maxNewWorkersPerCycle) if self.maxNewWorkers is not None and self.maxNewWorkers > 0: nNewWorkers_max_agg = min(nNewWorkers_max_agg, self.maxNewWorkers) # exceeded max, to adjust if total_new_workers_rts > nNewWorkers_max_agg: if nNewWorkers_max_agg == 0: for resource_type in dyn_num_workers[queueName]: dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 tmpLog.debug('No nNewWorkers since nNewWorkers_max_agg=0 for UCORE') else: tmpLog.debug('nNewWorkers_max_agg={0} for UCORE'.format(nNewWorkers_max_agg)) _d = dyn_num_workers[queueName].copy() del _d['ANY'] simple_rt_nw_list = [ [_rt, _d[_rt].get('nNewWorkers', 0), 0] for _rt in _d ] _countdown = nNewWorkers_max_agg for _rt_list in simple_rt_nw_list: resource_type, nNewWorkers_orig, _r = _rt_list nNewWorkers, remainder = divmod(nNewWorkers_orig*nNewWorkers_max_agg, total_new_workers_rts) dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers _rt_list[2] = remainder _countdown -= nNewWorkers _s_list = sorted(simple_rt_nw_list, key=(lambda x: x[1])) sorted_rt_nw_list = sorted(_s_list, key=(lambda x: x[2]), reverse=True) for resource_type, nNewWorkers_orig, remainder in sorted_rt_nw_list: if _countdown <= 0: break dyn_num_workers[queueName][resource_type]['nNewWorkers'] += 1 _countdown -= 1 for resource_type in dyn_num_workers[queueName]: if resource_type == 'ANY': continue nNewWorkers = dyn_num_workers[queueName][resource_type]['nNewWorkers'] tmpLog.debug('setting nNewWorkers to {0} of type {1} in order to respect RT aggregations for UCORE' .format(nNewWorkers, resource_type)) if not apf_msg: apf_data = copy.deepcopy(dyn_num_workers[queueName]) self.apf_mon.update_label(queueName, apf_msg, apf_data) # dump tmpLog.debug('defined {0}'.format(str(dyn_num_workers))) return dyn_num_workers except Exception: # dump error errMsg = core_utils.dump_error_message(tmpLog) return None
def start(self): # thread list thrList = [] # Credential Manager from pandaharvester.harvesterbody.cred_manager import CredManager thr = CredManager(self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.execute() thr.start() thrList.append(thr) # Command manager from pandaharvester.harvesterbody.command_manager import CommandManager thr = CommandManager(self.communicatorPool, self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Cacher from pandaharvester.harvesterbody.cacher import Cacher thr = Cacher(self.communicatorPool, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.execute(force_update=True, skip_lock=True) thr.start() thrList.append(thr) # Watcher from pandaharvester.harvesterbody.watcher import Watcher thr = Watcher(single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Job Fetcher from pandaharvester.harvesterbody.job_fetcher import JobFetcher nThr = harvester_config.jobfetcher.nThreads for iThr in range(nThr): thr = JobFetcher(self.communicatorPool, self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Propagator from pandaharvester.harvesterbody.propagator import Propagator nThr = harvester_config.propagator.nThreads for iThr in range(nThr): thr = Propagator(self.communicatorPool, self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Monitor from pandaharvester.harvesterbody.monitor import Monitor nThr = harvester_config.monitor.nThreads for iThr in range(nThr): thr = Monitor(self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Preparator from pandaharvester.harvesterbody.preparator import Preparator nThr = harvester_config.preparator.nThreads for iThr in range(nThr): thr = Preparator(self.communicatorPool, self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Submitter from pandaharvester.harvesterbody.submitter import Submitter nThr = harvester_config.submitter.nThreads for iThr in range(nThr): thr = Submitter(self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Stager from pandaharvester.harvesterbody.stager import Stager nThr = harvester_config.stager.nThreads for iThr in range(nThr): thr = Stager(self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # EventFeeder from pandaharvester.harvesterbody.event_feeder import EventFeeder nThr = harvester_config.eventfeeder.nThreads for iThr in range(nThr): thr = EventFeeder(self.communicatorPool, self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Sweeper from pandaharvester.harvesterbody.sweeper import Sweeper nThr = harvester_config.sweeper.nThreads for iThr in range(nThr): thr = Sweeper(self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Service monitor try: sm_active = harvester_config.service_monitor.active except Exception: sm_active = False if sm_active: from pandaharvester.harvesterbody.service_monitor import ServiceMonitor thr = ServiceMonitor(options.pid, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Report itself to APF Mon apf_mon = Apfmon(self.queueConfigMapper) apf_mon.create_factory() apf_mon.create_labels() ################## # loop on stop event to be interruptable since thr.join blocks signal capture in python 2.7 while True: if self.singleMode or not self.daemonMode: break self.stopEvent.wait(1) if self.stopEvent.is_set(): break ################## # join if self.daemonMode: for thr in thrList: thr.join()
class Submitter(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.workerMaker = WorkerMaker() self.workerAdjuster = WorkerAdjuster(queue_config_mapper) self.pluginFactory = PluginFactory() self.monitor_fifo = MonitorFIFO() self.apfmon = Apfmon(self.queueConfigMapper) # main loop def run(self): lockedBy = 'submitter-{0}'.format(self.get_pid()) monitor_fifo = self.monitor_fifo queueLockInterval = getattr(harvester_config.submitter, 'queueLockInterval', harvester_config.submitter.lockInterval) while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting queues to submit workers') # get queues associated to a site to submit workers curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(harvester_config.submitter.nQueues, harvester_config.submitter.lookupTime, harvester_config.submitter.lockInterval, lockedBy, queueLockInterval) submitted = False if siteName is not None: mainLog.debug('got {0} queues for site {1}'.format(len(curWorkers), siteName)) # get commands comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName) commandSpecs = self.dbProxy.get_commands_for_receiver('submitter', comStr) mainLog.debug('got {0} {1} commands'.format(commandSpecs, comStr)) for commandSpec in commandSpecs: newLimits = self.dbProxy.set_queue_limit(siteName, commandSpec.params) for tmpResource, tmpNewVal in iteritems(newLimits): # if available, overwrite new worker value with the command from panda server if tmpResource in resMap: tmpQueueName = resMap[tmpResource] if tmpQueueName in curWorkers: curWorkers[tmpQueueName][tmpResource]['nNewWorkers'] = tmpNewVal # define number of new workers if len(curWorkers) == 0: n_workers_per_queue_and_rt = dict() else: n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers(curWorkers, siteName) if n_workers_per_queue_and_rt is None: mainLog.error('WorkerAdjuster failed to define the number of workers') elif len(n_workers_per_queue_and_rt) == 0: pass else: # loop over all queues and resource types for queueName in n_workers_per_queue_and_rt: for resource_type, tmpVal in iteritems(n_workers_per_queue_and_rt[queueName]): tmpLog = self.make_logger(_logger, 'id={0} queue={1} rtype={2}'.format(lockedBy, queueName, resource_type), method_name='run') try: tmpLog.debug('start') tmpLog.debug('workers status: %s' % tmpVal) nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady'] nReady = tmpVal['nReady'] # check queue if not self.queueConfigMapper.has_queue(queueName): tmpLog.error('config not found') continue # no new workers if nWorkers == 0: tmpLog.debug('skipped since no new worker is needed based on current stats') continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) workerMakerCore = self.workerMaker.get_plugin(queueConfig) # check if resource is ready if hasattr(workerMakerCore, 'dynamicSizing') and workerMakerCore.dynamicSizing is True: numReadyResources = self.workerMaker.num_ready_resources(queueConfig, resource_type, workerMakerCore) tmpLog.debug('numReadyResources: %s' % numReadyResources) if not numReadyResources: if hasattr(workerMakerCore, 'staticWorkers'): nQRWorkers = tmpVal['nQueue'] + tmpVal['nRunning'] tmpLog.debug('staticWorkers: %s, nQRWorkers(Queue+Running): %s' % (workerMakerCore.staticWorkers, nQRWorkers)) if nQRWorkers >= workerMakerCore.staticWorkers: tmpLog.debug('No left static workers, skip') continue else: nWorkers = min(workerMakerCore.staticWorkers - nQRWorkers, nWorkers) tmpLog.debug('staticWorkers: %s, nWorkers: %s' % (workerMakerCore.staticWorkers, nWorkers)) else: tmpLog.debug('skip since no resources are ready') continue else: nWorkers = min(nWorkers, numReadyResources) # post action of worker maker if hasattr(workerMakerCore, 'skipOnFail') and workerMakerCore.skipOnFail is True: skipOnFail = True else: skipOnFail = False # actions based on mapping type if queueConfig.mapType == WorkSpec.MT_NoJob: # workers without jobs jobChunks = [] for i in range(nWorkers): jobChunks.append([]) elif queueConfig.mapType == WorkSpec.MT_OneToOne: # one worker per one job jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, 1, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy) elif queueConfig.mapType == WorkSpec.MT_MultiJobs: # one worker for multiple jobs nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(queueConfig, nWorkers, resource_type, maker=workerMakerCore) tmpLog.debug('nJobsPerWorker={0}'.format(nJobsPerWorker)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, nJobsPerWorker, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy, queueConfig.allowJobMixture) elif queueConfig.mapType == WorkSpec.MT_MultiWorkers: # multiple workers for one job nWorkersPerJob = self.workerMaker.get_num_workers_per_job(queueConfig, nWorkers, resource_type, maker=workerMakerCore) maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total( queueConfig, resource_type, maker=workerMakerCore) maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle( queueConfig, resource_type, maker=workerMakerCore) tmpLog.debug('nWorkersPerJob={0}'.format(nWorkersPerJob)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, None, nWorkersPerJob, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy, max_workers_per_job_in_total=maxWorkersPerJob, max_workers_per_job_per_cycle=maxWorkersPerJobPerCycle) else: tmpLog.error('unknown mapType={0}'.format(queueConfig.mapType)) continue tmpLog.debug('got {0} job chunks'.format(len(jobChunks))) if len(jobChunks) == 0: continue # make workers okChunks, ngChunks = self.workerMaker.make_workers(jobChunks, queueConfig, nReady, resource_type, maker=workerMakerCore) if len(ngChunks) == 0: tmpLog.debug('successfully made {0} workers'.format(len(okChunks))) else: tmpLog.debug('made {0} workers, while {1} workers failed'.format(len(okChunks), len(ngChunks))) timeNow = datetime.datetime.utcnow() timeNow_timestamp = time.time() pandaIDs = set() # NG (=not good) for ngJobs in ngChunks: for jobSpec in ngJobs: if skipOnFail: # release jobs when workers are not made pandaIDs.add(jobSpec.PandaID) else: jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_make' jobSpec.stateChangeTime = timeNow jobSpec.lockedBy = None errStr = 'failed to make a worker' jobSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, 'subStatus': 'prepared'}) # OK workSpecList = [] if len(okChunks) > 0: for workSpec, okJobs in okChunks: # has job if (queueConfig.useJobLateBinding and workSpec.workerID is None) \ or queueConfig.mapType == WorkSpec.MT_NoJob: workSpec.hasJob = 0 else: workSpec.hasJob = 1 if workSpec.nJobsToReFill in [None, 0]: workSpec.set_jobspec_list(okJobs) else: # refill free slots during the worker is running workSpec.set_jobspec_list(okJobs[:workSpec.nJobsToReFill]) workSpec.nJobsToReFill = None for jobSpec in okJobs[workSpec.nJobsToReFill:]: pandaIDs.add(jobSpec.PandaID) workSpec.set_num_jobs_with_list() # map type workSpec.mapType = queueConfig.mapType # queue name workSpec.computingSite = queueConfig.queueName # set access point workSpec.accessPoint = queueConfig.messenger['accessPoint'] # sync level workSpec.syncLevel = queueConfig.get_synchronization_level() # events if len(okJobs) > 0 and \ ('eventService' in okJobs[0].jobParams or 'cloneJob' in okJobs[0].jobParams): workSpec.eventsRequest = WorkSpec.EV_useEvents workSpecList.append(workSpec) if len(workSpecList) > 0: sw = core_utils.get_stopwatch() # get plugin for submitter submitterCore = self.pluginFactory.get_plugin(queueConfig.submitter) if submitterCore is None: # not found tmpLog.error( 'submitter plugin for {0} not found'.format(jobSpec.computingSite)) continue # get plugin for messenger messenger = self.pluginFactory.get_plugin(queueConfig.messenger) if messenger is None: # not found tmpLog.error( 'messenger plugin for {0} not found'.format(jobSpec.computingSite)) continue # setup access points messenger.setup_access_points(workSpecList) # feed jobs for workSpec in workSpecList: if workSpec.hasJob == 1: tmpStat = messenger.feed_jobs(workSpec, workSpec.get_jobspec_list()) if tmpStat is False: tmpLog.error( 'failed to send jobs to workerID={0}'.format(workSpec.workerID)) else: tmpLog.debug( 'sent jobs to workerID={0} with {1}'.format(workSpec.workerID, tmpStat)) # insert workers self.dbProxy.insert_workers(workSpecList, lockedBy) # submit sw.reset() tmpLog.info('submitting {0} workers'.format(len(workSpecList))) workSpecList, tmpRetList, tmpStrList = self.submit_workers(submitterCore, workSpecList) tmpLog.debug('done submitting {0} workers'.format(len(workSpecList)) + sw.get_elapsed_time()) # collect successful jobs okPandaIDs = set() for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): if tmpRet: workSpec, jobList = okChunks[iWorker] jobList = workSpec.get_jobspec_list() if jobList is not None: for jobSpec in jobList: okPandaIDs.add(jobSpec.PandaID) # loop over all workers for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): workSpec, jobList = okChunks[iWorker] # set harvesterHost workSpec.harvesterHost = socket.gethostname() # use associated job list since it can be truncated for re-filling jobList = workSpec.get_jobspec_list() # set status if not tmpRet: # failed submission errStr = 'failed to submit a workerID={0} with {1}'.format( workSpec.workerID, tmpStr) tmpLog.error(errStr) workSpec.set_status(WorkSpec.ST_missed) workSpec.set_dialog_message(tmpStr) workSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) if jobList is not None: # increment attempt number newJobList = [] for jobSpec in jobList: # skip if successful with another worker if jobSpec.PandaID in okPandaIDs: continue if jobSpec.submissionAttempts is None: jobSpec.submissionAttempts = 0 jobSpec.submissionAttempts += 1 # max attempt or permanent error if tmpRet is False or \ jobSpec.submissionAttempts >= \ queueConfig.maxSubmissionAttempts: newJobList.append(jobSpec) else: self.dbProxy.increment_submission_attempt( jobSpec.PandaID, jobSpec.submissionAttempts) jobList = newJobList elif queueConfig.useJobLateBinding and workSpec.hasJob == 1: # directly go to running after feeding jobs for late biding workSpec.set_status(WorkSpec.ST_running) else: # normal successful submission workSpec.set_status(WorkSpec.ST_submitted) workSpec.submitTime = timeNow workSpec.modificationTime = timeNow workSpec.checkTime = timeNow if self.monitor_fifo.enabled: workSpec.set_work_params({'lastCheckAt': timeNow_timestamp}) # prefetch events if tmpRet and workSpec.hasJob == 1 and \ workSpec.eventsRequest == WorkSpec.EV_useEvents and \ queueConfig.prefetchEvents: workSpec.eventsRequest = WorkSpec.EV_requestEvents eventsRequestParams = dict() for jobSpec in jobList: eventsRequestParams[jobSpec.PandaID] = \ {'pandaID': jobSpec.PandaID, 'taskID': jobSpec.taskID, 'jobsetID': jobSpec.jobParams['jobsetID'], 'nRanges': max(int(math.ceil(workSpec.nCore / len(jobList))), jobSpec.jobParams['coreCount']), } workSpec.eventsRequestParams = eventsRequestParams # register worker tmpStat = self.dbProxy.register_worker(workSpec, jobList, lockedBy) if jobList is not None: for jobSpec in jobList: pandaIDs.add(jobSpec.PandaID) if tmpStat: if tmpRet: tmpStr = \ 'submitted a workerID={0} for PandaID={1} with batchID={2}' tmpLog.info(tmpStr.format(workSpec.workerID, jobSpec.PandaID, workSpec.batchID)) else: tmpStr = 'failed to submit a workerID={0} for PandaID={1}' tmpLog.error(tmpStr.format(workSpec.workerID, jobSpec.PandaID)) else: tmpStr = \ 'failed to register a worker for PandaID={0} with batchID={1}' tmpLog.error(tmpStr.format(jobSpec.PandaID, workSpec.batchID)) # enqueue to monitor fifo if self.monitor_fifo.enabled \ and queueConfig.mapType != WorkSpec.MT_MultiWorkers: workSpecsToEnqueue = \ [[w] for w in workSpecList if w.status in (WorkSpec.ST_submitted, WorkSpec.ST_running)] check_delay = min( getattr(harvester_config.monitor, 'eventBasedCheckInterval', harvester_config.monitor.checkInterval), getattr(harvester_config.monitor, 'fifoCheckInterval', harvester_config.monitor.checkInterval)) monitor_fifo.put((queueName, workSpecsToEnqueue), time.time() + check_delay) mainLog.debug('put workers to monitor FIFO') submitted = True # release jobs self.dbProxy.release_jobs(pandaIDs, lockedBy) tmpLog.info('done') except Exception: core_utils.dump_error_message(tmpLog) # release the site self.dbProxy.release_site(siteName, lockedBy) if sw_main.get_elapsed_time_in_sec() > queueLockInterval: mainLog.warning('a submitter cycle was longer than queueLockInterval {0} sec'.format(queueLockInterval) + sw_main.get_elapsed_time()) mainLog.debug('done') # define sleep interval if siteName is None: sleepTime = harvester_config.submitter.sleepTime else: sleepTime = 0 if submitted and hasattr(harvester_config.submitter, 'minSubmissionInterval'): interval = harvester_config.submitter.minSubmissionInterval if interval > 0: newTime = datetime.datetime.utcnow() + datetime.timedelta(seconds=interval) self.dbProxy.update_panda_queue_attribute('submitTime', newTime, site_name=siteName) # time the cycle mainLog.debug('done a submitter cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(sleepTime): mainLog.debug('terminated') return # wrapper for submitWorkers to skip ready workers def submit_workers(self, submitter_core, workspec_list): retList = [] strList = [] newSpecList = [] workersToSubmit = [] for workSpec in workspec_list: if workSpec.status in [WorkSpec.ST_ready, WorkSpec.ST_running]: newSpecList.append(workSpec) retList.append(True) strList.append('') else: workersToSubmit.append(workSpec) tmpRetList = submitter_core.submit_workers(workersToSubmit) # submit the workers to the monitoring self.apfmon.create_workers(workersToSubmit) for tmpRet, tmpStr in tmpRetList: retList.append(tmpRet) strList.append(tmpStr) newSpecList += workersToSubmit return newSpecList, retList, strList