def do_loop_turn(self): """Receiver daemon main loop :return: None """ # Begin to clean modules self.check_and_del_zombie_modules() # Maybe the arbiter pushed a new configuration... if self.watch_for_new_conf(timeout=0.05): logger.info("I got a new configuration...") # Manage the new configuration self.setup_new_conf() # Maybe external modules raised 'objects' # we should get them _t0 = time.time() self.get_objects_from_from_queues() statsmgr.timer('core.get-objects-from-queues', time.time() - _t0) # Get external commands from the arbiters... _t0 = time.time() self.get_external_commands_from_arbiters() statsmgr.timer('external-commands.got.time', time.time() - _t0) statsmgr.gauge('external-commands.got.count', len(self.unprocessed_external_commands)) _t0 = time.time() self.push_external_commands_to_schedulers() statsmgr.timer('external-commands.pushed.time', time.time() - _t0) # Say to modules it's a new tick :) _t0 = time.time() self.hook_point('tick') statsmgr.timer('hook.tick', time.time() - _t0)
def get_new_broks(self): """Get new broks from our satellites :return: None """ for satellites in [ self.schedulers, self.pollers, self.reactionners, self.receivers ]: for satellite_link in list(satellites.values()): logger.debug("Getting broks from %s", satellite_link) _t0 = time.time() try: tmp_broks = satellite_link.get_broks(self.name) except LinkError: logger.warning( "Daemon %s connection failed, I could not get the broks!", satellite_link) else: if tmp_broks: logger.debug("Got %d Broks from %s in %s", len(tmp_broks), satellite_link.name, time.time() - _t0) statsmgr.gauge( 'get-new-broks-count.%s' % (satellite_link.name), len(tmp_broks)) statsmgr.timer( 'get-new-broks-time.%s' % (satellite_link.name), time.time() - _t0) for brok in tmp_broks: brok.instance_id = satellite_link.instance_id # Add the broks to our global list self.external_broks.extend(tmp_broks)
def get_new_broks(self): """Get new broks from our satellites :return: None """ for satellites in [self.schedulers, self.pollers, self.reactionners, self.receivers]: for satellite_link in list(satellites.values()): logger.debug("Getting broks from %s", satellite_link) _t0 = time.time() try: tmp_broks = satellite_link.get_broks(self.name) except LinkError: logger.warning("Daemon %s connection failed, I could not get the broks!", satellite_link) else: if tmp_broks: logger.debug("Got %d Broks from %s in %s", len(tmp_broks), satellite_link.name, time.time() - _t0) statsmgr.gauge('get-new-broks-count.%s' % (satellite_link.name), len(tmp_broks)) statsmgr.timer('get-new-broks-time.%s' % (satellite_link.name), time.time() - _t0) for brok in tmp_broks: brok.instance_id = satellite_link.instance_id # Add the broks to our global list self.external_broks.extend(tmp_broks)
def get_internal_broks(self): """Get all broks from self.broks_internal_raised and append them to our broks to manage :return: None """ statsmgr.gauge('get-new-broks-count.broker', len(self.internal_broks)) # Add the broks to our global list self.external_broks.extend(self.internal_broks) self.internal_broks = []
def get_arbiter_broks(self): """Get the broks from the arbiters, but as the arbiter_broks list can be push by arbiter without Global lock, we must protect this with a lock TODO: really? check this arbiter behavior! :return: None """ with self.arbiter_broks_lock: statsmgr.gauge('get-new-broks-count.arbiter', len(self.arbiter_broks)) # Add the broks to our global list self.external_broks.extend(self.arbiter_broks) self.arbiter_broks = []
def do_get_new_actions(self): """Get new actions from schedulers Create a Message and put into the module queue REF: doc/alignak-action-queues.png (1) :return: None """ # Here are the differences between a poller and a reactionner: # Poller will only do checks, # Reactionner will do actions (notifications and event handlers) do_checks = self.__class__.do_checks do_actions = self.__class__.do_actions # We check and get the new actions to execute in each of our schedulers for scheduler_link_uuid in self.schedulers: scheduler_link = self.schedulers[scheduler_link_uuid] if not scheduler_link.active: logger.warning("My scheduler '%s' is not active currently", scheduler_link.name) continue logger.debug("get new actions, scheduler: %s", scheduler_link.name) # OK, go for it :) _t0 = time.time() actions = scheduler_link.get_actions({ 'do_checks': do_checks, 'do_actions': do_actions, 'poller_tags': self.poller_tags, 'reactionner_tags': self.reactionner_tags, 'worker_name': self.name, 'module_types': list(self.q_by_mod.keys()) }) if actions: logger.debug("Got %d actions from %s", len(actions), scheduler_link.name) # We 'tag' them with my_scheduler and put into queue for workers self.add_actions(actions, scheduler_link.instance_id) logger.debug("Got %d actions from %s in %s", len(actions), scheduler_link.name, time.time() - _t0) statsmgr.gauge('actions.added.count.%s' % (scheduler_link.name), len(actions))
def do_get_new_actions(self): """Get new actions from schedulers Create a Message and put into the module queue REF: doc/alignak-action-queues.png (1) :return: None """ # Here are the differences between a poller and a reactionner: # Poller will only do checks, # Reactionner will do actions (notifications and event handlers) do_checks = self.__class__.do_checks do_actions = self.__class__.do_actions # We check and get the new actions to execute in each of our schedulers for scheduler_link_uuid in self.schedulers: scheduler_link = self.schedulers[scheduler_link_uuid] if not scheduler_link.active: logger.warning("My scheduler '%s' is not active currently", scheduler_link.name) continue logger.debug("get new actions, scheduler: %s", scheduler_link.name) # OK, go for it :) _t0 = time.time() actions = scheduler_link.get_actions({'do_checks': do_checks, 'do_actions': do_actions, 'poller_tags': self.poller_tags, 'reactionner_tags': self.reactionner_tags, 'worker_name': self.name, 'module_types': list(self.q_by_mod.keys())}) if actions: logger.debug("Got %d actions from %s", len(actions), scheduler_link.name) # We 'tag' them with my_scheduler and put into queue for workers self.add_actions(actions, scheduler_link.instance_id) logger.debug("Got %d actions from %s in %s", len(actions), scheduler_link.name, time.time() - _t0) statsmgr.gauge('actions.added.count.%s' % (scheduler_link.name), len(actions))
def do_loop_turn(self): # pylint: disable=too-many-branches """Loop used to: * get initial status broks * check if modules are alive, if not restart them * get broks from ourself, the arbiters and our satellites * add broks to the queue of each external module * manage broks with each internal module If the internal broks management is longer than 0.8 seconds, postpone to hte next loop turn to avoid overloading the broker daemon. :return: None """ if not self.got_initial_broks: # Asking initial broks from my schedulers my_satellites = self.get_links_of_type(s_type='scheduler') for satellite in list(my_satellites.values()): logger.info("Asking my initial broks from '%s'", satellite.name) _t0 = time.time() try: my_initial_broks = satellite.get_initial_broks(self.name) statsmgr.timer('broks.initial.%s.time' % satellite.name, time.time() - _t0) if not my_initial_broks: logger.info("No initial broks were raised, " "my scheduler is not yet ready...") return self.got_initial_broks = True logger.debug("Got %d initial broks from '%s'", my_initial_broks, satellite.name) statsmgr.gauge('broks.initial.%s.count' % satellite.name, my_initial_broks) except LinkError as exp: logger.warning( "Scheduler connection failed, I could not get initial broks!" ) logger.debug("Begin Loop: still some old broks to manage (%d)", len(self.external_broks)) if self.external_broks: statsmgr.gauge('unmanaged.broks', len(self.external_broks)) # Try to see if one of my module is dead, and restart previously dead modules self.check_and_del_zombie_modules() # Call modules that manage a starting tick pass _t0 = time.time() self.hook_point('tick') statsmgr.timer('hook.tick', time.time() - _t0) # Maybe the last loop we did raised some broks internally self.get_internal_broks() # Also reap broks sent from the arbiters self.get_arbiter_broks() # Now get broks from our distant daemons self.get_new_broks() # Get the list of broks not yet sent to our external modules _t0 = time.time() broks_to_send = [ brok for brok in self.external_broks if getattr(brok, 'to_be_sent', True) ] statsmgr.gauge('get-new-broks-count.to_send', len(broks_to_send)) # Send the broks to all external modules to_q queue so they can get the whole packet # beware, the sub-process/queue can be die/close, so we put to restart the whole module # instead of killing ourselves :) for module in self.modules_manager.get_external_instances(): try: _t00 = time.time() queue_size = module.to_q.qsize() statsmgr.gauge( 'queues.external.%s.to.size' % module.get_name(), queue_size) module.to_q.put(broks_to_send) statsmgr.timer('queues.external.%s.to.put' % module.get_name(), time.time() - _t00) except Exception as exp: # pylint: disable=broad-except # first we must find the modules logger.warning( "Module %s queue exception: %s, I'm tagging it to restart later", module.get_name(), str(exp)) logger.exception(exp) self.modules_manager.set_to_restart(module) # No more need to send them for brok in broks_to_send: brok.to_be_sent = False logger.debug("Time to send %s broks (%d secs)", len(broks_to_send), time.time() - _t0) # Make the internal modules manage the broks start = time.time() while self.external_broks: now = time.time() # Do not 'manage' more than 0.8s, we must get new broks almost every second if now - start > 0.8: logger.info( "I did not yet managed all my broks, still %d broks", len(self.external_broks)) break # Get the first brok in the list brok = self.external_broks.pop(0) if self.modules_manager.get_internal_instances(): self.manage_brok(brok) # Make a very short pause to avoid overloading self.make_a_pause(0.01, check_time_change=False) else: if getattr(brok, 'to_be_sent', False): self.external_broks.append(brok) # Maybe our external modules raised 'objects', so get them if self.get_objects_from_from_queues(): statsmgr.gauge('external-commands.got.count', len(self.external_commands)) statsmgr.gauge('broks.got.count', len(self.external_broks))
def do_loop_turn(self): # pylint: disable=too-many-branches """Satellite main loop:: * Check and delete zombies actions / modules * Get returns from queues * Adjust worker number * Get new actions :return: None """ # Try to see if one of my module is dead, and restart previously dead modules self.check_and_del_zombie_modules() # Also if some zombie workers exist... self.check_and_del_zombie_workers() # Call modules that manage a starting tick pass self.hook_point('tick') # Print stats for debug for _, sched in self.schedulers.items(): for mod in self.q_by_mod: # In workers we've got actions sent to queue - queue size for (worker_id, queue) in list(self.q_by_mod[mod].items()): try: actions_count = queue.qsize() results_count = self.returns_queue.qsize() logger.debug("[%s][%s][%s] actions queued: %d, results queued: %d", sched.name, mod, worker_id, actions_count, results_count) # Update the statistics statsmgr.gauge('worker.%s.actions-queue-size' % worker_id, actions_count) statsmgr.gauge('worker.%s.results-queue-size' % worker_id, results_count) except (IOError, EOFError): pass # todo temporaray deactivate all this stuff! # Before return or get new actions, see how we managed # the former ones: are they still in queue(s)? If so, we # must wait more or at least have more workers # wait_ratio = self.wait_ratio.get_load() # total_q = 0 # try: # for mod in self.q_by_mod: # for queue in list(self.q_by_mod[mod].values()): # total_q += queue.qsize() # except (IOError, EOFError): # pass # if total_q != 0 and wait_ratio < 2 * self.worker_polling_interval: # logger.debug("I decide to increase the wait ratio") # self.wait_ratio.update_load(wait_ratio * 2) # # self.wait_ratio.update_load(self.worker_polling_interval) # else: # # Go to self.worker_polling_interval on normal run, if wait_ratio # # was >2*self.worker_polling_interval, # # it make it come near 2 because if < 2, go up :) # self.wait_ratio.update_load(self.worker_polling_interval) # wait_ratio = self.wait_ratio.get_load() # statsmgr.timer('core.wait-ratio', wait_ratio) # if self.log_loop: # logger.debug("[%s] wait ratio: %f", self.name, wait_ratio) # Maybe we do not have enough workers, we check for it # and launch the new ones if needed self.adjust_worker_number_by_load() # Manage all messages we've got in the last timeout # for queue in self.return_messages: try: logger.debug("[%s] manage action results: %d results", self.name, self.returns_queue.qsize()) while self.returns_queue.qsize(): msg = self.returns_queue.get_nowait() if msg is None: continue logger.debug("Got a message: %s", msg) if msg.get_type() == 'Done': logger.debug("Got an action result: %s", msg.get_data()) self.manage_action_return(msg.get_data()) logger.debug("Managed action result") else: logger.warning("Ignoring message of type: %s", msg.get_type()) except Full: logger.warning("Returns queue is full") except Empty: logger.debug("Returns queue is empty") except (IOError, EOFError) as exp: logger.warning("My returns queue is no more available: %s", str(exp)) except Exception as exp: # pylint: disable=broad-except logger.error("Failed getting messages in returns queue: %s", str(exp)) logger.error(traceback.format_exc()) for _, sched in self.schedulers.items(): if sched.wait_homerun: logger.debug("scheduler home run: %d results", len(sched.wait_homerun)) if not self.passive: # If we are an active satellite, we do not initiate the check getting # and return try: # We send to our schedulers the results of all finished checks logger.debug("pushing results...") self.push_results() except LinkError as exp: logger.warning("Scheduler connection failed, I could not send my results!") try: # And we get the new actions from our schedulers logger.debug("getting new actions...") self.get_new_actions() except LinkError as exp: logger.warning("Scheduler connection failed, I could not get new actions!") # Get objects from our modules that are not Worker based if self.log_loop: logger.debug("[%s] get objects from queues", self.name) self.get_objects_from_from_queues() statsmgr.gauge('external-commands.count', len(self.external_commands)) statsmgr.gauge('broks.count', len(self.broks)) statsmgr.gauge('events.count', len(self.events))
def do_loop_turn(self): # pylint: disable=too-many-branches """Satellite main loop:: * Check and delete zombies actions / modules * Get returns from queues * Adjust worker number * Get new actions :return: None """ # Try to see if one of my module is dead, and restart previously dead modules self.check_and_del_zombie_modules() # Also if some zombie workers exist... self.check_and_del_zombie_workers() # Call modules that manage a starting tick pass self.hook_point('tick') # Print stats for debug for _, sched in self.schedulers.items(): for mod in self.q_by_mod: # In workers we've got actions sent to queue - queue size for (worker_id, queue) in list(self.q_by_mod[mod].items()): try: actions_count = queue.qsize() results_count = self.returns_queue.qsize() logger.debug("[%s][%s][%s] actions queued: %d, results queued: %d", sched.name, mod, worker_id, actions_count, results_count) # Update the statistics statsmgr.gauge('worker.%s.actions-queue-size' % worker_id, actions_count) statsmgr.gauge('worker.%s.results-queue-size' % worker_id, results_count) except (IOError, EOFError): pass # todo temporaray deactivate all this stuff! # Before return or get new actions, see how we managed # the former ones: are they still in queue(s)? If so, we # must wait more or at least have more workers # wait_ratio = self.wait_ratio.get_load() # total_q = 0 # try: # for mod in self.q_by_mod: # for queue in list(self.q_by_mod[mod].values()): # total_q += queue.qsize() # except (IOError, EOFError): # pass # if total_q != 0 and wait_ratio < 2 * self.worker_polling_interval: # logger.debug("I decide to increase the wait ratio") # self.wait_ratio.update_load(wait_ratio * 2) # # self.wait_ratio.update_load(self.worker_polling_interval) # else: # # Go to self.worker_polling_interval on normal run, if wait_ratio # # was >2*self.worker_polling_interval, # # it make it come near 2 because if < 2, go up :) # self.wait_ratio.update_load(self.worker_polling_interval) # wait_ratio = self.wait_ratio.get_load() # statsmgr.timer('core.wait-ratio', wait_ratio) # if self.log_loop: # logger.debug("[%s] wait ratio: %f", self.name, wait_ratio) # Maybe we do not have enough workers, we check for it # and launch the new ones if needed self.adjust_worker_number_by_load() # Manage all messages we've got in the last timeout # for queue in self.return_messages: try: logger.debug("[%s] manage action results: %d results", self.name, self.returns_queue.qsize()) while self.returns_queue.qsize(): msg = self.returns_queue.get_nowait() if msg is None: continue if not isinstance(msg, Message): logger.warning("Should have received a Message, got a %s!", type(msg)) continue logger.debug("Got a message: %s", msg) if msg.get_type() == 'Done': logger.debug("Got (from %s) an action result: %s", msg.get_source(), msg.get_data()) self.manage_action_return(msg.get_data()) elif msg.get_type() == 'Stats': logger.debug("Got (from %s) stats: %s", msg.get_source(), msg.get_data()) if msg.get_source() in self.workers: self.workers[msg.get_source()].stats = msg.get_data() else: logger.warning("Ignoring message of type: %s", msg.get_type()) except Full: logger.warning("Returns queue is full") except Empty: logger.debug("Returns queue is empty") except (IOError, EOFError) as exp: logger.warning("My returns queue is no more available: %s", str(exp)) except Exception as exp: # pylint: disable=broad-except logger.error("Failed getting messages in returns queue: %s", str(exp)) logger.error(traceback.format_exc()) for _, sched in self.schedulers.items(): if sched.wait_homerun: logger.debug("scheduler home run: %d results", len(sched.wait_homerun)) if not self.passive: # If we are an active satellite, we do not initiate the check getting # and return try: # We send to our schedulers the results of all finished checks logger.debug("pushing results...") self.push_results() except LinkError as exp: logger.warning("Scheduler connection failed, I could not send my results!") try: # And we get the new actions from our schedulers logger.debug("getting new actions...") self.get_new_actions() except LinkError as exp: logger.warning("Scheduler connection failed, I could not get new actions!") # Get objects from our modules that are not Worker based if self.log_loop: logger.debug("[%s] get objects from queues", self.name) self.get_objects_from_from_queues() statsmgr.gauge('external-commands.count', len(self.external_commands)) statsmgr.gauge('broks.count', len(self.broks)) statsmgr.gauge('events.count', len(self.events))
def push_external_commands_to_schedulers(self): """Send a HTTP request to the schedulers (POST /run_external_commands) with external command list. :return: None """ if not self.unprocessed_external_commands: return commands_to_process = self.unprocessed_external_commands self.unprocessed_external_commands = [] logger.debug("Commands: %s", commands_to_process) statsmgr.gauge('external-commands.pushed', len(self.unprocessed_external_commands)) # Now get all external commands and put them into the # good schedulers for ext_cmd in commands_to_process: self.external_commands_manager.resolve_command(ext_cmd) logger.debug("Resolved command: %s", ext_cmd) # Now for all alive schedulers, send the commands for sched_id in self.schedulers: sched = self.schedulers[sched_id] # TODO: sched should be a SatelliteLink object and, thus, have a get_name() method # but sometimes when an exception is raised because the scheduler is not available # this is not True ... sched is a simple dictionary! is_active = sched['active'] if not is_active: logger.warning("The scheduler '%s' is not active, it is not possible to push " "external commands from its connection!", sched) return # If there are some commands... extcmds = sched['external_commands'] cmds = [extcmd.cmd_line for extcmd in extcmds] if not cmds: continue # ...and the scheduler is alive con = sched['con'] if con is None: self.daemon_connection_init(sched_id, s_type='scheduler') if con is None: logger.warning("The connection for the scheduler '%s' cannot be established, it is " "not possible to push external commands.", sched) continue sent = False logger.debug("Sending %d commands to scheduler %s", len(cmds), sched) try: # con.run_external_commands(cmds) con.post('run_external_commands', {'cmds': cmds}) sent = True except HTTPClientConnectionException as exp: # pragma: no cover, simple protection logger.warning("[%s] %s", sched, str(exp)) sched['con'] = None continue except HTTPClientTimeoutException as exp: # pragma: no cover, simple protection logger.warning("Connection timeout with the scheduler '%s' when " "sending external commands: %s", sched, str(exp)) sched['con'] = None continue except HTTPClientException as exp: # pragma: no cover, simple protection logger.error("Error with the scheduler '%s' when " "sending external commands: %s", sched, str(exp)) sched['con'] = None continue except AttributeError as exp: # pragma: no cover, simple protection logger.warning("The scheduler %s should not be initialized: %s", sched, str(exp)) logger.exception(exp) except Exception as exp: # pylint: disable=broad-except logger.exception("A satellite raised an unknown exception (%s): %s", type(exp), exp) raise # Whether we sent the commands or not, clean the scheduler list self.schedulers[sched_id]['external_commands'] = [] # If we didn't sent them, add the commands to the arbiter list if not sent: for extcmd in extcmds: self.external_commands.append(extcmd)
def push_external_commands_to_schedulers(self): """Push received external commands to the schedulers :return: None """ if not self.unprocessed_external_commands: return # Those are the global external commands commands_to_process = self.unprocessed_external_commands self.unprocessed_external_commands = [] logger.debug("Commands: %s", commands_to_process) # Now get all external commands and put them into the good schedulers logger.debug("Commands to process: %d commands", len(commands_to_process)) for ext_cmd in commands_to_process: cmd = self.external_commands_manager.resolve_command(ext_cmd) logger.debug("Resolved command: %s, result: %s", ext_cmd.cmd_line, cmd) if cmd and cmd['global']: # Send global command to all our schedulers for scheduler_link_uuid in self.schedulers: self.schedulers[scheduler_link_uuid].pushed_commands.append(ext_cmd) # Now for all active schedulers, send the commands count_pushed_commands = 0 count_failed_commands = 0 for scheduler_link_uuid in self.schedulers: link = self.schedulers[scheduler_link_uuid] if not link.active: logger.debug("The scheduler '%s' is not active, it is not possible to push " "external commands to its connection!", link.name) continue # If there are some commands for this scheduler... commands = [ext_cmd.cmd_line for ext_cmd in link.pushed_commands] if not commands: logger.debug("The scheduler '%s' has no commands.", link.name) continue logger.debug("Sending %d commands to scheduler %s", len(commands), link.name) sent = [] try: sent = link.push_external_commands(commands) except LinkError: logger.warning("Scheduler connection failed, I could not push external commands!") # Whether we sent the commands or not, clean the scheduler list link.pushed_commands = [] # If we didn't sent them, add the commands to the arbiter list if sent: statsmgr.gauge('external-commands.pushed.%s' % link.name, len(commands)) count_pushed_commands = count_pushed_commands + len(commands) else: count_failed_commands = count_failed_commands + len(commands) statsmgr.gauge('external-commands.failed.%s' % link.name, len(commands)) # Kepp the not sent commands... for a next try self.external_commands.extend(commands) statsmgr.gauge('external-commands.pushed.all', count_pushed_commands) statsmgr.gauge('external-commands.failed.all', count_failed_commands)
def get_new_broks(self, s_type='scheduler'): """Get new broks from daemon defined in type parameter :param s_type: type of object :type s_type: str :return: None """ # Get the good links tab for looping.. links = self.get_links_from_type(s_type) if links is None: logger.debug('Type unknown for connection! %s', s_type) return # We check for new check in each schedulers and put # the result in new_checks for s_id in links: logger.debug("Getting broks from %s", links[s_id]['name']) link = links[s_id] logger.debug("Link: %s", link) if not link['active']: logger.debug("The %s '%s' is not active, " "do not get broks from its connection!", s_type, link['name']) continue if link['con'] is None: if not self.daemon_connection_init(s_id, s_type=s_type): if link['connection_attempt'] <= link['max_failed_connections']: logger.warning("The connection for the %s '%s' cannot be established, " "it is not possible to get broks from this daemon.", s_type, link['name']) else: logger.error("The connection for the %s '%s' cannot be established, " "it is not possible to get broks from this daemon.", s_type, link['name']) continue try: _t0 = time.time() tmp_broks = link['con'].get('get_broks', {'bname': self.name}, wait='long') try: tmp_broks = unserialize(tmp_broks, True) except AlignakClassLookupException as exp: # pragma: no cover, # simple protection logger.error('Cannot un-serialize data received from "get_broks" call: %s', exp) continue if tmp_broks: logger.debug("Got %d Broks from %s in %s", len(tmp_broks), link['name'], time.time() - _t0) statsmgr.timer('con-broks-get.%s' % (link['name']), time.time() - _t0) statsmgr.gauge('con-broks-count.%s' % (link['name']), len(tmp_broks.values())) for brok in tmp_broks.values(): brok.instance_id = link['instance_id'] # Ok, we can add theses broks to our queues _t0 = time.time() self.add_broks_to_queue(tmp_broks.values()) statsmgr.timer('con-broks-add.%s' % s_type, time.time() - _t0) except HTTPClientConnectionException as exp: # pragma: no cover, simple protection logger.warning("[%s] %s", link['name'], str(exp)) link['con'] = None return except HTTPClientTimeoutException as exp: # pragma: no cover, simple protection logger.warning("Connection timeout with the %s '%s' when getting broks: %s", s_type, link['name'], str(exp)) link['con'] = None return except HTTPClientException as exp: # pragma: no cover, simple protection logger.error("Error with the %s '%s' when getting broks: %s", s_type, link['name'], str(exp)) link['con'] = None return # scheduler must not have checks # What the F**k? We do not know what happened, # so.. bye bye :) except Exception as exp: # pylint: disable=broad-except logger.exception(exp) sys.exit(1)
def do_loop_turn(self): # pylint: disable=too-many-branches """Loop used to: * get initial status broks * check if modules are alive, if not restart them * get broks from ourself, the arbiters and our satellites * add broks to the queue of each external module * manage broks with each internal module If the internal broks management is longer than 0.8 seconds, postpone to hte next loop turn to avoid overloading the broker daemon. :return: None """ if not self.got_initial_broks: # Asking initial broks from my schedulers my_satellites = self.get_links_of_type(s_type='scheduler') for satellite in list(my_satellites.values()): logger.info("Asking my initial broks from '%s'", satellite.name) _t0 = time.time() try: my_initial_broks = satellite.get_initial_broks(self.name) statsmgr.timer('broks.initial.%s.time' % satellite.name, time.time() - _t0) if not my_initial_broks: logger.info("No initial broks were raised, " "my scheduler is not yet ready...") return self.got_initial_broks = True logger.debug("Got %d initial broks from '%s'", my_initial_broks, satellite.name) statsmgr.gauge('broks.initial.%s.count' % satellite.name, my_initial_broks) except LinkError as exp: logger.warning("Scheduler connection failed, I could not get initial broks!") logger.debug("Begin Loop: still some old broks to manage (%d)", len(self.external_broks)) if self.external_broks: statsmgr.gauge('unmanaged.broks', len(self.external_broks)) # Try to see if one of my module is dead, and restart previously dead modules self.check_and_del_zombie_modules() # Call modules that manage a starting tick pass _t0 = time.time() self.hook_point('tick') statsmgr.timer('hook.tick', time.time() - _t0) # Maybe the last loop we did raised some broks internally self.get_internal_broks() # Also reap broks sent from the arbiters self.get_arbiter_broks() # Now get broks from our distant daemons self.get_new_broks() # Get the list of broks not yet sent to our external modules _t0 = time.time() broks_to_send = [brok for brok in self.external_broks if getattr(brok, 'to_be_sent', True)] statsmgr.gauge('get-new-broks-count.to_send', len(broks_to_send)) # Send the broks to all external modules to_q queue so they can get the whole packet # beware, the sub-process/queue can be die/close, so we put to restart the whole module # instead of killing ourselves :) for module in self.modules_manager.get_external_instances(): try: _t00 = time.time() queue_size = module.to_q.qsize() statsmgr.gauge('queues.external.%s.to.size' % module.get_name(), queue_size) module.to_q.put(broks_to_send) statsmgr.timer('queues.external.%s.to.put' % module.get_name(), time.time() - _t00) except Exception as exp: # pylint: disable=broad-except # first we must find the modules logger.warning("Module %s queue exception: %s, I'm tagging it to restart later", module.get_name(), str(exp)) logger.exception(exp) self.modules_manager.set_to_restart(module) # No more need to send them for brok in broks_to_send: brok.to_be_sent = False logger.debug("Time to send %s broks (%d secs)", len(broks_to_send), time.time() - _t0) # Make the internal modules manage the broks start = time.time() while self.external_broks: now = time.time() # Do not 'manage' more than 0.8s, we must get new broks almost every second if now - start > 0.8: logger.info("I did not yet managed all my broks, still %d broks", len(self.external_broks)) break # Get the first brok in the list brok = self.external_broks.pop(0) if self.modules_manager.get_internal_instances(): self.manage_brok(brok) # Make a very short pause to avoid overloading self.make_a_pause(0.01, check_time_change=False) else: if getattr(brok, 'to_be_sent', False): self.external_broks.append(brok) # Maybe our external modules raised 'objects', so get them if self.get_objects_from_from_queues(): statsmgr.gauge('external-commands.got.count', len(self.external_commands)) statsmgr.gauge('broks.got.count', len(self.external_broks))