def _thread_monitor(self, thread, name, flag): # TODO(ivar): I could have used thread.join instead of this retries = None max_retries = len(self.ws_urls) LOG.debug("Monitoring thread %s" % name) try: while flag['monitor_runs']: if not thread.isAlive(): if retries and retries.get() >= max_retries: utils.perform_harakiri( LOG, "Critical thread %s stopped working" % name) else: retries = utils.exponential_backoff( self.monitor_max_backoff, tentative=retries) try: self.establish_ws_session() except Exception as e: LOG.debug( "Monitor for thread %s tried to reconnect web " "socket, but something went wrong. Will retry " "%s more times: %s" % (name, max_retries - retries.get(), e.message)) continue else: LOG.debug("Thread %s is in good shape" % name) retries = None time.sleep(self.monitor_sleep_time) # for testing purposes flag['monitor_runs'] -= 1 except Exception as e: msg = ("Unknown error in thread monitor " "for %s: %s" % (name, e.message)) LOG.error(msg) utils.perform_harakiri(LOG, msg)
def test_exponential_backoff(self): with mock.patch.object(internal_utils.random, 'random', return_value=1): with mock.patch.object(internal_utils.time, 'sleep') as sleep: tentative = None tentative = internal_utils.exponential_backoff(10, tentative) self.assertEqual(1, tentative.get()) sleep.assert_called_with(1) tentative.increment() tentative = internal_utils.exponential_backoff(10, tentative) self.assertEqual(3, tentative.get()) sleep.assert_called_with(4) tentative.increment() tentative.increment() internal_utils.exponential_backoff(10, tentative) sleep.assert_called_with(10)
def initialize(self, conf): try: self.conf_manager = conf self.us_path = self.conf_manager.get_option('unix_socket_path', group='aim') self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) self.sock.connect(self.us_path) LOG.info("Connected to %s" % self.us_path) self.recovery_retries = None return self except Exception: LOG.error(traceback.format_exc()) self.recovery_retries = utils.exponential_backoff( SOCKET_RECONNECT_MAX_WAIT, tentative=self.recovery_retries)
def _poll(self): # Loop count is the equivalent of a True in normal usage, but it's # useful for testing. while self.loop_count > 0: try: start_time = time.time() self._daemon_loop() utils.wait_for_next_cycle( start_time, self.polling_interval, LOG, readable_caller='Event Service Poller', notify_exceeding_timeout=False) self.loop_count -= 1 self.recovery_retries = None except Exception: LOG.error('A error occurred in polling agent.') LOG.error(traceback.format_exc()) self.recovery_retries = utils.exponential_backoff( 10, tentative=self.recovery_retries)
def _thread_monitor(self, flag): login_thread_name = 'login_thread' subscription_thread_name = 'subscription_thread' name_to_retry = { login_thread_name: None, subscription_thread_name: None } max_retries = len(self.ws_urls) LOG.debug("Monitoring threads login and subscription") try: while flag['monitor_runs']: for thd, name in [(self.login_thread, 'login_thread'), (self.subs_thread, 'subscription_thread')]: if thd and not thd.isAlive(): if name_to_retry[name] and name_to_retry[name].get( ) >= max_retries: utils.perform_harakiri( LOG, "Critical thread %s stopped " "working" % name) else: name_to_retry[name] = utils.exponential_backoff( self.monitor_max_backoff, tentative=name_to_retry[name]) try: self.establish_ws_session() except Exception as e: LOG.debug( "Monitor for thread %s tried to reconnect " "web socket, but something went wrong. " "Will retry %s more times: %s" % (name, max_retries - name_to_retry[name].get(), e.message)) continue elif thd: LOG.debug("Thread %s is in good shape" % name) name_to_retry[name] = None time.sleep(self.monitor_sleep_time) # for testing purposes flag['monitor_runs'] -= 1 except Exception as e: msg = ("Unknown error in thread monitor: %s" % e.message) LOG.error(msg) utils.perform_harakiri(LOG, msg)
def _thread_monitor(self, flag): login_thread_name = 'login_thread' subscription_thread_name = 'subscription_thread' name_to_retry = {login_thread_name: None, subscription_thread_name: None} max_retries = len(self.ws_urls) LOG.debug("Monitoring threads login and subscription") try: while flag['monitor_runs']: for thd, name in [(self.login_thread, 'login_thread'), (self.subs_thread, 'subscription_thread')]: if thd and not thd.isAlive(): if name_to_retry[name] and name_to_retry[ name].get() >= max_retries: utils.perform_harakiri( LOG, "Critical thread %s stopped " "working" % name) else: name_to_retry[name] = utils.exponential_backoff( self.monitor_max_backoff, tentative=name_to_retry[name]) try: self.establish_ws_session() except Exception as e: LOG.debug( "Monitor for thread %s tried to reconnect " "web socket, but something went wrong. " "Will retry %s more times: %s" % (name, max_retries - name_to_retry[name].get(), e.message)) continue elif thd: LOG.debug("Thread %s is in good shape" % name) name_to_retry[name] = None time.sleep(self.monitor_sleep_time) # for testing purposes flag['monitor_runs'] -= 1 except Exception as e: msg = ("Unknown error in thread monitor: %s" % e.message) LOG.error(msg) utils.perform_harakiri(LOG, msg)
def _listener(self): # Multiple event notifiers can connect to AID while True: try: self._connect() LOG.info("Listening for Events on %s", self.us_path) while True: self._recv_loop() self.recovery_retries = None except Exception as e: LOG.debug(traceback.format_exc()) LOG.error("An error as occurred in the event listener " "thread: %s" % e) self.recovery_retries = utils.exponential_backoff( SOCKET_RECONNECT_MAX_WAIT, tentative=self.recovery_retries) finally: try: self.sock.close() except AttributeError: LOG.debug("Socket wasn't initialized before failure")
def _main_loop(self): try: # tenant subscription is redone upon exception self._subscribe_tenant() LOG.debug("Starting event loop for tenant %s" % self.tenant_name) last_time = 0 epsilon = 0.5 while not self._stop and self.num_loop_runs > 0: start = time.time() if start > self.scheduled_reset: raise ScheduledReset() self._event_loop() curr_time = time.time() - start if abs(curr_time - last_time) > epsilon: # Only log significant differences LOG.debug("Event loop for tenant %s completed in %s " "seconds" % (self.tenant_name, time.time() - start)) last_time = curr_time if not last_time: last_time = curr_time # Successfull run self.num_loop_runs -= 1 self.recovery_retries = None except ScheduledReset: LOG.info("Scheduled tree reset for root %s" % self.tenant_name) self._unsubscribe_tenant() except Exception as e: LOG.error("An exception has occurred in thread serving tenant " "%s, error: %s" % (self.tenant_name, e.message)) LOG.error(traceback.format_exc()) self._unsubscribe_tenant() self.recovery_retries = utils.exponential_backoff( TENANT_FAILURE_MAX_WAIT, tentative=self.recovery_retries) if self.recovery_retries.get() >= self.max_retries: LOG.error("Exceeded max recovery retries for tenant %s. " "Destroying the manager." % self.tenant_name) self.kill()
def _main_loop(self): try: # tenant subscription is redone upon exception self._subscribe_tenant() LOG.debug("Starting event loop for tenant %s" % self.tenant_name) last_time = 0 epsilon = 0.5 while not self._stop and self.num_loop_runs > 0: start = time.time() if start > self.scheduled_reset: raise ScheduledReset() self._event_loop() curr_time = time.time() - start if abs(curr_time - last_time) > epsilon: # Only log significant differences LOG.debug("Event loop for tenant %s completed in %s " "seconds" % (self.tenant_name, time.time() - start)) last_time = curr_time if not last_time: last_time = curr_time # Successfull run self.num_loop_runs -= 1 self.recovery_retries = None except ScheduledReset: LOG.info("Scheduled tree reset for root %s" % self.tenant_name) self._unsubscribe_tenant() except Exception as e: LOG.error("An exception has occurred in thread serving tenant " "%s, error: %s" % (self.tenant_name, str(e))) LOG.error(traceback.format_exc()) self._unsubscribe_tenant() self.recovery_retries = utils.exponential_backoff( TENANT_FAILURE_MAX_WAIT, tentative=self.recovery_retries) if self.recovery_retries.get() >= self.max_retries: LOG.error("Exceeded max recovery retries for tenant %s. " "Destroying the manager." % self.tenant_name) self.kill()
def _thread_monitor(self, flag): login_thread_name = 'login_thread' subscription_thread_name = 'subscription_thread' name_to_retry = { login_thread_name: None, subscription_thread_name: None } max_retries = len(self.ws_urls) recovery_timer = utils.get_time() recovery_retry = 0 aim_context = aim_ctx.AimContext(store=api.get_store()) LOG.debug("Monitoring threads login and subscription") try: while flag['monitor_runs']: for thd, name in [(self.login_thread, 'login_thread'), (self.subs_thread, 'subscription_thread')]: if thd and not thd.isAlive(): if name_to_retry[name] and name_to_retry[name].get( ) >= max_retries: utils.perform_harakiri( LOG, "Critical thread %s stopped " "working" % name) else: name_to_retry[name] = utils.exponential_backoff( self.monitor_max_backoff, tentative=name_to_retry[name]) try: self.establish_ws_session() except Exception as e: LOG.debug( "Monitor for thread %s tried to reconnect " "web socket, but something went wrong. " "Will retry %s more times: %s" % (name, max_retries - name_to_retry[name].get(), str(e))) continue elif thd: LOG.debug("Thread %s is in good shape" % name) name_to_retry[name] = None if self.need_recovery: # No point to do any recovery session if we # only have 1 ws_url. if (len(self.ws_urls) > 1 and utils.get_time() > recovery_timer): self.establish_ws_session(recovery_mode=True) # Still fail to recover if self.need_recovery: recovery_retry += 1 recovery_timer = ( utils.get_time() + utils.get_backoff_time( self.recovery_max_backoff, recovery_retry)) else: recovery_retry = 0 else: # Update the last_update_timestamp if self.apic_assign_obj: self.apic_assign_obj = self.manager.update( aim_context, self.apic_assign_obj) else: # This should never happen LOG.error('There is no such apic_assign_obj exist ' 'for %s!' % self.session.ipaddr) time.sleep(self.monitor_sleep_time) # for testing purposes flag['monitor_runs'] -= 1 except Exception as e: msg = ("Unknown error in thread monitor: %s" % str(e)) LOG.error(msg) utils.perform_harakiri(LOG, msg)