def _calculate_tenants(self, aim_ctx): with aim_ctx.store.begin(subtransactions=True): # Refresh this agent self.agent = self.manager.get(aim_ctx, self.agent) if not self.single_aid: down_time = self.agent.down_time(aim_ctx) if max(0, down_time or 0) > self.max_down_time: utils.perform_harakiri(LOG, "Agent has been down for %s " "seconds." % down_time) # Get peers agents = [ x for x in self.manager.find(aim_ctx, resource.Agent, admin_state_up=True) if not x.is_down(aim_ctx)] # Validate agent version if not agents: return [] max_version = max(agents, key=lambda x: x.version).version if self._major_vercompare(self.agent.version, max_version) < 0: LOG.error("Agent version is outdated: Current %s Required " "%s" % (self.agent.version, max_version)) return [] # Purge outdated agents agents = [x for x in agents if self._major_vercompare(x.version, max_version) == 0] else: agents = [self.agent] result = self._tenant_assignation_algorithm(aim_ctx, agents) # Store result in DB self.agent.hash_trees = result self.manager.create(aim_ctx, self.agent, overwrite=True) return result
def establish_ws_session(self, max_retries=None): retries = 0 self._reload_websocket_config() max_retries = max_retries or 2 * len(self.ws_urls) while retries < max_retries: if self.session and self.session.session: self.session.close() LOG.info('Establishing WS connection with parameters: %s', [self.ws_urls[0], self.apic_username, self.apic_password, self.verify_ssl_certificate]) self.session = acitoolkit.Session( self.ws_urls[0], self.apic_username, self.apic_password, verify_ssl=self.verify_ssl_certificate, cert_name=self.cert_name, key=self.private_key_file) resp = self.session.login() if not resp.ok: LOG.debug('Websocket connection failed: %s' % resp.text) self.ws_urls.rotate(-1) LOG.info('Rotating websocket URL, using: %s' % self.ws_urls[0]) retries += 1 continue LOG.info('Websocket connection succeeded.') self._spawn_monitors() return self.session utils.perform_harakiri(LOG, "Cannot establish WS connection after %s " "retries." % retries)
def _thread_monitor(self, thread, name, flag): # TODO(ivar): I could have used thread.join instead of this retries = None max_retries = len(self.ws_urls) LOG.debug("Monitoring thread %s" % name) try: while flag['monitor_runs']: if not thread.isAlive(): if retries and retries.get() >= max_retries: utils.perform_harakiri( LOG, "Critical thread %s stopped working" % name) else: retries = utils.exponential_backoff( self.monitor_max_backoff, tentative=retries) try: self.establish_ws_session() except Exception as e: LOG.debug( "Monitor for thread %s tried to reconnect web " "socket, but something went wrong. Will retry " "%s more times: %s" % (name, max_retries - retries.get(), e.message)) continue else: LOG.debug("Thread %s is in good shape" % name) retries = None time.sleep(self.monitor_sleep_time) # for testing purposes flag['monitor_runs'] -= 1 except Exception as e: msg = ("Unknown error in thread monitor " "for %s: %s" % (name, e.message)) LOG.error(msg) utils.perform_harakiri(LOG, msg)
def establish_ws_session(self, max_retries=None): try: with utils.get_rlock(lcon.ACI_WS_CONNECTION_LOCK, blocking=False): retries = 0 self._reload_websocket_config() max_retries = max_retries or 2 * len(self.ws_urls) while retries < max_retries: if self.session and self.session.session: self.session.close() LOG.info('Establishing WS connection with url: %s', self.ws_urls[0]) self.session = acitoolkit.Session( self.ws_urls[0], self.apic_username, self.apic_password, verify_ssl=self.verify_ssl_certificate, cert_name=self.cert_name, key=self.private_key_file) resp = self.session.login() if not resp.ok: LOG.warn('Websocket connection failed: %s' % resp.text) self.ws_urls.rotate(-1) LOG.info('Rotating websocket URL, ' 'using: %s' % self.ws_urls[0]) retries += 1 continue LOG.info('Websocket connection succeeded.') self._spawn_monitors() return self.session utils.perform_harakiri( LOG, "Cannot establish WS connection " "after %s retries." % retries) except utils.LockNotAcquired: # Some other thread is trying to reconnect return
def establish_ws_session(self, max_retries=None): try: with utils.get_rlock(lcon.ACI_WS_CONNECTION_LOCK, blocking=False): retries = 0 self._reload_websocket_config() max_retries = max_retries or 2 * len(self.ws_urls) while retries < max_retries: if self.session and self.session.session: self.session.close() LOG.info('Establishing WS connection with url: %s', self.ws_urls[0]) self.session = acitoolkit.Session( self.ws_urls[0], self.apic_username, self.apic_password, verify_ssl=self.verify_ssl_certificate, cert_name=self.cert_name, key=self.private_key_file) resp = self.session.login() if not resp.ok: LOG.warn('Websocket connection failed: %s' % resp.text) self.ws_urls.rotate(-1) LOG.info('Rotating websocket URL, ' 'using: %s' % self.ws_urls[0]) retries += 1 continue LOG.info('Websocket connection succeeded.') self._spawn_monitors() return self.session utils.perform_harakiri(LOG, "Cannot establish WS connection " "after %s retries." % retries) except utils.LockNotAcquired: # Some other thread is trying to reconnect return
def test_harakiri(self): original = self.cfg_manager.get_option('recovery_restart', 'aim') self.set_override('recovery_restart', False, 'aim') with mock.patch.object(internal_utils.os, '_exit') as ex: internal_utils.perform_harakiri(mock.Mock(), '') self.assertEqual(0, ex.call_count) self.set_override('recovery_restart', True, 'aim') internal_utils.perform_harakiri(mock.Mock(), '') ex.assert_called_once_with(1) self.set_override('recovery_restart', original, 'aim')
def _thread(self, func, name): LOG.info("Starting main loop of %s", name) try: while True: func() except utils.ThreadExit: return except Exception as e: LOG.error(traceback.format_exc()) utils.perform_harakiri(LOG, "%s thread stopped " "unexpectedly: %s" % (name, str(e)))
def _heartbeat_loop(self): start_time = time.time() aim_ctx = context.AimContext(store=api.get_store()) self._send_heartbeat(aim_ctx) # REVISIT: This code should be removed once we've # removed all the locking in AID. if start_time > self.daemon_loop_time: down_time = start_time - self.daemon_loop_time if down_time > DEADLOCK_TIME: utils.perform_harakiri( LOG, "Agent has been down for %s " "seconds." % down_time) utils.wait_for_next_cycle(start_time, self.report_interval, LOG, readable_caller='AID-HB', notify_exceeding_timeout=False)
def _thread_monitor(self, flag): login_thread_name = 'login_thread' subscription_thread_name = 'subscription_thread' name_to_retry = { login_thread_name: None, subscription_thread_name: None } max_retries = len(self.ws_urls) LOG.debug("Monitoring threads login and subscription") try: while flag['monitor_runs']: for thd, name in [(self.login_thread, 'login_thread'), (self.subs_thread, 'subscription_thread')]: if thd and not thd.isAlive(): if name_to_retry[name] and name_to_retry[name].get( ) >= max_retries: utils.perform_harakiri( LOG, "Critical thread %s stopped " "working" % name) else: name_to_retry[name] = utils.exponential_backoff( self.monitor_max_backoff, tentative=name_to_retry[name]) try: self.establish_ws_session() except Exception as e: LOG.debug( "Monitor for thread %s tried to reconnect " "web socket, but something went wrong. " "Will retry %s more times: %s" % (name, max_retries - name_to_retry[name].get(), e.message)) continue elif thd: LOG.debug("Thread %s is in good shape" % name) name_to_retry[name] = None time.sleep(self.monitor_sleep_time) # for testing purposes flag['monitor_runs'] -= 1 except Exception as e: msg = ("Unknown error in thread monitor: %s" % e.message) LOG.error(msg) utils.perform_harakiri(LOG, msg)
def _thread_monitor(self, flag): login_thread_name = 'login_thread' subscription_thread_name = 'subscription_thread' name_to_retry = {login_thread_name: None, subscription_thread_name: None} max_retries = len(self.ws_urls) LOG.debug("Monitoring threads login and subscription") try: while flag['monitor_runs']: for thd, name in [(self.login_thread, 'login_thread'), (self.subs_thread, 'subscription_thread')]: if thd and not thd.isAlive(): if name_to_retry[name] and name_to_retry[ name].get() >= max_retries: utils.perform_harakiri( LOG, "Critical thread %s stopped " "working" % name) else: name_to_retry[name] = utils.exponential_backoff( self.monitor_max_backoff, tentative=name_to_retry[name]) try: self.establish_ws_session() except Exception as e: LOG.debug( "Monitor for thread %s tried to reconnect " "web socket, but something went wrong. " "Will retry %s more times: %s" % (name, max_retries - name_to_retry[name].get(), e.message)) continue elif thd: LOG.debug("Thread %s is in good shape" % name) name_to_retry[name] = None time.sleep(self.monitor_sleep_time) # for testing purposes flag['monitor_runs'] -= 1 except Exception as e: msg = ("Unknown error in thread monitor: %s" % e.message) LOG.error(msg) utils.perform_harakiri(LOG, msg)
def _fail_agent(self, context, aim_object, operation, reason): utils.perform_harakiri(LOG, message=reason)
def _thread_monitor(self, flag): login_thread_name = 'login_thread' subscription_thread_name = 'subscription_thread' name_to_retry = { login_thread_name: None, subscription_thread_name: None } max_retries = len(self.ws_urls) recovery_timer = utils.get_time() recovery_retry = 0 aim_context = aim_ctx.AimContext(store=api.get_store()) LOG.debug("Monitoring threads login and subscription") try: while flag['monitor_runs']: for thd, name in [(self.login_thread, 'login_thread'), (self.subs_thread, 'subscription_thread')]: if thd and not thd.isAlive(): if name_to_retry[name] and name_to_retry[name].get( ) >= max_retries: utils.perform_harakiri( LOG, "Critical thread %s stopped " "working" % name) else: name_to_retry[name] = utils.exponential_backoff( self.monitor_max_backoff, tentative=name_to_retry[name]) try: self.establish_ws_session() except Exception as e: LOG.debug( "Monitor for thread %s tried to reconnect " "web socket, but something went wrong. " "Will retry %s more times: %s" % (name, max_retries - name_to_retry[name].get(), str(e))) continue elif thd: LOG.debug("Thread %s is in good shape" % name) name_to_retry[name] = None if self.need_recovery: # No point to do any recovery session if we # only have 1 ws_url. if (len(self.ws_urls) > 1 and utils.get_time() > recovery_timer): self.establish_ws_session(recovery_mode=True) # Still fail to recover if self.need_recovery: recovery_retry += 1 recovery_timer = ( utils.get_time() + utils.get_backoff_time( self.recovery_max_backoff, recovery_retry)) else: recovery_retry = 0 else: # Update the last_update_timestamp if self.apic_assign_obj: self.apic_assign_obj = self.manager.update( aim_context, self.apic_assign_obj) else: # This should never happen LOG.error('There is no such apic_assign_obj exist ' 'for %s!' % self.session.ipaddr) time.sleep(self.monitor_sleep_time) # for testing purposes flag['monitor_runs'] -= 1 except Exception as e: msg = ("Unknown error in thread monitor: %s" % str(e)) LOG.error(msg) utils.perform_harakiri(LOG, msg)
def establish_ws_session(self, max_retries=None, recovery_mode=False): try: with utils.get_rlock(lcon.ACI_WS_CONNECTION_LOCK, blocking=False): if not recovery_mode: purpose = NORMAL_PURPOSE self._reload_websocket_config() self.need_recovery = False else: purpose = RECOVERY_PURPOSE backup_urls = collections.deque() max_retries = max_retries or 2 * len(self.ws_urls) url_max_retries = max(1, max_retries / len(self.ws_urls)) aim_context = aim_ctx.AimContext(store=api.get_store()) for url in self.ws_urls: apic_assign = api_infra.ApicAssignment(apic_host=url) apic_assign_obj = self.manager.get(aim_context, apic_assign) if (apic_assign_obj and apic_assign_obj.aim_aid_id != self.agent_id and not apic_assign_obj.is_available(aim_context)): backup_urls.append(url) continue # This means the original aim-aid owner might have # crashed or something. We will just take it! if (recovery_mode and apic_assign_obj and self.session.ipaddr in url): obj = self._update_apic_assign_db( aim_context, apic_assign, apic_assign_obj) if obj is None: continue self.need_recovery = False self.apic_assign_obj = obj return is_conn_successful = self._ws_session_login( url, url_max_retries, purpose, aim_context, apic_assign, apic_assign_obj) if is_conn_successful: return else: backup_urls.append(url) if recovery_mode: return # Try the backup urls. Randomly rotate the list first so that # the extra aim-aids won't all go for the same backup url. backup_urls_len = len(backup_urls) if backup_urls_len > 1: backup_urls.rotate(random.randint(1, backup_urls_len)) for url in backup_urls: is_conn_successful = self._ws_session_login( url, url_max_retries, BACKUP_PURPOSE) if is_conn_successful: return utils.perform_harakiri( LOG, "Cannot establish WS connection " "after %s retries." % max_retries) except utils.LockNotAcquired: # Some other thread is trying to reconnect return