def health_check(self): LOG.debug('health_check() starting...') futs = [] while not self.dead.is_set(): vthunder = None lock_session = None try: lock_session = db_apis.get_session(autocommit=False) failover_wait_time = datetime.datetime.utcnow( ) - datetime.timedelta( seconds=CONF.a10_health_manager.heartbeat_timeout) initial_setup_wait_time = datetime.datetime.utcnow( ) - datetime.timedelta( seconds=CONF.a10_health_manager.failover_timeout) vthunder = self.vthunder_repo.get_stale_vthunders( lock_session, initial_setup_wait_time, failover_wait_time) if vthunder is not None: self.vthunder_repo.set_vthunder_health_state( lock_session, vthunder.id, 'DOWN') if not vthunder.amphora_id: LOG.info("Hardware vthunder %s heartbeat timeout", vthunder.vthunder_id) lock_session.commit() continue # Don't failover vthunders which has pending state LBs if self._is_vthunder_busy(lock_session, vthunder): self.vthunder_repo.set_vthunder_health_state( lock_session, vthunder.id, 'BUSY') LOG.info( "vthunder %s heartbeat timeout but it is in " "pending state, skip failover", vthunder.vthunder_id) lock_session.commit() continue lock_session.commit() except db_exc.DBDeadlock: vthunder = None LOG.debug('Database reports deadlock. Skipping.') lock_session.rollback() except db_exc.RetryRequest: vthunder = None LOG.debug('Database is requesting a retry. Skipping.') lock_session.rollback() except db_exc.DBConnectionError: vthunder = None db_apis.wait_for_connection(self.dead) lock_session.rollback() if not self.dead.is_set(): time.sleep(CONF.health_manager.heartbeat_timeout) except Exception as e: vthunder = None with excutils.save_and_reraise_exception(): LOG.debug("Database error while health_check: %s", str(e)) if lock_session: lock_session.rollback() if vthunder is None: db_session = db_apis.get_session() self.vthunder_repo.unset_vthunder_busy_health_state(db_session) break LOG.info("Stale vThunder's id is: %s", vthunder.vthunder_id) fut = self.executor.submit(self.cw.failover_amphora, vthunder.vthunder_id) futs.append(fut) if len(futs) == self.threads: break if futs: LOG.info("Waiting for %s failovers to finish", len(futs)) health_manager.wait_done_or_dead(futs, self.dead) LOG.info("Successfully completed failover for VThunders.")
def health_check(self): stats = { 'failover_attempted': 0, 'failover_failed': 0, 'failover_cancelled': 0, } futs = [] while not self.dead.is_set(): amp_health = None lock_session = None try: lock_session = db_api.get_session(autocommit=False) amp = None amp_health = self.amp_health_repo.get_stale_amphora( lock_session) if amp_health: amp = self.amp_repo.get(lock_session, id=amp_health.amphora_id) # If there is an associated LB, attempt to set it to # PENDING_UPDATE. If it is already immutable, skip the # amphora on this cycle if amp and amp.load_balancer_id: if not self._test_and_set_failover_prov_status( lock_session, amp.load_balancer_id): lock_session.rollback() break lock_session.commit() except db_exc.DBDeadlock: LOG.debug('Database reports deadlock. Skipping.') lock_session.rollback() amp_health = None except db_exc.RetryRequest: LOG.debug('Database is requesting a retry. Skipping.') lock_session.rollback() amp_health = None except db_exc.DBConnectionError: db_api.wait_for_connection(self.dead) lock_session.rollback() amp_health = None if not self.dead.is_set(): # amphora heartbeat timestamps should also be outdated # while DB is unavailable and soon after DB comes back # online. Sleeping off the full "heartbeat_timeout" # interval to give the amps a chance to check in before # we start failovers. time.sleep(CONF.health_manager.heartbeat_timeout) except Exception: with excutils.save_and_reraise_exception(): if lock_session: lock_session.rollback() if amp_health is None: break LOG.info("Stale amphora's id is: %s", amp_health.amphora_id) fut = self.executor.submit(self.cw.failover_amphora, amp_health.amphora_id) fut.add_done_callback( functools.partial(update_stats_on_done, stats)) futs.append(fut) if len(futs) == self.threads: break if futs: LOG.info("Waiting for %s failovers to finish", len(futs)) wait_done_or_dead(futs, self.dead) if stats['failover_attempted'] > 0: LOG.info("Attempted %s failovers of amphora", stats['failover_attempted']) LOG.info("Failed at %s failovers of amphora", stats['failover_failed']) LOG.info("Cancelled %s failovers of amphora", stats['failover_cancelled']) happy_failovers = stats['failover_attempted'] happy_failovers -= stats['failover_cancelled'] happy_failovers -= stats['failover_failed'] LOG.info("Successfully completed %s failovers of amphora", happy_failovers)
def health_check(self): stats = { 'failover_attempted': 0, 'failover_failed': 0, 'failover_cancelled': 0, } futs = [] while not self.dead.is_set(): amp_health = None try: lock_session = db_api.get_session(autocommit=False) amp = None amp_health = self.amp_health_repo.get_stale_amphora( lock_session) if amp_health: amp = self.amp_repo.get(lock_session, id=amp_health.amphora_id) # If there is an associated LB, attempt to set it to # PENDING_UPDATE. If it is already immutable, skip the # amphora on this cycle if amp and amp.load_balancer_id: if not self._test_and_set_failover_prov_status( lock_session, amp.load_balancer_id): lock_session.rollback() break lock_session.commit() except db_exc.DBDeadlock: LOG.debug('Database reports deadlock. Skipping.') lock_session.rollback() amp_health = None except db_exc.RetryRequest: LOG.debug('Database is requesting a retry. Skipping.') lock_session.rollback() amp_health = None except db_exc.DBConnectionError: db_api.wait_for_connection(self.dead) lock_session.rollback() amp_health = None if not self.dead.is_set(): # amphora heartbeat timestamps should also be outdated # while DB is unavailable and soon after DB comes back # online. Sleeping off the full "heartbeat_timeout" # interval to give the amps a chance to check in before # we start failovers. time.sleep(CONF.health_manager.heartbeat_timeout) except Exception: with excutils.save_and_reraise_exception(): lock_session.rollback() if amp_health is None: break LOG.info("Stale amphora's id is: %s", amp_health.amphora_id) fut = self.executor.submit( self.cw.failover_amphora, amp_health.amphora_id) fut.add_done_callback( functools.partial(update_stats_on_done, stats) ) futs.append(fut) if len(futs) == self.threads: break if futs: LOG.info("Waiting for %s failovers to finish", len(futs)) wait_done_or_dead(futs, self.dead) if stats['failover_attempted'] > 0: LOG.info("Attempted %s failovers of amphora", stats['failover_attempted']) LOG.info("Failed at %s failovers of amphora", stats['failover_failed']) LOG.info("Cancelled %s failovers of amphora", stats['failover_cancelled']) happy_failovers = stats['failover_attempted'] happy_failovers -= stats['failover_cancelled'] happy_failovers -= stats['failover_failed'] LOG.info("Successfully completed %s failovers of amphora", happy_failovers)