示例#1
0
    def health_check(self):
        LOG.debug('health_check() starting...')
        futs = []
        while not self.dead.is_set():
            vthunder = None
            lock_session = None
            try:
                lock_session = db_apis.get_session(autocommit=False)
                failover_wait_time = datetime.datetime.utcnow(
                ) - datetime.timedelta(
                    seconds=CONF.a10_health_manager.heartbeat_timeout)
                initial_setup_wait_time = datetime.datetime.utcnow(
                ) - datetime.timedelta(
                    seconds=CONF.a10_health_manager.failover_timeout)
                vthunder = self.vthunder_repo.get_stale_vthunders(
                    lock_session, initial_setup_wait_time, failover_wait_time)

                if vthunder is not None:
                    self.vthunder_repo.set_vthunder_health_state(
                        lock_session, vthunder.id, 'DOWN')

                    if not vthunder.amphora_id:
                        LOG.info("Hardware vthunder %s heartbeat timeout",
                                 vthunder.vthunder_id)
                        lock_session.commit()
                        continue

                    # Don't failover vthunders which has pending state LBs
                    if self._is_vthunder_busy(lock_session, vthunder):
                        self.vthunder_repo.set_vthunder_health_state(
                            lock_session, vthunder.id, 'BUSY')
                        LOG.info(
                            "vthunder %s heartbeat timeout but it is in "
                            "pending state, skip failover",
                            vthunder.vthunder_id)
                        lock_session.commit()
                        continue

                lock_session.commit()

            except db_exc.DBDeadlock:
                vthunder = None
                LOG.debug('Database reports deadlock. Skipping.')
                lock_session.rollback()
            except db_exc.RetryRequest:
                vthunder = None
                LOG.debug('Database is requesting a retry. Skipping.')
                lock_session.rollback()
            except db_exc.DBConnectionError:
                vthunder = None
                db_apis.wait_for_connection(self.dead)
                lock_session.rollback()
                if not self.dead.is_set():
                    time.sleep(CONF.health_manager.heartbeat_timeout)
            except Exception as e:
                vthunder = None
                with excutils.save_and_reraise_exception():
                    LOG.debug("Database error while health_check: %s", str(e))
                    if lock_session:
                        lock_session.rollback()

            if vthunder is None:
                db_session = db_apis.get_session()
                self.vthunder_repo.unset_vthunder_busy_health_state(db_session)
                break

            LOG.info("Stale vThunder's id is: %s", vthunder.vthunder_id)
            fut = self.executor.submit(self.cw.failover_amphora,
                                       vthunder.vthunder_id)
            futs.append(fut)
            if len(futs) == self.threads:
                break

        if futs:
            LOG.info("Waiting for %s failovers to finish", len(futs))
            health_manager.wait_done_or_dead(futs, self.dead)
            LOG.info("Successfully completed failover for VThunders.")
示例#2
0
    def health_check(self):
        stats = {
            'failover_attempted': 0,
            'failover_failed': 0,
            'failover_cancelled': 0,
        }
        futs = []
        while not self.dead.is_set():
            amp_health = None
            lock_session = None
            try:
                lock_session = db_api.get_session(autocommit=False)
                amp = None
                amp_health = self.amp_health_repo.get_stale_amphora(
                    lock_session)
                if amp_health:
                    amp = self.amp_repo.get(lock_session,
                                            id=amp_health.amphora_id)
                    # If there is an associated LB, attempt to set it to
                    # PENDING_UPDATE. If it is already immutable, skip the
                    # amphora on this cycle
                    if amp and amp.load_balancer_id:
                        if not self._test_and_set_failover_prov_status(
                                lock_session, amp.load_balancer_id):
                            lock_session.rollback()
                            break
                lock_session.commit()
            except db_exc.DBDeadlock:
                LOG.debug('Database reports deadlock. Skipping.')
                lock_session.rollback()
                amp_health = None
            except db_exc.RetryRequest:
                LOG.debug('Database is requesting a retry. Skipping.')
                lock_session.rollback()
                amp_health = None
            except db_exc.DBConnectionError:
                db_api.wait_for_connection(self.dead)
                lock_session.rollback()
                amp_health = None
                if not self.dead.is_set():
                    # amphora heartbeat timestamps should also be outdated
                    # while DB is unavailable and soon after DB comes back
                    # online. Sleeping off the full "heartbeat_timeout"
                    # interval to give the amps a chance to check in before
                    # we start failovers.
                    time.sleep(CONF.health_manager.heartbeat_timeout)
            except Exception:
                with excutils.save_and_reraise_exception():
                    if lock_session:
                        lock_session.rollback()

            if amp_health is None:
                break

            LOG.info("Stale amphora's id is: %s", amp_health.amphora_id)
            fut = self.executor.submit(self.cw.failover_amphora,
                                       amp_health.amphora_id)
            fut.add_done_callback(
                functools.partial(update_stats_on_done, stats))
            futs.append(fut)
            if len(futs) == self.threads:
                break
        if futs:
            LOG.info("Waiting for %s failovers to finish", len(futs))
            wait_done_or_dead(futs, self.dead)
        if stats['failover_attempted'] > 0:
            LOG.info("Attempted %s failovers of amphora",
                     stats['failover_attempted'])
            LOG.info("Failed at %s failovers of amphora",
                     stats['failover_failed'])
            LOG.info("Cancelled %s failovers of amphora",
                     stats['failover_cancelled'])
            happy_failovers = stats['failover_attempted']
            happy_failovers -= stats['failover_cancelled']
            happy_failovers -= stats['failover_failed']
            LOG.info("Successfully completed %s failovers of amphora",
                     happy_failovers)
示例#3
0
    def health_check(self):
        stats = {
            'failover_attempted': 0,
            'failover_failed': 0,
            'failover_cancelled': 0,
        }
        futs = []
        while not self.dead.is_set():
            amp_health = None
            try:
                lock_session = db_api.get_session(autocommit=False)
                amp = None
                amp_health = self.amp_health_repo.get_stale_amphora(
                    lock_session)
                if amp_health:
                    amp = self.amp_repo.get(lock_session,
                                            id=amp_health.amphora_id)
                    # If there is an associated LB, attempt to set it to
                    # PENDING_UPDATE. If it is already immutable, skip the
                    # amphora on this cycle
                    if amp and amp.load_balancer_id:
                        if not self._test_and_set_failover_prov_status(
                                lock_session, amp.load_balancer_id):
                            lock_session.rollback()
                            break
                lock_session.commit()
            except db_exc.DBDeadlock:
                LOG.debug('Database reports deadlock. Skipping.')
                lock_session.rollback()
                amp_health = None
            except db_exc.RetryRequest:
                LOG.debug('Database is requesting a retry. Skipping.')
                lock_session.rollback()
                amp_health = None
            except db_exc.DBConnectionError:
                db_api.wait_for_connection(self.dead)
                lock_session.rollback()
                amp_health = None
                if not self.dead.is_set():
                    # amphora heartbeat timestamps should also be outdated
                    # while DB is unavailable and soon after DB comes back
                    # online. Sleeping off the full "heartbeat_timeout"
                    # interval to give the amps a chance to check in before
                    # we start failovers.
                    time.sleep(CONF.health_manager.heartbeat_timeout)
            except Exception:
                with excutils.save_and_reraise_exception():
                    lock_session.rollback()

            if amp_health is None:
                break

            LOG.info("Stale amphora's id is: %s", amp_health.amphora_id)
            fut = self.executor.submit(
                self.cw.failover_amphora, amp_health.amphora_id)
            fut.add_done_callback(
                functools.partial(update_stats_on_done, stats)
            )
            futs.append(fut)
            if len(futs) == self.threads:
                break
        if futs:
            LOG.info("Waiting for %s failovers to finish",
                     len(futs))
            wait_done_or_dead(futs, self.dead)
        if stats['failover_attempted'] > 0:
            LOG.info("Attempted %s failovers of amphora",
                     stats['failover_attempted'])
            LOG.info("Failed at %s failovers of amphora",
                     stats['failover_failed'])
            LOG.info("Cancelled %s failovers of amphora",
                     stats['failover_cancelled'])
            happy_failovers = stats['failover_attempted']
            happy_failovers -= stats['failover_cancelled']
            happy_failovers -= stats['failover_failed']
            LOG.info("Successfully completed %s failovers of amphora",
                     happy_failovers)