예제 #1
0
    def test_controller_node_isolation(self):
        """
        Isolate controller node, expect cluster to be available
        """
        def controller_available():
            return self.redpanda.controller() is not None

        admin = Admin(self.redpanda)

        # wait for controller
        wait_until(controller_available,
                   timeout_sec=ELECTION_TIMEOUT * 2,
                   backoff_sec=1)

        initial_leader_id, replicas = self._wait_for_leader()
        assert initial_leader_id == replicas[0]
        self._expect_available()

        allocator_info = admin.wait_stable_configuration(
            "id_allocator",
            namespace="kafka_internal",
            replication=3,
            timeout_s=ELECTION_TIMEOUT * 2)

        # isolate controller
        with FailureInjector(self.redpanda) as fi:
            controller_id = self.redpanda.idx(
                self.redpanda.controller().account.hostname)
            fi.inject_failure(
                FailureSpec(FailureSpec.FAILURE_ISOLATE,
                            self.redpanda.controller()))

            if allocator_info.leader == controller_id:
                hosts = [
                    n.account.hostname for n in self.redpanda.nodes
                    if self.redpanda.idx(n) != controller_id
                ]
                admin.await_stable_leader(
                    "id_allocator",
                    namespace="kafka_internal",
                    replication=3,
                    timeout_s=ELECTION_TIMEOUT * 2,
                    hosts=hosts,
                    check=lambda node_id: node_id != controller_id)

        connection = self.ping_pong()
        connection.ping_pong(timeout_s=10, retries=10)
        for i in range(0, 127):
            connection.ping_pong()
예제 #2
0
    def test_one_node_down(self):
        """
        Simplest HA test.  Stop the leader for our partition.  Validate that
        the cluster remains available afterwards, and that the expected
        peer takes over as the new leader.
        """
        # Find which node is the leader
        admin = Admin(self.redpanda)
        initial_leader_id, replicas = self._wait_for_leader()
        assert initial_leader_id == replicas[0]

        self._expect_available()

        allocator_info = admin.wait_stable_configuration(
            "id_allocator",
            namespace="kafka_internal",
            replication=3,
            timeout_s=ELECTION_TIMEOUT * 2)

        leader_node = self.redpanda.get_node(initial_leader_id)
        self.logger.info(
            f"Initial leader {initial_leader_id} {leader_node.account.hostname}"
        )
        self.logger.info(f"id_allocator leader {allocator_info.leader}")

        # Priority mechanism should reliably select next replica in list
        expect_new_leader_id = replicas[1]
        expect_new_leader_node = self.redpanda.get_node(expect_new_leader_id)

        observer_node_id = (set(replicas) -
                            {expect_new_leader_id, initial_leader_id}).pop()
        observer_node = self.redpanda.get_node(observer_node_id)
        self.logger.info(
            f"Tracking stats on observer node {observer_node_id} {observer_node.account.hostname}"
        )
        self.logger.info(
            f"Tracking stats on expected new leader node {expect_new_leader_id} {expect_new_leader_node.account.hostname}"
        )

        observer_metrics = MetricCheck(self.logger, self.redpanda,
                                       observer_node,
                                       re.compile("vectorized_raft_.*"),
                                       {'topic': self.topic})

        new_leader_metrics = MetricCheck(self.logger, self.redpanda,
                                         expect_new_leader_node,
                                         re.compile("vectorized_raft_.*"),
                                         {'topic': self.topic})

        self.logger.info(
            f"Stopping node {initial_leader_id} ({leader_node.account.hostname})"
        )
        self.redpanda.stop_node(leader_node)

        if allocator_info.leader == initial_leader_id:
            hosts = [
                n.account.hostname for n in self.redpanda.nodes
                if self.redpanda.idx(n) != initial_leader_id
            ]
            admin.await_stable_leader(
                "id_allocator",
                namespace="kafka_internal",
                replication=3,
                timeout_s=ELECTION_TIMEOUT * 2,
                hosts=hosts,
                check=lambda node_id: node_id != initial_leader_id)

        new_leader, _ = self._wait_for_leader(
            lambda l: l == expect_new_leader_id)
        self.logger.info(f"Leadership moved to {new_leader}")

        self._expect_available()

        # Check that metrics have progressed in the expected direction.  Not doing exact
        # value checks (for e.g. how many elections happened) because a sufficiently
        # noisy test environment can violate even quite long timeouts (e.g. the 1500ms
        # election timeout).
        #
        # It would be good to impose stricter checks, to detect bugs that manifest as
        # elections taking more iterations than expected, once we have a less contended
        # test environment to execute in.
        observer_metrics.expect([
            ("vectorized_raft_leadership_changes_total", lambda a, b: b > a),
            ("vectorized_raft_leader_for", lambda a, b: int(b) == 0),
            ("vectorized_raft_received_vote_requests_total",
             lambda a, b: b > a),
        ])

        new_leader_metrics.expect([
            ("vectorized_raft_leadership_changes_total", lambda a, b: b > a),
            ("vectorized_raft_leader_for", lambda a, b: int(b) == 1),
            ("vectorized_raft_received_vote_requests_total",
             lambda a, b: b == a),
        ])