示例#1
0
    def test_id_allocator_leader_isolation(self):
        """
        Isolate id allocator leader. This test validates whether the cluster
        is still available when `kafka_internal/id_allocator` leader has been isolated.
        """
        admin = Admin(self.redpanda)
        self._expect_available()
        # Find which node is the leader for id allocator partition
        admin.wait_stable_configuration(namespace='kafka_internal',
                                        topic='id_allocator',
                                        replication=3)
        initial_leader_id = admin.get_partition_leader(
            namespace='kafka_internal', topic='id_allocator', partition=0)
        leader_node = self.redpanda.get_node(initial_leader_id)
        self.logger.info(
            f"kafka_internal/id_allocator/0 leader: {initial_leader_id}, node: {leader_node.account.hostname}"
        )

        self._expect_available()

        with FailureInjector(self.redpanda) as fi:
            # isolate id_allocator
            fi.inject_failure(
                FailureSpec(FailureSpec.FAILURE_ISOLATE,
                            self.redpanda.get_node(initial_leader_id)))

            # expect messages to be produced and consumed without a timeout
            connection = self.ping_pong()
            connection.ping_pong(timeout_s=10, retries=10)
            for i in range(0, 127):
                connection.ping_pong()
    def test_follower_isolation(self):
        """
        Simplest HA test.  Stop the leader for our partition.  Validate that
        the cluster remains available afterwards, and that the expected
        peer takes over as the new leader.
        """
        # Find which node is the leader
        initial_leader_id, replicas = self._wait_for_leader()
        assert initial_leader_id == replicas[0]

        self._expect_available()

        leader_node = self.redpanda.get_node(initial_leader_id)
        self.logger.info(
            f"Initial leader {initial_leader_id} {leader_node.account.hostname}"
        )

        with FailureInjector(self.redpanda) as fi:
            # isolate one of the followers
            fi.inject_failure(
                FailureSpec(FailureSpec.FAILURE_ISOLATE,
                            self.redpanda.get_node(replicas[1])))

            # expect messages to be produced and consumed without a timeout
            for i in range(0, 128):
                self._ping_pong()
示例#3
0
    def test_recovery_after_catastrophic_failure(self):

        self.redpanda = RedpandaService(
            self.test_context,
            3,
            KafkaCliTools,
            extra_rp_conf={
                "enable_auto_rebalance_on_node_add": True,
                "group_topic_partitions": 1,
                "default_topic_replications": 3,
            })

        self.redpanda.start()
        spec = TopicSpec(name="test-topic",
                         partition_count=6,
                         replication_factor=3)

        self.redpanda.create_topic(spec)
        self.topic = spec.name

        self.start_producer(1, throughput=10000)
        self.start_consumer(1)
        self.await_startup()

        # inject permanent random failure
        f_spec = FailureSpec(random.choice(FailureSpec.FAILURE_TYPES),
                             random.choice(self.redpanda.nodes[0:1]))

        self.inject_failure(f_spec)

        # inject transient failure on other node
        f_spec = FailureSpec(random.choice(FailureSpec.FAILURE_TYPES),
                             self.redpanda.nodes[2],
                             length=2.0 if self.scale.local else 15.0)

        self.inject_failure(f_spec)

        self.validate_records()
示例#4
0
    def test_controller_node_isolation(self):
        """
        Isolate controller node, expect cluster to be available
        """
        def controller_available():
            return self.redpanda.controller() is not None

        admin = Admin(self.redpanda)

        # wait for controller
        wait_until(controller_available,
                   timeout_sec=ELECTION_TIMEOUT * 2,
                   backoff_sec=1)

        initial_leader_id, replicas = self._wait_for_leader()
        assert initial_leader_id == replicas[0]
        self._expect_available()

        allocator_info = admin.wait_stable_configuration(
            "id_allocator",
            namespace="kafka_internal",
            replication=3,
            timeout_s=ELECTION_TIMEOUT * 2)

        # isolate controller
        with FailureInjector(self.redpanda) as fi:
            controller_id = self.redpanda.idx(
                self.redpanda.controller().account.hostname)
            fi.inject_failure(
                FailureSpec(FailureSpec.FAILURE_ISOLATE,
                            self.redpanda.controller()))

            if allocator_info.leader == controller_id:
                hosts = [
                    n.account.hostname for n in self.redpanda.nodes
                    if self.redpanda.idx(n) != controller_id
                ]
                admin.await_stable_leader(
                    "id_allocator",
                    namespace="kafka_internal",
                    replication=3,
                    timeout_s=ELECTION_TIMEOUT * 2,
                    hosts=hosts,
                    check=lambda node_id: node_id != controller_id)

        connection = self.ping_pong()
        connection.ping_pong(timeout_s=10, retries=10)
        for i in range(0, 127):
            connection.ping_pong()
示例#5
0
    def test_follower_isolation(self):
        """
        Simplest HA test.  Stop the leader for our partition.  Validate that
        the cluster remains available afterwards, and that the expected
        peer takes over as the new leader.
        """
        admin = Admin(self.redpanda)
        # Find which node is the leader
        initial_leader_id, replicas = self._wait_for_leader()
        assert initial_leader_id == replicas[0]

        self._expect_available()

        leader_node = self.redpanda.get_node(initial_leader_id)
        self.logger.info(
            f"Initial leader {initial_leader_id} {leader_node.account.hostname}"
        )

        allocator_info = admin.wait_stable_configuration(
            "id_allocator",
            namespace="kafka_internal",
            replication=3,
            timeout_s=ELECTION_TIMEOUT * 2)

        follower = None
        for node in replicas:
            if node == initial_leader_id:
                continue
            if node == allocator_info.leader:
                continue
            follower = node
            break

        assert follower != None

        with FailureInjector(self.redpanda) as fi:
            # isolate one of the followers
            fi.inject_failure(
                FailureSpec(FailureSpec.FAILURE_ISOLATE,
                            self.redpanda.get_node(follower)))

            # expect messages to be produced and consumed without a timeout
            connection = self.ping_pong()
            connection.ping_pong(timeout_s=10, retries=10)
            for i in range(0, 127):
                connection.ping_pong()
示例#6
0
    def do_action(self):
        node = self.target_node()
        if node:
            self.redpanda.logger.info(
                f'executing action on {node.account.hostname}')
            self.failure_injector.inject_failure(
                FailureSpec(FailureSpec.FAILURE_KILL, node))
            self.affected_nodes.add(node)
            self.last_affected_node = node

            # Update started_nodes so storage validations are run
            # on the correct set of nodes later.
            self.redpanda.remove_from_started_nodes(node)
            return node
        else:
            self.redpanda.logger.warn(f'no usable node')
            return None
    def test_controller_node_isolation(self):
        """
        Isolate controller node, expect cluster to be available
        """
        def controller_available():
            return self.redpanda.controller() is not None

        # wait for controller
        wait_until(controller_available, timeout_sec=10, backoff_sec=1)

        # isolate controller
        with FailureInjector(self.redpanda) as fi:
            fi.inject_failure(
                FailureSpec(FailureSpec.FAILURE_ISOLATE,
                            self.redpanda.controller()))

        for i in range(0, 128):
            self._ping_pong()
示例#8
0
    def test_metadata_request_does_not_contain_failed_node(
            self, failure, node):
        """
        Check if broker list returned from metadata request does not contain node
        which is not alive
        """
        # validate initial conditions
        wait_until(lambda: self.controller_present, 10, 1)
        rpk = RpkTool(self.redpanda)
        nodes = rpk.cluster_info()
        assert len(nodes) == 3
        redpanda_ids = [self.redpanda.idx(n) for n in self.redpanda.nodes]
        node_ids = [n.id for n in nodes]
        assert sorted(redpanda_ids) == sorted(node_ids)

        def get_node():
            if node == 'controller':
                return self.redpanda.controller()
            else:
                n = self.redpanda.nodes[0]
                while n == self.redpanda.controller():
                    n = random.choice(self.redpanda.nodes)
                return n

        node = get_node()
        node_id = self.redpanda.idx(node)
        self.redpanda.logger.info(
            f"Injecting failure on node {node.account.hostname} with id: {node_id}",
        )
        with FailureInjector(self.redpanda) as fi:
            if failure == "isolate":
                fi.inject_failure(
                    FailureSpec(FailureSpec.FAILURE_ISOLATE, node))
            else:
                self.redpanda.stop_node(node)

            rpk = RpkTool(self.redpanda)

            def contains_only_alive_nodes():
                nodes = rpk.cluster_info()
                returned_ids = [n.id for n in nodes]
                return len(nodes) == 2 and node_id not in returned_ids

            wait_until(contains_only_alive_nodes, 30, 1)
        def failure_injector_loop():
            f_injector = FailureInjector(self.redpanda)
            while enable_failures:
                f_type = random.choice(FailureSpec.FAILURE_TYPES)
                length = 0
                # allow suspending any node
                if f_type == FailureSpec.FAILURE_SUSPEND:
                    length = random.randint(1, 10)
                    node = random.choice(self.redpanda.nodes)
                else:
                    #kill/termianate only active nodes (not to influence the test outcome)
                    idx = random.choice(list(self.active_nodes)) - 1
                    node = self.redpanda.nodes[idx]

                f_injector.inject_failure(
                    FailureSpec(node=node, type=f_type, length=length))

                delay = random.randint(20, 45)
                self.redpanda.logger.info(
                    f"waiting {delay} seconds before next failure")
                time.sleep(delay)
示例#10
0
        def failure_injector_loop():
            f_injector = FailureInjector(self.redpanda)
            while enable_failures:
                f_type = random.choice(FailureSpec.FAILURE_TYPES)
                length = 0
                # allow suspending any node
                if f_type == FailureSpec.FAILURE_SUSPEND:
                    length = random.randint(
                        1, NodeOperationFuzzyTest.max_suspend_duration_seconds)
                    node = random.choice(self.redpanda.nodes)
                else:
                    #kill/termianate only active nodes (not to influence the test outcome)
                    idx = random.choice(list(self.active_nodes))
                    node = self.redpanda.get_node(idx)

                f_injector.inject_failure(
                    FailureSpec(node=node, type=f_type, length=length))

                delay = random.randint(
                    NodeOperationFuzzyTest.min_inter_failure_time,
                    NodeOperationFuzzyTest.max_inter_failure_time)
                self.redpanda.logger.info(
                    f"waiting {delay} seconds before next failure")
                time.sleep(delay)
示例#11
0
        def failure_injector_loop():
            f_injector = FailureInjector(self.redpanda)
            while failures:
                f_type = random.choice(FailureSpec.FAILURE_TYPES)
                length = 0
                node = random.choice(self.redpanda.nodes)
                while self.redpanda.idx(node) in suppressed:
                    node = random.choice(self.redpanda.nodes)

                # allow suspending any node
                if f_type == FailureSpec.FAILURE_SUSPEND:
                    length = random.randint(
                        1,
                        ConsumerOffsetsMigrationTest.max_suspend_duration_sec)

                f_injector.inject_failure(
                    FailureSpec(node=node, type=f_type, length=length))

                delay = random.randint(
                    ConsumerOffsetsMigrationTest.min_inter_failure_time_sec,
                    ConsumerOffsetsMigrationTest.max_inter_failure_time_sec)
                self.redpanda.logger.info(
                    f"waiting {delay} seconds before next failure")
                time.sleep(delay)
示例#12
0
    def random_failure_spec(self):
        f_type = random.choice(self.allowed_failures)
        length = self.failure_length_provider(f_type)
        node = random.choice(self.allowed_nodes_provider(f_type))

        return FailureSpec(node=node, type=f_type, length=length)