Пример #1
0
 def test_network_partition_fault(self):
     """
     Test that the network partition fault results in a true network partition between nodes.
     """
     self.set_up_trogdor(3)
     spec = NetworkPartitionFaultSpec(0, TaskSpec.MAX_DURATION_MS,
                                         [[self.agent_nodes[0]], self.agent_nodes[1:]])
     partitions = spec.message["partitions"]
     assert 2 == len(partitions)
     assert [self.agent_nodes[0].name] == partitions[0]
     assert [self.agent_nodes[1].name, self.agent_nodes[2].name] == partitions[1]
     self.trogdor.create_task("partition0", spec)
     def verify_nodes_partitioned():
         if node_is_reachable(self.agent_nodes[0], self.agent_nodes[1]):
             return False
         if node_is_reachable(self.agent_nodes[1], self.agent_nodes[0]):
             return False
         if node_is_reachable(self.agent_nodes[2], self.agent_nodes[0]):
             return False
         return True
     wait_until(lambda: verify_nodes_partitioned,
                timeout_sec=10, backoff_sec=.2, err_msg="Failed to verify that the nodes were partitioned.")
     if not node_is_reachable(self.agent_nodes[0], self.agent_nodes[0]):
         raise RuntimeError("Node 0 must be reachable from itself.")
     if not node_is_reachable(self.agent_nodes[1], self.agent_nodes[2]):
         raise RuntimeError("Node 2 must be reachable from node 1.")
     if not node_is_reachable(self.agent_nodes[2], self.agent_nodes[1]):
         raise RuntimeError("Node 1 must be reachable from node 2.")
Пример #2
0
 def test_produce_consume_with_client_partition(self):
     workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
     time.sleep(2)
     part1 = [self.workload_service.nodes[0]]
     part2 = self.kafka.nodes + self.zk.nodes
     partition1_spec = NetworkPartitionFaultSpec(0, 60000, [part1, part2])
     stop1 = self.trogdor.create_task("stop1", partition1_spec)
     workload1.wait_for_done(timeout_sec=600)
     stop1.stop()
     stop1.wait_for_done()
 def test_round_trip_workload_with_broker_partition(self, metadata_quorum=quorum.zk):
     workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
     time.sleep(2)
     part1 = [self.kafka.nodes[0]]
     part2 = self.kafka.nodes[1:] + [self.workload_service.nodes[0]] + self.remote_quorum_nodes()
     partition1_spec = NetworkPartitionFaultSpec(0, TaskSpec.MAX_DURATION_MS,
                                                 [part1, part2])
     partition1 = self.trogdor.create_task("partition1", partition1_spec)
     workload1.wait_for_done(timeout_sec=600)
     partition1.stop()
     partition1.wait_for_done()
    def test_replication_with_replica_failure(self, metadata_quorum=quorum.zk):
        """
        This test verifies that replication shrinks the ISR when a replica is not fetching anymore.
        It also verifies that replication provides simple durability guarantees by checking that data acked by
        brokers is still available for consumption.

        Setup: 1 zk/KRaft controller, 3 kafka nodes, 1 topic with partitions=1, replication-factor=3, and min.insync.replicas=2
          - Produce messages in the background
          - Consume messages in the background
          - Partition a follower
          - Validate that the ISR was shrunk
          - Stop producing and finish consuming
          - Validate that every acked message was consumed
        """
        self.create_zookeeper_if_necessary()
        if self.zk:
            self.zk.start()

        self.create_kafka(
            num_nodes=3,
            server_prop_overrides=[["replica.lag.time.max.ms", "10000"]],
            controller_num_nodes_override=1)
        self.kafka.start()

        self.trogdor = TrogdorService(context=self.test_context,
                                      client_services=[self.kafka])
        self.trogdor.start()

        # If ZK is used, the partition leader is put on the controller node
        # to avoid partitioning the controller later on in the test.
        if self.zk:
            controller = self.kafka.controller()
            assignment = [self.kafka.idx(controller)] + [
                self.kafka.idx(node)
                for node in self.kafka.nodes if node != controller
            ]
        else:
            assignment = [self.kafka.idx(node) for node in self.kafka.nodes]

        self.topic = "test_topic"
        self.kafka.create_topic({
            "topic":
            self.topic,
            "replica-assignment":
            ":".join(map(str, assignment)),
            "configs": {
                "min.insync.replicas": 2
            }
        })

        self.logger.info("Created topic %s with assignment %s", self.topic,
                         ", ".join(map(str, assignment)))

        self.create_producer()
        self.producer.start()

        self.create_consumer()
        self.consumer.start()

        self.await_startup()

        leader = self.kafka.leader(self.topic, partition=0)
        replicas = self.kafka.replicas(self.topic, partition=0)

        # One of the followers is picked to be partitioned.
        follower_to_partition = [
            replica for replica in replicas if replica != leader
        ][0]
        self.logger.info(
            "Partitioning follower %s (%s) from the other brokers",
            self.kafka.idx(follower_to_partition), follower_to_partition.name)
        partition_spec = NetworkPartitionFaultSpec(
            0, 5 * 60 * 1000,
            [[follower_to_partition],
             [
                 node
                 for node in self.kafka.nodes if node != follower_to_partition
             ]])
        partition = self.trogdor.create_task("partition", partition_spec)

        def current_isr():
            try:
                # Due to the network partition, the kafka-topics command could fail if it tries
                # to connect to the partitioned broker. Therefore we catch the error here and retry.
                return set(
                    self.kafka.isr_idx_list(
                        self.topic,
                        partition=0,
                        node=leader,
                        offline_nodes=[follower_to_partition]))
            except RemoteCommandError as e:
                return set()

        # Verify that ISR is shrunk.
        expected_isr = {
            self.kafka.idx(replica)
            for replica in replicas if replica != follower_to_partition
        }
        wait_until(lambda: current_isr() == expected_isr,
                   timeout_sec=120,
                   backoff_sec=1,
                   err_msg="ISR should have been shrunk.")

        # Wait until the network partition is removed.
        partition.stop()
        partition.wait_for_done(timeout_sec=300)

        # Verify that ISR is expanded.
        expected_isr = {self.kafka.idx(replica) for replica in replicas}
        wait_until(lambda: current_isr() == expected_isr,
                   timeout_sec=120,
                   backoff_sec=1,
                   err_msg="ISR should have been expanded.")

        self.run_validation(producer_timeout_sec=120, min_records=25000)