Пример #1
0
 def __init__(self, test_context):
     """:type test_context: ducktape.tests.test.TestContext"""
     super(ConsumeBenchTest, self).__init__(test_context)
     self.zk = ZookeeperService(test_context,
                                num_nodes=3) if quorum.for_test(
                                    test_context) == quorum.zk else None
     self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk)
     self.producer_workload_service = ProduceBenchWorkloadService(
         test_context, self.kafka)
     self.consumer_workload_service = ConsumeBenchWorkloadService(
         test_context, self.kafka)
     self.consumer_workload_service_2 = ConsumeBenchWorkloadService(
         test_context, self.kafka)
     self.active_topics = {
         "consume_bench_topic[0-5]": {
             "numPartitions": 5,
             "replicationFactor": 3
         }
     }
     self.trogdor = TrogdorService(context=self.test_context,
                                   client_services=[
                                       self.kafka,
                                       self.producer_workload_service,
                                       self.consumer_workload_service,
                                       self.consumer_workload_service_2
                                   ])
Пример #2
0
 def __init__(self, test_context):
     """:type test_context: ducktape.tests.test.TestContext"""
     super(RoundTripFaultTest, self).__init__(test_context)
     self.zk = ZookeeperService(test_context, num_nodes=3)
     self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk)
     self.workload_service = RoundTripWorkloadService(
         test_context, self.kafka)
     self.trogdor = TrogdorService(
         context=self.test_context,
         client_services=[self.zk, self.kafka, self.workload_service])
     topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index
     RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1
     active_topics = {
         topic_name: {
             "partitionAssignments": {
                 "0": [0, 1, 2]
             }
         }
     }
     self.round_trip_spec = RoundTripWorkloadSpec(
         0,
         TaskSpec.MAX_DURATION_MS,
         self.workload_service.client_node,
         self.workload_service.bootstrap_servers,
         target_messages_per_sec=10000,
         max_messages=100000,
         active_topics=active_topics)
Пример #3
0
 def set_up_trogdor(self, num_agent_nodes):
     self.agent_nodes = self.test_context.cluster.alloc(
         ClusterSpec.simple_linux(num_agent_nodes))
     self.trogdor = TrogdorService(context=self.test_context,
                                   agent_nodes=self.agent_nodes)
     for agent_node in self.agent_nodes:
         agent_node.account.logger = self.trogdor.logger
     self.trogdor.start()
Пример #4
0
 def __init__(self, test_context):
     """:type test_context: ducktape.tests.test.TestContext"""
     super(ProduceBenchTest, self).__init__(test_context)
     self.zk = ZookeeperService(test_context, num_nodes=3)
     self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk)
     self.workload_service = ProduceBenchWorkloadService(
         test_context, self.kafka)
     self.trogdor = TrogdorService(
         context=self.test_context,
         client_services=[self.kafka, self.workload_service])
Пример #5
0
class ProduceBenchTest(Test):
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ProduceBenchTest, self).__init__(test_context)
        self.zk = ZookeeperService(test_context, num_nodes=3)
        self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk)
        self.workload_service = ProduceBenchWorkloadService(test_context, self.kafka)
        self.trogdor = TrogdorService(context=self.test_context,
                                      client_services=[self.kafka, self.workload_service])
        self.active_topics = {"produce_bench_topic[0-1]": {"numPartitions": 1, "replicationFactor": 3}}
        self.inactive_topics = {"produce_bench_topic[2-9]": {"numPartitions": 1, "replicationFactor": 3}}

    def setUp(self):
        self.trogdor.start()
        self.zk.start()
        self.kafka.start()

    def teardown(self):
        self.trogdor.stop()
        self.kafka.stop()
        self.zk.stop()

    def test_produce_bench(self):
        spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                        self.workload_service.producer_node,
                                        self.workload_service.bootstrap_servers,
                                        target_messages_per_sec=1000,
                                        max_messages=100000,
                                        producer_conf={},
                                        admin_client_conf={},
                                        common_client_conf={},
                                        inactive_topics=self.inactive_topics,
                                        active_topics=self.active_topics)
        workload1 = self.trogdor.create_task("workload1", spec)
        workload1.wait_for_done(timeout_sec=360)
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))

    def test_produce_bench_transactions(self):
        spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                        self.workload_service.producer_node,
                                        self.workload_service.bootstrap_servers,
                                        target_messages_per_sec=1000,
                                        max_messages=100000,
                                        producer_conf={},
                                        admin_client_conf={},
                                        common_client_conf={},
                                        inactive_topics=self.inactive_topics,
                                        active_topics=self.active_topics,
                                        transaction_generator={
                                            # 10 transactions with 10k messages
                                            "type": "uniform",
                                            "messagesPerTransaction": "10000"
                                        })
        workload1 = self.trogdor.create_task("workload1", spec)
        workload1.wait_for_done(timeout_sec=360)
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))
Пример #6
0
 def __init__(self, test_context):
     """:type test_context: ducktape.tests.test.TestContext"""
     super(ProduceBenchTest, self).__init__(test_context)
     self.zk = ZookeeperService(test_context, num_nodes=3)
     self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk)
     self.workload_service = ProduceBenchWorkloadService(test_context, self.kafka)
     self.trogdor = TrogdorService(context=self.test_context,
                                   client_services=[self.kafka, self.workload_service])
Пример #7
0
 def __init__(self, test_context):
     """:type test_context: ducktape.tests.test.TestContext"""
     super(RoundTripFaultTest, self).__init__(test_context)
     self.zk = ZookeeperService(test_context, num_nodes=3)
     self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk)
     self.workload_service = RoundTripWorkloadService(
         test_context, self.kafka)
     self.trogdor = TrogdorService(
         context=self.test_context,
         client_services=[self.zk, self.kafka, self.workload_service])
     self.round_trip_spec = RoundTripWorkloadSpec(
         0,
         TaskSpec.MAX_DURATION_MS,
         self.workload_service.client_node,
         self.workload_service.bootstrap_servers,
         target_messages_per_sec=10000,
         partition_assignments={0: [0, 1, 2]},
         max_messages=100000)
Пример #8
0
 def __init__(self, test_context):
     """:type test_context: ducktape.tests.test.TestContext"""
     super(ProduceBenchTest, self).__init__(test_context)
     self.zk = ZookeeperService(test_context, num_nodes=3)
     self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk)
     self.workload_service = ProduceBenchWorkloadService(test_context, self.kafka)
     self.trogdor = TrogdorService(context=self.test_context,
                                   client_services=[self.kafka, self.workload_service])
     self.active_topics = {"produce_bench_topic[0-1]": {"numPartitions": 1, "replicationFactor": 3}}
     self.inactive_topics = {"produce_bench_topic[2-9]": {"numPartitions": 1, "replicationFactor": 3}}
Пример #9
0
    def test_produce_consume(self, topic_count, partition_count, replication_factor):
        topics_create_start_time = time.time()
        for i in range(topic_count):
            topic = "replicas_produce_consume_%d" % i
            print("Creating topic %s" % topic)  # Force some stdout for Jenkins
            topic_cfg = {
                "topic": topic,
                "partitions": partition_count,
                "replication-factor": replication_factor,
                "configs": {"min.insync.replicas": 2}
            }
            self.kafka.create_topic(topic_cfg)

        topics_create_end_time = time.time()
        self.logger.info("Time to create topics: %d" % (topics_create_end_time - topics_create_start_time))

        producer_workload_service = ProduceBenchWorkloadService(self.test_context, self.kafka)
        consumer_workload_service = ConsumeBenchWorkloadService(self.test_context, self.kafka)
        trogdor = TrogdorService(context=self.test_context,
                                 client_services=[self.kafka, producer_workload_service, consumer_workload_service])
        trogdor.start()

        produce_spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                producer_workload_service.producer_node,
                                                producer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=10000,
                                                max_messages=3400000,
                                                producer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                inactive_topics={},
                                                active_topics={"replicas_produce_consume_[0-2]": {
                                                    "numPartitions": partition_count, "replicationFactor": replication_factor
                                                }})
        produce_workload = trogdor.create_task("replicas-produce-workload", produce_spec)
        produce_workload.wait_for_done(timeout_sec=600)
        self.logger.info("Completed produce bench")

        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                consumer_workload_service.consumer_node,
                                                consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=10000,
                                                max_messages=3400000,
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                active_topics=["replicas_produce_consume_[0-2]"])
        consume_workload = trogdor.create_task("replicas-consume-workload", consume_spec)
        consume_workload.wait_for_done(timeout_sec=600)
        self.logger.info("Completed consume bench")

        trogdor.stop()
Пример #10
0
 def __init__(self, test_context):
     """:type test_context: ducktape.tests.test.TestContext"""
     super(ProduceBenchTest, self).__init__(test_context)
     self.redpanda = RedpandaService(test_context, num_nodes=3)
     self.workload_service = ProduceBenchWorkloadService(
         test_context, self.redpanda)
     self.trogdor = TrogdorService(
         context=self.test_context,
         client_services=[self.redpanda, self.workload_service])
     self.active_topics = {
         "produce_bench_topic[0-1]": {
             "numPartitions": 1,
             "replicationFactor": 3
         }
     }
     self.inactive_topics = {
         "produce_bench_topic[2-9]": {
             "numPartitions": 1,
             "replicationFactor": 3
         }
     }
Пример #11
0
class ProduceBenchTest(Test):
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ProduceBenchTest, self).__init__(test_context)
        self.redpanda = RedpandaService(test_context, num_nodes=3)
        self.workload_service = ProduceBenchWorkloadService(
            test_context, self.redpanda)
        self.trogdor = TrogdorService(
            context=self.test_context,
            client_services=[self.redpanda, self.workload_service])
        self.active_topics = {
            "produce_bench_topic[0-1]": {
                "numPartitions": 1,
                "replicationFactor": 3
            }
        }
        self.inactive_topics = {
            "produce_bench_topic[2-9]": {
                "numPartitions": 1,
                "replicationFactor": 3
            }
        }

    def setUp(self):
        self.trogdor.start()
        self.redpanda.start()

    def teardown(self):
        self.trogdor.stop()
        self.redpanda.stop()

    @cluster(num_nodes=3)
    def test_produce_bench(self):
        spec = ProduceBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.workload_service.producer_node,
            self.workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=100000,
            producer_conf={},
            admin_client_conf={},
            common_client_conf={},
            inactive_topics=self.inactive_topics,
            active_topics=self.active_topics)
        workload1 = self.trogdor.create_task("workload1", spec)

        # the trogdor service logs all requests() operations to INFO level,
        # which is too verbose. We explicitly change the level to WARNING and
        # set it back after the wait_for_done function returns
        self.trogdor.logger.setLevel('WARNING')

        workload1.wait_for_done(timeout_sec=360)

        # set it back to info
        self.trogdor.logger.setLevel('INFO')

        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" %
                         json.dumps(tasks, sort_keys=True, indent=2))
Пример #12
0
 def __init__(self, test_context):
     """:type test_context: ducktape.tests.test.TestContext"""
     super(RoundTripFaultTest, self).__init__(test_context)
     self.zk = ZookeeperService(test_context,
                                num_nodes=3) if quorum.for_test(
                                    test_context) == quorum.zk else None
     self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk)
     self.workload_service = RoundTripWorkloadService(
         test_context, self.kafka)
     if quorum.for_test(test_context) == quorum.zk:
         trogdor_client_services = [
             self.zk, self.kafka, self.workload_service
         ]
     elif quorum.for_test(test_context) == quorum.remote_kraft:
         trogdor_client_services = [
             self.kafka.controller_quorum, self.kafka, self.workload_service
         ]
     else:  #co-located case, which we currently don't test but handle here for completeness in case we do test it
         trogdor_client_services = [self.kafka, self.workload_service]
     self.trogdor = TrogdorService(context=self.test_context,
                                   client_services=trogdor_client_services)
     topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index
     RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1
     # note that the broker.id values will be 1..num_nodes
     active_topics = {
         topic_name: {
             "partitionAssignments": {
                 "0": [1, 2, 3]
             }
         }
     }
     self.round_trip_spec = RoundTripWorkloadSpec(
         0,
         TaskSpec.MAX_DURATION_MS,
         self.workload_service.client_node,
         self.workload_service.bootstrap_servers,
         target_messages_per_sec=10000,
         max_messages=100000,
         active_topics=active_topics)
Пример #13
0
 def __init__(self, test_context):
     """:type test_context: ducktape.tests.test.TestContext"""
     super(RoundTripFaultTest, self).__init__(test_context)
     self.zk = ZookeeperService(test_context, num_nodes=3)
     self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk)
     self.workload_service = RoundTripWorkloadService(test_context, self.kafka)
     self.trogdor = TrogdorService(context=self.test_context,
                                   client_services=[self.zk, self.kafka, self.workload_service])
     topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index
     RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1
     active_topics={topic_name : {"partitionAssignments":{"0": [0,1,2]}}}
     self.round_trip_spec = RoundTripWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                  self.workload_service.client_node,
                                  self.workload_service.bootstrap_servers,
                                  target_messages_per_sec=10000,
                                  max_messages=100000,
                                  active_topics=active_topics)
Пример #14
0
class ProduceBenchTest(Test):
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ProduceBenchTest, self).__init__(test_context)
        self.zk = ZookeeperService(test_context, num_nodes=3)
        self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk)
        self.workload_service = ProduceBenchWorkloadService(
            test_context, self.kafka)
        self.trogdor = TrogdorService(
            context=self.test_context,
            client_services=[self.kafka, self.workload_service])

    def setUp(self):
        self.trogdor.start()
        self.zk.start()
        self.kafka.start()

    def teardown(self):
        self.trogdor.stop()
        self.kafka.stop()
        self.zk.stop()

    def test_produce_bench(self):
        active_topics = {
            "produce_bench_topic[0-1]": {
                "numPartitions": 1,
                "replicationFactor": 3
            }
        }
        inactive_topics = {
            "produce_bench_topic[2-9]": {
                "numPartitions": 1,
                "replicationFactor": 3
            }
        }
        spec = ProduceBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.workload_service.producer_node,
            self.workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=100000,
            producer_conf={},
            admin_client_conf={},
            common_client_conf={},
            inactive_topics=inactive_topics,
            active_topics=active_topics)
        workload1 = self.trogdor.create_task("workload1", spec)
        workload1.wait_for_done(timeout_sec=360)
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" %
                         json.dumps(tasks, sort_keys=True, indent=2))
Пример #15
0
class RoundTripFaultTest(Test):
    topic_name_index = 0

    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(RoundTripFaultTest, self).__init__(test_context)
        self.zk = ZookeeperService(test_context,
                                   num_nodes=3) if quorum.for_test(
                                       test_context) == quorum.zk else None
        self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk)
        self.workload_service = RoundTripWorkloadService(
            test_context, self.kafka)
        if quorum.for_test(test_context) == quorum.zk:
            trogdor_client_services = [
                self.zk, self.kafka, self.workload_service
            ]
        elif quorum.for_test(test_context) == quorum.remote_kraft:
            trogdor_client_services = [
                self.kafka.controller_quorum, self.kafka, self.workload_service
            ]
        else:  #co-located case, which we currently don't test but handle here for completeness in case we do test it
            trogdor_client_services = [self.kafka, self.workload_service]
        self.trogdor = TrogdorService(context=self.test_context,
                                      client_services=trogdor_client_services)
        topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index
        RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1
        active_topics = {
            topic_name: {
                "partitionAssignments": {
                    "0": [0, 1, 2]
                }
            }
        }
        self.round_trip_spec = RoundTripWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.workload_service.client_node,
            self.workload_service.bootstrap_servers,
            target_messages_per_sec=10000,
            max_messages=100000,
            active_topics=active_topics)

    def setUp(self):
        if self.zk:
            self.zk.start()
        self.kafka.start()
        self.trogdor.start()

    def teardown(self):
        self.trogdor.stop()
        self.kafka.stop()
        if self.zk:
            self.zk.stop()

    def remote_quorum_nodes(self):
        if quorum.for_test(self.test_context) == quorum.zk:
            return self.zk.nodes
        elif quorum.for_test(self.test_context) == quorum.remote_kraft:
            return self.kafka.controller_quorum.nodes
        else:  # co-located case, which we currently don't test but handle here for completeness in case we do test it
            return []

    @cluster(num_nodes=9)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_round_trip_workload(self, metadata_quorum=quorum.zk):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        workload1.wait_for_done(timeout_sec=600)

    @cluster(num_nodes=9)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_round_trip_workload_with_broker_partition(
            self, metadata_quorum=quorum.zk):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        part1 = [self.kafka.nodes[0]]
        part2 = self.kafka.nodes[1:] + [self.workload_service.nodes[0]
                                        ] + self.remote_quorum_nodes()
        partition1_spec = NetworkPartitionFaultSpec(0,
                                                    TaskSpec.MAX_DURATION_MS,
                                                    [part1, part2])
        partition1 = self.trogdor.create_task("partition1", partition1_spec)
        workload1.wait_for_done(timeout_sec=600)
        partition1.stop()
        partition1.wait_for_done()

    @cluster(num_nodes=9)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_produce_consume_with_broker_pause(self,
                                               metadata_quorum=quorum.zk):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        stop1_spec = ProcessStopFaultSpec(0, TaskSpec.MAX_DURATION_MS,
                                          [self.kafka.nodes[0]],
                                          self.kafka.java_class_name())
        stop1 = self.trogdor.create_task("stop1", stop1_spec)
        workload1.wait_for_done(timeout_sec=600)
        stop1.stop()
        stop1.wait_for_done()
        self.kafka.stop_node(self.kafka.nodes[0], False)

    @cluster(num_nodes=9)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_produce_consume_with_client_partition(self,
                                                   metadata_quorum=quorum.zk):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        part1 = [self.workload_service.nodes[0]]
        part2 = self.kafka.nodes + self.remote_quorum_nodes()
        partition1_spec = NetworkPartitionFaultSpec(0, 60000, [part1, part2])
        stop1 = self.trogdor.create_task("stop1", partition1_spec)
        workload1.wait_for_done(timeout_sec=600)
        stop1.stop()
        stop1.wait_for_done()

    @cluster(num_nodes=9)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_produce_consume_with_latency(self, metadata_quorum=quorum.zk):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        spec = DegradedNetworkFaultSpec(0, 60000)
        for node in self.kafka.nodes + self.remote_quorum_nodes():
            spec.add_node_spec(node.name,
                               "eth0",
                               latencyMs=100,
                               rateLimitKbit=3000)
        slow1 = self.trogdor.create_task("slow1", spec)
        workload1.wait_for_done(timeout_sec=600)
        slow1.stop()
        slow1.wait_for_done()
Пример #16
0
class RoundTripFaultTest(Test):
    topic_name_index = 0

    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(RoundTripFaultTest, self).__init__(test_context)
        self.zk = ZookeeperService(test_context, num_nodes=3)
        self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk)
        self.workload_service = RoundTripWorkloadService(test_context, self.kafka)
        self.trogdor = TrogdorService(context=self.test_context,
                                      client_services=[self.zk, self.kafka, self.workload_service])
        topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index
        RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1
        active_topics={topic_name : {"partitionAssignments":{"0": [0,1,2]}}}
        self.round_trip_spec = RoundTripWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                     self.workload_service.client_node,
                                     self.workload_service.bootstrap_servers,
                                     target_messages_per_sec=10000,
                                     max_messages=100000,
                                     active_topics=active_topics)

    def setUp(self):
        self.zk.start()
        self.kafka.start()
        self.trogdor.start()

    def teardown(self):
        self.trogdor.stop()
        self.kafka.stop()
        self.zk.stop()

    def test_round_trip_workload(self):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        workload1.wait_for_done(timeout_sec=600)

    def test_round_trip_workload_with_broker_partition(self):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        part1 = [self.kafka.nodes[0]]
        part2 = self.kafka.nodes[1:] + [self.workload_service.nodes[0]] + self.zk.nodes
        partition1_spec = NetworkPartitionFaultSpec(0, TaskSpec.MAX_DURATION_MS,
                                                    [part1, part2])
        partition1 = self.trogdor.create_task("partition1", partition1_spec)
        workload1.wait_for_done(timeout_sec=600)
        partition1.stop()
        partition1.wait_for_done()

    def test_produce_consume_with_broker_pause(self):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        stop1_spec = ProcessStopFaultSpec(0, TaskSpec.MAX_DURATION_MS, [self.kafka.nodes[0]],
                                           self.kafka.java_class_name())
        stop1 = self.trogdor.create_task("stop1", stop1_spec)
        workload1.wait_for_done(timeout_sec=600)
        stop1.stop()
        stop1.wait_for_done()
        self.kafka.stop_node(self.kafka.nodes[0], False)

    def test_produce_consume_with_client_partition(self):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        part1 = [self.workload_service.nodes[0]]
        part2 = self.kafka.nodes + self.zk.nodes
        partition1_spec = NetworkPartitionFaultSpec(0, 60000, [part1, part2])
        stop1 = self.trogdor.create_task("stop1", partition1_spec)
        workload1.wait_for_done(timeout_sec=600)
        stop1.stop()
        stop1.wait_for_done()
Пример #17
0
 def set_up_trogdor(self, num_agent_nodes):
     self.agent_nodes = self.test_context.cluster.alloc(ClusterSpec.simple_linux(num_agent_nodes))
     self.trogdor = TrogdorService(context=self.test_context, agent_nodes=self.agent_nodes)
     for agent_node in self.agent_nodes:
         agent_node.account.logger = self.trogdor.logger
     self.trogdor.start()
Пример #18
0
class TrogdorTest(Test):
    """
    Tests the Trogdor fault injection daemon in isolation.
    """

    def __init__(self, test_context):
        super(TrogdorTest, self).__init__(test_context)

    def set_up_trogdor(self, num_agent_nodes):
        self.agent_nodes = self.test_context.cluster.alloc(ClusterSpec.simple_linux(num_agent_nodes))
        self.trogdor = TrogdorService(context=self.test_context, agent_nodes=self.agent_nodes)
        for agent_node in self.agent_nodes:
            agent_node.account.logger = self.trogdor.logger
        self.trogdor.start()

    def setUp(self):
        self.trogdor = None
        self.agent_nodes = None

    def tearDown(self):
        if self.trogdor is not None:
            self.trogdor.stop()
            self.trogdor = None
        if self.agent_nodes is not None:
            self.test_context.cluster.free(self.agent_nodes)
            self.agent_nodes = None

    @cluster(num_nodes=4)
    def test_trogdor_service(self):
        """
        Test that we can bring up Trogdor and create a no-op fault.
        """
        self.set_up_trogdor(3)
        spec = NoOpTaskSpec(0, TaskSpec.MAX_DURATION_MS)
        self.trogdor.create_task("myfault", spec)
        def check_for_myfault():
            faults = self.trogdor.tasks()["tasks"]
            self.logger.info("tasks = %s" % faults)
            return "myfault" in faults
        wait_until(lambda: check_for_myfault,
                   timeout_sec=10, backoff_sec=.2, err_msg="Failed to read back myfault.")
        self.trogdor.stop_task("myfault")

    @cluster(num_nodes=4)
    def test_network_partition_fault(self):
        """
        Test that the network partition fault results in a true network partition between nodes.
        """
        self.set_up_trogdor(3)
        spec = NetworkPartitionFaultSpec(0, TaskSpec.MAX_DURATION_MS,
                                            [[self.agent_nodes[0]], self.agent_nodes[1:]])
        partitions = spec.message["partitions"]
        assert 2 == len(partitions)
        assert [self.agent_nodes[0].name] == partitions[0]
        assert [self.agent_nodes[1].name, self.agent_nodes[2].name] == partitions[1]
        self.trogdor.create_task("partition0", spec)
        def verify_nodes_partitioned():
            if node_is_reachable(self.agent_nodes[0], self.agent_nodes[1]):
                return False
            if node_is_reachable(self.agent_nodes[1], self.agent_nodes[0]):
                return False
            if node_is_reachable(self.agent_nodes[2], self.agent_nodes[0]):
                return False
            return True
        wait_until(lambda: verify_nodes_partitioned,
                   timeout_sec=10, backoff_sec=.2, err_msg="Failed to verify that the nodes were partitioned.")
        if not node_is_reachable(self.agent_nodes[0], self.agent_nodes[0]):
            raise RuntimeError("Node 0 must be reachable from itself.")
        if not node_is_reachable(self.agent_nodes[1], self.agent_nodes[2]):
            raise RuntimeError("Node 2 must be reachable from node 1.")
        if not node_is_reachable(self.agent_nodes[2], self.agent_nodes[1]):
            raise RuntimeError("Node 1 must be reachable from node 2.")
Пример #19
0
class ProduceBenchTest(Test):
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ProduceBenchTest, self).__init__(test_context)
        self.zk = ZookeeperService(test_context,
                                   num_nodes=3) if quorum.for_test(
                                       test_context) == quorum.zk else None
        self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk)
        self.workload_service = ProduceBenchWorkloadService(
            test_context, self.kafka)
        self.trogdor = TrogdorService(
            context=self.test_context,
            client_services=[self.kafka, self.workload_service])
        self.active_topics = {
            "produce_bench_topic[0-1]": {
                "numPartitions": 1,
                "replicationFactor": 3
            }
        }
        self.inactive_topics = {
            "produce_bench_topic[2-9]": {
                "numPartitions": 1,
                "replicationFactor": 3
            }
        }

    def setUp(self):
        self.trogdor.start()
        if self.zk:
            self.zk.start()
        self.kafka.start()

    def teardown(self):
        self.trogdor.stop()
        self.kafka.stop()
        if self.zk:
            self.zk.stop()

    @cluster(num_nodes=8)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_produce_bench(self, metadata_quorum=quorum.zk):
        spec = ProduceBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.workload_service.producer_node,
            self.workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=100000,
            producer_conf={},
            admin_client_conf={},
            common_client_conf={},
            inactive_topics=self.inactive_topics,
            active_topics=self.active_topics)
        workload1 = self.trogdor.create_task("workload1", spec)
        workload1.wait_for_done(timeout_sec=360)
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" %
                         json.dumps(tasks, sort_keys=True, indent=2))

    @cluster(num_nodes=8)
    def test_produce_bench_transactions(self, metadata_quorum=quorum.zk):
        spec = ProduceBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.workload_service.producer_node,
            self.workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=100000,
            producer_conf={},
            admin_client_conf={},
            common_client_conf={},
            inactive_topics=self.inactive_topics,
            active_topics=self.active_topics,
            transaction_generator={
                # 10 transactions with 10k messages
                "type": "uniform",
                "messagesPerTransaction": "10000"
            })
        workload1 = self.trogdor.create_task("workload1", spec)
        workload1.wait_for_done(timeout_sec=360)
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" %
                         json.dumps(tasks, sort_keys=True, indent=2))
Пример #20
0
class RoundTripFaultTest(Test):
    topic_name_index = 0

    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(RoundTripFaultTest, self).__init__(test_context)
        self.zk = ZookeeperService(test_context, num_nodes=3)
        self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk)
        self.workload_service = RoundTripWorkloadService(
            test_context, self.kafka)
        self.trogdor = TrogdorService(
            context=self.test_context,
            client_services=[self.zk, self.kafka, self.workload_service])
        topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index
        RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1
        active_topics = {
            topic_name: {
                "partitionAssignments": {
                    "0": [0, 1, 2]
                }
            }
        }
        self.round_trip_spec = RoundTripWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.workload_service.client_node,
            self.workload_service.bootstrap_servers,
            target_messages_per_sec=10000,
            max_messages=100000,
            active_topics=active_topics)

    def setUp(self):
        self.zk.start()
        self.kafka.start()
        self.trogdor.start()

    def teardown(self):
        self.trogdor.stop()
        self.kafka.stop()
        self.zk.stop()

    def test_round_trip_workload(self):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        workload1.wait_for_done(timeout_sec=600)

    def test_round_trip_workload_with_broker_partition(self):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        part1 = [self.kafka.nodes[0]]
        part2 = self.kafka.nodes[1:] + [self.workload_service.nodes[0]
                                        ] + self.zk.nodes
        partition1_spec = NetworkPartitionFaultSpec(0,
                                                    TaskSpec.MAX_DURATION_MS,
                                                    [part1, part2])
        partition1 = self.trogdor.create_task("partition1", partition1_spec)
        workload1.wait_for_done(timeout_sec=600)
        partition1.stop()
        partition1.wait_for_done()

    def test_produce_consume_with_broker_pause(self):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        stop1_spec = ProcessStopFaultSpec(0, TaskSpec.MAX_DURATION_MS,
                                          [self.kafka.nodes[0]],
                                          self.kafka.java_class_name())
        stop1 = self.trogdor.create_task("stop1", stop1_spec)
        workload1.wait_for_done(timeout_sec=600)
        stop1.stop()
        stop1.wait_for_done()
        self.kafka.stop_node(self.kafka.nodes[0], False)

    def test_produce_consume_with_client_partition(self):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        part1 = [self.workload_service.nodes[0]]
        part2 = self.kafka.nodes + self.zk.nodes
        partition1_spec = NetworkPartitionFaultSpec(0, 60000, [part1, part2])
        stop1 = self.trogdor.create_task("stop1", partition1_spec)
        workload1.wait_for_done(timeout_sec=600)
        stop1.stop()
        stop1.wait_for_done()
    def test_replication_with_replica_failure(self, metadata_quorum=quorum.zk):
        """
        This test verifies that replication shrinks the ISR when a replica is not fetching anymore.
        It also verifies that replication provides simple durability guarantees by checking that data acked by
        brokers is still available for consumption.

        Setup: 1 zk/KRaft controller, 3 kafka nodes, 1 topic with partitions=1, replication-factor=3, and min.insync.replicas=2
          - Produce messages in the background
          - Consume messages in the background
          - Partition a follower
          - Validate that the ISR was shrunk
          - Stop producing and finish consuming
          - Validate that every acked message was consumed
        """
        self.create_zookeeper_if_necessary()
        if self.zk:
            self.zk.start()

        self.create_kafka(
            num_nodes=3,
            server_prop_overrides=[["replica.lag.time.max.ms", "10000"]],
            controller_num_nodes_override=1)
        self.kafka.start()

        self.trogdor = TrogdorService(context=self.test_context,
                                      client_services=[self.kafka])
        self.trogdor.start()

        # If ZK is used, the partition leader is put on the controller node
        # to avoid partitioning the controller later on in the test.
        if self.zk:
            controller = self.kafka.controller()
            assignment = [self.kafka.idx(controller)] + [
                self.kafka.idx(node)
                for node in self.kafka.nodes if node != controller
            ]
        else:
            assignment = [self.kafka.idx(node) for node in self.kafka.nodes]

        self.topic = "test_topic"
        self.kafka.create_topic({
            "topic":
            self.topic,
            "replica-assignment":
            ":".join(map(str, assignment)),
            "configs": {
                "min.insync.replicas": 2
            }
        })

        self.logger.info("Created topic %s with assignment %s", self.topic,
                         ", ".join(map(str, assignment)))

        self.create_producer()
        self.producer.start()

        self.create_consumer()
        self.consumer.start()

        self.await_startup()

        leader = self.kafka.leader(self.topic, partition=0)
        replicas = self.kafka.replicas(self.topic, partition=0)

        # One of the followers is picked to be partitioned.
        follower_to_partition = [
            replica for replica in replicas if replica != leader
        ][0]
        self.logger.info(
            "Partitioning follower %s (%s) from the other brokers",
            self.kafka.idx(follower_to_partition), follower_to_partition.name)
        partition_spec = NetworkPartitionFaultSpec(
            0, 5 * 60 * 1000,
            [[follower_to_partition],
             [
                 node
                 for node in self.kafka.nodes if node != follower_to_partition
             ]])
        partition = self.trogdor.create_task("partition", partition_spec)

        def current_isr():
            try:
                # Due to the network partition, the kafka-topics command could fail if it tries
                # to connect to the partitioned broker. Therefore we catch the error here and retry.
                return set(
                    self.kafka.isr_idx_list(
                        self.topic,
                        partition=0,
                        node=leader,
                        offline_nodes=[follower_to_partition]))
            except RemoteCommandError as e:
                return set()

        # Verify that ISR is shrunk.
        expected_isr = {
            self.kafka.idx(replica)
            for replica in replicas if replica != follower_to_partition
        }
        wait_until(lambda: current_isr() == expected_isr,
                   timeout_sec=120,
                   backoff_sec=1,
                   err_msg="ISR should have been shrunk.")

        # Wait until the network partition is removed.
        partition.stop()
        partition.wait_for_done(timeout_sec=300)

        # Verify that ISR is expanded.
        expected_isr = {self.kafka.idx(replica) for replica in replicas}
        wait_until(lambda: current_isr() == expected_isr,
                   timeout_sec=120,
                   backoff_sec=1,
                   err_msg="ISR should have been expanded.")

        self.run_validation(producer_timeout_sec=120, min_records=25000)
Пример #22
0
class ConsumeBenchTest(Test):
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ConsumeBenchTest, self).__init__(test_context)
        self.zk = ZookeeperService(test_context, num_nodes=3)
        self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk)
        self.producer_workload_service = ProduceBenchWorkloadService(test_context, self.kafka)
        self.consumer_workload_service = ConsumeBenchWorkloadService(test_context, self.kafka)
        self.consumer_workload_service_2 = ConsumeBenchWorkloadService(test_context, self.kafka)
        self.active_topics = {"consume_bench_topic[0-5]": {"numPartitions": 5, "replicationFactor": 3}}
        self.trogdor = TrogdorService(context=self.test_context,
                                      client_services=[self.kafka, self.producer_workload_service,
                                                       self.consumer_workload_service,
                                                       self.consumer_workload_service_2])

    def setUp(self):
        self.trogdor.start()
        self.zk.start()
        self.kafka.start()

    def teardown(self):
        self.trogdor.stop()
        self.kafka.stop()
        self.zk.stop()

    def produce_messages(self, topics, max_messages=10000):
        produce_spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.producer_workload_service.producer_node,
                                                self.producer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=max_messages,
                                                producer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                inactive_topics={},
                                                active_topics=topics)
        produce_workload = self.trogdor.create_task("produce_workload", produce_spec)
        produce_workload.wait_for_done(timeout_sec=180)
        self.logger.debug("Produce workload finished")

    @parametrize(topics=["consume_bench_topic[0-5]"]) # topic subscription
    @parametrize(topics=["consume_bench_topic[0-5]:[0-4]"])  # manual topic assignment
    def test_consume_bench(self, topics):
        """
        Runs a ConsumeBench workload to consume messages
        """
        self.produce_messages(self.active_topics)
        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.consumer_workload_service.consumer_node,
                                                self.consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=10000,
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                active_topics=topics)
        consume_workload = self.trogdor.create_task("consume_workload", consume_spec)
        consume_workload.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))

    def test_consume_bench_single_partition(self):
        """
        Run a ConsumeBench against a single partition
        """
        active_topics = {"consume_bench_topic": {"numPartitions": 2, "replicationFactor": 3}}
        self.produce_messages(active_topics, 5000)
        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.consumer_workload_service.consumer_node,
                                                self.consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=2500,
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                active_topics=["consume_bench_topic:1"])
        consume_workload = self.trogdor.create_task("consume_workload", consume_spec)
        consume_workload.wait_for_done(timeout_sec=180)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))

    def test_consume_group_bench(self):
        """
        Runs two ConsumeBench workloads in the same consumer group to read messages from topics
        """
        self.produce_messages(self.active_topics)
        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.consumer_workload_service.consumer_node,
                                                self.consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=2000, # both should read at least 2k messages
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                consumer_group="testGroup",
                                                active_topics=["consume_bench_topic[0-5]"])
        consume_workload_1 = self.trogdor.create_task("consume_workload_1", consume_spec)
        consume_workload_2 = self.trogdor.create_task("consume_workload_2", consume_spec)
        consume_workload_1.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload 1 finished")
        consume_workload_2.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload 2 finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))
Пример #23
0
 def __init__(self, test_context):
     super(NetworkDegradeTest, self).__init__(test_context)
     self.zk = ZookeeperService(test_context, num_nodes=3)
     self.trogdor = TrogdorService(context=self.test_context, client_services=[self.zk])
Пример #24
0
class NetworkDegradeTest(Test):
    """
    These tests ensure that the network degrade Trogdor specs (which use "tc") are working as expected in whatever
    environment the system tests may be running in. The linux tools "ping" and "iperf" are used for validation
    and need to be available along with "tc" in the test environment.
    """

    def __init__(self, test_context):
        super(NetworkDegradeTest, self).__init__(test_context)
        self.zk = ZookeeperService(test_context, num_nodes=3)
        self.trogdor = TrogdorService(context=self.test_context, client_services=[self.zk])

    def setUp(self):
        self.zk.start()
        self.trogdor.start()

    def teardown(self):
        self.trogdor.stop()
        self.zk.stop()

    @cluster(num_nodes=5)
    @parametrize(task_name="latency-100", device_name="eth0", latency_ms=50, rate_limit_kbit=0)
    @parametrize(task_name="latency-100-rate-1000", device_name="eth0", latency_ms=50, rate_limit_kbit=1000)
    def test_latency(self, task_name, device_name, latency_ms, rate_limit_kbit):
        spec = DegradedNetworkFaultSpec(0, 10000)
        for node in self.zk.nodes:
            spec.add_node_spec(node.name, device_name, latency_ms, rate_limit_kbit)

        latency = self.trogdor.create_task(task_name, spec)

        zk0 = self.zk.nodes[0]
        zk1 = self.zk.nodes[1]

        # Capture the ping times from the ping stdout
        # 64 bytes from ducker01 (172.24.0.2): icmp_seq=1 ttl=64 time=0.325 ms
        r = re.compile(r".*time=(?P<time>[\d.]+)\sms.*")

        times = []
        for line in zk0.account.ssh_capture("ping -i 1 -c 20 %s" % zk1.account.hostname):
            self.logger.debug("Ping output: %s" % line)
            m = r.match(line)
            if m is not None and m.group("time"):
                times.append(float(m.group("time")))
                self.logger.info("Parsed ping time of %d" % float(m.group("time")))
        self.logger.debug("Captured ping times: %s" % times)

        # We expect to see some low ping times (before and after the task runs) as well as high ping times
        # (during the task). For the high time, it's twice the configured latency since both links apply the
        # rule, 80% for a little variance buffer
        high_time_ms = 0.8 * 2 * latency_ms
        low_time_ms = 10
        slow_times = [t for t in times if t > high_time_ms]
        fast_times = [t for t in times if t < low_time_ms]

        latency.stop()
        latency.wait_for_done()

        # We captured 20 ping times. Assert that at least 5 were "fast" and 5 were "slow"
        assert len(slow_times) > 5, "Expected to see more slow ping times (lower than %d)" % low_time_ms
        assert len(fast_times) > 5, "Expected to see more fast ping times (higher than %d)" % high_time_ms

    @cluster(num_nodes=5)
    @parametrize(task_name="rate-1000", device_name="eth0", latency_ms=0, rate_limit_kbit=1000000)
    @parametrize(task_name="rate-1000-latency-50", device_name="eth0", latency_ms=50, rate_limit_kbit=1000000)
    def test_rate(self, task_name, device_name, latency_ms, rate_limit_kbit):
        zk0 = self.zk.nodes[0]
        zk1 = self.zk.nodes[1]

        spec = DegradedNetworkFaultSpec(0, 60000)
        spec.add_node_spec(zk0.name, device_name, latency_ms, rate_limit_kbit)

        # start the task and wait
        rate_limit = self.trogdor.create_task(task_name, spec)
        wait_until(lambda: rate_limit.running(),
                   timeout_sec=10,
                   err_msg="%s failed to start within 10 seconds." % rate_limit)

        # Run iperf server on zk1, iperf client on zk0
        iperf_server = zk1.account.ssh_capture("iperf -s")

        # Capture the measured kbps between the two nodes.
        # [  3]  0.0- 1.0 sec  2952576 KBytes  24187503 Kbits/sec
        r = re.compile(r"^.*\s(?P<rate>[\d.]+)\sKbits/sec$")

        measured_rates = []
        for line in zk0.account.ssh_capture("iperf -i 1 -t 20 -f k -c %s" % zk1.account.hostname):
            self.logger.info("iperf output %s" % line)
            m = r.match(line)
            if m is not None:
                measured_rate = float(m.group("rate"))
                measured_rates.append(measured_rate)
                self.logger.info("Parsed rate of %d kbit/s from iperf" % measured_rate)

        # kill iperf server and consume the stdout to ensure clean exit
        zk1.account.kill_process("iperf")
        for _ in iperf_server:
            continue

        rate_limit.stop()
        rate_limit.wait_for_done()

        self.logger.info("Measured rates: %s" % measured_rates)

        # We expect to see measured rates within an order of magnitude of our target rate
        low_kbps = rate_limit_kbit // 10
        high_kbps = rate_limit_kbit * 10
        acceptable_rates = [r for r in measured_rates if low_kbps < r < high_kbps]

        msg = "Expected most of the measured rates to be within an order of magnitude of target %d." % rate_limit_kbit
        msg += " This means `tc` did not limit the bandwidth as expected."
        assert len(acceptable_rates) > 5, msg
Пример #25
0
class ConsumeBenchTest(Test):
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ConsumeBenchTest, self).__init__(test_context)
        self.zk = ZookeeperService(test_context,
                                   num_nodes=3) if quorum.for_test(
                                       test_context) == quorum.zk else None
        self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk)
        self.producer_workload_service = ProduceBenchWorkloadService(
            test_context, self.kafka)
        self.consumer_workload_service = ConsumeBenchWorkloadService(
            test_context, self.kafka)
        self.consumer_workload_service_2 = ConsumeBenchWorkloadService(
            test_context, self.kafka)
        self.active_topics = {
            "consume_bench_topic[0-5]": {
                "numPartitions": 5,
                "replicationFactor": 3
            }
        }
        self.trogdor = TrogdorService(context=self.test_context,
                                      client_services=[
                                          self.kafka,
                                          self.producer_workload_service,
                                          self.consumer_workload_service,
                                          self.consumer_workload_service_2
                                      ])

    def setUp(self):
        self.trogdor.start()
        if self.zk:
            self.zk.start()
        self.kafka.start()

    def teardown(self):
        self.trogdor.stop()
        self.kafka.stop()
        if self.zk:
            self.zk.stop()

    def produce_messages(self, topics, max_messages=10000):
        produce_spec = ProduceBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.producer_workload_service.producer_node,
            self.producer_workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=max_messages,
            producer_conf={},
            admin_client_conf={},
            common_client_conf={},
            inactive_topics={},
            active_topics=topics)
        produce_workload = self.trogdor.create_task("produce_workload",
                                                    produce_spec)
        produce_workload.wait_for_done(timeout_sec=180)
        self.logger.debug("Produce workload finished")

    @cluster(num_nodes=10)
    @matrix(topics=[["consume_bench_topic[0-5]"]],
            metadata_quorum=quorum.all_non_upgrade)  # topic subscription
    @matrix(topics=[["consume_bench_topic[0-5]:[0-4]"]],
            metadata_quorum=quorum.all_non_upgrade)  # manual topic assignment
    def test_consume_bench(self, topics, metadata_quorum=quorum.zk):
        """
        Runs a ConsumeBench workload to consume messages
        """
        self.produce_messages(self.active_topics)
        consume_spec = ConsumeBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.consumer_workload_service.consumer_node,
            self.consumer_workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=10000,
            consumer_conf={},
            admin_client_conf={},
            common_client_conf={},
            active_topics=topics)
        consume_workload = self.trogdor.create_task("consume_workload",
                                                    consume_spec)
        consume_workload.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" %
                         json.dumps(tasks, sort_keys=True, indent=2))

    @cluster(num_nodes=10)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_single_partition(self, metadata_quorum=quorum.zk):
        """
        Run a ConsumeBench against a single partition
        """
        active_topics = {
            "consume_bench_topic": {
                "numPartitions": 2,
                "replicationFactor": 3
            }
        }
        self.produce_messages(active_topics, 5000)
        consume_spec = ConsumeBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.consumer_workload_service.consumer_node,
            self.consumer_workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=2500,
            consumer_conf={},
            admin_client_conf={},
            common_client_conf={},
            active_topics=["consume_bench_topic:1"])
        consume_workload = self.trogdor.create_task("consume_workload",
                                                    consume_spec)
        consume_workload.wait_for_done(timeout_sec=180)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" %
                         json.dumps(tasks, sort_keys=True, indent=2))

    @cluster(num_nodes=10)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_multiple_consumers_random_group_topics(self,
                                                    metadata_quorum=quorum.zk):
        """
        Runs multiple consumers group to read messages from topics.
        Since a consumerGroup isn't specified, each consumer should read from all topics independently
        """
        self.produce_messages(self.active_topics, max_messages=5000)
        consume_spec = ConsumeBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.consumer_workload_service.consumer_node,
            self.consumer_workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=5000,  # all should read exactly 5k messages
            consumer_conf={},
            admin_client_conf={},
            common_client_conf={},
            threads_per_worker=5,
            active_topics=["consume_bench_topic[0-5]"])
        consume_workload = self.trogdor.create_task("consume_workload",
                                                    consume_spec)
        consume_workload.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" %
                         json.dumps(tasks, sort_keys=True, indent=2))

    @cluster(num_nodes=10)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_two_consumers_specified_group_topics(self,
                                                  metadata_quorum=quorum.zk):
        """
        Runs two consumers in the same consumer group to read messages from topics.
        Since a consumerGroup is specified, each consumer should dynamically get assigned a partition from group
        """
        self.produce_messages(self.active_topics)
        consume_spec = ConsumeBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.consumer_workload_service.consumer_node,
            self.consumer_workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=2000,  # both should read at least 2k messages
            consumer_conf={},
            admin_client_conf={},
            common_client_conf={},
            threads_per_worker=2,
            consumer_group="testGroup",
            active_topics=["consume_bench_topic[0-5]"])
        consume_workload = self.trogdor.create_task("consume_workload",
                                                    consume_spec)
        consume_workload.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" %
                         json.dumps(tasks, sort_keys=True, indent=2))

    @cluster(num_nodes=10)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_multiple_consumers_random_group_partitions(
            self, metadata_quorum=quorum.zk):
        """
        Runs multiple consumers in to read messages from specific partitions.
        Since a consumerGroup isn't specified, each consumer will get assigned a random group
        and consume from all partitions
        """
        self.produce_messages(self.active_topics, max_messages=20000)
        consume_spec = ConsumeBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.consumer_workload_service.consumer_node,
            self.consumer_workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=2000,
            consumer_conf={},
            admin_client_conf={},
            common_client_conf={},
            threads_per_worker=4,
            active_topics=["consume_bench_topic1:[0-4]"])
        consume_workload = self.trogdor.create_task("consume_workload",
                                                    consume_spec)
        consume_workload.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" %
                         json.dumps(tasks, sort_keys=True, indent=2))

    @cluster(num_nodes=10)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_multiple_consumers_specified_group_partitions_should_raise(
            self, metadata_quorum=quorum.zk):
        """
        Runs multiple consumers in the same group to read messages from specific partitions.
        It is an invalid configuration to provide a consumer group and specific partitions.
        """
        expected_error_msg = 'explicit partition assignment'
        self.produce_messages(self.active_topics, max_messages=20000)
        consume_spec = ConsumeBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.consumer_workload_service.consumer_node,
            self.consumer_workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=2000,
            consumer_conf={},
            admin_client_conf={},
            common_client_conf={},
            threads_per_worker=4,
            consumer_group="fail_group",
            active_topics=["consume_bench_topic1:[0-4]"])
        consume_workload = self.trogdor.create_task("consume_workload",
                                                    consume_spec)
        try:
            consume_workload.wait_for_done(timeout_sec=360)
            raise Exception(
                "Should have raised an exception due to an invalid configuration"
            )
        except RuntimeError as e:
            if expected_error_msg not in str(e):
                raise RuntimeError("Unexpected Exception - " + str(e))
            self.logger.info(e)
Пример #26
0
class ConsumeBenchTest(Test):
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ConsumeBenchTest, self).__init__(test_context)
        self.zk = ZookeeperService(test_context, num_nodes=3)
        self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk)
        self.producer_workload_service = ProduceBenchWorkloadService(test_context, self.kafka)
        self.consumer_workload_service = ConsumeBenchWorkloadService(test_context, self.kafka)
        self.consumer_workload_service_2 = ConsumeBenchWorkloadService(test_context, self.kafka)
        self.active_topics = {"consume_bench_topic[0-5]": {"numPartitions": 5, "replicationFactor": 3}}
        self.trogdor = TrogdorService(context=self.test_context,
                                      client_services=[self.kafka, self.producer_workload_service,
                                                       self.consumer_workload_service,
                                                       self.consumer_workload_service_2])

    def setUp(self):
        self.trogdor.start()
        self.zk.start()
        self.kafka.start()

    def teardown(self):
        self.trogdor.stop()
        self.kafka.stop()
        self.zk.stop()

    def produce_messages(self, topics, max_messages=10000):
        produce_spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.producer_workload_service.producer_node,
                                                self.producer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=max_messages,
                                                producer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                inactive_topics={},
                                                active_topics=topics)
        produce_workload = self.trogdor.create_task("produce_workload", produce_spec)
        produce_workload.wait_for_done(timeout_sec=180)
        self.logger.debug("Produce workload finished")

    @parametrize(topics=["consume_bench_topic[0-5]"]) # topic subscription
    @parametrize(topics=["consume_bench_topic[0-5]:[0-4]"])  # manual topic assignment
    def test_consume_bench(self, topics):
        """
        Runs a ConsumeBench workload to consume messages
        """
        self.produce_messages(self.active_topics)
        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.consumer_workload_service.consumer_node,
                                                self.consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=10000,
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                active_topics=topics)
        consume_workload = self.trogdor.create_task("consume_workload", consume_spec)
        consume_workload.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))

    def test_single_partition(self):
        """
        Run a ConsumeBench against a single partition
        """
        active_topics = {"consume_bench_topic": {"numPartitions": 2, "replicationFactor": 3}}
        self.produce_messages(active_topics, 5000)
        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.consumer_workload_service.consumer_node,
                                                self.consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=2500,
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                active_topics=["consume_bench_topic:1"])
        consume_workload = self.trogdor.create_task("consume_workload", consume_spec)
        consume_workload.wait_for_done(timeout_sec=180)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))

    def test_multiple_consumers_random_group_topics(self):
        """
        Runs multiple consumers group to read messages from topics.
        Since a consumerGroup isn't specified, each consumer should read from all topics independently
        """
        self.produce_messages(self.active_topics, max_messages=5000)
        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.consumer_workload_service.consumer_node,
                                                self.consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=5000, # all should read exactly 5k messages
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                threads_per_worker=5,
                                                active_topics=["consume_bench_topic[0-5]"])
        consume_workload = self.trogdor.create_task("consume_workload", consume_spec)
        consume_workload.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))

    def test_two_consumers_specified_group_topics(self):
        """
        Runs two consumers in the same consumer group to read messages from topics.
        Since a consumerGroup is specified, each consumer should dynamically get assigned a partition from group
        """
        self.produce_messages(self.active_topics)
        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.consumer_workload_service.consumer_node,
                                                self.consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=2000, # both should read at least 2k messages
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                threads_per_worker=2,
                                                consumer_group="testGroup",
                                                active_topics=["consume_bench_topic[0-5]"])
        consume_workload = self.trogdor.create_task("consume_workload", consume_spec)
        consume_workload.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))

    def test_multiple_consumers_random_group_partitions(self):
        """
        Runs multiple consumers in to read messages from specific partitions.
        Since a consumerGroup isn't specified, each consumer will get assigned a random group
        and consume from all partitions
        """
        self.produce_messages(self.active_topics, max_messages=20000)
        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.consumer_workload_service.consumer_node,
                                                self.consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=2000,
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                threads_per_worker=4,
                                                active_topics=["consume_bench_topic1:[0-4]"])
        consume_workload = self.trogdor.create_task("consume_workload", consume_spec)
        consume_workload.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))

    def test_multiple_consumers_specified_group_partitions_should_raise(self):
        """
        Runs multiple consumers in the same group to read messages from specific partitions.
        It is an invalid configuration to provide a consumer group and specific partitions.
        """
        expected_error_msg = 'explicit partition assignment'
        self.produce_messages(self.active_topics, max_messages=20000)
        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.consumer_workload_service.consumer_node,
                                                self.consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=2000,
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                threads_per_worker=4,
                                                consumer_group="fail_group",
                                                active_topics=["consume_bench_topic1:[0-4]"])
        consume_workload = self.trogdor.create_task("consume_workload", consume_spec)
        try:
            consume_workload.wait_for_done(timeout_sec=360)
            raise Exception("Should have raised an exception due to an invalid configuration")
        except RuntimeError as e:
            if expected_error_msg not in str(e):
                raise RuntimeError("Unexpected Exception - " + str(e))
            self.logger.info(e)