Exemplo n.º 1
0
class ProduceBenchTest(Test):
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ProduceBenchTest, self).__init__(test_context)
        self.zk = ZookeeperService(test_context, num_nodes=3)
        self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk)
        self.workload_service = ProduceBenchWorkloadService(test_context, self.kafka)
        self.trogdor = TrogdorService(context=self.test_context,
                                      client_services=[self.kafka, self.workload_service])

    def setUp(self):
        self.trogdor.start()
        self.zk.start()
        self.kafka.start()

    def teardown(self):
        self.trogdor.stop()
        self.kafka.stop()
        self.zk.stop()

    def test_produce_bench(self):
        active_topics={"produce_bench_topic[0-1]":{"numPartitions":1, "replicationFactor":3}}
        inactive_topics={"produce_bench_topic[2-9]":{"numPartitions":1, "replicationFactor":3}}
        spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                        self.workload_service.producer_node,
                                        self.workload_service.bootstrap_servers,
                                        target_messages_per_sec=1000,
                                        max_messages=100000,
                                        producer_conf={},
                                        inactive_topics=inactive_topics,
                                        active_topics=active_topics)
        workload1 = self.trogdor.create_task("workload1", spec)
        workload1.wait_for_done(timeout_sec=360)
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))
Exemplo n.º 2
0
class ProduceBenchTest(Test):
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ProduceBenchTest, self).__init__(test_context)
        self.zk = ZookeeperService(test_context, num_nodes=3)
        self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk)
        self.workload_service = ProduceBenchWorkloadService(
            test_context, self.kafka)
        self.trogdor = TrogdorService(
            context=self.test_context,
            client_services=[self.kafka, self.workload_service])

    def setUp(self):
        self.trogdor.start()
        self.zk.start()
        self.kafka.start()

    def teardown(self):
        self.trogdor.stop()
        self.kafka.stop()
        self.zk.stop()

    def test_produce_bench(self):
        active_topics = {
            "produce_bench_topic[0-1]": {
                "numPartitions": 1,
                "replicationFactor": 3
            }
        }
        inactive_topics = {
            "produce_bench_topic[2-9]": {
                "numPartitions": 1,
                "replicationFactor": 3
            }
        }
        spec = ProduceBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.workload_service.producer_node,
            self.workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=100000,
            producer_conf={},
            admin_client_conf={},
            common_client_conf={},
            inactive_topics=inactive_topics,
            active_topics=active_topics)
        workload1 = self.trogdor.create_task("workload1", spec)
        workload1.wait_for_done(timeout_sec=360)
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" %
                         json.dumps(tasks, sort_keys=True, indent=2))
Exemplo n.º 3
0
class StreamsOptimizedTest(Test):
    """
    Test doing upgrades of a Kafka Streams application
    that is un-optimized initially then optimized
    """

    input_topic = 'inputTopic'
    aggregation_topic = 'aggregationTopic'
    reduce_topic = 'reduceTopic'
    join_topic = 'joinTopic'
    operation_pattern = 'AGGREGATED\|REDUCED\|JOINED'
    stopped_message = 'OPTIMIZE_TEST Streams Stopped'

    def __init__(self, test_context):
        super(StreamsOptimizedTest, self).__init__(test_context)
        self.topics = {
            self.input_topic: {
                'partitions': 6
            },
            self.aggregation_topic: {
                'partitions': 6
            },
            self.reduce_topic: {
                'partitions': 6
            },
            self.join_topic: {
                'partitions': 6
            }
        }

        self.zookeeper = ZookeeperService(self.test_context, num_nodes=1)
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zookeeper,
                                  topics=self.topics)

        self.producer = VerifiableProducer(self.test_context,
                                           1,
                                           self.kafka,
                                           self.input_topic,
                                           throughput=1000,
                                           acks=1)

    def test_upgrade_optimized_topology(self):
        self.zookeeper.start()
        self.kafka.start()

        processor1 = StreamsOptimizedUpgradeTestService(
            self.test_context, self.kafka)
        processor2 = StreamsOptimizedUpgradeTestService(
            self.test_context, self.kafka)
        processor3 = StreamsOptimizedUpgradeTestService(
            self.test_context, self.kafka)

        processors = [processor1, processor2, processor3]

        # produce records continually during the test
        self.producer.start()

        # start all processors unoptimized
        for processor in processors:
            self.set_topics(processor)
            processor.CLEAN_NODE_ENABLED = False
            self.verify_running_repartition_topic_count(processor, 4)

        self.verify_processing(processors, verify_individual_operations=False)

        stop_processors(processors, self.stopped_message)

        # start again with topology optimized
        for processor in processors:
            processor.OPTIMIZED_CONFIG = 'all'
            self.verify_running_repartition_topic_count(processor, 1)

        self.verify_processing(processors, verify_individual_operations=True)

        stop_processors(processors, self.stopped_message)

        self.producer.stop()
        self.kafka.stop()
        self.zookeeper.stop()

    @staticmethod
    def verify_running_repartition_topic_count(processor,
                                               repartition_topic_count):
        node = processor.node
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            processor.start()
            monitor.wait_until(
                'REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' %
                repartition_topic_count,
                timeout_sec=120,
                err_msg=
                "Never saw 'REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' message "
                % repartition_topic_count + str(processor.node.account))

    def verify_processing(self, processors, verify_individual_operations):
        for processor in processors:
            if not self.all_source_subtopology_tasks(processor):
                if verify_individual_operations:
                    for operation in self.operation_pattern.split('\|'):
                        self.do_verify(processor, operation)
                else:
                    self.do_verify(processor, self.operation_pattern)
            else:
                self.logger.info(
                    "Skipping processor %s with all source tasks" %
                    processor.node.account)

    def do_verify(self, processor, pattern):
        self.logger.info("Verifying %s processing pattern in STDOUT_FILE" %
                         pattern)
        with processor.node.account.monitor_log(
                processor.STDOUT_FILE) as monitor:
            monitor.wait_until(
                pattern,
                timeout_sec=60,
                err_msg="Never saw processing of %s " % pattern +
                str(processor.node.account))

    def all_source_subtopology_tasks(self, processor):
        retries = 0
        while retries < 5:
            found = list(
                processor.node.account.ssh_capture(
                    "sed -n 's/.*current active tasks: \[\(\(0_[0-9], \)\{3\}0_[0-9]\)\].*/\1/p' %s"
                    % processor.LOG_FILE,
                    allow_fail=True))
            self.logger.info("Returned %s from assigned task check" % found)
            if len(found) > 0:
                return True
            retries += 1
            time.sleep(1)

        return False

    def set_topics(self, processor):
        processor.INPUT_TOPIC = self.input_topic
        processor.AGGREGATION_TOPIC = self.aggregation_topic
        processor.REDUCE_TOPIC = self.reduce_topic
        processor.JOIN_TOPIC = self.join_topic
Exemplo n.º 4
0
class StreamsCooperativeRebalanceUpgradeTest(Test):
    """
    Test of a rolling upgrade from eager rebalance to
    cooperative rebalance
    """

    source_topic = "source"
    sink_topic = "sink"
    task_delimiter = "#"
    report_interval = "1000"
    processing_message = "Processed [0-9]* records so far"
    stopped_message = "COOPERATIVE-REBALANCE-TEST-CLIENT-CLOSED"
    running_state_msg = "STREAMS in a RUNNING State"
    cooperative_turned_off_msg = "Eager rebalancing enabled now for upgrade from %s"
    cooperative_enabled_msg = "Cooperative rebalancing enabled now"
    first_bounce_phase = "first_bounce_phase-"
    second_bounce_phase = "second_bounce_phase-"

    # !!CAUTION!!: THIS LIST OF VERSIONS IS FIXED, NO VERSIONS MUST BE ADDED
    streams_eager_rebalance_upgrade_versions = [
        str(LATEST_0_10_0),
        str(LATEST_0_10_1),
        str(LATEST_0_10_2),
        str(LATEST_0_11_0),
        str(LATEST_1_0),
        str(LATEST_1_1),
        str(LATEST_2_0),
        str(LATEST_2_1),
        str(LATEST_2_2),
        str(LATEST_2_3)
    ]

    def __init__(self, test_context):
        super(StreamsCooperativeRebalanceUpgradeTest,
              self).__init__(test_context)
        self.topics = {
            self.source_topic: {
                'partitions': 9
            },
            self.sink_topic: {
                'partitions': 9
            }
        }

        self.zookeeper = ZookeeperService(self.test_context, num_nodes=1)
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zookeeper,
                                  topics=self.topics)

        self.producer = VerifiableProducer(self.test_context,
                                           1,
                                           self.kafka,
                                           self.source_topic,
                                           throughput=1000,
                                           acks=1)

    @matrix(upgrade_from_version=streams_eager_rebalance_upgrade_versions)
    def test_upgrade_to_cooperative_rebalance(self, upgrade_from_version):
        self.zookeeper.start()
        self.kafka.start()

        processor1 = CooperativeRebalanceUpgradeService(
            self.test_context, self.kafka)
        processor2 = CooperativeRebalanceUpgradeService(
            self.test_context, self.kafka)
        processor3 = CooperativeRebalanceUpgradeService(
            self.test_context, self.kafka)

        processors = [processor1, processor2, processor3]

        # produce records continually during the test
        self.producer.start()

        # start all processors without upgrade_from config; normal operations mode
        self.logger.info("Starting all streams clients in normal running mode")
        for processor in processors:
            processor.set_version(upgrade_from_version)
            self.set_props(processor)
            processor.CLEAN_NODE_ENABLED = False
            # can't use state as older version don't have state listener
            # so just verify up and running
            verify_running(processor, self.processing_message)

        # all running rebalancing has ceased
        for processor in processors:
            self.verify_processing(processor, self.processing_message)

        # first rolling bounce with "upgrade.from" config set
        previous_phase = ""
        self.maybe_upgrade_rolling_bounce_and_verify(processors,
                                                     previous_phase,
                                                     self.first_bounce_phase,
                                                     upgrade_from_version)

        # All nodes processing, rebalancing has ceased
        for processor in processors:
            self.verify_processing(
                processor, self.first_bounce_phase + self.processing_message)

        # second rolling bounce without "upgrade.from" config
        self.maybe_upgrade_rolling_bounce_and_verify(processors,
                                                     self.first_bounce_phase,
                                                     self.second_bounce_phase)

        # All nodes processing, rebalancing has ceased
        for processor in processors:
            self.verify_processing(
                processor, self.second_bounce_phase + self.processing_message)

        # now verify tasks are unique
        for processor in processors:
            self.get_tasks_for_processor(processor)
            self.logger.info("Active tasks %s" % processor.active_tasks)

        overlapping_tasks = processor1.active_tasks.intersection(
            processor2.active_tasks)
        assert len(overlapping_tasks) == int(0), \
            "Final task assignments are not unique %s %s" % (processor1.active_tasks, processor2.active_tasks)

        overlapping_tasks = processor1.active_tasks.intersection(
            processor3.active_tasks)
        assert len(overlapping_tasks) == int(0), \
            "Final task assignments are not unique %s %s" % (processor1.active_tasks, processor3.active_tasks)

        overlapping_tasks = processor2.active_tasks.intersection(
            processor3.active_tasks)
        assert len(overlapping_tasks) == int(0), \
            "Final task assignments are not unique %s %s" % (processor2.active_tasks, processor3.active_tasks)

        # test done close all down
        stop_processors(processors,
                        self.second_bounce_phase + self.stopped_message)

        self.producer.stop()
        self.kafka.stop()
        self.zookeeper.stop()

    def maybe_upgrade_rolling_bounce_and_verify(self,
                                                processors,
                                                previous_phase,
                                                current_phase,
                                                upgrade_from_version=None):
        for processor in processors:
            # stop the processor in prep for setting "update.from" or removing "update.from"
            verify_stopped(processor, previous_phase + self.stopped_message)
            # upgrade to version with cooperative rebalance
            processor.set_version("")
            processor.set_upgrade_phase(current_phase)

            if upgrade_from_version is not None:
                # need to remove minor version numbers for check of valid upgrade from numbers
                upgrade_version = upgrade_from_version[:upgrade_from_version.
                                                       rfind('.')]
                rebalance_mode_msg = self.cooperative_turned_off_msg % upgrade_version
            else:
                upgrade_version = None
                rebalance_mode_msg = self.cooperative_enabled_msg

            self.set_props(processor, upgrade_version)
            node = processor.node
            with node.account.monitor_log(
                    processor.STDOUT_FILE) as stdout_monitor:
                with node.account.monitor_log(
                        processor.LOG_FILE) as log_monitor:
                    processor.start()
                    # verify correct rebalance mode either turned off for upgrade or enabled after upgrade
                    log_monitor.wait_until(
                        rebalance_mode_msg,
                        timeout_sec=60,
                        err_msg="Never saw '%s' message " % rebalance_mode_msg
                        + str(processor.node.account))

                # verify rebalanced into a running state
                rebalance_msg = current_phase + self.running_state_msg
                stdout_monitor.wait_until(
                    rebalance_msg,
                    timeout_sec=60,
                    err_msg="Never saw '%s' message " % rebalance_msg +
                    str(processor.node.account))

                # verify processing
                verify_processing_msg = current_phase + self.processing_message
                stdout_monitor.wait_until(
                    verify_processing_msg,
                    timeout_sec=60,
                    err_msg="Never saw '%s' message " % verify_processing_msg +
                    str(processor.node.account))

    def verify_processing(self, processor, pattern):
        self.logger.info("Verifying %s processing pattern in STDOUT_FILE" %
                         pattern)
        with processor.node.account.monitor_log(
                processor.STDOUT_FILE) as monitor:
            monitor.wait_until(
                pattern,
                timeout_sec=60,
                err_msg="Never saw processing of %s " % pattern +
                str(processor.node.account))

    def get_tasks_for_processor(self, processor):
        retries = 0
        while retries < 5:
            found_tasks = list(
                processor.node.account.ssh_capture(
                    "grep TASK-ASSIGNMENTS %s | tail -n 1" %
                    processor.STDOUT_FILE,
                    allow_fail=True))
            self.logger.info("Returned %s from assigned task check" %
                             found_tasks)
            if len(found_tasks) > 0:
                task_string = str(found_tasks[0]).strip()
                self.logger.info("Converted %s from assigned task check" %
                                 task_string)
                processor.set_tasks(task_string)
                return
            retries += 1
            time.sleep(1)
        return

    def set_props(self, processor, upgrade_from=None):
        processor.SOURCE_TOPIC = self.source_topic
        processor.SINK_TOPIC = self.sink_topic
        processor.REPORT_INTERVAL = self.report_interval
        processor.UPGRADE_FROM = upgrade_from
Exemplo n.º 5
0
class RoundTripFaultTest(Test):
    topic_name_index = 0

    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(RoundTripFaultTest, self).__init__(test_context)
        self.zk = ZookeeperService(test_context, num_nodes=3)
        self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk)
        self.workload_service = RoundTripWorkloadService(
            test_context, self.kafka)
        self.trogdor = TrogdorService(
            context=self.test_context,
            client_services=[self.zk, self.kafka, self.workload_service])
        topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index
        RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1
        active_topics = {
            topic_name: {
                "partitionAssignments": {
                    "0": [0, 1, 2]
                }
            }
        }
        self.round_trip_spec = RoundTripWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.workload_service.client_node,
            self.workload_service.bootstrap_servers,
            target_messages_per_sec=10000,
            max_messages=100000,
            active_topics=active_topics)

    def setUp(self):
        self.zk.start()
        self.kafka.start()
        self.trogdor.start()

    def teardown(self):
        self.trogdor.stop()
        self.kafka.stop()
        self.zk.stop()

    def test_round_trip_workload(self):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        workload1.wait_for_done(timeout_sec=600)

    def test_round_trip_workload_with_broker_partition(self):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        part1 = [self.kafka.nodes[0]]
        part2 = self.kafka.nodes[1:] + [self.workload_service.nodes[0]
                                        ] + self.zk.nodes
        partition1_spec = NetworkPartitionFaultSpec(0,
                                                    TaskSpec.MAX_DURATION_MS,
                                                    [part1, part2])
        partition1 = self.trogdor.create_task("partition1", partition1_spec)
        workload1.wait_for_done(timeout_sec=600)
        partition1.stop()
        partition1.wait_for_done()

    def test_produce_consume_with_broker_pause(self):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        stop1_spec = ProcessStopFaultSpec(0, TaskSpec.MAX_DURATION_MS,
                                          [self.kafka.nodes[0]],
                                          self.kafka.java_class_name())
        stop1 = self.trogdor.create_task("stop1", stop1_spec)
        workload1.wait_for_done(timeout_sec=600)
        stop1.stop()
        stop1.wait_for_done()
        self.kafka.stop_node(self.kafka.nodes[0], False)

    def test_produce_consume_with_client_partition(self):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        part1 = [self.workload_service.nodes[0]]
        part2 = self.kafka.nodes + self.zk.nodes
        partition1_spec = NetworkPartitionFaultSpec(0, 60000, [part1, part2])
        stop1 = self.trogdor.create_task("stop1", partition1_spec)
        workload1.wait_for_done(timeout_sec=600)
        stop1.stop()
        stop1.wait_for_done()
Exemplo n.º 6
0
class ConsumeBenchTest(Test):
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ConsumeBenchTest, self).__init__(test_context)
        self.zk = ZookeeperService(test_context, num_nodes=3)
        self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk)
        self.producer_workload_service = ProduceBenchWorkloadService(test_context, self.kafka)
        self.consumer_workload_service = ConsumeBenchWorkloadService(test_context, self.kafka)
        self.consumer_workload_service_2 = ConsumeBenchWorkloadService(test_context, self.kafka)
        self.active_topics = {"consume_bench_topic[0-5]": {"numPartitions": 5, "replicationFactor": 3}}
        self.trogdor = TrogdorService(context=self.test_context,
                                      client_services=[self.kafka, self.producer_workload_service,
                                                       self.consumer_workload_service,
                                                       self.consumer_workload_service_2])

    def setUp(self):
        self.trogdor.start()
        self.zk.start()
        self.kafka.start()

    def teardown(self):
        self.trogdor.stop()
        self.kafka.stop()
        self.zk.stop()

    def produce_messages(self, topics, max_messages=10000):
        produce_spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.producer_workload_service.producer_node,
                                                self.producer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=max_messages,
                                                producer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                inactive_topics={},
                                                active_topics=topics)
        produce_workload = self.trogdor.create_task("produce_workload", produce_spec)
        produce_workload.wait_for_done(timeout_sec=180)
        self.logger.debug("Produce workload finished")

    @parametrize(topics=["consume_bench_topic[0-5]"]) # topic subscription
    @parametrize(topics=["consume_bench_topic[0-5]:[0-4]"])  # manual topic assignment
    def test_consume_bench(self, topics):
        """
        Runs a ConsumeBench workload to consume messages
        """
        self.produce_messages(self.active_topics)
        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.consumer_workload_service.consumer_node,
                                                self.consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=10000,
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                active_topics=topics)
        consume_workload = self.trogdor.create_task("consume_workload", consume_spec)
        consume_workload.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))

    def test_single_partition(self):
        """
        Run a ConsumeBench against a single partition
        """
        active_topics = {"consume_bench_topic": {"numPartitions": 2, "replicationFactor": 3}}
        self.produce_messages(active_topics, 5000)
        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.consumer_workload_service.consumer_node,
                                                self.consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=2500,
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                active_topics=["consume_bench_topic:1"])
        consume_workload = self.trogdor.create_task("consume_workload", consume_spec)
        consume_workload.wait_for_done(timeout_sec=180)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))

    def test_multiple_consumers_random_group_topics(self):
        """
        Runs multiple consumers group to read messages from topics.
        Since a consumerGroup isn't specified, each consumer should read from all topics independently
        """
        self.produce_messages(self.active_topics, max_messages=5000)
        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.consumer_workload_service.consumer_node,
                                                self.consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=5000, # all should read exactly 5k messages
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                threads_per_worker=5,
                                                active_topics=["consume_bench_topic[0-5]"])
        consume_workload = self.trogdor.create_task("consume_workload", consume_spec)
        consume_workload.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))

    def test_two_consumers_specified_group_topics(self):
        """
        Runs two consumers in the same consumer group to read messages from topics.
        Since a consumerGroup is specified, each consumer should dynamically get assigned a partition from group
        """
        self.produce_messages(self.active_topics)
        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.consumer_workload_service.consumer_node,
                                                self.consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=2000, # both should read at least 2k messages
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                threads_per_worker=2,
                                                consumer_group="testGroup",
                                                active_topics=["consume_bench_topic[0-5]"])
        consume_workload = self.trogdor.create_task("consume_workload", consume_spec)
        consume_workload.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))

    def test_multiple_consumers_random_group_partitions(self):
        """
        Runs multiple consumers in to read messages from specific partitions.
        Since a consumerGroup isn't specified, each consumer will get assigned a random group
        and consume from all partitions
        """
        self.produce_messages(self.active_topics, max_messages=20000)
        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.consumer_workload_service.consumer_node,
                                                self.consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=2000,
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                threads_per_worker=4,
                                                active_topics=["consume_bench_topic1:[0-4]"])
        consume_workload = self.trogdor.create_task("consume_workload", consume_spec)
        consume_workload.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))

    def test_multiple_consumers_specified_group_partitions_should_raise(self):
        """
        Runs multiple consumers in the same group to read messages from specific partitions.
        It is an invalid configuration to provide a consumer group and specific partitions.
        """
        expected_error_msg = 'explicit partition assignment'
        self.produce_messages(self.active_topics, max_messages=20000)
        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.consumer_workload_service.consumer_node,
                                                self.consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=2000,
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                threads_per_worker=4,
                                                consumer_group="fail_group",
                                                active_topics=["consume_bench_topic1:[0-4]"])
        consume_workload = self.trogdor.create_task("consume_workload", consume_spec)
        try:
            consume_workload.wait_for_done(timeout_sec=360)
            raise Exception("Should have raised an exception due to an invalid configuration")
        except RuntimeError as e:
            if expected_error_msg not in str(e):
                raise RuntimeError("Unexpected Exception - " + str(e))
            self.logger.info(e)
Exemplo n.º 7
0
class RoundTripFaultTest(Test):
    topic_name_index = 0

    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(RoundTripFaultTest, self).__init__(test_context)
        self.zk = ZookeeperService(test_context,
                                   num_nodes=3) if quorum.for_test(
                                       test_context) == quorum.zk else None
        self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk)
        self.workload_service = RoundTripWorkloadService(
            test_context, self.kafka)
        if quorum.for_test(test_context) == quorum.zk:
            trogdor_client_services = [
                self.zk, self.kafka, self.workload_service
            ]
        elif quorum.for_test(test_context) == quorum.remote_kraft:
            trogdor_client_services = [
                self.kafka.controller_quorum, self.kafka, self.workload_service
            ]
        else:  #co-located case, which we currently don't test but handle here for completeness in case we do test it
            trogdor_client_services = [self.kafka, self.workload_service]
        self.trogdor = TrogdorService(context=self.test_context,
                                      client_services=trogdor_client_services)
        topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index
        RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1
        active_topics = {
            topic_name: {
                "partitionAssignments": {
                    "0": [0, 1, 2]
                }
            }
        }
        self.round_trip_spec = RoundTripWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.workload_service.client_node,
            self.workload_service.bootstrap_servers,
            target_messages_per_sec=10000,
            max_messages=100000,
            active_topics=active_topics)

    def setUp(self):
        if self.zk:
            self.zk.start()
        self.kafka.start()
        self.trogdor.start()

    def teardown(self):
        self.trogdor.stop()
        self.kafka.stop()
        if self.zk:
            self.zk.stop()

    def remote_quorum_nodes(self):
        if quorum.for_test(self.test_context) == quorum.zk:
            return self.zk.nodes
        elif quorum.for_test(self.test_context) == quorum.remote_kraft:
            return self.kafka.controller_quorum.nodes
        else:  # co-located case, which we currently don't test but handle here for completeness in case we do test it
            return []

    @cluster(num_nodes=9)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_round_trip_workload(self, metadata_quorum=quorum.zk):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        workload1.wait_for_done(timeout_sec=600)

    @cluster(num_nodes=9)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_round_trip_workload_with_broker_partition(
            self, metadata_quorum=quorum.zk):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        part1 = [self.kafka.nodes[0]]
        part2 = self.kafka.nodes[1:] + [self.workload_service.nodes[0]
                                        ] + self.remote_quorum_nodes()
        partition1_spec = NetworkPartitionFaultSpec(0,
                                                    TaskSpec.MAX_DURATION_MS,
                                                    [part1, part2])
        partition1 = self.trogdor.create_task("partition1", partition1_spec)
        workload1.wait_for_done(timeout_sec=600)
        partition1.stop()
        partition1.wait_for_done()

    @cluster(num_nodes=9)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_produce_consume_with_broker_pause(self,
                                               metadata_quorum=quorum.zk):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        stop1_spec = ProcessStopFaultSpec(0, TaskSpec.MAX_DURATION_MS,
                                          [self.kafka.nodes[0]],
                                          self.kafka.java_class_name())
        stop1 = self.trogdor.create_task("stop1", stop1_spec)
        workload1.wait_for_done(timeout_sec=600)
        stop1.stop()
        stop1.wait_for_done()
        self.kafka.stop_node(self.kafka.nodes[0], False)

    @cluster(num_nodes=9)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_produce_consume_with_client_partition(self,
                                                   metadata_quorum=quorum.zk):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        part1 = [self.workload_service.nodes[0]]
        part2 = self.kafka.nodes + self.remote_quorum_nodes()
        partition1_spec = NetworkPartitionFaultSpec(0, 60000, [part1, part2])
        stop1 = self.trogdor.create_task("stop1", partition1_spec)
        workload1.wait_for_done(timeout_sec=600)
        stop1.stop()
        stop1.wait_for_done()

    @cluster(num_nodes=9)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_produce_consume_with_latency(self, metadata_quorum=quorum.zk):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        spec = DegradedNetworkFaultSpec(0, 60000)
        for node in self.kafka.nodes + self.remote_quorum_nodes():
            spec.add_node_spec(node.name,
                               "eth0",
                               latencyMs=100,
                               rateLimitKbit=3000)
        slow1 = self.trogdor.create_task("slow1", spec)
        workload1.wait_for_done(timeout_sec=600)
        slow1.stop()
        slow1.wait_for_done()
class StreamsBrokerCompatibility(Test):
    """
    These tests validates that
    - Streams works for older brokers 0.11 (or newer)
    - Streams w/ EOS-alpha works for older brokers 0.11 (or newer)
    - Streams w/ EOS-beta works for older brokers 2.5 (or newer)
    - Streams fails fast for older brokers 0.10.0, 0.10.2, and 0.10.1
    - Streams w/ EOS-beta fails fast for older brokers 2.4 or older
    """

    input = "brokerCompatibilitySourceTopic"
    output = "brokerCompatibilitySinkTopic"

    def __init__(self, test_context):
        super(StreamsBrokerCompatibility,
              self).__init__(test_context=test_context)
        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(
            test_context,
            num_nodes=1,
            zk=self.zk,
            topics={
                self.input: {
                    'partitions': 1,
                    'replication-factor': 1
                },
                self.output: {
                    'partitions': 1,
                    'replication-factor': 1
                }
            },
            server_prop_overides=[[
                "transaction.state.log.replication.factor", "1"
            ], ["transaction.state.log.min.isr", "1"]])
        self.consumer = VerifiableConsumer(
            test_context, 1, self.kafka, self.output,
            "stream-broker-compatibility-verify-consumer")

    def setUp(self):
        self.zk.start()

    @parametrize(broker_version=str(LATEST_2_4))
    @parametrize(broker_version=str(LATEST_2_3))
    @parametrize(broker_version=str(LATEST_2_2))
    @parametrize(broker_version=str(LATEST_2_1))
    @parametrize(broker_version=str(LATEST_2_0))
    @parametrize(broker_version=str(LATEST_1_1))
    @parametrize(broker_version=str(LATEST_1_0))
    @parametrize(broker_version=str(LATEST_0_11_0))
    def test_compatible_brokers_eos_disabled(self, broker_version):
        self.kafka.set_version(KafkaVersion(broker_version))
        self.kafka.start()

        processor = StreamsBrokerCompatibilityService(self.test_context,
                                                      self.kafka,
                                                      "at_least_once")
        processor.start()

        self.consumer.start()

        processor.wait()

        wait_until(
            lambda: self.consumer.total_consumed() > 0,
            timeout_sec=30,
            err_msg=
            "Did expect to read a message but got none within 30 seconds.")

        self.consumer.stop()
        self.kafka.stop()

    @parametrize(broker_version=str(LATEST_2_5))
    @parametrize(broker_version=str(LATEST_2_4))
    @parametrize(broker_version=str(LATEST_2_3))
    @parametrize(broker_version=str(LATEST_2_2))
    @parametrize(broker_version=str(LATEST_2_1))
    @parametrize(broker_version=str(LATEST_2_0))
    @parametrize(broker_version=str(LATEST_1_1))
    @parametrize(broker_version=str(LATEST_1_0))
    @parametrize(broker_version=str(LATEST_0_11_0))
    def test_compatible_brokers_eos_alpha_enabled(self, broker_version):
        self.kafka.set_version(KafkaVersion(broker_version))
        self.kafka.start()

        processor = StreamsBrokerCompatibilityService(self.test_context,
                                                      self.kafka,
                                                      "exactly_once")
        processor.start()

        self.consumer.start()

        processor.wait()

        wait_until(
            lambda: self.consumer.total_consumed() > 0,
            timeout_sec=30,
            err_msg=
            "Did expect to read a message but got none within 30 seconds.")

        self.consumer.stop()
        self.kafka.stop()

    # TODO enable after 2.5 is released
    # @parametrize(broker_version=str(LATEST_2_5))
    # def test_compatible_brokers_eos_beta_enabled(self, broker_version):
    #     self.kafka.set_version(KafkaVersion(broker_version))
    #     self.kafka.start()
    #
    #     processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, "exactly_once_beta")
    #     processor.start()
    #
    #     self.consumer.start()
    #
    #     processor.wait()
    #
    #     wait_until(lambda: self.consumer.total_consumed() > 0, timeout_sec=30, err_msg="Did expect to read a message but got none within 30 seconds.")
    #
    #     self.consumer.stop()
    #     self.kafka.stop()

    @parametrize(broker_version=str(LATEST_0_10_2))
    @parametrize(broker_version=str(LATEST_0_10_1))
    @parametrize(broker_version=str(LATEST_0_10_0))
    def test_fail_fast_on_incompatible_brokers(self, broker_version):
        self.kafka.set_version(KafkaVersion(broker_version))
        self.kafka.start()

        processor = StreamsBrokerCompatibilityService(self.test_context,
                                                      self.kafka,
                                                      "at_least_once")

        with processor.node.account.monitor_log(
                processor.STDERR_FILE) as monitor:
            processor.start()
            monitor.wait_until(
                'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException',
                timeout_sec=60,
                err_msg=
                "Never saw 'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException "
                + str(processor.node.account))

        self.kafka.stop()

    @parametrize(broker_version=str(LATEST_2_4))
    @parametrize(broker_version=str(LATEST_2_3))
    @parametrize(broker_version=str(LATEST_2_2))
    @parametrize(broker_version=str(LATEST_2_1))
    @parametrize(broker_version=str(LATEST_2_0))
    @parametrize(broker_version=str(LATEST_1_1))
    @parametrize(broker_version=str(LATEST_1_0))
    @parametrize(broker_version=str(LATEST_0_11_0))
    def test_fail_fast_on_incompatible_brokers_if_eos_beta_enabled(
            self, broker_version):
        self.kafka.set_version(KafkaVersion(broker_version))
        self.kafka.start()

        processor = StreamsBrokerCompatibilityService(self.test_context,
                                                      self.kafka,
                                                      "exactly_once_beta")

        with processor.node.account.monitor_log(
                processor.STDERR_FILE) as monitor:
            with processor.node.account.monitor_log(processor.LOG_FILE) as log:
                processor.start()
                log.wait_until(
                    'Shutting down because the Kafka cluster seems to be on a too old version. Setting processing\.guarantee="exactly_once_beta" requires broker version 2\.5 or higher\.',
                    timeout_sec=60,
                    err_msg=
                    "Never saw 'Shutting down, because the Kafka cluster seems to be on a too old version. Setting `processing.guarantee=\"exaclty_once_beta\"` requires broker version 2.5 or higher.' log message "
                    + str(processor.node.account))
                monitor.wait_until(
                    'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException',
                    timeout_sec=60,
                    err_msg=
                    "Never saw 'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException' error message "
                    + str(processor.node.account))

        self.kafka.stop()
class StreamsBrokerDownResilience(Test):
    """
    This test validates that Streams is resilient to a broker
    being down longer than specified timeouts in configs
    """

    inputTopic = "streamsResilienceSource"
    outputTopic = "streamsResilienceSink"
    num_messages = 5

    def __init__(self, test_context):
        super(StreamsBrokerDownResilience,
              self).__init__(test_context=test_context)
        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics={
                                      self.inputTopic: {
                                          'partitions': 3,
                                          'replication-factor': 1
                                      },
                                      self.outputTopic: {
                                          'partitions': 1,
                                          'replication-factor': 1
                                      }
                                  })

    def get_consumer(self, num_messages):
        return VerifiableConsumer(self.test_context,
                                  1,
                                  self.kafka,
                                  self.outputTopic,
                                  "stream-broker-resilience-verify-consumer",
                                  max_messages=num_messages)

    def get_producer(self, num_messages):
        return VerifiableProducer(self.test_context,
                                  1,
                                  self.kafka,
                                  self.inputTopic,
                                  max_messages=num_messages,
                                  acks=1)

    def assert_produce_consume(self, test_state, num_messages=5):
        producer = self.get_producer(num_messages)
        producer.start()

        wait_until(lambda: producer.num_acked >= num_messages,
                   timeout_sec=30,
                   err_msg="At %s failed to send messages " % test_state)

        consumer = self.get_consumer(num_messages)
        consumer.start()

        wait_until(
            lambda: consumer.total_consumed() >= num_messages,
            timeout_sec=60,
            err_msg="At %s streams did not process messages in 60 seconds " %
            test_state)

    @staticmethod
    def get_configs(extra_configs=""):
        # Consumer max.poll.interval > min(max.block.ms, ((retries + 1) * request.timeout)
        consumer_poll_ms = "consumer.max.poll.interval.ms=50000"
        retries_config = "producer.retries=2"
        request_timeout = "producer.request.timeout.ms=15000"
        max_block_ms = "producer.max.block.ms=30000"

        # java code expects configs in key=value,key=value format
        updated_configs = consumer_poll_ms + "," + retries_config + "," + request_timeout + "," + max_block_ms + extra_configs

        return updated_configs

    def wait_for_verification(self, processor, message, file, num_lines=1):
        wait_until(lambda: self.verify_from_file(processor, message, file
                                                 ) >= num_lines,
                   timeout_sec=60,
                   err_msg="Did expect to read '%s' from %s" %
                   (message, processor.node.account))

    @staticmethod
    def verify_from_file(processor, message, file):
        result = processor.node.account.ssh_output("grep '%s' %s | wc -l" %
                                                   (message, file),
                                                   allow_fail=False)
        return int(result)

    def setUp(self):
        self.zk.start()

    def test_streams_resilient_to_broker_down(self):
        self.kafka.start()

        # Broker should be down over 2x of retries * timeout ms
        # So with (2 * 15000) = 30 seconds, we'll set downtime to 70 seconds
        broker_down_time_in_seconds = 70

        processor = StreamsBrokerDownResilienceService(self.test_context,
                                                       self.kafka,
                                                       self.get_configs())
        processor.start()

        # until KIP-91 is merged we'll only send 5 messages to assert Kafka Streams is running before taking the broker down
        # After KIP-91 is merged we'll continue to send messages the duration of the test
        self.assert_produce_consume("before_broker_stop")

        node = self.kafka.leader(self.inputTopic)

        self.kafka.stop_node(node)

        time.sleep(broker_down_time_in_seconds)

        self.kafka.start_node(node)

        self.assert_produce_consume("after_broker_stop")

        self.kafka.stop()

    def test_streams_runs_with_broker_down_initially(self):
        self.kafka.start()
        node = self.kafka.leader(self.inputTopic)
        self.kafka.stop_node(node)

        configs = self.get_configs(
            extra_configs=",application.id=starting_wo_broker_id")

        # start streams with broker down initially
        processor = StreamsBrokerDownResilienceService(self.test_context,
                                                       self.kafka, configs)
        processor.start()

        processor_2 = StreamsBrokerDownResilienceService(
            self.test_context, self.kafka, configs)
        processor_2.start()

        processor_3 = StreamsBrokerDownResilienceService(
            self.test_context, self.kafka, configs)
        processor_3.start()

        broker_unavailable_message = "Broker may not be available"

        # verify streams instances unable to connect to broker, kept trying
        self.wait_for_verification(processor, broker_unavailable_message,
                                   processor.LOG_FILE, 100)
        self.wait_for_verification(processor_2, broker_unavailable_message,
                                   processor_2.LOG_FILE, 100)
        self.wait_for_verification(processor_3, broker_unavailable_message,
                                   processor_3.LOG_FILE, 100)

        # now start broker
        self.kafka.start_node(node)

        # assert streams can process when starting with broker down
        self.assert_produce_consume("running_with_broker_down_initially",
                                    num_messages=9)

        message = "processed3messages"
        # need to show all 3 instances processed messages
        self.wait_for_verification(processor, message, processor.STDOUT_FILE)
        self.wait_for_verification(processor_2, message,
                                   processor_2.STDOUT_FILE)
        self.wait_for_verification(processor_3, message,
                                   processor_3.STDOUT_FILE)

        self.kafka.stop()

    def test_streams_should_scale_in_while_brokers_down(self):
        self.kafka.start()

        configs = self.get_configs(
            extra_configs=",application.id=shutdown_with_broker_down")

        processor = StreamsBrokerDownResilienceService(self.test_context,
                                                       self.kafka, configs)
        processor.start()

        processor_2 = StreamsBrokerDownResilienceService(
            self.test_context, self.kafka, configs)
        processor_2.start()

        processor_3 = StreamsBrokerDownResilienceService(
            self.test_context, self.kafka, configs)
        processor_3.start()

        # need to wait for rebalance  once
        self.wait_for_verification(
            processor_3, "State transition from REBALANCING to RUNNING",
            processor_3.LOG_FILE)

        # assert streams can process when starting with broker down
        self.assert_produce_consume("waiting for rebalance to complete",
                                    num_messages=9)

        message = "processed3messages"

        self.wait_for_verification(processor, message, processor.STDOUT_FILE)
        self.wait_for_verification(processor_2, message,
                                   processor_2.STDOUT_FILE)
        self.wait_for_verification(processor_3, message,
                                   processor_3.STDOUT_FILE)

        node = self.kafka.leader(self.inputTopic)
        self.kafka.stop_node(node)

        processor.stop()
        processor_2.stop()

        shutdown_message = "Complete shutdown of streams resilience test app now"
        self.wait_for_verification(processor, shutdown_message,
                                   processor.STDOUT_FILE)
        self.wait_for_verification(processor_2, shutdown_message,
                                   processor_2.STDOUT_FILE)

        self.kafka.start_node(node)

        self.assert_produce_consume(
            "sending_message_after_stopping_streams_instance_bouncing_broker",
            num_messages=9)

        self.wait_for_verification(processor_3, "processed9messages",
                                   processor_3.STDOUT_FILE)

        self.kafka.stop()
Exemplo n.º 10
0
class ReplicaScaleTest(Test):
    def __init__(self, test_context):
        super(ReplicaScaleTest, self).__init__(test_context=test_context)
        self.test_context = test_context
        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(self.test_context, num_nodes=8, zk=self.zk)

    def setUp(self):
        self.zk.start()
        self.kafka.start()

    def teardown(self):
        # Need to increase the timeout due to partition count
        for node in self.kafka.nodes:
            self.kafka.stop_node(node, clean_shutdown=False, timeout_sec=60)
        self.kafka.stop()
        self.zk.stop()

    @cluster(num_nodes=12)
    @parametrize(topic_count=500, partition_count=34, replication_factor=3)
    def test_produce_consume(self, topic_count, partition_count, replication_factor):
        topics_create_start_time = time.time()
        for i in range(topic_count):
            topic = "replicas_produce_consume_%d" % i
            print("Creating topic %s" % topic)  # Force some stdout for Jenkins
            topic_cfg = {
                "topic": topic,
                "partitions": partition_count,
                "replication-factor": replication_factor,
                "configs": {"min.insync.replicas": 2}
            }
            self.kafka.create_topic(topic_cfg)

        topics_create_end_time = time.time()
        self.logger.info("Time to create topics: %d" % (topics_create_end_time - topics_create_start_time))

        producer_workload_service = ProduceBenchWorkloadService(self.test_context, self.kafka)
        consumer_workload_service = ConsumeBenchWorkloadService(self.test_context, self.kafka)
        trogdor = TrogdorService(context=self.test_context,
                                 client_services=[self.kafka, producer_workload_service, consumer_workload_service])
        trogdor.start()

        produce_spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                producer_workload_service.producer_node,
                                                producer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=10000,
                                                max_messages=3400000,
                                                producer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                inactive_topics={},
                                                active_topics={"replicas_produce_consume_[0-2]": {
                                                    "numPartitions": partition_count, "replicationFactor": replication_factor
                                                }})
        produce_workload = trogdor.create_task("replicas-produce-workload", produce_spec)
        produce_workload.wait_for_done(timeout_sec=600)
        self.logger.info("Completed produce bench")

        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                consumer_workload_service.consumer_node,
                                                consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=10000,
                                                max_messages=3400000,
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                active_topics=["replicas_produce_consume_[0-2]"])
        consume_workload = trogdor.create_task("replicas-consume-workload", consume_spec)
        consume_workload.wait_for_done(timeout_sec=600)
        self.logger.info("Completed consume bench")

        trogdor.stop()

    @cluster(num_nodes=12)
    @parametrize(topic_count=500, partition_count=34, replication_factor=3)
    def test_clean_bounce(self, topic_count, partition_count, replication_factor):
        topics_create_start_time = time.time()
        for i in range(topic_count):
            topic = "topic-%04d" % i
            print("Creating topic %s" % topic)  # Force some stdout for Jenkins
            topic_cfg = {
                "topic": topic,
                "partitions": partition_count,
                "replication-factor": replication_factor,
                "configs": {"min.insync.replicas": 2}
            }
            self.kafka.create_topic(topic_cfg)
        topics_create_end_time = time.time()
        self.logger.info("Time to create topics: %d" % (topics_create_end_time - topics_create_start_time))

        restart_times = []
        for node in self.kafka.nodes:
            broker_bounce_start_time = time.time()
            self.kafka.stop_node(node, clean_shutdown=True, timeout_sec=600)
            self.kafka.start_node(node, timeout_sec=600)
            broker_bounce_end_time = time.time()
            restart_times.append(broker_bounce_end_time - broker_bounce_start_time)
            self.logger.info("Time to restart %s: %d" % (node.name, broker_bounce_end_time - broker_bounce_start_time))

        self.logger.info("Restart times: %s" % restart_times)

        delete_start_time = time.time()
        for i in range(topic_count):
            topic = "topic-%04d" % i
            self.logger.info("Deleting topic %s" % topic)
            self.kafka.delete_topic(topic)
        delete_end_time = time.time()
        self.logger.info("Time to delete topics: %d" % (delete_end_time - delete_start_time))
Exemplo n.º 11
0
class StreamsOptimizedTest(Test):
    """
    Test doing upgrades of a Kafka Streams application
    that is un-optimized initially then optimized
    """

    input_topic = 'inputTopic'
    aggregation_topic = 'aggregationTopic'
    reduce_topic = 'reduceTopic'
    join_topic = 'joinTopic'
    operation_pattern = 'AGGREGATED\|REDUCED\|JOINED'

    def __init__(self, test_context):
        super(StreamsOptimizedTest, self).__init__(test_context)
        self.topics = {
            self.input_topic: {'partitions': 6},
            self.aggregation_topic: {'partitions': 6},
            self.reduce_topic: {'partitions': 6},
            self.join_topic: {'partitions': 6}
        }

        self.zookeeper = ZookeeperService(self.test_context, num_nodes=1)
        self.kafka = KafkaService(self.test_context, num_nodes=3,
                                  zk=self.zookeeper, topics=self.topics)

        self.producer = VerifiableProducer(self.test_context,
                                           1,
                                           self.kafka,
                                           self.input_topic,
                                           throughput=1000,
                                           acks=1)

    def test_upgrade_optimized_topology(self):
        self.zookeeper.start()
        self.kafka.start()

        processor1 = StreamsOptimizedUpgradeTestService(self.test_context, self.kafka)
        processor2 = StreamsOptimizedUpgradeTestService(self.test_context, self.kafka)
        processor3 = StreamsOptimizedUpgradeTestService(self.test_context, self.kafka)

        processors = [processor1, processor2, processor3]

        # produce records continually during the test
        self.producer.start()

        # start all processors unoptimized
        for processor in processors:
            self.set_topics(processor)
            processor.CLEAN_NODE_ENABLED = False
            self.verify_running_repartition_topic_count(processor, 4)

        self.verify_processing(processors, verify_individual_operations=False)

        self.stop_processors(processors)

        # start again with topology optimized
        for processor in processors:
            processor.OPTIMIZED_CONFIG = 'all'
            self.verify_running_repartition_topic_count(processor, 1)

        self.verify_processing(processors, verify_individual_operations=True)

        self.stop_processors(processors)

        self.producer.stop()
        self.kafka.stop()
        self.zookeeper.stop()

    @staticmethod
    def verify_running_repartition_topic_count(processor, repartition_topic_count):
        node = processor.node
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            processor.start()
            monitor.wait_until('REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' % repartition_topic_count,
                               timeout_sec=120,
                               err_msg="Never saw 'REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' message "
                                       % repartition_topic_count + str(processor.node.account))

    @staticmethod
    def verify_stopped(processor):
        node = processor.node
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            processor.stop()
            monitor.wait_until('OPTIMIZE_TEST Streams Stopped',
                               timeout_sec=60,
                               err_msg="'OPTIMIZE_TEST Streams Stopped' message" + str(processor.node.account))

    def verify_processing(self, processors, verify_individual_operations):
        for processor in processors:
            if not self.all_source_subtopology_tasks(processor):
                if verify_individual_operations:
                    for operation in self.operation_pattern.split('\|'):
                        self.do_verify(processor, operation)
                else:
                    self.do_verify(processor, self.operation_pattern)
            else:
                self.logger.info("Skipping processor %s with all source tasks" % processor.node.account)

    def do_verify(self, processor, pattern):
        self.logger.info("Verifying %s processing pattern in STDOUT_FILE" % pattern)
        with processor.node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            monitor.wait_until(pattern,
                               timeout_sec=60,
                               err_msg="Never saw processing of %s " % pattern + str(processor.node.account))

    def all_source_subtopology_tasks(self, processor):
        retries = 0
        while retries < 5:
            found = list(processor.node.account.ssh_capture("sed -n 's/.*current active tasks: \[\(\(0_[0-9], \)\{3\}0_[0-9]\)\].*/\1/p' %s" % processor.LOG_FILE, allow_fail=True))
            self.logger.info("Returned %s from assigned task check" % found)
            if len(found) > 0:
                return True
            retries += 1
            time.sleep(1)

        return False

    def stop_processors(self, processors):
        for processor in processors:
            self.verify_stopped(processor)

    def set_topics(self, processor):
        processor.INPUT_TOPIC = self.input_topic
        processor.AGGREGATION_TOPIC = self.aggregation_topic
        processor.REDUCE_TOPIC = self.reduce_topic
        processor.JOIN_TOPIC = self.join_topic
Exemplo n.º 12
0
class StreamsNamedRepartitionTopicTest(Test):
    """
    Tests using a named repartition topic by starting
    application then doing a rolling upgrade with added
    operations and the application still runs
    """

    input_topic = 'inputTopic'
    aggregation_topic = 'aggregationTopic'
    pattern = 'AGGREGATED'

    def __init__(self, test_context):
        super(StreamsNamedRepartitionTopicTest, self).__init__(test_context)
        self.topics = {
            self.input_topic: {
                'partitions': 6
            },
            self.aggregation_topic: {
                'partitions': 6
            }
        }

        self.zookeeper = ZookeeperService(self.test_context, num_nodes=1)
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zookeeper,
                                  topics=self.topics)

        self.producer = VerifiableProducer(self.test_context,
                                           1,
                                           self.kafka,
                                           self.input_topic,
                                           throughput=1000,
                                           acks=1)

    def test_upgrade_topology_with_named_repartition_topic(self):
        self.zookeeper.start()
        self.kafka.start()

        processor1 = StreamsNamedRepartitionTopicService(
            self.test_context, self.kafka)
        processor2 = StreamsNamedRepartitionTopicService(
            self.test_context, self.kafka)
        processor3 = StreamsNamedRepartitionTopicService(
            self.test_context, self.kafka)

        processors = [processor1, processor2, processor3]

        self.producer.start()

        for processor in processors:
            processor.CLEAN_NODE_ENABLED = False
            self.set_topics(processor)
            self.verify_running(processor, 'REBALANCING -> RUNNING')

        self.verify_processing(processors)

        # do rolling upgrade
        for processor in processors:
            self.verify_stopped(processor)
            #  will tell app to add operations before repartition topic
            processor.ADD_ADDITIONAL_OPS = 'true'
            self.verify_running(processor, 'UPDATED Topology')

        self.verify_processing(processors)

        self.stop_processors(processors)

        self.producer.stop()
        self.kafka.stop()
        self.zookeeper.stop()

    @staticmethod
    def verify_running(processor, message):
        node = processor.node
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            processor.start()
            monitor.wait_until(message,
                               timeout_sec=60,
                               err_msg="Never saw '%s' message " % message +
                               str(processor.node.account))

    @staticmethod
    def verify_stopped(processor):
        node = processor.node
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            processor.stop()
            monitor.wait_until(
                'NAMED_REPARTITION_TEST Streams Stopped',
                timeout_sec=60,
                err_msg="'NAMED_REPARTITION_TEST Streams Stopped' message" +
                str(processor.node.account))

    def verify_processing(self, processors):
        for processor in processors:
            with processor.node.account.monitor_log(
                    processor.STDOUT_FILE) as monitor:
                monitor.wait_until(
                    self.pattern,
                    timeout_sec=60,
                    err_msg="Never saw processing of %s " % self.pattern +
                    str(processor.node.account))

    def stop_processors(self, processors):
        for processor in processors:
            self.verify_stopped(processor)

    def set_topics(self, processor):
        processor.INPUT_TOPIC = self.input_topic
        processor.AGGREGATION_TOPIC = self.aggregation_topic
Exemplo n.º 13
0
class ConsumeBenchTest(Test):
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ConsumeBenchTest, self).__init__(test_context)
        self.zk = ZookeeperService(test_context, num_nodes=3)
        self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk)
        self.producer_workload_service = ProduceBenchWorkloadService(test_context, self.kafka)
        self.consumer_workload_service = ConsumeBenchWorkloadService(test_context, self.kafka)
        self.consumer_workload_service_2 = ConsumeBenchWorkloadService(test_context, self.kafka)
        self.active_topics = {"consume_bench_topic[0-5]": {"numPartitions": 5, "replicationFactor": 3}}
        self.trogdor = TrogdorService(context=self.test_context,
                                      client_services=[self.kafka, self.producer_workload_service,
                                                       self.consumer_workload_service,
                                                       self.consumer_workload_service_2])

    def setUp(self):
        self.trogdor.start()
        self.zk.start()
        self.kafka.start()

    def teardown(self):
        self.trogdor.stop()
        self.kafka.stop()
        self.zk.stop()

    def produce_messages(self, topics, max_messages=10000):
        produce_spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.producer_workload_service.producer_node,
                                                self.producer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=max_messages,
                                                producer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                inactive_topics={},
                                                active_topics=topics)
        produce_workload = self.trogdor.create_task("produce_workload", produce_spec)
        produce_workload.wait_for_done(timeout_sec=180)
        self.logger.debug("Produce workload finished")

    @parametrize(topics=["consume_bench_topic[0-5]"]) # topic subscription
    @parametrize(topics=["consume_bench_topic[0-5]:[0-4]"])  # manual topic assignment
    def test_consume_bench(self, topics):
        """
        Runs a ConsumeBench workload to consume messages
        """
        self.produce_messages(self.active_topics)
        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.consumer_workload_service.consumer_node,
                                                self.consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=10000,
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                active_topics=topics)
        consume_workload = self.trogdor.create_task("consume_workload", consume_spec)
        consume_workload.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))

    def test_consume_bench_single_partition(self):
        """
        Run a ConsumeBench against a single partition
        """
        active_topics = {"consume_bench_topic": {"numPartitions": 2, "replicationFactor": 3}}
        self.produce_messages(active_topics, 5000)
        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.consumer_workload_service.consumer_node,
                                                self.consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=2500,
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                active_topics=["consume_bench_topic:1"])
        consume_workload = self.trogdor.create_task("consume_workload", consume_spec)
        consume_workload.wait_for_done(timeout_sec=180)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))

    def test_consume_group_bench(self):
        """
        Runs two ConsumeBench workloads in the same consumer group to read messages from topics
        """
        self.produce_messages(self.active_topics)
        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                self.consumer_workload_service.consumer_node,
                                                self.consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=1000,
                                                max_messages=2000, # both should read at least 2k messages
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                consumer_group="testGroup",
                                                active_topics=["consume_bench_topic[0-5]"])
        consume_workload_1 = self.trogdor.create_task("consume_workload_1", consume_spec)
        consume_workload_2 = self.trogdor.create_task("consume_workload_2", consume_spec)
        consume_workload_1.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload 1 finished")
        consume_workload_2.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload 2 finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))
Exemplo n.º 14
0
class RoundTripFaultTest(Test):
    topic_name_index = 0

    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(RoundTripFaultTest, self).__init__(test_context)
        self.zk = ZookeeperService(test_context, num_nodes=3)
        self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk)
        self.workload_service = RoundTripWorkloadService(test_context, self.kafka)
        self.trogdor = TrogdorService(context=self.test_context,
                                      client_services=[self.zk, self.kafka, self.workload_service])
        topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index
        RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1
        active_topics={topic_name : {"partitionAssignments":{"0": [0,1,2]}}}
        self.round_trip_spec = RoundTripWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                     self.workload_service.client_node,
                                     self.workload_service.bootstrap_servers,
                                     target_messages_per_sec=10000,
                                     max_messages=100000,
                                     active_topics=active_topics)

    def setUp(self):
        self.zk.start()
        self.kafka.start()
        self.trogdor.start()

    def teardown(self):
        self.trogdor.stop()
        self.kafka.stop()
        self.zk.stop()

    def test_round_trip_workload(self):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        workload1.wait_for_done(timeout_sec=600)

    def test_round_trip_workload_with_broker_partition(self):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        part1 = [self.kafka.nodes[0]]
        part2 = self.kafka.nodes[1:] + [self.workload_service.nodes[0]] + self.zk.nodes
        partition1_spec = NetworkPartitionFaultSpec(0, TaskSpec.MAX_DURATION_MS,
                                                    [part1, part2])
        partition1 = self.trogdor.create_task("partition1", partition1_spec)
        workload1.wait_for_done(timeout_sec=600)
        partition1.stop()
        partition1.wait_for_done()

    def test_produce_consume_with_broker_pause(self):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        stop1_spec = ProcessStopFaultSpec(0, TaskSpec.MAX_DURATION_MS, [self.kafka.nodes[0]],
                                           self.kafka.java_class_name())
        stop1 = self.trogdor.create_task("stop1", stop1_spec)
        workload1.wait_for_done(timeout_sec=600)
        stop1.stop()
        stop1.wait_for_done()
        self.kafka.stop_node(self.kafka.nodes[0], False)

    def test_produce_consume_with_client_partition(self):
        workload1 = self.trogdor.create_task("workload1", self.round_trip_spec)
        time.sleep(2)
        part1 = [self.workload_service.nodes[0]]
        part2 = self.kafka.nodes + self.zk.nodes
        partition1_spec = NetworkPartitionFaultSpec(0, 60000, [part1, part2])
        stop1 = self.trogdor.create_task("stop1", partition1_spec)
        workload1.wait_for_done(timeout_sec=600)
        stop1.stop()
        stop1.wait_for_done()
class StreamsBrokerDownResilience(Test):
    """
    This test validates that Streams is resilient to a broker
    being down longer than specified timeouts in configs
    """

    inputTopic = "streamsResilienceSource"
    outputTopic = "streamsResilienceSink"
    num_messages = 5

    def __init__(self, test_context):
        super(StreamsBrokerDownResilience,
              self).__init__(test_context=test_context)
        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics={
                                      self.inputTopic: {
                                          'partitions': 1,
                                          'replication-factor': 1
                                      },
                                      self.outputTopic: {
                                          'partitions': 1,
                                          'replication-factor': 1
                                      }
                                  })

    def get_consumer(self):
        return VerifiableConsumer(self.test_context,
                                  1,
                                  self.kafka,
                                  self.outputTopic,
                                  "stream-broker-resilience-verify-consumer",
                                  max_messages=self.num_messages)

    def get_producer(self):
        return VerifiableProducer(self.test_context,
                                  1,
                                  self.kafka,
                                  self.inputTopic,
                                  max_messages=self.num_messages,
                                  acks=1)

    def assert_produce_consume(self, test_state):
        producer = self.get_producer()
        producer.start()

        wait_until(lambda: producer.num_acked > 0,
                   timeout_sec=30,
                   err_msg="At %s failed to send messages " % test_state)

        consumer = self.get_consumer()
        consumer.start()

        wait_until(
            lambda: consumer.total_consumed() > 0,
            timeout_sec=120,
            err_msg="At %s streams did not process messages in 120 seconds " %
            test_state)

    def setUp(self):
        self.zk.start()

    def test_streams_resilient_to_broker_down(self):
        self.kafka.start()

        # Consumer max.poll.interval > min(max.block.ms, ((retries + 1) * request.timeout)
        consumer_poll_ms = "consumer.max.poll.interval.ms=50000"
        retries_config = "producer.retries=2"
        request_timeout = "producer.request.timeout.ms=15000"
        max_block_ms = "producer.max.block.ms=30000"

        # Broker should be down over 2x of retries * timeout ms
        # So with (2 * 15000) = 30 seconds, we'll set downtime to 70 seconds
        broker_down_time_in_seconds = 70

        # java code expects configs in key=value,key=value format
        updated_configs = consumer_poll_ms + "," + retries_config + "," + request_timeout + "," + max_block_ms

        processor = StreamsBrokerDownResilienceService(self.test_context,
                                                       self.kafka,
                                                       updated_configs)
        processor.start()

        # until KIP-91 is merged we'll only send 5 messages to assert Kafka Streams is running before taking the broker down
        # After KIP-91 is merged we'll continue to send messages the duration of the test
        self.assert_produce_consume("before_broker_stop")

        node = self.kafka.leader(self.inputTopic)

        self.kafka.stop_node(node)

        time.sleep(broker_down_time_in_seconds)

        self.kafka.start_node(node)

        self.assert_produce_consume("after_broker_stop")

        self.kafka.stop()
Exemplo n.º 16
0
class ProduceBenchTest(Test):
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ProduceBenchTest, self).__init__(test_context)
        self.zk = ZookeeperService(test_context,
                                   num_nodes=3) if quorum.for_test(
                                       test_context) == quorum.zk else None
        self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk)
        self.workload_service = ProduceBenchWorkloadService(
            test_context, self.kafka)
        self.trogdor = TrogdorService(
            context=self.test_context,
            client_services=[self.kafka, self.workload_service])
        self.active_topics = {
            "produce_bench_topic[0-1]": {
                "numPartitions": 1,
                "replicationFactor": 3
            }
        }
        self.inactive_topics = {
            "produce_bench_topic[2-9]": {
                "numPartitions": 1,
                "replicationFactor": 3
            }
        }

    def setUp(self):
        self.trogdor.start()
        if self.zk:
            self.zk.start()
        self.kafka.start()

    def teardown(self):
        self.trogdor.stop()
        self.kafka.stop()
        if self.zk:
            self.zk.stop()

    @cluster(num_nodes=8)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_produce_bench(self, metadata_quorum=quorum.zk):
        spec = ProduceBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.workload_service.producer_node,
            self.workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=100000,
            producer_conf={},
            admin_client_conf={},
            common_client_conf={},
            inactive_topics=self.inactive_topics,
            active_topics=self.active_topics)
        workload1 = self.trogdor.create_task("workload1", spec)
        workload1.wait_for_done(timeout_sec=360)
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" %
                         json.dumps(tasks, sort_keys=True, indent=2))

    @cluster(num_nodes=8)
    def test_produce_bench_transactions(self, metadata_quorum=quorum.zk):
        spec = ProduceBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.workload_service.producer_node,
            self.workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=100000,
            producer_conf={},
            admin_client_conf={},
            common_client_conf={},
            inactive_topics=self.inactive_topics,
            active_topics=self.active_topics,
            transaction_generator={
                # 10 transactions with 10k messages
                "type": "uniform",
                "messagesPerTransaction": "10000"
            })
        workload1 = self.trogdor.create_task("workload1", spec)
        workload1.wait_for_done(timeout_sec=360)
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" %
                         json.dumps(tasks, sort_keys=True, indent=2))
Exemplo n.º 17
0
class ConsumeBenchTest(Test):
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ConsumeBenchTest, self).__init__(test_context)
        self.zk = ZookeeperService(test_context,
                                   num_nodes=3) if quorum.for_test(
                                       test_context) == quorum.zk else None
        self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk)
        self.producer_workload_service = ProduceBenchWorkloadService(
            test_context, self.kafka)
        self.consumer_workload_service = ConsumeBenchWorkloadService(
            test_context, self.kafka)
        self.consumer_workload_service_2 = ConsumeBenchWorkloadService(
            test_context, self.kafka)
        self.active_topics = {
            "consume_bench_topic[0-5]": {
                "numPartitions": 5,
                "replicationFactor": 3
            }
        }
        self.trogdor = TrogdorService(context=self.test_context,
                                      client_services=[
                                          self.kafka,
                                          self.producer_workload_service,
                                          self.consumer_workload_service,
                                          self.consumer_workload_service_2
                                      ])

    def setUp(self):
        self.trogdor.start()
        if self.zk:
            self.zk.start()
        self.kafka.start()

    def teardown(self):
        self.trogdor.stop()
        self.kafka.stop()
        if self.zk:
            self.zk.stop()

    def produce_messages(self, topics, max_messages=10000):
        produce_spec = ProduceBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.producer_workload_service.producer_node,
            self.producer_workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=max_messages,
            producer_conf={},
            admin_client_conf={},
            common_client_conf={},
            inactive_topics={},
            active_topics=topics)
        produce_workload = self.trogdor.create_task("produce_workload",
                                                    produce_spec)
        produce_workload.wait_for_done(timeout_sec=180)
        self.logger.debug("Produce workload finished")

    @cluster(num_nodes=10)
    @matrix(topics=[["consume_bench_topic[0-5]"]],
            metadata_quorum=quorum.all_non_upgrade)  # topic subscription
    @matrix(topics=[["consume_bench_topic[0-5]:[0-4]"]],
            metadata_quorum=quorum.all_non_upgrade)  # manual topic assignment
    def test_consume_bench(self, topics, metadata_quorum=quorum.zk):
        """
        Runs a ConsumeBench workload to consume messages
        """
        self.produce_messages(self.active_topics)
        consume_spec = ConsumeBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.consumer_workload_service.consumer_node,
            self.consumer_workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=10000,
            consumer_conf={},
            admin_client_conf={},
            common_client_conf={},
            active_topics=topics)
        consume_workload = self.trogdor.create_task("consume_workload",
                                                    consume_spec)
        consume_workload.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" %
                         json.dumps(tasks, sort_keys=True, indent=2))

    @cluster(num_nodes=10)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_single_partition(self, metadata_quorum=quorum.zk):
        """
        Run a ConsumeBench against a single partition
        """
        active_topics = {
            "consume_bench_topic": {
                "numPartitions": 2,
                "replicationFactor": 3
            }
        }
        self.produce_messages(active_topics, 5000)
        consume_spec = ConsumeBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.consumer_workload_service.consumer_node,
            self.consumer_workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=2500,
            consumer_conf={},
            admin_client_conf={},
            common_client_conf={},
            active_topics=["consume_bench_topic:1"])
        consume_workload = self.trogdor.create_task("consume_workload",
                                                    consume_spec)
        consume_workload.wait_for_done(timeout_sec=180)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" %
                         json.dumps(tasks, sort_keys=True, indent=2))

    @cluster(num_nodes=10)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_multiple_consumers_random_group_topics(self,
                                                    metadata_quorum=quorum.zk):
        """
        Runs multiple consumers group to read messages from topics.
        Since a consumerGroup isn't specified, each consumer should read from all topics independently
        """
        self.produce_messages(self.active_topics, max_messages=5000)
        consume_spec = ConsumeBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.consumer_workload_service.consumer_node,
            self.consumer_workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=5000,  # all should read exactly 5k messages
            consumer_conf={},
            admin_client_conf={},
            common_client_conf={},
            threads_per_worker=5,
            active_topics=["consume_bench_topic[0-5]"])
        consume_workload = self.trogdor.create_task("consume_workload",
                                                    consume_spec)
        consume_workload.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" %
                         json.dumps(tasks, sort_keys=True, indent=2))

    @cluster(num_nodes=10)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_two_consumers_specified_group_topics(self,
                                                  metadata_quorum=quorum.zk):
        """
        Runs two consumers in the same consumer group to read messages from topics.
        Since a consumerGroup is specified, each consumer should dynamically get assigned a partition from group
        """
        self.produce_messages(self.active_topics)
        consume_spec = ConsumeBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.consumer_workload_service.consumer_node,
            self.consumer_workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=2000,  # both should read at least 2k messages
            consumer_conf={},
            admin_client_conf={},
            common_client_conf={},
            threads_per_worker=2,
            consumer_group="testGroup",
            active_topics=["consume_bench_topic[0-5]"])
        consume_workload = self.trogdor.create_task("consume_workload",
                                                    consume_spec)
        consume_workload.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" %
                         json.dumps(tasks, sort_keys=True, indent=2))

    @cluster(num_nodes=10)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_multiple_consumers_random_group_partitions(
            self, metadata_quorum=quorum.zk):
        """
        Runs multiple consumers in to read messages from specific partitions.
        Since a consumerGroup isn't specified, each consumer will get assigned a random group
        and consume from all partitions
        """
        self.produce_messages(self.active_topics, max_messages=20000)
        consume_spec = ConsumeBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.consumer_workload_service.consumer_node,
            self.consumer_workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=2000,
            consumer_conf={},
            admin_client_conf={},
            common_client_conf={},
            threads_per_worker=4,
            active_topics=["consume_bench_topic1:[0-4]"])
        consume_workload = self.trogdor.create_task("consume_workload",
                                                    consume_spec)
        consume_workload.wait_for_done(timeout_sec=360)
        self.logger.debug("Consume workload finished")
        tasks = self.trogdor.tasks()
        self.logger.info("TASKS: %s\n" %
                         json.dumps(tasks, sort_keys=True, indent=2))

    @cluster(num_nodes=10)
    @matrix(metadata_quorum=quorum.all_non_upgrade)
    def test_multiple_consumers_specified_group_partitions_should_raise(
            self, metadata_quorum=quorum.zk):
        """
        Runs multiple consumers in the same group to read messages from specific partitions.
        It is an invalid configuration to provide a consumer group and specific partitions.
        """
        expected_error_msg = 'explicit partition assignment'
        self.produce_messages(self.active_topics, max_messages=20000)
        consume_spec = ConsumeBenchWorkloadSpec(
            0,
            TaskSpec.MAX_DURATION_MS,
            self.consumer_workload_service.consumer_node,
            self.consumer_workload_service.bootstrap_servers,
            target_messages_per_sec=1000,
            max_messages=2000,
            consumer_conf={},
            admin_client_conf={},
            common_client_conf={},
            threads_per_worker=4,
            consumer_group="fail_group",
            active_topics=["consume_bench_topic1:[0-4]"])
        consume_workload = self.trogdor.create_task("consume_workload",
                                                    consume_spec)
        try:
            consume_workload.wait_for_done(timeout_sec=360)
            raise Exception(
                "Should have raised an exception due to an invalid configuration"
            )
        except RuntimeError as e:
            if expected_error_msg not in str(e):
                raise RuntimeError("Unexpected Exception - " + str(e))
            self.logger.info(e)
class StreamsBrokerCompatibility(Test):
    """
    These tests validate that Streams v0.10.2+ can connect to older brokers v0.10+
    and that Streams fails fast for pre-0.10 brokers
    """

    input = "brokerCompatibilitySourceTopic"
    output = "brokerCompatibilitySinkTopic"

    def __init__(self, test_context):
        super(StreamsBrokerCompatibility, self).__init__(test_context=test_context)

        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics={
                                      self.input: {'partitions': 1, 'replication-factor': 1},
                                      self.output: {'partitions': 1, 'replication-factor': 1}
                                  })

        self.processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka)

        self.consumer = VerifiableConsumer(test_context,
                                           1,
                                           self.kafka,
                                           self.output,
                                           "stream-broker-compatibility-verify-consumer")

    def setUp(self):
        self.zk.start()

    @parametrize(broker_version=str(DEV_BRANCH))
    @parametrize(broker_version=str(LATEST_0_10_1))
    def test_compatible_brokers(self, broker_version):
        self.kafka.set_version(KafkaVersion(broker_version))
        self.kafka.start()

        self.processor.start()
        self.consumer.start()

        self.processor.wait()

        num_consumed_mgs = self.consumer.total_consumed()

        self.consumer.stop()
        self.kafka.stop()

        assert num_consumed_mgs == 1, \
            "Did expect to read exactly one message but got %d" % num_consumed_mgs

    @parametrize(broker_version=str(LATEST_0_10_0))
    def test_fail_fast_on_incompatible_brokers(self, broker_version):
        self.kafka.set_version(KafkaVersion(broker_version))
        self.kafka.start()

        self.processor.start()

        self.processor.node.account.ssh(self.processor.start_cmd(self.processor.node))
        with self.processor.node.account.monitor_log(self.processor.STDERR_FILE) as monitor:
            monitor.wait_until('Exception in thread "main" org.apache.kafka.streams.errors.StreamsException: Kafka Streams requires broker version 0.10.1.x or higher.',
                        timeout_sec=60,
                        err_msg="Never saw 'incompatible broker' error message " + str(self.processor.node.account))

        self.kafka.stop()
class StreamsStaticMembershipTest(Test):
    """
    Tests using static membership when broker points to minimum supported
    version (2.3) or higher.
    """

    input_topic = 'inputTopic'
    pattern = 'PROCESSED'
    running_message = 'REBALANCING -> RUNNING'
    stopped_message = 'Static membership test closed'

    def __init__(self, test_context):
        super(StreamsStaticMembershipTest, self).__init__(test_context)
        self.topics = {
            self.input_topic: {
                'partitions': 18
            },
        }

        self.zookeeper = ZookeeperService(self.test_context, num_nodes=1)
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zookeeper,
                                  topics=self.topics)

        self.producer = VerifiableProducer(self.test_context,
                                           1,
                                           self.kafka,
                                           self.input_topic,
                                           throughput=1000,
                                           acks=1)

    def test_rolling_bounces_will_not_trigger_rebalance_under_static_membership(
            self):
        self.zookeeper.start()
        self.kafka.start()

        numThreads = 3
        processor1 = StaticMemberTestService(self.test_context, self.kafka,
                                             "consumer-A", numThreads)
        processor2 = StaticMemberTestService(self.test_context, self.kafka,
                                             "consumer-B", numThreads)
        processor3 = StaticMemberTestService(self.test_context, self.kafka,
                                             "consumer-C", numThreads)

        processors = [processor1, processor2, processor3]

        self.producer.start()

        for processor in processors:
            processor.CLEAN_NODE_ENABLED = False
            self.set_topics(processor)
            verify_running(processor, self.running_message)

        self.verify_processing(processors)

        # do several rolling bounces
        num_bounces = 3
        for i in range(0, num_bounces):
            for processor in processors:
                verify_stopped(processor, self.stopped_message)
                verify_running(processor, self.running_message)

        stable_generation = -1
        for processor in processors:
            generations = extract_generation_from_logs(processor)
            num_bounce_generations = num_bounces * numThreads
            assert num_bounce_generations <= len(generations), \
                "Smaller than minimum expected %d generation messages, actual %d" % (num_bounce_generations, len(generations))

            for generation in generations[-num_bounce_generations:]:
                generation = int(generation)
                if stable_generation == -1:
                    stable_generation = generation
                assert stable_generation == generation, \
                    "Stream rolling bounce have caused unexpected generation bump %d" % generation

        self.verify_processing(processors)

        stop_processors(processors, self.stopped_message)

        self.producer.stop()
        self.kafka.stop()
        self.zookeeper.stop()

    def verify_processing(self, processors):
        for processor in processors:
            with processor.node.account.monitor_log(
                    processor.STDOUT_FILE) as monitor:
                monitor.wait_until(
                    self.pattern,
                    timeout_sec=60,
                    err_msg="Never saw processing of %s " % self.pattern +
                    str(processor.node.account))

    def set_topics(self, processor):
        processor.INPUT_TOPIC = self.input_topic
Exemplo n.º 20
0
class StreamsBrokerCompatibility(Test):
    """
    These tests validate that Streams v0.10.2+ can connect to older brokers v0.10.1+
    and that Streams fails fast for pre-0.10.0 brokers
    """

    input = "brokerCompatibilitySourceTopic"
    output = "brokerCompatibilitySinkTopic"

    def __init__(self, test_context):
        super(StreamsBrokerCompatibility,
              self).__init__(test_context=test_context)

        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics={
                                      self.input: {
                                          'partitions': 1,
                                          'replication-factor': 1
                                      },
                                      self.output: {
                                          'partitions': 1,
                                          'replication-factor': 1
                                      }
                                  })

        self.processor = StreamsBrokerCompatibilityService(
            self.test_context, self.kafka)

        self.consumer = VerifiableConsumer(
            test_context, 1, self.kafka, self.output,
            "stream-broker-compatibility-verify-consumer")

    def setUp(self):
        self.zk.start()

    @parametrize(broker_version=str(DEV_BRANCH))
    @parametrize(broker_version=str(LATEST_0_10_1))
    def test_compatible_brokers(self, broker_version):
        self.kafka.set_version(KafkaVersion(broker_version))
        self.kafka.start()

        self.processor.start()
        self.consumer.start()

        self.processor.wait()

        wait_until(
            lambda: self.consumer.total_consumed() > 0,
            timeout_sec=30,
            err_msg=
            "Did expect to read a message but got none within 30 seconds.")

        self.consumer.stop()
        self.kafka.stop()

    @parametrize(broker_version=str(LATEST_0_10_0))
    def test_fail_fast_on_incompatible_brokers(self, broker_version):
        self.kafka.set_version(KafkaVersion(broker_version))
        self.kafka.start()

        self.processor.start()

        self.processor.node.account.ssh(
            self.processor.start_cmd(self.processor.node))
        with self.processor.node.account.monitor_log(
                self.processor.STDERR_FILE) as monitor:
            monitor.wait_until(
                'Exception in thread "main" org.apache.kafka.streams.errors.StreamsException: Kafka Streams requires broker version 0.10.1.x or higher.',
                timeout_sec=60,
                err_msg="Never saw 'incompatible broker' error message " +
                str(self.processor.node.account))

        self.kafka.stop()
class StreamsBrokerCompatibility(Test):
    """
    These tests validates that
    - Streams 0.11+ w/ EOS fails fast for older brokers 0.10.2 and 0.10.1
    - Streams 0.11+ w/o EOS works for older brokers 0.10.2 and 0.10.1
    - Streams fails fast for 0.10.0 brokers
    - Streams times-out for pre-0.10.0 brokers
    """

    input = "brokerCompatibilitySourceTopic"
    output = "brokerCompatibilitySinkTopic"

    def __init__(self, test_context):
        super(StreamsBrokerCompatibility, self).__init__(test_context=test_context)
        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics={
                                      self.input: {'partitions': 1, 'replication-factor': 1},
                                      self.output: {'partitions': 1, 'replication-factor': 1}
                                  })
        self.consumer = VerifiableConsumer(test_context,
                                           1,
                                           self.kafka,
                                           self.output,
                                           "stream-broker-compatibility-verify-consumer")

    def setUp(self):
        self.zk.start()

   
    @parametrize(broker_version=str(LATEST_0_10_2))
    @parametrize(broker_version=str(LATEST_0_10_1))
    def test_fail_fast_on_incompatible_brokers_if_eos_enabled(self, broker_version):
        self.kafka.set_version(KafkaVersion(broker_version))
        self.kafka.start()

        processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, True)

        with processor.node.account.monitor_log(processor.STDERR_FILE) as monitor:
            processor.start()
            monitor.wait_until('FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException: Cannot create a v0 FindCoordinator request because we require features supported only in 1 or later.',
                               timeout_sec=60,
                               err_msg="Never saw 'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException: Cannot create a v0 FindCoordinator request because we require features supported only in 1 or later.' error message " + str(processor.node.account))

        self.kafka.stop()

    @parametrize(broker_version=str(LATEST_0_11_0))
    @parametrize(broker_version=str(LATEST_0_10_2))
    @parametrize(broker_version=str(LATEST_0_10_1))
    def test_compatible_brokers_eos_disabled(self, broker_version):
        self.kafka.set_version(KafkaVersion(broker_version))
        self.kafka.start()

        processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, False)
        processor.start()

        self.consumer.start()

        processor.wait()

        wait_until(lambda: self.consumer.total_consumed() > 0, timeout_sec=30, err_msg="Did expect to read a message but got none within 30 seconds.")

        self.consumer.stop()
        self.kafka.stop()

    @parametrize(broker_version=str(LATEST_0_10_0))
    def test_fail_fast_on_incompatible_brokers(self, broker_version):
        self.kafka.set_version(KafkaVersion(broker_version))
        self.kafka.start()

        processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, False)

        with processor.node.account.monitor_log(processor.STDERR_FILE) as monitor:
            processor.start()
            monitor.wait_until('FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException: The broker does not support CREATE_TOPICS',
                        timeout_sec=60,
                        err_msg="Never saw 'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException: The broker does not support CREATE_TOPICS' error message " + str(processor.node.account))

        self.kafka.stop()

    @ignore
    @parametrize(broker_version=str(LATEST_0_9))
    @parametrize(broker_version=str(LATEST_0_8_2))
    def test_timeout_on_pre_010_brokers(self, broker_version):
        self.kafka.set_version(KafkaVersion(broker_version))
        self.kafka.start()

        processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, False)

        with processor.node.account.monitor_log(processor.STDERR_FILE) as monitor:
            processor.start()
            monitor.wait_until('Exception in thread "main" org.apache.kafka.streams.errors.BrokerNotFoundException: Could not find any available broker.',
                               timeout_sec=60,
                               err_msg="Never saw 'no available brokers' error message " + str(processor.node.account))

        self.kafka.stop()
Exemplo n.º 22
0
class StreamsBrokerCompatibility(Test):
    """
    These tests validates that
    - Streams 0.11+ w/ EOS fails fast for older brokers 0.10.2 and 0.10.1
    - Streams 0.11+ w/o EOS works for older brokers 0.10.2 and 0.10.1
    - Streams fails fast for 0.10.0 brokers
    - Streams times-out for pre-0.10.0 brokers
    """

    input = "brokerCompatibilitySourceTopic"
    output = "brokerCompatibilitySinkTopic"

    def __init__(self, test_context):
        super(StreamsBrokerCompatibility,
              self).__init__(test_context=test_context)
        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics={
                                      self.input: {
                                          'partitions': 1,
                                          'replication-factor': 1
                                      },
                                      self.output: {
                                          'partitions': 1,
                                          'replication-factor': 1
                                      }
                                  })
        self.consumer = VerifiableConsumer(
            test_context, 1, self.kafka, self.output,
            "stream-broker-compatibility-verify-consumer")

    def setUp(self):
        self.zk.start()

    @parametrize(broker_version=str(LATEST_0_10_2))
    @parametrize(broker_version=str(LATEST_0_10_1))
    def test_fail_fast_on_incompatible_brokers_if_eos_enabled(
            self, broker_version):
        self.kafka.set_version(KafkaVersion(broker_version))
        self.kafka.start()

        processor = StreamsBrokerCompatibilityService(self.test_context,
                                                      self.kafka, True)
        processor.start()

        processor.node.account.ssh(processor.start_cmd(processor.node))
        with processor.node.account.monitor_log(
                processor.STDERR_FILE) as monitor:
            monitor.wait_until(
                'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException: The broker does not support LIST_OFFSETS ',
                timeout_sec=60,
                err_msg=
                "Never saw 'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException: The broker does not support LIST_OFFSETS ' error message "
                + str(processor.node.account))

        self.kafka.stop()

    @parametrize(broker_version=str(LATEST_0_11_0))
    @parametrize(broker_version=str(LATEST_0_10_2))
    @parametrize(broker_version=str(LATEST_0_10_1))
    def test_compatible_brokers_eos_disabled(self, broker_version):
        self.kafka.set_version(KafkaVersion(broker_version))
        self.kafka.start()

        processor = StreamsBrokerCompatibilityService(self.test_context,
                                                      self.kafka, False)
        processor.start()

        self.consumer.start()

        processor.wait()

        wait_until(
            lambda: self.consumer.total_consumed() > 0,
            timeout_sec=30,
            err_msg=
            "Did expect to read a message but got none within 30 seconds.")

        self.consumer.stop()
        self.kafka.stop()

    @parametrize(broker_version=str(LATEST_0_10_0))
    def test_fail_fast_on_incompatible_brokers(self, broker_version):
        self.kafka.set_version(KafkaVersion(broker_version))
        self.kafka.start()

        processor = StreamsBrokerCompatibilityService(self.test_context,
                                                      self.kafka, False)
        processor.start()

        processor.node.account.ssh(processor.start_cmd(processor.node))
        with processor.node.account.monitor_log(
                processor.STDERR_FILE) as monitor:
            monitor.wait_until(
                'FATAL: An unexpected exception org.apache.kafka.streams.errors.StreamsException: Could not create internal topics.',
                timeout_sec=60,
                err_msg=
                "Never saw 'FATAL: An unexpected exception org.apache.kafka.streams.errors.StreamsException: Could not create internal topics.' error message "
                + str(processor.node.account))

        self.kafka.stop()

    @ignore
    @parametrize(broker_version=str(LATEST_0_9))
    @parametrize(broker_version=str(LATEST_0_8_2))
    def test_timeout_on_pre_010_brokers(self, broker_version):
        self.kafka.set_version(KafkaVersion(broker_version))
        self.kafka.start()

        processor = StreamsBrokerCompatibilityService(self.test_context,
                                                      self.kafka, False)
        processor.start()

        processor.node.account.ssh(processor.start_cmd(processor.node))
        with processor.node.account.monitor_log(
                processor.STDERR_FILE) as monitor:
            monitor.wait_until(
                'Exception in thread "main" org.apache.kafka.streams.errors.BrokerNotFoundException: Could not find any available broker.',
                timeout_sec=60,
                err_msg="Never saw 'no available brokers' error message " +
                str(processor.node.account))

        self.kafka.stop()
Exemplo n.º 23
0
class StreamsOptimizedTest(Test):
    """
    Test doing upgrades of a Kafka Streams application
    that is un-optimized initially then optimized
    """

    input_topic = 'inputTopic'
    aggregation_topic = 'aggregationTopic'
    reduce_topic = 'reduceTopic'
    join_topic = 'joinTopic'
    operation_pattern = 'AGGREGATED\|REDUCED\|JOINED'
    stopped_message = 'OPTIMIZE_TEST Streams Stopped'

    def __init__(self, test_context):
        super(StreamsOptimizedTest, self).__init__(test_context)
        self.topics = {
            self.input_topic: {
                'partitions': 6
            },
            self.aggregation_topic: {
                'partitions': 6
            },
            self.reduce_topic: {
                'partitions': 6
            },
            self.join_topic: {
                'partitions': 6
            }
        }

        self.zookeeper = ZookeeperService(self.test_context, num_nodes=1)
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zookeeper,
                                  topics=self.topics)

        self.producer = VerifiableProducer(self.test_context,
                                           1,
                                           self.kafka,
                                           self.input_topic,
                                           throughput=1000,
                                           acks=1)

    def test_upgrade_optimized_topology(self):
        self.zookeeper.start()
        self.kafka.start()

        processor1 = StreamsOptimizedUpgradeTestService(
            self.test_context, self.kafka)
        processor2 = StreamsOptimizedUpgradeTestService(
            self.test_context, self.kafka)
        processor3 = StreamsOptimizedUpgradeTestService(
            self.test_context, self.kafka)

        processors = [processor1, processor2, processor3]

        self.logger.info("produce records continually during the test")
        self.producer.start()

        self.logger.info("start all processors unoptimized")
        for processor in processors:
            self.set_topics(processor)
            processor.CLEAN_NODE_ENABLED = False
            self.verify_running_repartition_topic_count(processor, 4)

        self.logger.info("verify unoptimized")
        self.verify_processing(processors, verify_individual_operations=False)

        self.logger.info("stop unoptimized")
        stop_processors(processors, self.stopped_message)

        self.logger.info("reset")
        self.reset_application()
        for processor in processors:
            processor.node.account.ssh("mv " + processor.LOG_FILE + " " +
                                       processor.LOG_FILE + ".1",
                                       allow_fail=False)
            processor.node.account.ssh("mv " + processor.STDOUT_FILE + " " +
                                       processor.STDOUT_FILE + ".1",
                                       allow_fail=False)
            processor.node.account.ssh("mv " + processor.STDERR_FILE + " " +
                                       processor.STDERR_FILE + ".1",
                                       allow_fail=False)
            processor.node.account.ssh("mv " + processor.CONFIG_FILE + " " +
                                       processor.CONFIG_FILE + ".1",
                                       allow_fail=False)

        self.logger.info("start again with topology optimized")
        for processor in processors:
            processor.OPTIMIZED_CONFIG = 'all'
            self.verify_running_repartition_topic_count(processor, 1)

        self.logger.info("verify optimized")
        self.verify_processing(processors, verify_individual_operations=True)

        self.logger.info("stop optimized")
        stop_processors(processors, self.stopped_message)

        self.logger.info("teardown")
        self.producer.stop()
        self.kafka.stop()
        self.zookeeper.stop()

    def reset_application(self):
        resetter = StreamsResetter(self.test_context,
                                   self.kafka,
                                   topic=self.input_topic,
                                   applicationId='StreamsOptimizedTest')
        resetter.start()
        # resetter is not long-term running but it would be better to check the pid by stopping it
        resetter.stop()

    @staticmethod
    def verify_running_repartition_topic_count(processor,
                                               repartition_topic_count):
        node = processor.node
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            processor.start()
            monitor.wait_until(
                'REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' %
                repartition_topic_count,
                timeout_sec=120,
                err_msg=
                "Never saw 'REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' message "
                % repartition_topic_count + str(processor.node.account))

    def verify_processing(self, processors, verify_individual_operations):
        # This test previously had logic to account for skewed assignments, in which not all processors may
        # receive active assignments. I don't think this will happen anymore, but keep an eye out if we see
        # test failures here. If that does resurface, note that the prior implementation was not correct.
        # A better approach would be to make sure we see processing of each partition across the whole cluster
        # instead of just expecting to see each node perform some processing.
        for processor in processors:
            if verify_individual_operations:
                for operation in self.operation_pattern.split('\|'):
                    self.do_verify(processor, operation)
            else:
                self.do_verify(processor, self.operation_pattern)

    def do_verify(self, processor, pattern):
        self.logger.info("Verifying %s processing pattern in STDOUT_FILE" %
                         pattern)
        self.logger.info(
            list(
                processor.node.account.ssh_capture("ls -lh %s" %
                                                   (processor.STDOUT_FILE),
                                                   allow_fail=True)))
        wait_until(lambda: processor.node.account.ssh(
            "grep --max-count 1 '%s' %s" % (pattern, processor.STDOUT_FILE),
            allow_fail=True) == 0,
                   timeout_sec=60)

    def set_topics(self, processor):
        processor.INPUT_TOPIC = self.input_topic
        processor.AGGREGATION_TOPIC = self.aggregation_topic
        processor.REDUCE_TOPIC = self.reduce_topic
        processor.JOIN_TOPIC = self.join_topic