class ProduceBenchTest(Test): def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ProduceBenchTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk) self.workload_service = ProduceBenchWorkloadService(test_context, self.kafka) self.trogdor = TrogdorService(context=self.test_context, client_services=[self.kafka, self.workload_service]) def setUp(self): self.trogdor.start() self.zk.start() self.kafka.start() def teardown(self): self.trogdor.stop() self.kafka.stop() self.zk.stop() def test_produce_bench(self): active_topics={"produce_bench_topic[0-1]":{"numPartitions":1, "replicationFactor":3}} inactive_topics={"produce_bench_topic[2-9]":{"numPartitions":1, "replicationFactor":3}} spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.workload_service.producer_node, self.workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=100000, producer_conf={}, inactive_topics=inactive_topics, active_topics=active_topics) workload1 = self.trogdor.create_task("workload1", spec) workload1.wait_for_done(timeout_sec=360) tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))
class ProduceBenchTest(Test): def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ProduceBenchTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk) self.workload_service = ProduceBenchWorkloadService( test_context, self.kafka) self.trogdor = TrogdorService( context=self.test_context, client_services=[self.kafka, self.workload_service]) def setUp(self): self.trogdor.start() self.zk.start() self.kafka.start() def teardown(self): self.trogdor.stop() self.kafka.stop() self.zk.stop() def test_produce_bench(self): active_topics = { "produce_bench_topic[0-1]": { "numPartitions": 1, "replicationFactor": 3 } } inactive_topics = { "produce_bench_topic[2-9]": { "numPartitions": 1, "replicationFactor": 3 } } spec = ProduceBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.workload_service.producer_node, self.workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=100000, producer_conf={}, admin_client_conf={}, common_client_conf={}, inactive_topics=inactive_topics, active_topics=active_topics) workload1 = self.trogdor.create_task("workload1", spec) workload1.wait_for_done(timeout_sec=360) tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))
class StreamsOptimizedTest(Test): """ Test doing upgrades of a Kafka Streams application that is un-optimized initially then optimized """ input_topic = 'inputTopic' aggregation_topic = 'aggregationTopic' reduce_topic = 'reduceTopic' join_topic = 'joinTopic' operation_pattern = 'AGGREGATED\|REDUCED\|JOINED' stopped_message = 'OPTIMIZE_TEST Streams Stopped' def __init__(self, test_context): super(StreamsOptimizedTest, self).__init__(test_context) self.topics = { self.input_topic: { 'partitions': 6 }, self.aggregation_topic: { 'partitions': 6 }, self.reduce_topic: { 'partitions': 6 }, self.join_topic: { 'partitions': 6 } } self.zookeeper = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zookeeper, topics=self.topics) self.producer = VerifiableProducer(self.test_context, 1, self.kafka, self.input_topic, throughput=1000, acks=1) def test_upgrade_optimized_topology(self): self.zookeeper.start() self.kafka.start() processor1 = StreamsOptimizedUpgradeTestService( self.test_context, self.kafka) processor2 = StreamsOptimizedUpgradeTestService( self.test_context, self.kafka) processor3 = StreamsOptimizedUpgradeTestService( self.test_context, self.kafka) processors = [processor1, processor2, processor3] # produce records continually during the test self.producer.start() # start all processors unoptimized for processor in processors: self.set_topics(processor) processor.CLEAN_NODE_ENABLED = False self.verify_running_repartition_topic_count(processor, 4) self.verify_processing(processors, verify_individual_operations=False) stop_processors(processors, self.stopped_message) # start again with topology optimized for processor in processors: processor.OPTIMIZED_CONFIG = 'all' self.verify_running_repartition_topic_count(processor, 1) self.verify_processing(processors, verify_individual_operations=True) stop_processors(processors, self.stopped_message) self.producer.stop() self.kafka.stop() self.zookeeper.stop() @staticmethod def verify_running_repartition_topic_count(processor, repartition_topic_count): node = processor.node with node.account.monitor_log(processor.STDOUT_FILE) as monitor: processor.start() monitor.wait_until( 'REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' % repartition_topic_count, timeout_sec=120, err_msg= "Never saw 'REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' message " % repartition_topic_count + str(processor.node.account)) def verify_processing(self, processors, verify_individual_operations): for processor in processors: if not self.all_source_subtopology_tasks(processor): if verify_individual_operations: for operation in self.operation_pattern.split('\|'): self.do_verify(processor, operation) else: self.do_verify(processor, self.operation_pattern) else: self.logger.info( "Skipping processor %s with all source tasks" % processor.node.account) def do_verify(self, processor, pattern): self.logger.info("Verifying %s processing pattern in STDOUT_FILE" % pattern) with processor.node.account.monitor_log( processor.STDOUT_FILE) as monitor: monitor.wait_until( pattern, timeout_sec=60, err_msg="Never saw processing of %s " % pattern + str(processor.node.account)) def all_source_subtopology_tasks(self, processor): retries = 0 while retries < 5: found = list( processor.node.account.ssh_capture( "sed -n 's/.*current active tasks: \[\(\(0_[0-9], \)\{3\}0_[0-9]\)\].*/\1/p' %s" % processor.LOG_FILE, allow_fail=True)) self.logger.info("Returned %s from assigned task check" % found) if len(found) > 0: return True retries += 1 time.sleep(1) return False def set_topics(self, processor): processor.INPUT_TOPIC = self.input_topic processor.AGGREGATION_TOPIC = self.aggregation_topic processor.REDUCE_TOPIC = self.reduce_topic processor.JOIN_TOPIC = self.join_topic
class StreamsCooperativeRebalanceUpgradeTest(Test): """ Test of a rolling upgrade from eager rebalance to cooperative rebalance """ source_topic = "source" sink_topic = "sink" task_delimiter = "#" report_interval = "1000" processing_message = "Processed [0-9]* records so far" stopped_message = "COOPERATIVE-REBALANCE-TEST-CLIENT-CLOSED" running_state_msg = "STREAMS in a RUNNING State" cooperative_turned_off_msg = "Eager rebalancing enabled now for upgrade from %s" cooperative_enabled_msg = "Cooperative rebalancing enabled now" first_bounce_phase = "first_bounce_phase-" second_bounce_phase = "second_bounce_phase-" # !!CAUTION!!: THIS LIST OF VERSIONS IS FIXED, NO VERSIONS MUST BE ADDED streams_eager_rebalance_upgrade_versions = [ str(LATEST_0_10_0), str(LATEST_0_10_1), str(LATEST_0_10_2), str(LATEST_0_11_0), str(LATEST_1_0), str(LATEST_1_1), str(LATEST_2_0), str(LATEST_2_1), str(LATEST_2_2), str(LATEST_2_3) ] def __init__(self, test_context): super(StreamsCooperativeRebalanceUpgradeTest, self).__init__(test_context) self.topics = { self.source_topic: { 'partitions': 9 }, self.sink_topic: { 'partitions': 9 } } self.zookeeper = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zookeeper, topics=self.topics) self.producer = VerifiableProducer(self.test_context, 1, self.kafka, self.source_topic, throughput=1000, acks=1) @matrix(upgrade_from_version=streams_eager_rebalance_upgrade_versions) def test_upgrade_to_cooperative_rebalance(self, upgrade_from_version): self.zookeeper.start() self.kafka.start() processor1 = CooperativeRebalanceUpgradeService( self.test_context, self.kafka) processor2 = CooperativeRebalanceUpgradeService( self.test_context, self.kafka) processor3 = CooperativeRebalanceUpgradeService( self.test_context, self.kafka) processors = [processor1, processor2, processor3] # produce records continually during the test self.producer.start() # start all processors without upgrade_from config; normal operations mode self.logger.info("Starting all streams clients in normal running mode") for processor in processors: processor.set_version(upgrade_from_version) self.set_props(processor) processor.CLEAN_NODE_ENABLED = False # can't use state as older version don't have state listener # so just verify up and running verify_running(processor, self.processing_message) # all running rebalancing has ceased for processor in processors: self.verify_processing(processor, self.processing_message) # first rolling bounce with "upgrade.from" config set previous_phase = "" self.maybe_upgrade_rolling_bounce_and_verify(processors, previous_phase, self.first_bounce_phase, upgrade_from_version) # All nodes processing, rebalancing has ceased for processor in processors: self.verify_processing( processor, self.first_bounce_phase + self.processing_message) # second rolling bounce without "upgrade.from" config self.maybe_upgrade_rolling_bounce_and_verify(processors, self.first_bounce_phase, self.second_bounce_phase) # All nodes processing, rebalancing has ceased for processor in processors: self.verify_processing( processor, self.second_bounce_phase + self.processing_message) # now verify tasks are unique for processor in processors: self.get_tasks_for_processor(processor) self.logger.info("Active tasks %s" % processor.active_tasks) overlapping_tasks = processor1.active_tasks.intersection( processor2.active_tasks) assert len(overlapping_tasks) == int(0), \ "Final task assignments are not unique %s %s" % (processor1.active_tasks, processor2.active_tasks) overlapping_tasks = processor1.active_tasks.intersection( processor3.active_tasks) assert len(overlapping_tasks) == int(0), \ "Final task assignments are not unique %s %s" % (processor1.active_tasks, processor3.active_tasks) overlapping_tasks = processor2.active_tasks.intersection( processor3.active_tasks) assert len(overlapping_tasks) == int(0), \ "Final task assignments are not unique %s %s" % (processor2.active_tasks, processor3.active_tasks) # test done close all down stop_processors(processors, self.second_bounce_phase + self.stopped_message) self.producer.stop() self.kafka.stop() self.zookeeper.stop() def maybe_upgrade_rolling_bounce_and_verify(self, processors, previous_phase, current_phase, upgrade_from_version=None): for processor in processors: # stop the processor in prep for setting "update.from" or removing "update.from" verify_stopped(processor, previous_phase + self.stopped_message) # upgrade to version with cooperative rebalance processor.set_version("") processor.set_upgrade_phase(current_phase) if upgrade_from_version is not None: # need to remove minor version numbers for check of valid upgrade from numbers upgrade_version = upgrade_from_version[:upgrade_from_version. rfind('.')] rebalance_mode_msg = self.cooperative_turned_off_msg % upgrade_version else: upgrade_version = None rebalance_mode_msg = self.cooperative_enabled_msg self.set_props(processor, upgrade_version) node = processor.node with node.account.monitor_log( processor.STDOUT_FILE) as stdout_monitor: with node.account.monitor_log( processor.LOG_FILE) as log_monitor: processor.start() # verify correct rebalance mode either turned off for upgrade or enabled after upgrade log_monitor.wait_until( rebalance_mode_msg, timeout_sec=60, err_msg="Never saw '%s' message " % rebalance_mode_msg + str(processor.node.account)) # verify rebalanced into a running state rebalance_msg = current_phase + self.running_state_msg stdout_monitor.wait_until( rebalance_msg, timeout_sec=60, err_msg="Never saw '%s' message " % rebalance_msg + str(processor.node.account)) # verify processing verify_processing_msg = current_phase + self.processing_message stdout_monitor.wait_until( verify_processing_msg, timeout_sec=60, err_msg="Never saw '%s' message " % verify_processing_msg + str(processor.node.account)) def verify_processing(self, processor, pattern): self.logger.info("Verifying %s processing pattern in STDOUT_FILE" % pattern) with processor.node.account.monitor_log( processor.STDOUT_FILE) as monitor: monitor.wait_until( pattern, timeout_sec=60, err_msg="Never saw processing of %s " % pattern + str(processor.node.account)) def get_tasks_for_processor(self, processor): retries = 0 while retries < 5: found_tasks = list( processor.node.account.ssh_capture( "grep TASK-ASSIGNMENTS %s | tail -n 1" % processor.STDOUT_FILE, allow_fail=True)) self.logger.info("Returned %s from assigned task check" % found_tasks) if len(found_tasks) > 0: task_string = str(found_tasks[0]).strip() self.logger.info("Converted %s from assigned task check" % task_string) processor.set_tasks(task_string) return retries += 1 time.sleep(1) return def set_props(self, processor, upgrade_from=None): processor.SOURCE_TOPIC = self.source_topic processor.SINK_TOPIC = self.sink_topic processor.REPORT_INTERVAL = self.report_interval processor.UPGRADE_FROM = upgrade_from
class RoundTripFaultTest(Test): topic_name_index = 0 def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(RoundTripFaultTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk) self.workload_service = RoundTripWorkloadService( test_context, self.kafka) self.trogdor = TrogdorService( context=self.test_context, client_services=[self.zk, self.kafka, self.workload_service]) topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1 active_topics = { topic_name: { "partitionAssignments": { "0": [0, 1, 2] } } } self.round_trip_spec = RoundTripWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.workload_service.client_node, self.workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=100000, active_topics=active_topics) def setUp(self): self.zk.start() self.kafka.start() self.trogdor.start() def teardown(self): self.trogdor.stop() self.kafka.stop() self.zk.stop() def test_round_trip_workload(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) workload1.wait_for_done(timeout_sec=600) def test_round_trip_workload_with_broker_partition(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) part1 = [self.kafka.nodes[0]] part2 = self.kafka.nodes[1:] + [self.workload_service.nodes[0] ] + self.zk.nodes partition1_spec = NetworkPartitionFaultSpec(0, TaskSpec.MAX_DURATION_MS, [part1, part2]) partition1 = self.trogdor.create_task("partition1", partition1_spec) workload1.wait_for_done(timeout_sec=600) partition1.stop() partition1.wait_for_done() def test_produce_consume_with_broker_pause(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) stop1_spec = ProcessStopFaultSpec(0, TaskSpec.MAX_DURATION_MS, [self.kafka.nodes[0]], self.kafka.java_class_name()) stop1 = self.trogdor.create_task("stop1", stop1_spec) workload1.wait_for_done(timeout_sec=600) stop1.stop() stop1.wait_for_done() self.kafka.stop_node(self.kafka.nodes[0], False) def test_produce_consume_with_client_partition(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) part1 = [self.workload_service.nodes[0]] part2 = self.kafka.nodes + self.zk.nodes partition1_spec = NetworkPartitionFaultSpec(0, 60000, [part1, part2]) stop1 = self.trogdor.create_task("stop1", partition1_spec) workload1.wait_for_done(timeout_sec=600) stop1.stop() stop1.wait_for_done()
class ConsumeBenchTest(Test): def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ConsumeBenchTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk) self.producer_workload_service = ProduceBenchWorkloadService(test_context, self.kafka) self.consumer_workload_service = ConsumeBenchWorkloadService(test_context, self.kafka) self.consumer_workload_service_2 = ConsumeBenchWorkloadService(test_context, self.kafka) self.active_topics = {"consume_bench_topic[0-5]": {"numPartitions": 5, "replicationFactor": 3}} self.trogdor = TrogdorService(context=self.test_context, client_services=[self.kafka, self.producer_workload_service, self.consumer_workload_service, self.consumer_workload_service_2]) def setUp(self): self.trogdor.start() self.zk.start() self.kafka.start() def teardown(self): self.trogdor.stop() self.kafka.stop() self.zk.stop() def produce_messages(self, topics, max_messages=10000): produce_spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.producer_workload_service.producer_node, self.producer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=max_messages, producer_conf={}, admin_client_conf={}, common_client_conf={}, inactive_topics={}, active_topics=topics) produce_workload = self.trogdor.create_task("produce_workload", produce_spec) produce_workload.wait_for_done(timeout_sec=180) self.logger.debug("Produce workload finished") @parametrize(topics=["consume_bench_topic[0-5]"]) # topic subscription @parametrize(topics=["consume_bench_topic[0-5]:[0-4]"]) # manual topic assignment def test_consume_bench(self, topics): """ Runs a ConsumeBench workload to consume messages """ self.produce_messages(self.active_topics) consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=10000, consumer_conf={}, admin_client_conf={}, common_client_conf={}, active_topics=topics) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) def test_single_partition(self): """ Run a ConsumeBench against a single partition """ active_topics = {"consume_bench_topic": {"numPartitions": 2, "replicationFactor": 3}} self.produce_messages(active_topics, 5000) consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2500, consumer_conf={}, admin_client_conf={}, common_client_conf={}, active_topics=["consume_bench_topic:1"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=180) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) def test_multiple_consumers_random_group_topics(self): """ Runs multiple consumers group to read messages from topics. Since a consumerGroup isn't specified, each consumer should read from all topics independently """ self.produce_messages(self.active_topics, max_messages=5000) consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=5000, # all should read exactly 5k messages consumer_conf={}, admin_client_conf={}, common_client_conf={}, threads_per_worker=5, active_topics=["consume_bench_topic[0-5]"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) def test_two_consumers_specified_group_topics(self): """ Runs two consumers in the same consumer group to read messages from topics. Since a consumerGroup is specified, each consumer should dynamically get assigned a partition from group """ self.produce_messages(self.active_topics) consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2000, # both should read at least 2k messages consumer_conf={}, admin_client_conf={}, common_client_conf={}, threads_per_worker=2, consumer_group="testGroup", active_topics=["consume_bench_topic[0-5]"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) def test_multiple_consumers_random_group_partitions(self): """ Runs multiple consumers in to read messages from specific partitions. Since a consumerGroup isn't specified, each consumer will get assigned a random group and consume from all partitions """ self.produce_messages(self.active_topics, max_messages=20000) consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2000, consumer_conf={}, admin_client_conf={}, common_client_conf={}, threads_per_worker=4, active_topics=["consume_bench_topic1:[0-4]"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) def test_multiple_consumers_specified_group_partitions_should_raise(self): """ Runs multiple consumers in the same group to read messages from specific partitions. It is an invalid configuration to provide a consumer group and specific partitions. """ expected_error_msg = 'explicit partition assignment' self.produce_messages(self.active_topics, max_messages=20000) consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2000, consumer_conf={}, admin_client_conf={}, common_client_conf={}, threads_per_worker=4, consumer_group="fail_group", active_topics=["consume_bench_topic1:[0-4]"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) try: consume_workload.wait_for_done(timeout_sec=360) raise Exception("Should have raised an exception due to an invalid configuration") except RuntimeError as e: if expected_error_msg not in str(e): raise RuntimeError("Unexpected Exception - " + str(e)) self.logger.info(e)
class RoundTripFaultTest(Test): topic_name_index = 0 def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(RoundTripFaultTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) if quorum.for_test( test_context) == quorum.zk else None self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk) self.workload_service = RoundTripWorkloadService( test_context, self.kafka) if quorum.for_test(test_context) == quorum.zk: trogdor_client_services = [ self.zk, self.kafka, self.workload_service ] elif quorum.for_test(test_context) == quorum.remote_kraft: trogdor_client_services = [ self.kafka.controller_quorum, self.kafka, self.workload_service ] else: #co-located case, which we currently don't test but handle here for completeness in case we do test it trogdor_client_services = [self.kafka, self.workload_service] self.trogdor = TrogdorService(context=self.test_context, client_services=trogdor_client_services) topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1 active_topics = { topic_name: { "partitionAssignments": { "0": [0, 1, 2] } } } self.round_trip_spec = RoundTripWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.workload_service.client_node, self.workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=100000, active_topics=active_topics) def setUp(self): if self.zk: self.zk.start() self.kafka.start() self.trogdor.start() def teardown(self): self.trogdor.stop() self.kafka.stop() if self.zk: self.zk.stop() def remote_quorum_nodes(self): if quorum.for_test(self.test_context) == quorum.zk: return self.zk.nodes elif quorum.for_test(self.test_context) == quorum.remote_kraft: return self.kafka.controller_quorum.nodes else: # co-located case, which we currently don't test but handle here for completeness in case we do test it return [] @cluster(num_nodes=9) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_round_trip_workload(self, metadata_quorum=quorum.zk): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) workload1.wait_for_done(timeout_sec=600) @cluster(num_nodes=9) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_round_trip_workload_with_broker_partition( self, metadata_quorum=quorum.zk): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) part1 = [self.kafka.nodes[0]] part2 = self.kafka.nodes[1:] + [self.workload_service.nodes[0] ] + self.remote_quorum_nodes() partition1_spec = NetworkPartitionFaultSpec(0, TaskSpec.MAX_DURATION_MS, [part1, part2]) partition1 = self.trogdor.create_task("partition1", partition1_spec) workload1.wait_for_done(timeout_sec=600) partition1.stop() partition1.wait_for_done() @cluster(num_nodes=9) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_produce_consume_with_broker_pause(self, metadata_quorum=quorum.zk): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) stop1_spec = ProcessStopFaultSpec(0, TaskSpec.MAX_DURATION_MS, [self.kafka.nodes[0]], self.kafka.java_class_name()) stop1 = self.trogdor.create_task("stop1", stop1_spec) workload1.wait_for_done(timeout_sec=600) stop1.stop() stop1.wait_for_done() self.kafka.stop_node(self.kafka.nodes[0], False) @cluster(num_nodes=9) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_produce_consume_with_client_partition(self, metadata_quorum=quorum.zk): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) part1 = [self.workload_service.nodes[0]] part2 = self.kafka.nodes + self.remote_quorum_nodes() partition1_spec = NetworkPartitionFaultSpec(0, 60000, [part1, part2]) stop1 = self.trogdor.create_task("stop1", partition1_spec) workload1.wait_for_done(timeout_sec=600) stop1.stop() stop1.wait_for_done() @cluster(num_nodes=9) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_produce_consume_with_latency(self, metadata_quorum=quorum.zk): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) spec = DegradedNetworkFaultSpec(0, 60000) for node in self.kafka.nodes + self.remote_quorum_nodes(): spec.add_node_spec(node.name, "eth0", latencyMs=100, rateLimitKbit=3000) slow1 = self.trogdor.create_task("slow1", spec) workload1.wait_for_done(timeout_sec=600) slow1.stop() slow1.wait_for_done()
class StreamsBrokerCompatibility(Test): """ These tests validates that - Streams works for older brokers 0.11 (or newer) - Streams w/ EOS-alpha works for older brokers 0.11 (or newer) - Streams w/ EOS-beta works for older brokers 2.5 (or newer) - Streams fails fast for older brokers 0.10.0, 0.10.2, and 0.10.1 - Streams w/ EOS-beta fails fast for older brokers 2.4 or older """ input = "brokerCompatibilitySourceTopic" output = "brokerCompatibilitySinkTopic" def __init__(self, test_context): super(StreamsBrokerCompatibility, self).__init__(test_context=test_context) self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService( test_context, num_nodes=1, zk=self.zk, topics={ self.input: { 'partitions': 1, 'replication-factor': 1 }, self.output: { 'partitions': 1, 'replication-factor': 1 } }, server_prop_overides=[[ "transaction.state.log.replication.factor", "1" ], ["transaction.state.log.min.isr", "1"]]) self.consumer = VerifiableConsumer( test_context, 1, self.kafka, self.output, "stream-broker-compatibility-verify-consumer") def setUp(self): self.zk.start() @parametrize(broker_version=str(LATEST_2_4)) @parametrize(broker_version=str(LATEST_2_3)) @parametrize(broker_version=str(LATEST_2_2)) @parametrize(broker_version=str(LATEST_2_1)) @parametrize(broker_version=str(LATEST_2_0)) @parametrize(broker_version=str(LATEST_1_1)) @parametrize(broker_version=str(LATEST_1_0)) @parametrize(broker_version=str(LATEST_0_11_0)) def test_compatible_brokers_eos_disabled(self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, "at_least_once") processor.start() self.consumer.start() processor.wait() wait_until( lambda: self.consumer.total_consumed() > 0, timeout_sec=30, err_msg= "Did expect to read a message but got none within 30 seconds.") self.consumer.stop() self.kafka.stop() @parametrize(broker_version=str(LATEST_2_5)) @parametrize(broker_version=str(LATEST_2_4)) @parametrize(broker_version=str(LATEST_2_3)) @parametrize(broker_version=str(LATEST_2_2)) @parametrize(broker_version=str(LATEST_2_1)) @parametrize(broker_version=str(LATEST_2_0)) @parametrize(broker_version=str(LATEST_1_1)) @parametrize(broker_version=str(LATEST_1_0)) @parametrize(broker_version=str(LATEST_0_11_0)) def test_compatible_brokers_eos_alpha_enabled(self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, "exactly_once") processor.start() self.consumer.start() processor.wait() wait_until( lambda: self.consumer.total_consumed() > 0, timeout_sec=30, err_msg= "Did expect to read a message but got none within 30 seconds.") self.consumer.stop() self.kafka.stop() # TODO enable after 2.5 is released # @parametrize(broker_version=str(LATEST_2_5)) # def test_compatible_brokers_eos_beta_enabled(self, broker_version): # self.kafka.set_version(KafkaVersion(broker_version)) # self.kafka.start() # # processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, "exactly_once_beta") # processor.start() # # self.consumer.start() # # processor.wait() # # wait_until(lambda: self.consumer.total_consumed() > 0, timeout_sec=30, err_msg="Did expect to read a message but got none within 30 seconds.") # # self.consumer.stop() # self.kafka.stop() @parametrize(broker_version=str(LATEST_0_10_2)) @parametrize(broker_version=str(LATEST_0_10_1)) @parametrize(broker_version=str(LATEST_0_10_0)) def test_fail_fast_on_incompatible_brokers(self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, "at_least_once") with processor.node.account.monitor_log( processor.STDERR_FILE) as monitor: processor.start() monitor.wait_until( 'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException', timeout_sec=60, err_msg= "Never saw 'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException " + str(processor.node.account)) self.kafka.stop() @parametrize(broker_version=str(LATEST_2_4)) @parametrize(broker_version=str(LATEST_2_3)) @parametrize(broker_version=str(LATEST_2_2)) @parametrize(broker_version=str(LATEST_2_1)) @parametrize(broker_version=str(LATEST_2_0)) @parametrize(broker_version=str(LATEST_1_1)) @parametrize(broker_version=str(LATEST_1_0)) @parametrize(broker_version=str(LATEST_0_11_0)) def test_fail_fast_on_incompatible_brokers_if_eos_beta_enabled( self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, "exactly_once_beta") with processor.node.account.monitor_log( processor.STDERR_FILE) as monitor: with processor.node.account.monitor_log(processor.LOG_FILE) as log: processor.start() log.wait_until( 'Shutting down because the Kafka cluster seems to be on a too old version. Setting processing\.guarantee="exactly_once_beta" requires broker version 2\.5 or higher\.', timeout_sec=60, err_msg= "Never saw 'Shutting down, because the Kafka cluster seems to be on a too old version. Setting `processing.guarantee=\"exaclty_once_beta\"` requires broker version 2.5 or higher.' log message " + str(processor.node.account)) monitor.wait_until( 'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException', timeout_sec=60, err_msg= "Never saw 'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException' error message " + str(processor.node.account)) self.kafka.stop()
class StreamsBrokerDownResilience(Test): """ This test validates that Streams is resilient to a broker being down longer than specified timeouts in configs """ inputTopic = "streamsResilienceSource" outputTopic = "streamsResilienceSink" num_messages = 5 def __init__(self, test_context): super(StreamsBrokerDownResilience, self).__init__(test_context=test_context) self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={ self.inputTopic: { 'partitions': 3, 'replication-factor': 1 }, self.outputTopic: { 'partitions': 1, 'replication-factor': 1 } }) def get_consumer(self, num_messages): return VerifiableConsumer(self.test_context, 1, self.kafka, self.outputTopic, "stream-broker-resilience-verify-consumer", max_messages=num_messages) def get_producer(self, num_messages): return VerifiableProducer(self.test_context, 1, self.kafka, self.inputTopic, max_messages=num_messages, acks=1) def assert_produce_consume(self, test_state, num_messages=5): producer = self.get_producer(num_messages) producer.start() wait_until(lambda: producer.num_acked >= num_messages, timeout_sec=30, err_msg="At %s failed to send messages " % test_state) consumer = self.get_consumer(num_messages) consumer.start() wait_until( lambda: consumer.total_consumed() >= num_messages, timeout_sec=60, err_msg="At %s streams did not process messages in 60 seconds " % test_state) @staticmethod def get_configs(extra_configs=""): # Consumer max.poll.interval > min(max.block.ms, ((retries + 1) * request.timeout) consumer_poll_ms = "consumer.max.poll.interval.ms=50000" retries_config = "producer.retries=2" request_timeout = "producer.request.timeout.ms=15000" max_block_ms = "producer.max.block.ms=30000" # java code expects configs in key=value,key=value format updated_configs = consumer_poll_ms + "," + retries_config + "," + request_timeout + "," + max_block_ms + extra_configs return updated_configs def wait_for_verification(self, processor, message, file, num_lines=1): wait_until(lambda: self.verify_from_file(processor, message, file ) >= num_lines, timeout_sec=60, err_msg="Did expect to read '%s' from %s" % (message, processor.node.account)) @staticmethod def verify_from_file(processor, message, file): result = processor.node.account.ssh_output("grep '%s' %s | wc -l" % (message, file), allow_fail=False) return int(result) def setUp(self): self.zk.start() def test_streams_resilient_to_broker_down(self): self.kafka.start() # Broker should be down over 2x of retries * timeout ms # So with (2 * 15000) = 30 seconds, we'll set downtime to 70 seconds broker_down_time_in_seconds = 70 processor = StreamsBrokerDownResilienceService(self.test_context, self.kafka, self.get_configs()) processor.start() # until KIP-91 is merged we'll only send 5 messages to assert Kafka Streams is running before taking the broker down # After KIP-91 is merged we'll continue to send messages the duration of the test self.assert_produce_consume("before_broker_stop") node = self.kafka.leader(self.inputTopic) self.kafka.stop_node(node) time.sleep(broker_down_time_in_seconds) self.kafka.start_node(node) self.assert_produce_consume("after_broker_stop") self.kafka.stop() def test_streams_runs_with_broker_down_initially(self): self.kafka.start() node = self.kafka.leader(self.inputTopic) self.kafka.stop_node(node) configs = self.get_configs( extra_configs=",application.id=starting_wo_broker_id") # start streams with broker down initially processor = StreamsBrokerDownResilienceService(self.test_context, self.kafka, configs) processor.start() processor_2 = StreamsBrokerDownResilienceService( self.test_context, self.kafka, configs) processor_2.start() processor_3 = StreamsBrokerDownResilienceService( self.test_context, self.kafka, configs) processor_3.start() broker_unavailable_message = "Broker may not be available" # verify streams instances unable to connect to broker, kept trying self.wait_for_verification(processor, broker_unavailable_message, processor.LOG_FILE, 100) self.wait_for_verification(processor_2, broker_unavailable_message, processor_2.LOG_FILE, 100) self.wait_for_verification(processor_3, broker_unavailable_message, processor_3.LOG_FILE, 100) # now start broker self.kafka.start_node(node) # assert streams can process when starting with broker down self.assert_produce_consume("running_with_broker_down_initially", num_messages=9) message = "processed3messages" # need to show all 3 instances processed messages self.wait_for_verification(processor, message, processor.STDOUT_FILE) self.wait_for_verification(processor_2, message, processor_2.STDOUT_FILE) self.wait_for_verification(processor_3, message, processor_3.STDOUT_FILE) self.kafka.stop() def test_streams_should_scale_in_while_brokers_down(self): self.kafka.start() configs = self.get_configs( extra_configs=",application.id=shutdown_with_broker_down") processor = StreamsBrokerDownResilienceService(self.test_context, self.kafka, configs) processor.start() processor_2 = StreamsBrokerDownResilienceService( self.test_context, self.kafka, configs) processor_2.start() processor_3 = StreamsBrokerDownResilienceService( self.test_context, self.kafka, configs) processor_3.start() # need to wait for rebalance once self.wait_for_verification( processor_3, "State transition from REBALANCING to RUNNING", processor_3.LOG_FILE) # assert streams can process when starting with broker down self.assert_produce_consume("waiting for rebalance to complete", num_messages=9) message = "processed3messages" self.wait_for_verification(processor, message, processor.STDOUT_FILE) self.wait_for_verification(processor_2, message, processor_2.STDOUT_FILE) self.wait_for_verification(processor_3, message, processor_3.STDOUT_FILE) node = self.kafka.leader(self.inputTopic) self.kafka.stop_node(node) processor.stop() processor_2.stop() shutdown_message = "Complete shutdown of streams resilience test app now" self.wait_for_verification(processor, shutdown_message, processor.STDOUT_FILE) self.wait_for_verification(processor_2, shutdown_message, processor_2.STDOUT_FILE) self.kafka.start_node(node) self.assert_produce_consume( "sending_message_after_stopping_streams_instance_bouncing_broker", num_messages=9) self.wait_for_verification(processor_3, "processed9messages", processor_3.STDOUT_FILE) self.kafka.stop()
class ReplicaScaleTest(Test): def __init__(self, test_context): super(ReplicaScaleTest, self).__init__(test_context=test_context) self.test_context = test_context self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=8, zk=self.zk) def setUp(self): self.zk.start() self.kafka.start() def teardown(self): # Need to increase the timeout due to partition count for node in self.kafka.nodes: self.kafka.stop_node(node, clean_shutdown=False, timeout_sec=60) self.kafka.stop() self.zk.stop() @cluster(num_nodes=12) @parametrize(topic_count=500, partition_count=34, replication_factor=3) def test_produce_consume(self, topic_count, partition_count, replication_factor): topics_create_start_time = time.time() for i in range(topic_count): topic = "replicas_produce_consume_%d" % i print("Creating topic %s" % topic) # Force some stdout for Jenkins topic_cfg = { "topic": topic, "partitions": partition_count, "replication-factor": replication_factor, "configs": {"min.insync.replicas": 2} } self.kafka.create_topic(topic_cfg) topics_create_end_time = time.time() self.logger.info("Time to create topics: %d" % (topics_create_end_time - topics_create_start_time)) producer_workload_service = ProduceBenchWorkloadService(self.test_context, self.kafka) consumer_workload_service = ConsumeBenchWorkloadService(self.test_context, self.kafka) trogdor = TrogdorService(context=self.test_context, client_services=[self.kafka, producer_workload_service, consumer_workload_service]) trogdor.start() produce_spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, producer_workload_service.producer_node, producer_workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=3400000, producer_conf={}, admin_client_conf={}, common_client_conf={}, inactive_topics={}, active_topics={"replicas_produce_consume_[0-2]": { "numPartitions": partition_count, "replicationFactor": replication_factor }}) produce_workload = trogdor.create_task("replicas-produce-workload", produce_spec) produce_workload.wait_for_done(timeout_sec=600) self.logger.info("Completed produce bench") consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, consumer_workload_service.consumer_node, consumer_workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=3400000, consumer_conf={}, admin_client_conf={}, common_client_conf={}, active_topics=["replicas_produce_consume_[0-2]"]) consume_workload = trogdor.create_task("replicas-consume-workload", consume_spec) consume_workload.wait_for_done(timeout_sec=600) self.logger.info("Completed consume bench") trogdor.stop() @cluster(num_nodes=12) @parametrize(topic_count=500, partition_count=34, replication_factor=3) def test_clean_bounce(self, topic_count, partition_count, replication_factor): topics_create_start_time = time.time() for i in range(topic_count): topic = "topic-%04d" % i print("Creating topic %s" % topic) # Force some stdout for Jenkins topic_cfg = { "topic": topic, "partitions": partition_count, "replication-factor": replication_factor, "configs": {"min.insync.replicas": 2} } self.kafka.create_topic(topic_cfg) topics_create_end_time = time.time() self.logger.info("Time to create topics: %d" % (topics_create_end_time - topics_create_start_time)) restart_times = [] for node in self.kafka.nodes: broker_bounce_start_time = time.time() self.kafka.stop_node(node, clean_shutdown=True, timeout_sec=600) self.kafka.start_node(node, timeout_sec=600) broker_bounce_end_time = time.time() restart_times.append(broker_bounce_end_time - broker_bounce_start_time) self.logger.info("Time to restart %s: %d" % (node.name, broker_bounce_end_time - broker_bounce_start_time)) self.logger.info("Restart times: %s" % restart_times) delete_start_time = time.time() for i in range(topic_count): topic = "topic-%04d" % i self.logger.info("Deleting topic %s" % topic) self.kafka.delete_topic(topic) delete_end_time = time.time() self.logger.info("Time to delete topics: %d" % (delete_end_time - delete_start_time))
class StreamsOptimizedTest(Test): """ Test doing upgrades of a Kafka Streams application that is un-optimized initially then optimized """ input_topic = 'inputTopic' aggregation_topic = 'aggregationTopic' reduce_topic = 'reduceTopic' join_topic = 'joinTopic' operation_pattern = 'AGGREGATED\|REDUCED\|JOINED' def __init__(self, test_context): super(StreamsOptimizedTest, self).__init__(test_context) self.topics = { self.input_topic: {'partitions': 6}, self.aggregation_topic: {'partitions': 6}, self.reduce_topic: {'partitions': 6}, self.join_topic: {'partitions': 6} } self.zookeeper = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zookeeper, topics=self.topics) self.producer = VerifiableProducer(self.test_context, 1, self.kafka, self.input_topic, throughput=1000, acks=1) def test_upgrade_optimized_topology(self): self.zookeeper.start() self.kafka.start() processor1 = StreamsOptimizedUpgradeTestService(self.test_context, self.kafka) processor2 = StreamsOptimizedUpgradeTestService(self.test_context, self.kafka) processor3 = StreamsOptimizedUpgradeTestService(self.test_context, self.kafka) processors = [processor1, processor2, processor3] # produce records continually during the test self.producer.start() # start all processors unoptimized for processor in processors: self.set_topics(processor) processor.CLEAN_NODE_ENABLED = False self.verify_running_repartition_topic_count(processor, 4) self.verify_processing(processors, verify_individual_operations=False) self.stop_processors(processors) # start again with topology optimized for processor in processors: processor.OPTIMIZED_CONFIG = 'all' self.verify_running_repartition_topic_count(processor, 1) self.verify_processing(processors, verify_individual_operations=True) self.stop_processors(processors) self.producer.stop() self.kafka.stop() self.zookeeper.stop() @staticmethod def verify_running_repartition_topic_count(processor, repartition_topic_count): node = processor.node with node.account.monitor_log(processor.STDOUT_FILE) as monitor: processor.start() monitor.wait_until('REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' % repartition_topic_count, timeout_sec=120, err_msg="Never saw 'REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' message " % repartition_topic_count + str(processor.node.account)) @staticmethod def verify_stopped(processor): node = processor.node with node.account.monitor_log(processor.STDOUT_FILE) as monitor: processor.stop() monitor.wait_until('OPTIMIZE_TEST Streams Stopped', timeout_sec=60, err_msg="'OPTIMIZE_TEST Streams Stopped' message" + str(processor.node.account)) def verify_processing(self, processors, verify_individual_operations): for processor in processors: if not self.all_source_subtopology_tasks(processor): if verify_individual_operations: for operation in self.operation_pattern.split('\|'): self.do_verify(processor, operation) else: self.do_verify(processor, self.operation_pattern) else: self.logger.info("Skipping processor %s with all source tasks" % processor.node.account) def do_verify(self, processor, pattern): self.logger.info("Verifying %s processing pattern in STDOUT_FILE" % pattern) with processor.node.account.monitor_log(processor.STDOUT_FILE) as monitor: monitor.wait_until(pattern, timeout_sec=60, err_msg="Never saw processing of %s " % pattern + str(processor.node.account)) def all_source_subtopology_tasks(self, processor): retries = 0 while retries < 5: found = list(processor.node.account.ssh_capture("sed -n 's/.*current active tasks: \[\(\(0_[0-9], \)\{3\}0_[0-9]\)\].*/\1/p' %s" % processor.LOG_FILE, allow_fail=True)) self.logger.info("Returned %s from assigned task check" % found) if len(found) > 0: return True retries += 1 time.sleep(1) return False def stop_processors(self, processors): for processor in processors: self.verify_stopped(processor) def set_topics(self, processor): processor.INPUT_TOPIC = self.input_topic processor.AGGREGATION_TOPIC = self.aggregation_topic processor.REDUCE_TOPIC = self.reduce_topic processor.JOIN_TOPIC = self.join_topic
class StreamsNamedRepartitionTopicTest(Test): """ Tests using a named repartition topic by starting application then doing a rolling upgrade with added operations and the application still runs """ input_topic = 'inputTopic' aggregation_topic = 'aggregationTopic' pattern = 'AGGREGATED' def __init__(self, test_context): super(StreamsNamedRepartitionTopicTest, self).__init__(test_context) self.topics = { self.input_topic: { 'partitions': 6 }, self.aggregation_topic: { 'partitions': 6 } } self.zookeeper = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zookeeper, topics=self.topics) self.producer = VerifiableProducer(self.test_context, 1, self.kafka, self.input_topic, throughput=1000, acks=1) def test_upgrade_topology_with_named_repartition_topic(self): self.zookeeper.start() self.kafka.start() processor1 = StreamsNamedRepartitionTopicService( self.test_context, self.kafka) processor2 = StreamsNamedRepartitionTopicService( self.test_context, self.kafka) processor3 = StreamsNamedRepartitionTopicService( self.test_context, self.kafka) processors = [processor1, processor2, processor3] self.producer.start() for processor in processors: processor.CLEAN_NODE_ENABLED = False self.set_topics(processor) self.verify_running(processor, 'REBALANCING -> RUNNING') self.verify_processing(processors) # do rolling upgrade for processor in processors: self.verify_stopped(processor) # will tell app to add operations before repartition topic processor.ADD_ADDITIONAL_OPS = 'true' self.verify_running(processor, 'UPDATED Topology') self.verify_processing(processors) self.stop_processors(processors) self.producer.stop() self.kafka.stop() self.zookeeper.stop() @staticmethod def verify_running(processor, message): node = processor.node with node.account.monitor_log(processor.STDOUT_FILE) as monitor: processor.start() monitor.wait_until(message, timeout_sec=60, err_msg="Never saw '%s' message " % message + str(processor.node.account)) @staticmethod def verify_stopped(processor): node = processor.node with node.account.monitor_log(processor.STDOUT_FILE) as monitor: processor.stop() monitor.wait_until( 'NAMED_REPARTITION_TEST Streams Stopped', timeout_sec=60, err_msg="'NAMED_REPARTITION_TEST Streams Stopped' message" + str(processor.node.account)) def verify_processing(self, processors): for processor in processors: with processor.node.account.monitor_log( processor.STDOUT_FILE) as monitor: monitor.wait_until( self.pattern, timeout_sec=60, err_msg="Never saw processing of %s " % self.pattern + str(processor.node.account)) def stop_processors(self, processors): for processor in processors: self.verify_stopped(processor) def set_topics(self, processor): processor.INPUT_TOPIC = self.input_topic processor.AGGREGATION_TOPIC = self.aggregation_topic
class ConsumeBenchTest(Test): def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ConsumeBenchTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk) self.producer_workload_service = ProduceBenchWorkloadService(test_context, self.kafka) self.consumer_workload_service = ConsumeBenchWorkloadService(test_context, self.kafka) self.consumer_workload_service_2 = ConsumeBenchWorkloadService(test_context, self.kafka) self.active_topics = {"consume_bench_topic[0-5]": {"numPartitions": 5, "replicationFactor": 3}} self.trogdor = TrogdorService(context=self.test_context, client_services=[self.kafka, self.producer_workload_service, self.consumer_workload_service, self.consumer_workload_service_2]) def setUp(self): self.trogdor.start() self.zk.start() self.kafka.start() def teardown(self): self.trogdor.stop() self.kafka.stop() self.zk.stop() def produce_messages(self, topics, max_messages=10000): produce_spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.producer_workload_service.producer_node, self.producer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=max_messages, producer_conf={}, admin_client_conf={}, common_client_conf={}, inactive_topics={}, active_topics=topics) produce_workload = self.trogdor.create_task("produce_workload", produce_spec) produce_workload.wait_for_done(timeout_sec=180) self.logger.debug("Produce workload finished") @parametrize(topics=["consume_bench_topic[0-5]"]) # topic subscription @parametrize(topics=["consume_bench_topic[0-5]:[0-4]"]) # manual topic assignment def test_consume_bench(self, topics): """ Runs a ConsumeBench workload to consume messages """ self.produce_messages(self.active_topics) consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=10000, consumer_conf={}, admin_client_conf={}, common_client_conf={}, active_topics=topics) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) def test_consume_bench_single_partition(self): """ Run a ConsumeBench against a single partition """ active_topics = {"consume_bench_topic": {"numPartitions": 2, "replicationFactor": 3}} self.produce_messages(active_topics, 5000) consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2500, consumer_conf={}, admin_client_conf={}, common_client_conf={}, active_topics=["consume_bench_topic:1"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=180) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) def test_consume_group_bench(self): """ Runs two ConsumeBench workloads in the same consumer group to read messages from topics """ self.produce_messages(self.active_topics) consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2000, # both should read at least 2k messages consumer_conf={}, admin_client_conf={}, common_client_conf={}, consumer_group="testGroup", active_topics=["consume_bench_topic[0-5]"]) consume_workload_1 = self.trogdor.create_task("consume_workload_1", consume_spec) consume_workload_2 = self.trogdor.create_task("consume_workload_2", consume_spec) consume_workload_1.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload 1 finished") consume_workload_2.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload 2 finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))
class RoundTripFaultTest(Test): topic_name_index = 0 def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(RoundTripFaultTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk) self.workload_service = RoundTripWorkloadService(test_context, self.kafka) self.trogdor = TrogdorService(context=self.test_context, client_services=[self.zk, self.kafka, self.workload_service]) topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1 active_topics={topic_name : {"partitionAssignments":{"0": [0,1,2]}}} self.round_trip_spec = RoundTripWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.workload_service.client_node, self.workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=100000, active_topics=active_topics) def setUp(self): self.zk.start() self.kafka.start() self.trogdor.start() def teardown(self): self.trogdor.stop() self.kafka.stop() self.zk.stop() def test_round_trip_workload(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) workload1.wait_for_done(timeout_sec=600) def test_round_trip_workload_with_broker_partition(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) part1 = [self.kafka.nodes[0]] part2 = self.kafka.nodes[1:] + [self.workload_service.nodes[0]] + self.zk.nodes partition1_spec = NetworkPartitionFaultSpec(0, TaskSpec.MAX_DURATION_MS, [part1, part2]) partition1 = self.trogdor.create_task("partition1", partition1_spec) workload1.wait_for_done(timeout_sec=600) partition1.stop() partition1.wait_for_done() def test_produce_consume_with_broker_pause(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) stop1_spec = ProcessStopFaultSpec(0, TaskSpec.MAX_DURATION_MS, [self.kafka.nodes[0]], self.kafka.java_class_name()) stop1 = self.trogdor.create_task("stop1", stop1_spec) workload1.wait_for_done(timeout_sec=600) stop1.stop() stop1.wait_for_done() self.kafka.stop_node(self.kafka.nodes[0], False) def test_produce_consume_with_client_partition(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) part1 = [self.workload_service.nodes[0]] part2 = self.kafka.nodes + self.zk.nodes partition1_spec = NetworkPartitionFaultSpec(0, 60000, [part1, part2]) stop1 = self.trogdor.create_task("stop1", partition1_spec) workload1.wait_for_done(timeout_sec=600) stop1.stop() stop1.wait_for_done()
class StreamsBrokerDownResilience(Test): """ This test validates that Streams is resilient to a broker being down longer than specified timeouts in configs """ inputTopic = "streamsResilienceSource" outputTopic = "streamsResilienceSink" num_messages = 5 def __init__(self, test_context): super(StreamsBrokerDownResilience, self).__init__(test_context=test_context) self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={ self.inputTopic: { 'partitions': 1, 'replication-factor': 1 }, self.outputTopic: { 'partitions': 1, 'replication-factor': 1 } }) def get_consumer(self): return VerifiableConsumer(self.test_context, 1, self.kafka, self.outputTopic, "stream-broker-resilience-verify-consumer", max_messages=self.num_messages) def get_producer(self): return VerifiableProducer(self.test_context, 1, self.kafka, self.inputTopic, max_messages=self.num_messages, acks=1) def assert_produce_consume(self, test_state): producer = self.get_producer() producer.start() wait_until(lambda: producer.num_acked > 0, timeout_sec=30, err_msg="At %s failed to send messages " % test_state) consumer = self.get_consumer() consumer.start() wait_until( lambda: consumer.total_consumed() > 0, timeout_sec=120, err_msg="At %s streams did not process messages in 120 seconds " % test_state) def setUp(self): self.zk.start() def test_streams_resilient_to_broker_down(self): self.kafka.start() # Consumer max.poll.interval > min(max.block.ms, ((retries + 1) * request.timeout) consumer_poll_ms = "consumer.max.poll.interval.ms=50000" retries_config = "producer.retries=2" request_timeout = "producer.request.timeout.ms=15000" max_block_ms = "producer.max.block.ms=30000" # Broker should be down over 2x of retries * timeout ms # So with (2 * 15000) = 30 seconds, we'll set downtime to 70 seconds broker_down_time_in_seconds = 70 # java code expects configs in key=value,key=value format updated_configs = consumer_poll_ms + "," + retries_config + "," + request_timeout + "," + max_block_ms processor = StreamsBrokerDownResilienceService(self.test_context, self.kafka, updated_configs) processor.start() # until KIP-91 is merged we'll only send 5 messages to assert Kafka Streams is running before taking the broker down # After KIP-91 is merged we'll continue to send messages the duration of the test self.assert_produce_consume("before_broker_stop") node = self.kafka.leader(self.inputTopic) self.kafka.stop_node(node) time.sleep(broker_down_time_in_seconds) self.kafka.start_node(node) self.assert_produce_consume("after_broker_stop") self.kafka.stop()
class ProduceBenchTest(Test): def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ProduceBenchTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) if quorum.for_test( test_context) == quorum.zk else None self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk) self.workload_service = ProduceBenchWorkloadService( test_context, self.kafka) self.trogdor = TrogdorService( context=self.test_context, client_services=[self.kafka, self.workload_service]) self.active_topics = { "produce_bench_topic[0-1]": { "numPartitions": 1, "replicationFactor": 3 } } self.inactive_topics = { "produce_bench_topic[2-9]": { "numPartitions": 1, "replicationFactor": 3 } } def setUp(self): self.trogdor.start() if self.zk: self.zk.start() self.kafka.start() def teardown(self): self.trogdor.stop() self.kafka.stop() if self.zk: self.zk.stop() @cluster(num_nodes=8) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_produce_bench(self, metadata_quorum=quorum.zk): spec = ProduceBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.workload_service.producer_node, self.workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=100000, producer_conf={}, admin_client_conf={}, common_client_conf={}, inactive_topics=self.inactive_topics, active_topics=self.active_topics) workload1 = self.trogdor.create_task("workload1", spec) workload1.wait_for_done(timeout_sec=360) tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) @cluster(num_nodes=8) def test_produce_bench_transactions(self, metadata_quorum=quorum.zk): spec = ProduceBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.workload_service.producer_node, self.workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=100000, producer_conf={}, admin_client_conf={}, common_client_conf={}, inactive_topics=self.inactive_topics, active_topics=self.active_topics, transaction_generator={ # 10 transactions with 10k messages "type": "uniform", "messagesPerTransaction": "10000" }) workload1 = self.trogdor.create_task("workload1", spec) workload1.wait_for_done(timeout_sec=360) tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))
class ConsumeBenchTest(Test): def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ConsumeBenchTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) if quorum.for_test( test_context) == quorum.zk else None self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk) self.producer_workload_service = ProduceBenchWorkloadService( test_context, self.kafka) self.consumer_workload_service = ConsumeBenchWorkloadService( test_context, self.kafka) self.consumer_workload_service_2 = ConsumeBenchWorkloadService( test_context, self.kafka) self.active_topics = { "consume_bench_topic[0-5]": { "numPartitions": 5, "replicationFactor": 3 } } self.trogdor = TrogdorService(context=self.test_context, client_services=[ self.kafka, self.producer_workload_service, self.consumer_workload_service, self.consumer_workload_service_2 ]) def setUp(self): self.trogdor.start() if self.zk: self.zk.start() self.kafka.start() def teardown(self): self.trogdor.stop() self.kafka.stop() if self.zk: self.zk.stop() def produce_messages(self, topics, max_messages=10000): produce_spec = ProduceBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.producer_workload_service.producer_node, self.producer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=max_messages, producer_conf={}, admin_client_conf={}, common_client_conf={}, inactive_topics={}, active_topics=topics) produce_workload = self.trogdor.create_task("produce_workload", produce_spec) produce_workload.wait_for_done(timeout_sec=180) self.logger.debug("Produce workload finished") @cluster(num_nodes=10) @matrix(topics=[["consume_bench_topic[0-5]"]], metadata_quorum=quorum.all_non_upgrade) # topic subscription @matrix(topics=[["consume_bench_topic[0-5]:[0-4]"]], metadata_quorum=quorum.all_non_upgrade) # manual topic assignment def test_consume_bench(self, topics, metadata_quorum=quorum.zk): """ Runs a ConsumeBench workload to consume messages """ self.produce_messages(self.active_topics) consume_spec = ConsumeBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=10000, consumer_conf={}, admin_client_conf={}, common_client_conf={}, active_topics=topics) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) @cluster(num_nodes=10) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_single_partition(self, metadata_quorum=quorum.zk): """ Run a ConsumeBench against a single partition """ active_topics = { "consume_bench_topic": { "numPartitions": 2, "replicationFactor": 3 } } self.produce_messages(active_topics, 5000) consume_spec = ConsumeBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2500, consumer_conf={}, admin_client_conf={}, common_client_conf={}, active_topics=["consume_bench_topic:1"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=180) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) @cluster(num_nodes=10) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_multiple_consumers_random_group_topics(self, metadata_quorum=quorum.zk): """ Runs multiple consumers group to read messages from topics. Since a consumerGroup isn't specified, each consumer should read from all topics independently """ self.produce_messages(self.active_topics, max_messages=5000) consume_spec = ConsumeBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=5000, # all should read exactly 5k messages consumer_conf={}, admin_client_conf={}, common_client_conf={}, threads_per_worker=5, active_topics=["consume_bench_topic[0-5]"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) @cluster(num_nodes=10) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_two_consumers_specified_group_topics(self, metadata_quorum=quorum.zk): """ Runs two consumers in the same consumer group to read messages from topics. Since a consumerGroup is specified, each consumer should dynamically get assigned a partition from group """ self.produce_messages(self.active_topics) consume_spec = ConsumeBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2000, # both should read at least 2k messages consumer_conf={}, admin_client_conf={}, common_client_conf={}, threads_per_worker=2, consumer_group="testGroup", active_topics=["consume_bench_topic[0-5]"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) @cluster(num_nodes=10) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_multiple_consumers_random_group_partitions( self, metadata_quorum=quorum.zk): """ Runs multiple consumers in to read messages from specific partitions. Since a consumerGroup isn't specified, each consumer will get assigned a random group and consume from all partitions """ self.produce_messages(self.active_topics, max_messages=20000) consume_spec = ConsumeBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2000, consumer_conf={}, admin_client_conf={}, common_client_conf={}, threads_per_worker=4, active_topics=["consume_bench_topic1:[0-4]"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) @cluster(num_nodes=10) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_multiple_consumers_specified_group_partitions_should_raise( self, metadata_quorum=quorum.zk): """ Runs multiple consumers in the same group to read messages from specific partitions. It is an invalid configuration to provide a consumer group and specific partitions. """ expected_error_msg = 'explicit partition assignment' self.produce_messages(self.active_topics, max_messages=20000) consume_spec = ConsumeBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2000, consumer_conf={}, admin_client_conf={}, common_client_conf={}, threads_per_worker=4, consumer_group="fail_group", active_topics=["consume_bench_topic1:[0-4]"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) try: consume_workload.wait_for_done(timeout_sec=360) raise Exception( "Should have raised an exception due to an invalid configuration" ) except RuntimeError as e: if expected_error_msg not in str(e): raise RuntimeError("Unexpected Exception - " + str(e)) self.logger.info(e)
class StreamsBrokerCompatibility(Test): """ These tests validate that Streams v0.10.2+ can connect to older brokers v0.10+ and that Streams fails fast for pre-0.10 brokers """ input = "brokerCompatibilitySourceTopic" output = "brokerCompatibilitySinkTopic" def __init__(self, test_context): super(StreamsBrokerCompatibility, self).__init__(test_context=test_context) self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={ self.input: {'partitions': 1, 'replication-factor': 1}, self.output: {'partitions': 1, 'replication-factor': 1} }) self.processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka) self.consumer = VerifiableConsumer(test_context, 1, self.kafka, self.output, "stream-broker-compatibility-verify-consumer") def setUp(self): self.zk.start() @parametrize(broker_version=str(DEV_BRANCH)) @parametrize(broker_version=str(LATEST_0_10_1)) def test_compatible_brokers(self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() self.processor.start() self.consumer.start() self.processor.wait() num_consumed_mgs = self.consumer.total_consumed() self.consumer.stop() self.kafka.stop() assert num_consumed_mgs == 1, \ "Did expect to read exactly one message but got %d" % num_consumed_mgs @parametrize(broker_version=str(LATEST_0_10_0)) def test_fail_fast_on_incompatible_brokers(self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() self.processor.start() self.processor.node.account.ssh(self.processor.start_cmd(self.processor.node)) with self.processor.node.account.monitor_log(self.processor.STDERR_FILE) as monitor: monitor.wait_until('Exception in thread "main" org.apache.kafka.streams.errors.StreamsException: Kafka Streams requires broker version 0.10.1.x or higher.', timeout_sec=60, err_msg="Never saw 'incompatible broker' error message " + str(self.processor.node.account)) self.kafka.stop()
class StreamsStaticMembershipTest(Test): """ Tests using static membership when broker points to minimum supported version (2.3) or higher. """ input_topic = 'inputTopic' pattern = 'PROCESSED' running_message = 'REBALANCING -> RUNNING' stopped_message = 'Static membership test closed' def __init__(self, test_context): super(StreamsStaticMembershipTest, self).__init__(test_context) self.topics = { self.input_topic: { 'partitions': 18 }, } self.zookeeper = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zookeeper, topics=self.topics) self.producer = VerifiableProducer(self.test_context, 1, self.kafka, self.input_topic, throughput=1000, acks=1) def test_rolling_bounces_will_not_trigger_rebalance_under_static_membership( self): self.zookeeper.start() self.kafka.start() numThreads = 3 processor1 = StaticMemberTestService(self.test_context, self.kafka, "consumer-A", numThreads) processor2 = StaticMemberTestService(self.test_context, self.kafka, "consumer-B", numThreads) processor3 = StaticMemberTestService(self.test_context, self.kafka, "consumer-C", numThreads) processors = [processor1, processor2, processor3] self.producer.start() for processor in processors: processor.CLEAN_NODE_ENABLED = False self.set_topics(processor) verify_running(processor, self.running_message) self.verify_processing(processors) # do several rolling bounces num_bounces = 3 for i in range(0, num_bounces): for processor in processors: verify_stopped(processor, self.stopped_message) verify_running(processor, self.running_message) stable_generation = -1 for processor in processors: generations = extract_generation_from_logs(processor) num_bounce_generations = num_bounces * numThreads assert num_bounce_generations <= len(generations), \ "Smaller than minimum expected %d generation messages, actual %d" % (num_bounce_generations, len(generations)) for generation in generations[-num_bounce_generations:]: generation = int(generation) if stable_generation == -1: stable_generation = generation assert stable_generation == generation, \ "Stream rolling bounce have caused unexpected generation bump %d" % generation self.verify_processing(processors) stop_processors(processors, self.stopped_message) self.producer.stop() self.kafka.stop() self.zookeeper.stop() def verify_processing(self, processors): for processor in processors: with processor.node.account.monitor_log( processor.STDOUT_FILE) as monitor: monitor.wait_until( self.pattern, timeout_sec=60, err_msg="Never saw processing of %s " % self.pattern + str(processor.node.account)) def set_topics(self, processor): processor.INPUT_TOPIC = self.input_topic
class StreamsBrokerCompatibility(Test): """ These tests validate that Streams v0.10.2+ can connect to older brokers v0.10.1+ and that Streams fails fast for pre-0.10.0 brokers """ input = "brokerCompatibilitySourceTopic" output = "brokerCompatibilitySinkTopic" def __init__(self, test_context): super(StreamsBrokerCompatibility, self).__init__(test_context=test_context) self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={ self.input: { 'partitions': 1, 'replication-factor': 1 }, self.output: { 'partitions': 1, 'replication-factor': 1 } }) self.processor = StreamsBrokerCompatibilityService( self.test_context, self.kafka) self.consumer = VerifiableConsumer( test_context, 1, self.kafka, self.output, "stream-broker-compatibility-verify-consumer") def setUp(self): self.zk.start() @parametrize(broker_version=str(DEV_BRANCH)) @parametrize(broker_version=str(LATEST_0_10_1)) def test_compatible_brokers(self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() self.processor.start() self.consumer.start() self.processor.wait() wait_until( lambda: self.consumer.total_consumed() > 0, timeout_sec=30, err_msg= "Did expect to read a message but got none within 30 seconds.") self.consumer.stop() self.kafka.stop() @parametrize(broker_version=str(LATEST_0_10_0)) def test_fail_fast_on_incompatible_brokers(self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() self.processor.start() self.processor.node.account.ssh( self.processor.start_cmd(self.processor.node)) with self.processor.node.account.monitor_log( self.processor.STDERR_FILE) as monitor: monitor.wait_until( 'Exception in thread "main" org.apache.kafka.streams.errors.StreamsException: Kafka Streams requires broker version 0.10.1.x or higher.', timeout_sec=60, err_msg="Never saw 'incompatible broker' error message " + str(self.processor.node.account)) self.kafka.stop()
class StreamsBrokerCompatibility(Test): """ These tests validates that - Streams 0.11+ w/ EOS fails fast for older brokers 0.10.2 and 0.10.1 - Streams 0.11+ w/o EOS works for older brokers 0.10.2 and 0.10.1 - Streams fails fast for 0.10.0 brokers - Streams times-out for pre-0.10.0 brokers """ input = "brokerCompatibilitySourceTopic" output = "brokerCompatibilitySinkTopic" def __init__(self, test_context): super(StreamsBrokerCompatibility, self).__init__(test_context=test_context) self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={ self.input: {'partitions': 1, 'replication-factor': 1}, self.output: {'partitions': 1, 'replication-factor': 1} }) self.consumer = VerifiableConsumer(test_context, 1, self.kafka, self.output, "stream-broker-compatibility-verify-consumer") def setUp(self): self.zk.start() @parametrize(broker_version=str(LATEST_0_10_2)) @parametrize(broker_version=str(LATEST_0_10_1)) def test_fail_fast_on_incompatible_brokers_if_eos_enabled(self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, True) with processor.node.account.monitor_log(processor.STDERR_FILE) as monitor: processor.start() monitor.wait_until('FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException: Cannot create a v0 FindCoordinator request because we require features supported only in 1 or later.', timeout_sec=60, err_msg="Never saw 'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException: Cannot create a v0 FindCoordinator request because we require features supported only in 1 or later.' error message " + str(processor.node.account)) self.kafka.stop() @parametrize(broker_version=str(LATEST_0_11_0)) @parametrize(broker_version=str(LATEST_0_10_2)) @parametrize(broker_version=str(LATEST_0_10_1)) def test_compatible_brokers_eos_disabled(self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, False) processor.start() self.consumer.start() processor.wait() wait_until(lambda: self.consumer.total_consumed() > 0, timeout_sec=30, err_msg="Did expect to read a message but got none within 30 seconds.") self.consumer.stop() self.kafka.stop() @parametrize(broker_version=str(LATEST_0_10_0)) def test_fail_fast_on_incompatible_brokers(self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, False) with processor.node.account.monitor_log(processor.STDERR_FILE) as monitor: processor.start() monitor.wait_until('FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException: The broker does not support CREATE_TOPICS', timeout_sec=60, err_msg="Never saw 'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException: The broker does not support CREATE_TOPICS' error message " + str(processor.node.account)) self.kafka.stop() @ignore @parametrize(broker_version=str(LATEST_0_9)) @parametrize(broker_version=str(LATEST_0_8_2)) def test_timeout_on_pre_010_brokers(self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, False) with processor.node.account.monitor_log(processor.STDERR_FILE) as monitor: processor.start() monitor.wait_until('Exception in thread "main" org.apache.kafka.streams.errors.BrokerNotFoundException: Could not find any available broker.', timeout_sec=60, err_msg="Never saw 'no available brokers' error message " + str(processor.node.account)) self.kafka.stop()
class StreamsBrokerCompatibility(Test): """ These tests validates that - Streams 0.11+ w/ EOS fails fast for older brokers 0.10.2 and 0.10.1 - Streams 0.11+ w/o EOS works for older brokers 0.10.2 and 0.10.1 - Streams fails fast for 0.10.0 brokers - Streams times-out for pre-0.10.0 brokers """ input = "brokerCompatibilitySourceTopic" output = "brokerCompatibilitySinkTopic" def __init__(self, test_context): super(StreamsBrokerCompatibility, self).__init__(test_context=test_context) self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={ self.input: { 'partitions': 1, 'replication-factor': 1 }, self.output: { 'partitions': 1, 'replication-factor': 1 } }) self.consumer = VerifiableConsumer( test_context, 1, self.kafka, self.output, "stream-broker-compatibility-verify-consumer") def setUp(self): self.zk.start() @parametrize(broker_version=str(LATEST_0_10_2)) @parametrize(broker_version=str(LATEST_0_10_1)) def test_fail_fast_on_incompatible_brokers_if_eos_enabled( self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, True) processor.start() processor.node.account.ssh(processor.start_cmd(processor.node)) with processor.node.account.monitor_log( processor.STDERR_FILE) as monitor: monitor.wait_until( 'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException: The broker does not support LIST_OFFSETS ', timeout_sec=60, err_msg= "Never saw 'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException: The broker does not support LIST_OFFSETS ' error message " + str(processor.node.account)) self.kafka.stop() @parametrize(broker_version=str(LATEST_0_11_0)) @parametrize(broker_version=str(LATEST_0_10_2)) @parametrize(broker_version=str(LATEST_0_10_1)) def test_compatible_brokers_eos_disabled(self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, False) processor.start() self.consumer.start() processor.wait() wait_until( lambda: self.consumer.total_consumed() > 0, timeout_sec=30, err_msg= "Did expect to read a message but got none within 30 seconds.") self.consumer.stop() self.kafka.stop() @parametrize(broker_version=str(LATEST_0_10_0)) def test_fail_fast_on_incompatible_brokers(self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, False) processor.start() processor.node.account.ssh(processor.start_cmd(processor.node)) with processor.node.account.monitor_log( processor.STDERR_FILE) as monitor: monitor.wait_until( 'FATAL: An unexpected exception org.apache.kafka.streams.errors.StreamsException: Could not create internal topics.', timeout_sec=60, err_msg= "Never saw 'FATAL: An unexpected exception org.apache.kafka.streams.errors.StreamsException: Could not create internal topics.' error message " + str(processor.node.account)) self.kafka.stop() @ignore @parametrize(broker_version=str(LATEST_0_9)) @parametrize(broker_version=str(LATEST_0_8_2)) def test_timeout_on_pre_010_brokers(self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, False) processor.start() processor.node.account.ssh(processor.start_cmd(processor.node)) with processor.node.account.monitor_log( processor.STDERR_FILE) as monitor: monitor.wait_until( 'Exception in thread "main" org.apache.kafka.streams.errors.BrokerNotFoundException: Could not find any available broker.', timeout_sec=60, err_msg="Never saw 'no available brokers' error message " + str(processor.node.account)) self.kafka.stop()
class StreamsOptimizedTest(Test): """ Test doing upgrades of a Kafka Streams application that is un-optimized initially then optimized """ input_topic = 'inputTopic' aggregation_topic = 'aggregationTopic' reduce_topic = 'reduceTopic' join_topic = 'joinTopic' operation_pattern = 'AGGREGATED\|REDUCED\|JOINED' stopped_message = 'OPTIMIZE_TEST Streams Stopped' def __init__(self, test_context): super(StreamsOptimizedTest, self).__init__(test_context) self.topics = { self.input_topic: { 'partitions': 6 }, self.aggregation_topic: { 'partitions': 6 }, self.reduce_topic: { 'partitions': 6 }, self.join_topic: { 'partitions': 6 } } self.zookeeper = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zookeeper, topics=self.topics) self.producer = VerifiableProducer(self.test_context, 1, self.kafka, self.input_topic, throughput=1000, acks=1) def test_upgrade_optimized_topology(self): self.zookeeper.start() self.kafka.start() processor1 = StreamsOptimizedUpgradeTestService( self.test_context, self.kafka) processor2 = StreamsOptimizedUpgradeTestService( self.test_context, self.kafka) processor3 = StreamsOptimizedUpgradeTestService( self.test_context, self.kafka) processors = [processor1, processor2, processor3] self.logger.info("produce records continually during the test") self.producer.start() self.logger.info("start all processors unoptimized") for processor in processors: self.set_topics(processor) processor.CLEAN_NODE_ENABLED = False self.verify_running_repartition_topic_count(processor, 4) self.logger.info("verify unoptimized") self.verify_processing(processors, verify_individual_operations=False) self.logger.info("stop unoptimized") stop_processors(processors, self.stopped_message) self.logger.info("reset") self.reset_application() for processor in processors: processor.node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + ".1", allow_fail=False) processor.node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + ".1", allow_fail=False) processor.node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + ".1", allow_fail=False) processor.node.account.ssh("mv " + processor.CONFIG_FILE + " " + processor.CONFIG_FILE + ".1", allow_fail=False) self.logger.info("start again with topology optimized") for processor in processors: processor.OPTIMIZED_CONFIG = 'all' self.verify_running_repartition_topic_count(processor, 1) self.logger.info("verify optimized") self.verify_processing(processors, verify_individual_operations=True) self.logger.info("stop optimized") stop_processors(processors, self.stopped_message) self.logger.info("teardown") self.producer.stop() self.kafka.stop() self.zookeeper.stop() def reset_application(self): resetter = StreamsResetter(self.test_context, self.kafka, topic=self.input_topic, applicationId='StreamsOptimizedTest') resetter.start() # resetter is not long-term running but it would be better to check the pid by stopping it resetter.stop() @staticmethod def verify_running_repartition_topic_count(processor, repartition_topic_count): node = processor.node with node.account.monitor_log(processor.STDOUT_FILE) as monitor: processor.start() monitor.wait_until( 'REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' % repartition_topic_count, timeout_sec=120, err_msg= "Never saw 'REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' message " % repartition_topic_count + str(processor.node.account)) def verify_processing(self, processors, verify_individual_operations): # This test previously had logic to account for skewed assignments, in which not all processors may # receive active assignments. I don't think this will happen anymore, but keep an eye out if we see # test failures here. If that does resurface, note that the prior implementation was not correct. # A better approach would be to make sure we see processing of each partition across the whole cluster # instead of just expecting to see each node perform some processing. for processor in processors: if verify_individual_operations: for operation in self.operation_pattern.split('\|'): self.do_verify(processor, operation) else: self.do_verify(processor, self.operation_pattern) def do_verify(self, processor, pattern): self.logger.info("Verifying %s processing pattern in STDOUT_FILE" % pattern) self.logger.info( list( processor.node.account.ssh_capture("ls -lh %s" % (processor.STDOUT_FILE), allow_fail=True))) wait_until(lambda: processor.node.account.ssh( "grep --max-count 1 '%s' %s" % (pattern, processor.STDOUT_FILE), allow_fail=True) == 0, timeout_sec=60) def set_topics(self, processor): processor.INPUT_TOPIC = self.input_topic processor.AGGREGATION_TOPIC = self.aggregation_topic processor.REDUCE_TOPIC = self.reduce_topic processor.JOIN_TOPIC = self.join_topic