class TestUpgrade(ProduceConsumeValidateTest): def __init__(self, test_context): super(TestUpgrade, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.zk = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=LATEST_0_8_2, topics={self.topic: { "partitions": 3, "replication-factor": 3, 'configs': {"min.insync.replicas": 2}}}) self.zk.start() self.kafka.start() # Producer and consumer self.producer_throughput = 10000 self.num_producers = 1 self.num_consumers = 1 self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput, version=LATEST_0_8_2) # TODO - reduce the timeout self.consumer = ConsoleConsumer( self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=30000, message_validator=is_int, version=LATEST_0_8_2) def perform_upgrade(self): self.logger.info("First pass bounce - rolling upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = TRUNK node.config[config_property.INTER_BROKER_PROTOCOL_VERSION] = "0.8.2.X" self.kafka.start_node(node) self.logger.info("Second pass bounce - remove inter.broker.protocol.version config") for node in self.kafka.nodes: self.kafka.stop_node(node) del node.config[config_property.INTER_BROKER_PROTOCOL_VERSION] self.kafka.start_node(node) def test_upgrade(self): """Test upgrade of Kafka broker cluster from 0.8.2 to 0.9.0 - Start 3 node broker cluster on version 0.8.2 - Start producer and consumer in the background - Perform two-phase rolling upgrade - First phase: upgrade brokers to 0.9.0 with inter.broker.protocol.version set to 0.8.2.X - Second phase: remove inter.broker.protocol.version config with rolling bounce - Finally, validate that every message acked by the producer was consumed by the consumer """ self.run_produce_consume_validate(core_test_action=self.perform_upgrade)
class StreamsBrokerDownResilience(Test): """ This test validates that Streams is resilient to a broker being down longer than specified timeouts in configs """ inputTopic = "streamsResilienceSource" outputTopic = "streamsResilienceSink" num_messages = 5 def __init__(self, test_context): super(StreamsBrokerDownResilience, self).__init__(test_context=test_context) self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={ self.inputTopic: { 'partitions': 3, 'replication-factor': 1 }, self.outputTopic: { 'partitions': 1, 'replication-factor': 1 } }) def get_consumer(self, num_messages): return VerifiableConsumer(self.test_context, 1, self.kafka, self.outputTopic, "stream-broker-resilience-verify-consumer", max_messages=num_messages) def get_producer(self, num_messages): return VerifiableProducer(self.test_context, 1, self.kafka, self.inputTopic, max_messages=num_messages, acks=1) def assert_produce_consume(self, test_state, num_messages=5): producer = self.get_producer(num_messages) producer.start() wait_until(lambda: producer.num_acked >= num_messages, timeout_sec=30, err_msg="At %s failed to send messages " % test_state) consumer = self.get_consumer(num_messages) consumer.start() wait_until( lambda: consumer.total_consumed() >= num_messages, timeout_sec=60, err_msg="At %s streams did not process messages in 60 seconds " % test_state) @staticmethod def get_configs(extra_configs=""): # Consumer max.poll.interval > min(max.block.ms, ((retries + 1) * request.timeout) consumer_poll_ms = "consumer.max.poll.interval.ms=50000" retries_config = "producer.retries=2" request_timeout = "producer.request.timeout.ms=15000" max_block_ms = "producer.max.block.ms=30000" # java code expects configs in key=value,key=value format updated_configs = consumer_poll_ms + "," + retries_config + "," + request_timeout + "," + max_block_ms + extra_configs return updated_configs def wait_for_verification(self, processor, message, file, num_lines=1): wait_until(lambda: self.verify_from_file(processor, message, file ) >= num_lines, timeout_sec=60, err_msg="Did expect to read '%s' from %s" % (message, processor.node.account)) @staticmethod def verify_from_file(processor, message, file): result = processor.node.account.ssh_output("grep '%s' %s | wc -l" % (message, file), allow_fail=False) return int(result) def setUp(self): self.zk.start() def test_streams_resilient_to_broker_down(self): self.kafka.start() # Broker should be down over 2x of retries * timeout ms # So with (2 * 15000) = 30 seconds, we'll set downtime to 70 seconds broker_down_time_in_seconds = 70 processor = StreamsBrokerDownResilienceService(self.test_context, self.kafka, self.get_configs()) processor.start() # until KIP-91 is merged we'll only send 5 messages to assert Kafka Streams is running before taking the broker down # After KIP-91 is merged we'll continue to send messages the duration of the test self.assert_produce_consume("before_broker_stop") node = self.kafka.leader(self.inputTopic) self.kafka.stop_node(node) time.sleep(broker_down_time_in_seconds) self.kafka.start_node(node) self.assert_produce_consume("after_broker_stop") self.kafka.stop() def test_streams_runs_with_broker_down_initially(self): self.kafka.start() node = self.kafka.leader(self.inputTopic) self.kafka.stop_node(node) configs = self.get_configs( extra_configs=",application.id=starting_wo_broker_id") # start streams with broker down initially processor = StreamsBrokerDownResilienceService(self.test_context, self.kafka, configs) processor.start() processor_2 = StreamsBrokerDownResilienceService( self.test_context, self.kafka, configs) processor_2.start() processor_3 = StreamsBrokerDownResilienceService( self.test_context, self.kafka, configs) processor_3.start() broker_unavailable_message = "Broker may not be available" # verify streams instances unable to connect to broker, kept trying self.wait_for_verification(processor, broker_unavailable_message, processor.LOG_FILE, 100) self.wait_for_verification(processor_2, broker_unavailable_message, processor_2.LOG_FILE, 100) self.wait_for_verification(processor_3, broker_unavailable_message, processor_3.LOG_FILE, 100) # now start broker self.kafka.start_node(node) # assert streams can process when starting with broker down self.assert_produce_consume("running_with_broker_down_initially", num_messages=9) message = "processed3messages" # need to show all 3 instances processed messages self.wait_for_verification(processor, message, processor.STDOUT_FILE) self.wait_for_verification(processor_2, message, processor_2.STDOUT_FILE) self.wait_for_verification(processor_3, message, processor_3.STDOUT_FILE) self.kafka.stop() def test_streams_should_scale_in_while_brokers_down(self): self.kafka.start() configs = self.get_configs( extra_configs=",application.id=shutdown_with_broker_down") processor = StreamsBrokerDownResilienceService(self.test_context, self.kafka, configs) processor.start() processor_2 = StreamsBrokerDownResilienceService( self.test_context, self.kafka, configs) processor_2.start() processor_3 = StreamsBrokerDownResilienceService( self.test_context, self.kafka, configs) processor_3.start() # need to wait for rebalance once self.wait_for_verification( processor_3, "State transition from REBALANCING to RUNNING", processor_3.LOG_FILE) # assert streams can process when starting with broker down self.assert_produce_consume("waiting for rebalance to complete", num_messages=9) message = "processed3messages" self.wait_for_verification(processor, message, processor.STDOUT_FILE) self.wait_for_verification(processor_2, message, processor_2.STDOUT_FILE) self.wait_for_verification(processor_3, message, processor_3.STDOUT_FILE) node = self.kafka.leader(self.inputTopic) self.kafka.stop_node(node) processor.stop() processor_2.stop() shutdown_message = "Complete shutdown of streams resilience test app now" self.wait_for_verification(processor, shutdown_message, processor.STDOUT_FILE) self.wait_for_verification(processor_2, shutdown_message, processor_2.STDOUT_FILE) self.kafka.start_node(node) self.assert_produce_consume( "sending_message_after_stopping_streams_instance_bouncing_broker", num_messages=9) self.wait_for_verification(processor_3, "processed9messages", processor_3.STDOUT_FILE) self.kafka.stop()
class LogDirFailureTest(ProduceConsumeValidateTest): """ Note that consuming is a bit tricky, at least with console consumer. The goal is to consume all messages (foreach partition) in the topic. In this case, waiting for the last message may cause the consumer to stop too soon since console consumer is consuming multiple partitions from a single thread and therefore we lose ordering guarantees. Waiting on a count of consumed messages can be unreliable: if we stop consuming when num_consumed == num_acked, we might exit early if some messages are duplicated (though not an issue here since producer retries==0) Therefore rely here on the consumer.timeout.ms setting which times out on the interval between successively consumed messages. Since we run the producer to completion before running the consumer, this is a reliable indicator that nothing is left to consume. """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(LogDirFailureTest, self).__init__(test_context=test_context) self.topic1 = "test_topic_1" self.topic2 = "test_topic_2" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk, topics={ self.topic1: {"partitions": 1, "replication-factor": 3, "configs": {"min.insync.replicas": 1}}, self.topic2: {"partitions": 1, "replication-factor": 3, "configs": {"min.insync.replicas": 2}} }, # Set log.roll.ms to 3 seconds so that broker will detect disk error sooner when it creates log segment # Otherwise broker will still be able to read/write the log file even if the log directory is inaccessible. server_prop_overides=[ [config_property.OFFSETS_TOPIC_NUM_PARTITIONS, "1"], [config_property.LOG_FLUSH_INTERVAL_MESSAGE, "5"], [config_property.REPLICA_HIGHWATERMARK_CHECKPOINT_INTERVAL_MS, "60000"], [config_property.LOG_ROLL_TIME_MS, "3000"] ]) self.producer_throughput = 1000 self.num_producers = 1 self.num_consumers = 1 def setUp(self): self.zk.start() def min_cluster_size(self): """Override this since we're adding services outside of the constructor""" return super(LogDirFailureTest, self).min_cluster_size() + self.num_producers * 2 + self.num_consumers * 2 @cluster(num_nodes=9) @matrix(bounce_broker=[False, True], broker_type=["leader", "follower"], security_protocol=["PLAINTEXT"]) def test_replication_with_disk_failure(self, bounce_broker, security_protocol, broker_type): """Replication tests. These tests verify that replication provides simple durability guarantees by checking that data acked by brokers is still available for consumption in the face of various failure scenarios. Setup: 1 zk, 3 kafka nodes, 1 topic with partitions=3, replication-factor=3, and min.insync.replicas=2 and another topic with partitions=3, replication-factor=3, and min.insync.replicas=1 - Produce messages in the background - Consume messages in the background - Drive broker failures (shutdown, or bounce repeatedly with kill -15 or kill -9) - When done driving failures, stop producing, and finish consuming - Validate that every acked message was consumed """ self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol self.kafka.start() try: # Initialize producer/consumer for topic2 self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic2, throughput=self.producer_throughput) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic2, group_id="test-consumer-group-1", consumer_timeout_ms=60000, message_validator=is_int) self.start_producer_and_consumer() # Get a replica of the partition of topic2 and make its log directory offline by changing the log dir's permission. # We assume that partition of topic2 is created in the second log directory of respective brokers. broker_node = select_node(self, broker_type, self.topic2) broker_idx = self.kafka.idx(broker_node) assert broker_idx in self.kafka.isr_idx_list(self.topic2), \ "Broker %d should be in isr set %s" % (broker_idx, str(self.kafka.isr_idx_list(self.topic2))) # Verify that topic1 and the consumer offset topic is in the first log directory and topic2 is in the second log directory topic_1_partition_0 = KafkaService.DATA_LOG_DIR_1 + "/test_topic_1-0" topic_2_partition_0 = KafkaService.DATA_LOG_DIR_2 + "/test_topic_2-0" offset_topic_partition_0 = KafkaService.DATA_LOG_DIR_1 + "/__consumer_offsets-0" for path in [topic_1_partition_0, topic_2_partition_0, offset_topic_partition_0]: assert path_exists(broker_node, path), "%s should exist" % path self.logger.debug("Making log dir %s inaccessible" % (KafkaService.DATA_LOG_DIR_2)) cmd = "chmod a-w %s -R" % (KafkaService.DATA_LOG_DIR_2) broker_node.account.ssh(cmd, allow_fail=False) if bounce_broker: self.kafka.restart_node(broker_node, clean_shutdown=True) # Verify the following: # 1) The broker with offline log directory is not the leader of the partition of topic2 # 2) The broker with offline log directory is not in the ISR # 3) The broker with offline log directory is still online # 4) Messages can still be produced and consumed from topic2 wait_until(lambda: self.kafka.leader(self.topic2, partition=0) != broker_node, timeout_sec=60, err_msg="Broker %d should not be leader of topic %s and partition 0" % (broker_idx, self.topic2)) assert self.kafka.alive(broker_node), "Broker %d should be still online" % (broker_idx) wait_until(lambda: broker_idx not in self.kafka.isr_idx_list(self.topic2), timeout_sec=60, err_msg="Broker %d should not be in isr set %s" % (broker_idx, str(self.kafka.isr_idx_list(self.topic2)))) self.stop_producer_and_consumer() self.validate() # Shutdown all other brokers so that the broker with offline log dir is the only online broker offline_nodes = [] for node in self.kafka.nodes: if broker_node != node: offline_nodes.append(node) self.logger.debug("Hard shutdown broker %d" % (self.kafka.idx(node))) self.kafka.stop_node(node) # Verify the following: # 1) The broker with offline directory is the only in-sync broker of the partition of topic1 # 2) Messages can still be produced and consumed from topic1 self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic1, throughput=self.producer_throughput, offline_nodes=offline_nodes) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic1, group_id="test-consumer-group-2", consumer_timeout_ms=90000, message_validator=is_int) self.consumer_start_timeout_sec = 90 self.start_producer_and_consumer() assert self.kafka.isr_idx_list(self.topic1) == [broker_idx], \ "In-sync replicas of topic %s and partition 0 should be %s" % (self.topic1, str([broker_idx])) self.stop_producer_and_consumer() self.validate() except BaseException as e: for s in self.test_context.services: self.mark_for_collect(s) raise
class LogDirFailureTest(ProduceConsumeValidateTest): """ Note that consuming is a bit tricky, at least with console consumer. The goal is to consume all messages (foreach partition) in the topic. In this case, waiting for the last message may cause the consumer to stop too soon since console consumer is consuming multiple partitions from a single thread and therefore we lose ordering guarantees. Waiting on a count of consumed messages can be unreliable: if we stop consuming when num_consumed == num_acked, we might exit early if some messages are duplicated (though not an issue here since producer retries==0) Therefore rely here on the consumer.timeout.ms setting which times out on the interval between successively consumed messages. Since we run the producer to completion before running the consumer, this is a reliable indicator that nothing is left to consume. """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(LogDirFailureTest, self).__init__(test_context=test_context) self.topic1 = "test_topic_1" self.topic2 = "test_topic_2" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService( test_context, num_nodes=3, zk=self.zk, topics={ self.topic1: { "partitions": 1, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } }, self.topic2: { "partitions": 1, "replication-factor": 3, "configs": { "min.insync.replicas": 1 } } }, # Set log.roll.ms to 3 seconds so that broker will detect disk error sooner when it creates log segment # Otherwise broker will still be able to read/write the log file even if the log directory is inaccessible. server_prop_overides=[ [config_property.LOG_FLUSH_INTERVAL_MESSAGE, "5"], [ config_property. REPLICA_HIGHWATERMARK_CHECKPOINT_INTERVAL_MS, "60000" ], [config_property.LOG_ROLL_TIME_MS, "3000"] ]) self.producer_throughput = 1000 self.num_producers = 1 self.num_consumers = 1 def setUp(self): self.zk.start() def min_cluster_size(self): """Override this since we're adding services outside of the constructor""" return super(LogDirFailureTest, self).min_cluster_size( ) + self.num_producers * 2 + self.num_consumers * 2 @cluster(num_nodes=9) @matrix(bounce_broker=[False, True], broker_type=["leader", "follower"], security_protocol=["PLAINTEXT"]) def test_replication_with_disk_failure(self, bounce_broker, security_protocol, broker_type): """Replication tests. These tests verify that replication provides simple durability guarantees by checking that data acked by brokers is still available for consumption in the face of various failure scenarios. Setup: 1 zk, 3 kafka nodes, 1 topic with partitions=3, replication-factor=3, and min.insync.replicas=2 and another topic with partitions=3, replication-factor=3, and min.insync.replicas=1 - Produce messages in the background - Consume messages in the background - Drive broker failures (shutdown, or bounce repeatedly with kill -15 or kill -9) - When done driving failures, stop producing, and finish consuming - Validate that every acked message was consumed """ self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol self.kafka.start() try: # Initialize producer/consumer for topic1 self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic1, throughput=self.producer_throughput) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic1, group_id="test-consumer-group-1", new_consumer=False, consumer_timeout_ms=60000, message_validator=is_int) self.start_producer_and_consumer() # Get a replica of the partition of topic1 and make its first log directory offline by changing the log dir's permission. # We assume that partition of topic1 is created in the first log directory of respective brokers. broker_node = select_node(self, broker_type, self.topic1) broker_idx = self.kafka.idx(broker_node) assert broker_idx in self.kafka.isr_idx_list(self.topic1), \ "Broker %d should be in isr set %s" % (broker_idx, str(self.kafka.isr_idx_list(self.topic1))) self.logger.debug("Making log dir %s inaccessible" % (KafkaService.DATA_LOG_DIR_1)) cmd = "chmod a-w %s -R" % (KafkaService.DATA_LOG_DIR_1) broker_node.account.ssh(cmd, allow_fail=False) if bounce_broker: self.kafka.restart_node(broker_node, clean_shutdown=True) # Verify the following: # 1) The broker with offline log directory is not the leader of the partition of topic1 # 2) The broker with offline log directory is not in the ISR # 3) The broker with offline log directory is still online # 4) Messages can still be produced and consumed from topic1 wait_until( lambda: self.kafka.leader(self.topic1, partition=0 ) != broker_node, timeout_sec=60, err_msg= "Broker %d should not be leader of topic %s and partition 0" % (broker_idx, self.topic1)) assert self.kafka.alive( broker_node), "Broker %d should be still online" % (broker_idx) wait_until( lambda: broker_idx not in self.kafka.isr_idx_list(self.topic1), timeout_sec=60, err_msg="Broker %d should not be in isr set %s" % (broker_idx, str(self.kafka.isr_idx_list(self.topic1)))) self.stop_producer_and_consumer() self.validate() # Shutdown all other brokers so that the broker with offline log dir is the only online broker offline_nodes = [] for node in self.kafka.nodes: if broker_node != node: offline_nodes.append(node) self.logger.debug("Hard shutdown broker %d" % (self.kafka.idx(node))) self.kafka.stop_node(node) # Verify the following: # 1) The broker with offline directory is the only in-sync broker of the partition of topic2 # 2) Messages can still be produced and consumed from topic2 self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic2, throughput=self.producer_throughput, offline_nodes=offline_nodes) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic2, group_id="test-consumer-group-2", new_consumer=False, consumer_timeout_ms=60000, message_validator=is_int) self.start_producer_and_consumer() assert self.kafka.isr_idx_list(self.topic2) == [broker_idx], \ "In-sync replicas of topic %s and partition 0 should be %s" % (self.topic2, str([broker_idx])) self.stop_producer_and_consumer() self.validate() except BaseException as e: for s in self.test_context.services: self.mark_for_collect(s) raise
class TransactionsTest(Test): """Tests transactions by transactionally copying data from a source topic to a destination topic and killing the copy process as well as the broker randomly through the process. In the end we verify that the final output topic contains exactly one committed copy of each message in the input topic """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(TransactionsTest, self).__init__(test_context=test_context) self.input_topic = "input-topic" self.output_topic = "output-topic" self.num_brokers = 3 # Test parameters self.num_input_partitions = 2 self.num_output_partitions = 3 self.num_seed_messages = 100000 self.transaction_size = 750 self.first_transactional_id = "my-first-transactional-id" self.second_transactional_id = "my-second-transactional-id" self.consumer_group = "transactions-test-consumer-group" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=self.num_brokers, zk=self.zk, topics={ self.input_topic: { "partitions": self.num_input_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } }, self.output_topic: { "partitions": self.num_output_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } } }) def setUp(self): self.zk.start() def seed_messages(self): seed_timeout_sec = 10000 seed_producer = VerifiableProducer(context=self.test_context, num_nodes=1, kafka=self.kafka, topic=self.input_topic, message_validator=is_int, max_messages=self.num_seed_messages, enable_idempotence=True) seed_producer.start() wait_until(lambda: seed_producer.num_acked >= self.num_seed_messages, timeout_sec=seed_timeout_sec, err_msg="Producer failed to produce messages %d in %ds." %\ (self.num_seed_messages, seed_timeout_sec)) return seed_producer.acked def get_messages_from_output_topic(self): consumer = self.start_consumer(self.output_topic, group_id="verifying_consumer") return self.drain_consumer(consumer) def bounce_brokers(self, clean_shutdown): for node in self.kafka.nodes: if clean_shutdown: self.kafka.restart_node(node, clean_shutdown=True) else: self.kafka.stop_node(node, clean_shutdown=False) wait_until(lambda: len(self.kafka.pids(node)) == 0 and not self .kafka.is_registered(node), timeout_sec=self.kafka.zk_session_timeout + 5, err_msg="Failed to see timely deregistration of \ hard-killed broker %s" % str(node.account)) self.kafka.start_node(node) def create_and_start_message_copier(self, input_partition, transactional_id): message_copier = TransactionalMessageCopier( context=self.test_context, num_nodes=1, kafka=self.kafka, transactional_id=transactional_id, consumer_group=self.consumer_group, input_topic=self.input_topic, input_partition=input_partition, output_topic=self.output_topic, max_messages=-1, transaction_size=self.transaction_size) message_copier.start() wait_until(lambda: message_copier.alive(message_copier.nodes[0]), timeout_sec=10, err_msg="Message copier failed to start after 10 s") return message_copier def bounce_copiers(self, copiers, clean_shutdown): for _ in range(3): for copier in copiers: wait_until(lambda: copier.progress_percent() >= 20.0, timeout_sec=30, err_msg="%s : Message copier didn't make enough progress in 30s. Current progress: %s" \ % (copier.transactional_id, str(copier.progress_percent()))) self.logger.info( "%s - progress: %s" % (copier.transactional_id, str(copier.progress_percent()))) copier.restart(clean_shutdown) def create_and_start_copiers(self): copiers = [] copiers.append( self.create_and_start_message_copier( input_partition=0, transactional_id=self.first_transactional_id)) copiers.append( self.create_and_start_message_copier( input_partition=1, transactional_id=self.second_transactional_id)) return copiers def start_consumer(self, topic_to_read, group_id): consumer = ConsoleConsumer(context=self.test_context, num_nodes=1, kafka=self.kafka, topic=topic_to_read, group_id=group_id, new_consumer=True, message_validator=is_int, from_beginning=True, isolation_level="read_committed") consumer.start() # ensure that the consumer is up. wait_until(lambda: (len(consumer.messages_consumed[1]) > 0) == True, timeout_sec=60, err_msg="Consumer failed to consume any messages for %ds" %\ 60) return consumer def drain_consumer(self, consumer): # wait until we read at least the expected number of messages. # This is a safe check because both failure modes will be caught: # 1. If we have 'num_seed_messages' but there are duplicates, then # this is checked for later. # # 2. If we never reach 'num_seed_messages', then this will cause the # test to fail. wait_until(lambda: len(consumer.messages_consumed[1]) >= self.num_seed_messages, timeout_sec=90, err_msg="Consumer consumed only %d out of %d messages in %ds" %\ (len(consumer.messages_consumed[1]), self.num_seed_messages, 90)) consumer.stop() return consumer.messages_consumed[1] def copy_messages_transactionally(self, failure_mode, bounce_target): """Copies messages transactionally from the seeded input topic to the output topic, either bouncing brokers or clients in a hard and soft way as it goes. This method also consumes messages in read_committed mode from the output topic while the bounces and copy is going on. It returns the concurrently consumed messages. """ copiers = self.create_and_start_copiers() concurrent_consumer = self.start_consumer( self.output_topic, group_id="concurrent_consumer") clean_shutdown = False if failure_mode == "clean_bounce": clean_shutdown = True if bounce_target == "brokers": self.bounce_brokers(clean_shutdown) elif bounce_target == "clients": self.bounce_copiers(copiers, clean_shutdown) for copier in copiers: wait_until(lambda: copier.is_done, timeout_sec=60, err_msg="%s - Failed to copy all messages in %ds." %\ (copier.transactional_id, 60)) self.logger.info("finished copying messages") return self.drain_consumer(concurrent_consumer) @cluster(num_nodes=9) @matrix(failure_mode=["hard_bounce", "clean_bounce"], bounce_target=["brokers", "clients"]) def test_transactions(self, failure_mode, bounce_target): security_protocol = 'PLAINTEXT' self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol self.kafka.logs["kafka_data"]["collect_default"] = True self.kafka.logs["kafka_operational_logs_debug"][ "collect_default"] = True self.kafka.start() input_messages = self.seed_messages() concurrently_consumed_messages = self.copy_messages_transactionally( failure_mode, bounce_target) output_messages = self.get_messages_from_output_topic() concurrently_consumed_message_set = set(concurrently_consumed_messages) output_message_set = set(output_messages) input_message_set = set(input_messages) num_dups = abs(len(output_messages) - len(output_message_set)) num_dups_in_concurrent_consumer = abs( len(concurrently_consumed_messages) - len(concurrently_consumed_message_set)) assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" %\ (len(input_message_set), len(output_message_set)) assert num_dups_in_concurrent_consumer == 0, "Detected %d dups in concurrently consumed messages" % num_dups_in_concurrent_consumer assert input_message_set == concurrently_consumed_message_set, \ "Input and concurrently consumed output message sets are not equal. Num input messages: %d. Num concurrently_consumed_messages: %d" %\ (len(input_message_set), len(concurrently_consumed_message_set))
class TestSecurityRollingUpgrade(ProduceConsumeValidateTest): """Tests a rolling upgrade from PLAINTEXT to a secured cluster """ def __init__(self, test_context): super(TestSecurityRollingUpgrade, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.producer_throughput = 100 self.num_producers = 1 self.num_consumers = 1 self.zk = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, topics={self.topic: { "partitions": 3, "replication-factor": 3, 'configs': {"min.insync.replicas": 2}}}) self.zk.start() #reduce replica.lag.time.max.ms due to KAFKA-2827 self.kafka.replica_lag = 2000 def create_producer_and_consumer(self): self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput) self.consumer = ConsoleConsumer( self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=60000, message_validator=is_int, new_consumer=True) self.consumer.group_id = "unique-test-group-" + str(random.random()) def bounce(self): #Sleeps reduce the intermittent failures reported in KAFKA-2891. Should be removed once resolved. for node in self.kafka.nodes: self.kafka.stop_node(node) time.sleep(10) self.kafka.start_node(node) time.sleep(10) def roll_in_secured_settings(self, upgrade_protocol): self.kafka.interbroker_security_protocol = upgrade_protocol # Roll cluster to include inter broker security protocol. self.kafka.open_port(upgrade_protocol) self.bounce() # Roll cluster to disable PLAINTEXT port self.kafka.close_port('PLAINTEXT') self.bounce() def open_secured_port(self, upgrade_protocol): self.kafka.security_protocol = upgrade_protocol self.kafka.open_port(upgrade_protocol) self.kafka.start_minikdc() self.bounce() @matrix(upgrade_protocol=["SSL", "SASL_PLAINTEXT", "SASL_SSL"]) def test_rolling_upgrade_phase_one(self, upgrade_protocol): """ Start with a PLAINTEXT cluster, open a SECURED port, via a rolling upgrade, ensuring we could produce and consume throughout over PLAINTEXT. Finally check we can produce and consume the new secured port. """ self.kafka.interbroker_security_protocol = "PLAINTEXT" self.kafka.security_protocol = "PLAINTEXT" self.kafka.start() #Create PLAINTEXT producer and consumer self.create_producer_and_consumer() # Rolling upgrade, opening a secure protocol, ensuring the Plaintext producer/consumer continues to run self.run_produce_consume_validate(self.open_secured_port, upgrade_protocol) # Now we can produce and consume via the secured port self.kafka.security_protocol = upgrade_protocol self.create_producer_and_consumer() self.run_produce_consume_validate(lambda: time.sleep(1)) @matrix(upgrade_protocol=["SSL", "SASL_PLAINTEXT", "SASL_SSL"]) def test_rolling_upgrade_phase_two(self, upgrade_protocol): """ Start with a PLAINTEXT cluster with a second Secured port open (i.e. result of phase one). Start an Producer and Consumer via the SECURED port Rolling upgrade to add inter-broker be the secure protocol Rolling upgrade again to disable PLAINTEXT Ensure the producer and consumer ran throughout """ #Given we have a broker that has both secure and PLAINTEXT ports open self.kafka.security_protocol = upgrade_protocol self.kafka.interbroker_security_protocol = "PLAINTEXT" self.kafka.start() #Create Secured Producer and Consumer self.create_producer_and_consumer() #Roll in the security protocol. Disable Plaintext. Ensure we can produce and Consume throughout self.run_produce_consume_validate(self.roll_in_secured_settings, upgrade_protocol)
class ReplicaScaleTest(Test): def __init__(self, test_context): super(ReplicaScaleTest, self).__init__(test_context=test_context) self.test_context = test_context self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=8, zk=self.zk) def setUp(self): self.zk.start() self.kafka.start() def teardown(self): # Need to increase the timeout due to partition count for node in self.kafka.nodes: self.kafka.stop_node(node, clean_shutdown=False, timeout_sec=60) self.kafka.stop() self.zk.stop() @cluster(num_nodes=12) @parametrize(topic_count=500, partition_count=34, replication_factor=3) def test_produce_consume(self, topic_count, partition_count, replication_factor): topics_create_start_time = time.time() for i in range(topic_count): topic = "replicas_produce_consume_%d" % i print("Creating topic %s" % topic) # Force some stdout for Jenkins topic_cfg = { "topic": topic, "partitions": partition_count, "replication-factor": replication_factor, "configs": {"min.insync.replicas": 2} } self.kafka.create_topic(topic_cfg) topics_create_end_time = time.time() self.logger.info("Time to create topics: %d" % (topics_create_end_time - topics_create_start_time)) producer_workload_service = ProduceBenchWorkloadService(self.test_context, self.kafka) consumer_workload_service = ConsumeBenchWorkloadService(self.test_context, self.kafka) trogdor = TrogdorService(context=self.test_context, client_services=[self.kafka, producer_workload_service, consumer_workload_service]) trogdor.start() produce_spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, producer_workload_service.producer_node, producer_workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=3400000, producer_conf={}, admin_client_conf={}, common_client_conf={}, inactive_topics={}, active_topics={"replicas_produce_consume_[0-2]": { "numPartitions": partition_count, "replicationFactor": replication_factor }}) produce_workload = trogdor.create_task("replicas-produce-workload", produce_spec) produce_workload.wait_for_done(timeout_sec=600) self.logger.info("Completed produce bench") consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, consumer_workload_service.consumer_node, consumer_workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=3400000, consumer_conf={}, admin_client_conf={}, common_client_conf={}, active_topics=["replicas_produce_consume_[0-2]"]) consume_workload = trogdor.create_task("replicas-consume-workload", consume_spec) consume_workload.wait_for_done(timeout_sec=600) self.logger.info("Completed consume bench") trogdor.stop() @cluster(num_nodes=12) @parametrize(topic_count=500, partition_count=34, replication_factor=3) def test_clean_bounce(self, topic_count, partition_count, replication_factor): topics_create_start_time = time.time() for i in range(topic_count): topic = "topic-%04d" % i print("Creating topic %s" % topic) # Force some stdout for Jenkins topic_cfg = { "topic": topic, "partitions": partition_count, "replication-factor": replication_factor, "configs": {"min.insync.replicas": 2} } self.kafka.create_topic(topic_cfg) topics_create_end_time = time.time() self.logger.info("Time to create topics: %d" % (topics_create_end_time - topics_create_start_time)) restart_times = [] for node in self.kafka.nodes: broker_bounce_start_time = time.time() self.kafka.stop_node(node, clean_shutdown=True, timeout_sec=600) self.kafka.start_node(node, timeout_sec=600) broker_bounce_end_time = time.time() restart_times.append(broker_bounce_end_time - broker_bounce_start_time) self.logger.info("Time to restart %s: %d" % (node.name, broker_bounce_end_time - broker_bounce_start_time)) self.logger.info("Restart times: %s" % restart_times) delete_start_time = time.time() for i in range(topic_count): topic = "topic-%04d" % i self.logger.info("Deleting topic %s" % topic) self.kafka.delete_topic(topic) delete_end_time = time.time() self.logger.info("Time to delete topics: %d" % (delete_end_time - delete_start_time))
class StreamsBrokerDownResilience(Test): """ This test validates that Streams is resilient to a broker being down longer than specified timeouts in configs """ inputTopic = "streamsResilienceSource" outputTopic = "streamsResilienceSink" num_messages = 5 def __init__(self, test_context): super(StreamsBrokerDownResilience, self).__init__(test_context=test_context) self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={ self.inputTopic: { 'partitions': 1, 'replication-factor': 1 }, self.outputTopic: { 'partitions': 1, 'replication-factor': 1 } }) def get_consumer(self): return VerifiableConsumer(self.test_context, 1, self.kafka, self.outputTopic, "stream-broker-resilience-verify-consumer", max_messages=self.num_messages) def get_producer(self): return VerifiableProducer(self.test_context, 1, self.kafka, self.inputTopic, max_messages=self.num_messages, acks=1) def assert_produce_consume(self, test_state): producer = self.get_producer() producer.start() wait_until(lambda: producer.num_acked > 0, timeout_sec=30, err_msg="At %s failed to send messages " % test_state) consumer = self.get_consumer() consumer.start() wait_until( lambda: consumer.total_consumed() > 0, timeout_sec=120, err_msg="At %s streams did not process messages in 120 seconds " % test_state) def setUp(self): self.zk.start() def test_streams_resilient_to_broker_down(self): self.kafka.start() # Consumer max.poll.interval > min(max.block.ms, ((retries + 1) * request.timeout) consumer_poll_ms = "consumer.max.poll.interval.ms=50000" retries_config = "producer.retries=2" request_timeout = "producer.request.timeout.ms=15000" max_block_ms = "producer.max.block.ms=30000" # Broker should be down over 2x of retries * timeout ms # So with (2 * 15000) = 30 seconds, we'll set downtime to 70 seconds broker_down_time_in_seconds = 70 # java code expects configs in key=value,key=value format updated_configs = consumer_poll_ms + "," + retries_config + "," + request_timeout + "," + max_block_ms processor = StreamsBrokerDownResilienceService(self.test_context, self.kafka, updated_configs) processor.start() # until KIP-91 is merged we'll only send 5 messages to assert Kafka Streams is running before taking the broker down # After KIP-91 is merged we'll continue to send messages the duration of the test self.assert_produce_consume("before_broker_stop") node = self.kafka.leader(self.inputTopic) self.kafka.stop_node(node) time.sleep(broker_down_time_in_seconds) self.kafka.start_node(node) self.assert_produce_consume("after_broker_stop") self.kafka.stop()
class TransactionsTest(Test): """Tests transactions by transactionally copying data from a source topic to a destination topic and killing the copy process as well as the broker randomly through the process. In the end we verify that the final output topic contains exactly one committed copy of each message in the input topic """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(TransactionsTest, self).__init__(test_context=test_context) self.input_topic = "input-topic" self.output_topic = "output-topic" self.num_brokers = 3 # Test parameters self.num_input_partitions = 2 self.num_output_partitions = 3 self.num_seed_messages = 100000 self.transaction_size = 750 self.consumer_group = "transactions-test-consumer-group" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=self.num_brokers, zk=self.zk) def setUp(self): self.zk.start() def seed_messages(self, topic, num_seed_messages): seed_timeout_sec = 10000 seed_producer = VerifiableProducer(context=self.test_context, num_nodes=1, kafka=self.kafka, topic=topic, message_validator=is_int, max_messages=num_seed_messages, enable_idempotence=True) seed_producer.start() wait_until(lambda: seed_producer.num_acked >= num_seed_messages, timeout_sec=seed_timeout_sec, err_msg="Producer failed to produce messages %d in %ds." %\ (self.num_seed_messages, seed_timeout_sec)) return seed_producer.acked def get_messages_from_topic(self, topic, num_messages): consumer = self.start_consumer(topic, group_id="verifying_consumer") return self.drain_consumer(consumer, num_messages) def bounce_brokers(self, clean_shutdown): for node in self.kafka.nodes: if clean_shutdown: self.kafka.restart_node(node, clean_shutdown = True) else: self.kafka.stop_node(node, clean_shutdown = False) wait_until(lambda: len(self.kafka.pids(node)) == 0 and not self.kafka.is_registered(node), timeout_sec=self.kafka.zk_session_timeout + 5, err_msg="Failed to see timely deregistration of \ hard-killed broker %s" % str(node.account)) self.kafka.start_node(node) def create_and_start_message_copier(self, input_topic, input_partition, output_topic, transactional_id): message_copier = TransactionalMessageCopier( context=self.test_context, num_nodes=1, kafka=self.kafka, transactional_id=transactional_id, consumer_group=self.consumer_group, input_topic=input_topic, input_partition=input_partition, output_topic=output_topic, max_messages=-1, transaction_size=self.transaction_size ) message_copier.start() wait_until(lambda: message_copier.alive(message_copier.nodes[0]), timeout_sec=10, err_msg="Message copier failed to start after 10 s") return message_copier def bounce_copiers(self, copiers, clean_shutdown): for _ in range(3): for copier in copiers: wait_until(lambda: copier.progress_percent() >= 20.0, timeout_sec=30, err_msg="%s : Message copier didn't make enough progress in 30s. Current progress: %s" \ % (copier.transactional_id, str(copier.progress_percent()))) self.logger.info("%s - progress: %s" % (copier.transactional_id, str(copier.progress_percent()))) copier.restart(clean_shutdown) def create_and_start_copiers(self, input_topic, output_topic, num_copiers): copiers = [] for i in range(0, num_copiers): copiers.append(self.create_and_start_message_copier( input_topic=input_topic, output_topic=output_topic, input_partition=i, transactional_id="copier-" + str(i) )) return copiers def start_consumer(self, topic_to_read, group_id): consumer = ConsoleConsumer(context=self.test_context, num_nodes=1, kafka=self.kafka, topic=topic_to_read, group_id=group_id, message_validator=is_int, from_beginning=True, isolation_level="read_committed") consumer.start() # ensure that the consumer is up. wait_until(lambda: (len(consumer.messages_consumed[1]) > 0) == True, timeout_sec=60, err_msg="Consumer failed to consume any messages for %ds" %\ 60) return consumer def drain_consumer(self, consumer, num_messages): # wait until we read at least the expected number of messages. # This is a safe check because both failure modes will be caught: # 1. If we have 'num_seed_messages' but there are duplicates, then # this is checked for later. # # 2. If we never reach 'num_seed_messages', then this will cause the # test to fail. wait_until(lambda: len(consumer.messages_consumed[1]) >= num_messages, timeout_sec=90, err_msg="Consumer consumed only %d out of %d messages in %ds" %\ (len(consumer.messages_consumed[1]), num_messages, 90)) consumer.stop() return consumer.messages_consumed[1] def copy_messages_transactionally(self, failure_mode, bounce_target, input_topic, output_topic, num_copiers, num_messages_to_copy): """Copies messages transactionally from the seeded input topic to the output topic, either bouncing brokers or clients in a hard and soft way as it goes. This method also consumes messages in read_committed mode from the output topic while the bounces and copy is going on. It returns the concurrently consumed messages. """ copiers = self.create_and_start_copiers(input_topic=input_topic, output_topic=output_topic, num_copiers=num_copiers) concurrent_consumer = self.start_consumer(output_topic, group_id="concurrent_consumer") clean_shutdown = False if failure_mode == "clean_bounce": clean_shutdown = True if bounce_target == "brokers": self.bounce_brokers(clean_shutdown) elif bounce_target == "clients": self.bounce_copiers(copiers, clean_shutdown) for copier in copiers: wait_until(lambda: copier.is_done, timeout_sec=120, err_msg="%s - Failed to copy all messages in %ds." %\ (copier.transactional_id, 120)) self.logger.info("finished copying messages") return self.drain_consumer(concurrent_consumer, num_messages_to_copy) def setup_topics(self): self.kafka.topics = { self.input_topic: { "partitions": self.num_input_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } }, self.output_topic: { "partitions": self.num_output_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } } } @cluster(num_nodes=9) @matrix(failure_mode=["hard_bounce", "clean_bounce"], bounce_target=["brokers", "clients"], check_order=[True, False]) def test_transactions(self, failure_mode, bounce_target, check_order): security_protocol = 'PLAINTEXT' self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol self.kafka.logs["kafka_data_1"]["collect_default"] = True self.kafka.logs["kafka_data_2"]["collect_default"] = True self.kafka.logs["kafka_operational_logs_debug"]["collect_default"] = True if check_order: # To check ordering, we simply create input and output topics # with a single partition. # We reduce the number of seed messages to copy to account for the fewer output # partitions, and thus lower parallelism. This helps keep the test # time shorter. self.num_seed_messages = self.num_seed_messages / 3 self.num_input_partitions = 1 self.num_output_partitions = 1 self.setup_topics() self.kafka.start() input_messages = self.seed_messages(self.input_topic, self.num_seed_messages) concurrently_consumed_messages = self.copy_messages_transactionally( failure_mode, bounce_target, input_topic=self.input_topic, output_topic=self.output_topic, num_copiers=self.num_input_partitions, num_messages_to_copy=self.num_seed_messages) output_messages = self.get_messages_from_topic(self.output_topic, self.num_seed_messages) concurrently_consumed_message_set = set(concurrently_consumed_messages) output_message_set = set(output_messages) input_message_set = set(input_messages) num_dups = abs(len(output_messages) - len(output_message_set)) num_dups_in_concurrent_consumer = abs(len(concurrently_consumed_messages) - len(concurrently_consumed_message_set)) assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" %\ (len(input_message_set), len(output_message_set)) assert num_dups_in_concurrent_consumer == 0, "Detected %d dups in concurrently consumed messages" % num_dups_in_concurrent_consumer assert input_message_set == concurrently_consumed_message_set, \ "Input and concurrently consumed output message sets are not equal. Num input messages: %d. Num concurrently_consumed_messages: %d" %\ (len(input_message_set), len(concurrently_consumed_message_set)) if check_order: assert input_messages == sorted(input_messages), "The seed messages themselves were not in order" assert output_messages == input_messages, "Output messages are not in order" assert concurrently_consumed_messages == output_messages, "Concurrently consumed messages are not in order"
class TransactionsTest(Test): """Tests transactions by transactionally copying data from a source topic to a destination topic and killing the copy process as well as the broker randomly through the process. In the end we verify that the final output topic contains exactly one committed copy of each message in the input topic. """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(TransactionsTest, self).__init__(test_context=test_context) self.input_topic = "input-topic" self.output_topic = "output-topic" self.num_brokers = 3 # Test parameters self.num_input_partitions = 2 self.num_output_partitions = 3 self.num_seed_messages = 100000 self.transaction_size = 750 # The transaction timeout should be lower than the progress timeout, but at # least as high as the request timeout (which is 30s by default). When the # client is hard-bounced, progress may depend on the previous transaction # being aborted. When the broker is hard-bounced, we may have to wait as # long as the request timeout to get a `Produce` response and we do not # want the coordinator timing out the transaction. self.transaction_timeout = 40000 self.progress_timeout_sec = 60 self.consumer_group = "transactions-test-consumer-group" self.zk = ZookeeperService(test_context, num_nodes=1) if quorum.for_test( test_context) == quorum.zk else None self.kafka = KafkaService(test_context, num_nodes=self.num_brokers, zk=self.zk, controller_num_nodes_override=1) def setUp(self): if self.zk: self.zk.start() def seed_messages(self, topic, num_seed_messages): seed_timeout_sec = 10000 seed_producer = VerifiableProducer(context=self.test_context, num_nodes=1, kafka=self.kafka, topic=topic, message_validator=is_int, max_messages=num_seed_messages, enable_idempotence=True) seed_producer.start() wait_until(lambda: seed_producer.num_acked >= num_seed_messages, timeout_sec=seed_timeout_sec, err_msg="Producer failed to produce messages %d in %ds." %\ (self.num_seed_messages, seed_timeout_sec)) return seed_producer.acked def get_messages_from_topic(self, topic, num_messages): consumer = self.start_consumer(topic, group_id="verifying_consumer") return self.drain_consumer(consumer, num_messages) def bounce_brokers(self, clean_shutdown): for node in self.kafka.nodes: if clean_shutdown: self.kafka.restart_node(node, clean_shutdown=True) else: self.kafka.stop_node(node, clean_shutdown=False) gracePeriodSecs = 5 if self.zk: wait_until( lambda: len(self.kafka.pids( node)) == 0 and not self.kafka.is_registered(node), timeout_sec=self.kafka.zk_session_timeout + gracePeriodSecs, err_msg= "Failed to see timely deregistration of hard-killed broker %s" % str(node.account)) else: brokerSessionTimeoutSecs = 18 wait_until( lambda: len(self.kafka.pids(node)) == 0, timeout_sec=brokerSessionTimeoutSecs + gracePeriodSecs, err_msg= "Failed to see timely disappearance of process for hard-killed broker %s" % str(node.account)) time.sleep(brokerSessionTimeoutSecs + gracePeriodSecs) self.kafka.start_node(node) def create_and_start_message_copier(self, input_topic, input_partition, output_topic, transactional_id, use_group_metadata): message_copier = TransactionalMessageCopier( context=self.test_context, num_nodes=1, kafka=self.kafka, transactional_id=transactional_id, consumer_group=self.consumer_group, input_topic=input_topic, input_partition=input_partition, output_topic=output_topic, max_messages=-1, transaction_size=self.transaction_size, transaction_timeout=self.transaction_timeout, use_group_metadata=use_group_metadata) message_copier.start() wait_until(lambda: message_copier.alive(message_copier.nodes[0]), timeout_sec=10, err_msg="Message copier failed to start after 10 s") return message_copier def bounce_copiers(self, copiers, clean_shutdown): for _ in range(3): for copier in copiers: wait_until(lambda: copier.progress_percent() >= 20.0, timeout_sec=self.progress_timeout_sec, err_msg="%s : Message copier didn't make enough progress in %ds. Current progress: %s" \ % (copier.transactional_id, self.progress_timeout_sec, str(copier.progress_percent()))) self.logger.info( "%s - progress: %s" % (copier.transactional_id, str(copier.progress_percent()))) copier.restart(clean_shutdown) def create_and_start_copiers(self, input_topic, output_topic, num_copiers, use_group_metadata): copiers = [] for i in range(0, num_copiers): copiers.append( self.create_and_start_message_copier( input_topic=input_topic, output_topic=output_topic, input_partition=i, transactional_id="copier-" + str(i), use_group_metadata=use_group_metadata)) return copiers def start_consumer(self, topic_to_read, group_id): consumer = ConsoleConsumer(context=self.test_context, num_nodes=1, kafka=self.kafka, topic=topic_to_read, group_id=group_id, message_validator=is_int, from_beginning=True, isolation_level="read_committed") consumer.start() # ensure that the consumer is up. wait_until(lambda: (len(consumer.messages_consumed[1]) > 0) == True, timeout_sec=60, err_msg="Consumer failed to consume any messages for %ds" %\ 60) return consumer def drain_consumer(self, consumer, num_messages): # wait until we read at least the expected number of messages. # This is a safe check because both failure modes will be caught: # 1. If we have 'num_seed_messages' but there are duplicates, then # this is checked for later. # # 2. If we never reach 'num_seed_messages', then this will cause the # test to fail. wait_until(lambda: len(consumer.messages_consumed[1]) >= num_messages, timeout_sec=90, err_msg="Consumer consumed only %d out of %d messages in %ds" %\ (len(consumer.messages_consumed[1]), num_messages, 90)) consumer.stop() return consumer.messages_consumed[1] def copy_messages_transactionally(self, failure_mode, bounce_target, input_topic, output_topic, num_copiers, num_messages_to_copy, use_group_metadata): """Copies messages transactionally from the seeded input topic to the output topic, either bouncing brokers or clients in a hard and soft way as it goes. This method also consumes messages in read_committed mode from the output topic while the bounces and copy is going on. It returns the concurrently consumed messages. """ copiers = self.create_and_start_copiers( input_topic=input_topic, output_topic=output_topic, num_copiers=num_copiers, use_group_metadata=use_group_metadata) concurrent_consumer = self.start_consumer( output_topic, group_id="concurrent_consumer") clean_shutdown = False if failure_mode == "clean_bounce": clean_shutdown = True if bounce_target == "brokers": self.bounce_brokers(clean_shutdown) elif bounce_target == "clients": self.bounce_copiers(copiers, clean_shutdown) copier_timeout_sec = 120 for copier in copiers: wait_until(lambda: copier.is_done, timeout_sec=copier_timeout_sec, err_msg="%s - Failed to copy all messages in %ds." %\ (copier.transactional_id, copier_timeout_sec)) self.logger.info("finished copying messages") return self.drain_consumer(concurrent_consumer, num_messages_to_copy) def setup_topics(self): self.kafka.topics = { self.input_topic: { "partitions": self.num_input_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } }, self.output_topic: { "partitions": self.num_output_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } } } @cluster(num_nodes=9) @matrix(failure_mode=["hard_bounce", "clean_bounce"], bounce_target=["brokers", "clients"], check_order=[True, False], use_group_metadata=[True, False]) def test_transactions(self, failure_mode, bounce_target, check_order, use_group_metadata, metadata_quorum=quorum.all): security_protocol = 'PLAINTEXT' self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol self.kafka.logs["kafka_data_1"]["collect_default"] = True self.kafka.logs["kafka_data_2"]["collect_default"] = True self.kafka.logs["kafka_operational_logs_debug"][ "collect_default"] = True if check_order: # To check ordering, we simply create input and output topics # with a single partition. # We reduce the number of seed messages to copy to account for the fewer output # partitions, and thus lower parallelism. This helps keep the test # time shorter. self.num_seed_messages = self.num_seed_messages // 3 self.num_input_partitions = 1 self.num_output_partitions = 1 self.setup_topics() self.kafka.start() input_messages = self.seed_messages(self.input_topic, self.num_seed_messages) concurrently_consumed_messages = self.copy_messages_transactionally( failure_mode, bounce_target, input_topic=self.input_topic, output_topic=self.output_topic, num_copiers=self.num_input_partitions, num_messages_to_copy=self.num_seed_messages, use_group_metadata=use_group_metadata) output_messages = self.get_messages_from_topic(self.output_topic, self.num_seed_messages) concurrently_consumed_message_set = set(concurrently_consumed_messages) output_message_set = set(output_messages) input_message_set = set(input_messages) num_dups = abs(len(output_messages) - len(output_message_set)) num_dups_in_concurrent_consumer = abs( len(concurrently_consumed_messages) - len(concurrently_consumed_message_set)) assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" %\ (len(input_message_set), len(output_message_set)) assert num_dups_in_concurrent_consumer == 0, "Detected %d dups in concurrently consumed messages" % num_dups_in_concurrent_consumer assert input_message_set == concurrently_consumed_message_set, \ "Input and concurrently consumed output message sets are not equal. Num input messages: %d. Num concurrently_consumed_messages: %d" %\ (len(input_message_set), len(concurrently_consumed_message_set)) if check_order: assert input_messages == sorted( input_messages ), "The seed messages themselves were not in order" assert output_messages == input_messages, "Output messages are not in order" assert concurrently_consumed_messages == output_messages, "Concurrently consumed messages are not in order"
class RoundTripFaultTest(Test): topic_name_index = 0 def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(RoundTripFaultTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk) self.workload_service = RoundTripWorkloadService( test_context, self.kafka) self.trogdor = TrogdorService( context=self.test_context, client_services=[self.zk, self.kafka, self.workload_service]) topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1 active_topics = { topic_name: { "partitionAssignments": { "0": [0, 1, 2] } } } self.round_trip_spec = RoundTripWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.workload_service.client_node, self.workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=100000, active_topics=active_topics) def setUp(self): self.zk.start() self.kafka.start() self.trogdor.start() def teardown(self): self.trogdor.stop() self.kafka.stop() self.zk.stop() def test_round_trip_workload(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) workload1.wait_for_done(timeout_sec=600) def test_round_trip_workload_with_broker_partition(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) part1 = [self.kafka.nodes[0]] part2 = self.kafka.nodes[1:] + [self.workload_service.nodes[0] ] + self.zk.nodes partition1_spec = NetworkPartitionFaultSpec(0, TaskSpec.MAX_DURATION_MS, [part1, part2]) partition1 = self.trogdor.create_task("partition1", partition1_spec) workload1.wait_for_done(timeout_sec=600) partition1.stop() partition1.wait_for_done() def test_produce_consume_with_broker_pause(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) stop1_spec = ProcessStopFaultSpec(0, TaskSpec.MAX_DURATION_MS, [self.kafka.nodes[0]], self.kafka.java_class_name()) stop1 = self.trogdor.create_task("stop1", stop1_spec) workload1.wait_for_done(timeout_sec=600) stop1.stop() stop1.wait_for_done() self.kafka.stop_node(self.kafka.nodes[0], False) def test_produce_consume_with_client_partition(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) part1 = [self.workload_service.nodes[0]] part2 = self.kafka.nodes + self.zk.nodes partition1_spec = NetworkPartitionFaultSpec(0, 60000, [part1, part2]) stop1 = self.trogdor.create_task("stop1", partition1_spec) workload1.wait_for_done(timeout_sec=600) stop1.stop() stop1.wait_for_done()
class TestSecurityRollingUpgrade(ProduceConsumeValidateTest): """Tests a rolling upgrade from PLAINTEXT to a secured cluster """ def __init__(self, test_context): super(TestSecurityRollingUpgrade, self).__init__(test_context=test_context) def setUp(self): self.acls = ACLs(self.test_context) self.topic = "test_topic" self.group = "group" self.producer_throughput = 100 self.num_producers = 1 self.num_consumers = 1 self.zk = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, topics={ self.topic: { "partitions": 3, "replication-factor": 3, 'configs': { "min.insync.replicas": 2 } } }) self.zk.start() def create_producer_and_consumer(self): self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=60000, message_validator=is_int) self.consumer.group_id = "group" def bounce(self): self.kafka.start_minikdc() for node in self.kafka.nodes: self.kafka.stop_node(node) self.kafka.start_node(node) time.sleep(10) def roll_in_secured_settings(self, client_protocol, broker_protocol): # Roll cluster to include inter broker security protocol. self.kafka.setup_interbroker_listener(broker_protocol) self.bounce() # Roll cluster to disable PLAINTEXT port self.kafka.close_port(SecurityConfig.PLAINTEXT) self.set_authorizer_and_bounce(client_protocol, broker_protocol) def set_authorizer_and_bounce(self, client_protocol, broker_protocol): self.kafka.authorizer_class_name = KafkaService.SIMPLE_AUTHORIZER self.acls.set_acls(client_protocol, self.kafka, self.topic, self.group) self.acls.set_acls(broker_protocol, self.kafka, self.topic, self.group) self.bounce() def open_secured_port(self, client_protocol): self.kafka.security_protocol = client_protocol self.kafka.open_port(client_protocol) self.kafka.start_minikdc() self.bounce() def add_sasl_mechanism(self, new_client_sasl_mechanism): self.kafka.client_sasl_mechanism = new_client_sasl_mechanism self.kafka.start_minikdc() self.bounce() def roll_in_sasl_mechanism(self, security_protocol, new_sasl_mechanism): # Roll cluster to update inter-broker SASL mechanism. This disables the old mechanism. self.kafka.interbroker_sasl_mechanism = new_sasl_mechanism self.bounce() # Bounce again with ACLs for new mechanism self.set_authorizer_and_bounce(security_protocol, security_protocol) def add_separate_broker_listener(self, broker_security_protocol, broker_sasl_mechanism): self.kafka.setup_interbroker_listener(broker_security_protocol, True) self.kafka.interbroker_sasl_mechanism = broker_sasl_mechanism # kafka opens interbroker port automatically in start() but not in bounce() self.kafka.open_port(self.kafka.INTERBROKER_LISTENER_NAME) self.bounce() def remove_separate_broker_listener(self, client_security_protocol, client_sasl_mechanism): # separate interbroker listener port will be closed automatically in setup_interbroker_listener # if not using separate interbroker listener self.kafka.setup_interbroker_listener(client_security_protocol, False) self.kafka.interbroker_sasl_mechanism = client_sasl_mechanism self.bounce() @cluster(num_nodes=8) @matrix(client_protocol=[SecurityConfig.SSL]) @cluster(num_nodes=9) @matrix(client_protocol=[ SecurityConfig.SASL_PLAINTEXT, SecurityConfig.SASL_SSL ]) def test_rolling_upgrade_phase_one(self, client_protocol): """ Start with a PLAINTEXT cluster, open a SECURED port, via a rolling upgrade, ensuring we could produce and consume throughout over PLAINTEXT. Finally check we can produce and consume the new secured port. """ self.kafka.setup_interbroker_listener(SecurityConfig.PLAINTEXT) self.kafka.security_protocol = SecurityConfig.PLAINTEXT self.kafka.start() # Create PLAINTEXT producer and consumer self.create_producer_and_consumer() # Rolling upgrade, opening a secure protocol, ensuring the Plaintext producer/consumer continues to run self.run_produce_consume_validate(self.open_secured_port, client_protocol) # Now we can produce and consume via the secured port self.kafka.security_protocol = client_protocol self.create_producer_and_consumer() self.run_produce_consume_validate(lambda: time.sleep(1)) @cluster(num_nodes=8) @matrix(client_protocol=[ SecurityConfig.SASL_SSL, SecurityConfig.SSL, SecurityConfig.SASL_PLAINTEXT ], broker_protocol=[ SecurityConfig.SASL_SSL, SecurityConfig.SSL, SecurityConfig.SASL_PLAINTEXT ]) def test_rolling_upgrade_phase_two(self, client_protocol, broker_protocol): """ Start with a PLAINTEXT cluster with a second Secured port open (i.e. result of phase one). A third secure port is also open if inter-broker and client protocols are different. Start a Producer and Consumer via the SECURED client port Incrementally upgrade to add inter-broker be the secure broker protocol Incrementally upgrade again to add ACLs as well as disabling the PLAINTEXT port Ensure the producer and consumer ran throughout """ #Given we have a broker that has both secure and PLAINTEXT ports open self.kafka.security_protocol = client_protocol self.kafka.setup_interbroker_listener(SecurityConfig.PLAINTEXT, use_separate_listener=False) self.kafka.open_port(broker_protocol) self.kafka.start() #Create Secured Producer and Consumer self.create_producer_and_consumer() #Roll in the security protocol. Disable Plaintext. Ensure we can produce and Consume throughout self.run_produce_consume_validate(self.roll_in_secured_settings, client_protocol, broker_protocol) @cluster(num_nodes=9) @matrix(new_client_sasl_mechanism=[SecurityConfig.SASL_MECHANISM_PLAIN]) def test_rolling_upgrade_sasl_mechanism_phase_one( self, new_client_sasl_mechanism): """ Start with a SASL/GSSAPI cluster, add new SASL mechanism, via a rolling upgrade, ensuring we could produce and consume throughout over SASL/GSSAPI. Finally check we can produce and consume using new mechanism. """ self.kafka.setup_interbroker_listener(SecurityConfig.SASL_SSL, use_separate_listener=False) self.kafka.security_protocol = SecurityConfig.SASL_SSL self.kafka.client_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI self.kafka.interbroker_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI self.kafka.start() # Create SASL/GSSAPI producer and consumer self.create_producer_and_consumer() # Rolling upgrade, adding new SASL mechanism, ensuring the GSSAPI producer/consumer continues to run self.run_produce_consume_validate(self.add_sasl_mechanism, new_client_sasl_mechanism) # Now we can produce and consume using the new SASL mechanism self.kafka.client_sasl_mechanism = new_client_sasl_mechanism self.create_producer_and_consumer() self.run_produce_consume_validate(lambda: time.sleep(1)) @cluster(num_nodes=8) @matrix(new_sasl_mechanism=[SecurityConfig.SASL_MECHANISM_PLAIN]) def test_rolling_upgrade_sasl_mechanism_phase_two(self, new_sasl_mechanism): """ Start with a SASL cluster with GSSAPI for inter-broker and a second mechanism for clients (i.e. result of phase one). Start Producer and Consumer using the second mechanism Incrementally upgrade to set inter-broker to the second mechanism and disable GSSAPI Incrementally upgrade again to add ACLs Ensure the producer and consumer run throughout """ #Start with a broker that has GSSAPI for inter-broker and a second mechanism for clients self.kafka.security_protocol = SecurityConfig.SASL_SSL self.kafka.setup_interbroker_listener(SecurityConfig.SASL_SSL, use_separate_listener=False) self.kafka.client_sasl_mechanism = new_sasl_mechanism self.kafka.interbroker_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI self.kafka.start() #Create Producer and Consumer using second mechanism self.create_producer_and_consumer() #Roll in the second SASL mechanism for inter-broker, disabling first mechanism. Ensure we can produce and consume throughout self.run_produce_consume_validate(self.roll_in_sasl_mechanism, self.kafka.security_protocol, new_sasl_mechanism) @cluster(num_nodes=9) def test_enable_separate_interbroker_listener(self): """ Start with a cluster that has a single PLAINTEXT listener. Start producing/consuming on PLAINTEXT port. While doing that, do a rolling restart to enable separate secured interbroker port """ self.kafka.security_protocol = SecurityConfig.PLAINTEXT self.kafka.setup_interbroker_listener(SecurityConfig.PLAINTEXT, use_separate_listener=False) self.kafka.start() self.create_producer_and_consumer() self.run_produce_consume_validate(self.add_separate_broker_listener, SecurityConfig.SASL_SSL, SecurityConfig.SASL_MECHANISM_PLAIN) @cluster(num_nodes=9) def test_disable_separate_interbroker_listener(self): """ Start with a cluster that has two listeners, one on SSL (clients), another on SASL_SSL (broker-to-broker). Start producer and consumer on SSL listener. Close dedicated interbroker listener via rolling restart. Ensure we can produce and consume via SSL listener throughout. """ client_protocol = SecurityConfig.SSL client_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI self.kafka.security_protocol = client_protocol self.kafka.client_sasl_mechanism = client_sasl_mechanism self.kafka.setup_interbroker_listener(SecurityConfig.SASL_SSL, use_separate_listener=True) self.kafka.interbroker_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI self.kafka.start() # create producer and consumer via client security protocol self.create_producer_and_consumer() # run produce/consume/validate loop while disabling a separate interbroker listener via rolling restart self.run_produce_consume_validate(self.remove_separate_broker_listener, client_protocol, client_sasl_mechanism)
class ZooKeeperSecurityUpgradeTest(ProduceConsumeValidateTest): """Tests a rolling upgrade for zookeeper. """ def __init__(self, test_context): super(ZooKeeperSecurityUpgradeTest, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.group = "group" self.producer_throughput = 100 self.num_producers = 1 self.num_consumers = 1 self.acls = ACLs() self.zk = ZookeeperService(self.test_context, num_nodes=3) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, topics={self.topic: { "partitions": 3, "replication-factor": 3, 'configs': {"min.insync.replicas": 2}}}) def create_producer_and_consumer(self): self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput) self.consumer = ConsoleConsumer( self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=60000, message_validator=is_int, new_consumer=True) self.consumer.group_id = self.group @property def no_sasl(self): return self.kafka.security_protocol == "PLAINTEXT" or self.kafka.security_protocol == "SSL" @property def is_secure(self): return self.kafka.security_protocol == "SASL_PLAINTEXT" \ or self.kafka.security_protocol == "SSL" \ or self.kafka.security_protocol == "SASL_SSL" def run_zk_migration(self): # change zk config (auth provider + jaas login) self.zk.kafka_opts = self.zk.security_system_properties self.zk.zk_sasl = True if self.no_sasl: self.kafka.start_minikdc(self.zk.zk_principals) # restart zk for node in self.zk.nodes: self.zk.stop_node(node) self.zk.start_node(node) # restart broker with jaas login for node in self.kafka.nodes: self.kafka.stop_node(node) self.kafka.start_node(node) # run migration tool for node in self.zk.nodes: self.zk.zookeeper_migration(node, "secure") # restart broker with zookeeper.set.acl=true and acls self.kafka.zk_set_acl = "true" for node in self.kafka.nodes: self.kafka.stop_node(node) self.kafka.start_node(node) @matrix(security_protocol=["PLAINTEXT","SSL","SASL_SSL","SASL_PLAINTEXT"]) def test_zk_security_upgrade(self, security_protocol): self.zk.start() self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol # set acls if self.is_secure: self.kafka.authorizer_class_name = KafkaService.SIMPLE_AUTHORIZER self.acls.set_acls(security_protocol, self.kafka, self.zk, self.topic, self.group) if(self.no_sasl): self.kafka.start() else: self.kafka.start(self.zk.zk_principals) #Create Producer and Consumer self.create_producer_and_consumer() #Run upgrade self.run_produce_consume_validate(self.run_zk_migration)
class TestSecurityRollingUpgrade(ProduceConsumeValidateTest): """Tests a rolling upgrade from PLAINTEXT to a secured cluster """ def __init__(self, test_context): super(TestSecurityRollingUpgrade, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.producer_throughput = 100 self.num_producers = 1 self.num_consumers = 1 self.zk = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, topics={ self.topic: { "partitions": 3, "replication-factor": 3, 'configs': { "min.insync.replicas": 2 } } }) self.zk.start() def create_producer_and_consumer(self): self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=60000, message_validator=is_int, new_consumer=True) self.consumer.group_id = "unique-test-group-" + str(random.random()) def bounce(self): self.kafka.start_minikdc() for node in self.kafka.nodes: self.kafka.stop_node(node) self.kafka.start_node(node) time.sleep(10) def roll_in_secured_settings(self, client_protocol, broker_protocol): # Roll cluster to include inter broker security protocol. self.kafka.interbroker_security_protocol = broker_protocol self.kafka.open_port(client_protocol) self.kafka.open_port(broker_protocol) self.bounce() # Roll cluster to disable PLAINTEXT port self.kafka.close_port('PLAINTEXT') self.bounce() def open_secured_port(self, client_protocol): self.kafka.security_protocol = client_protocol self.kafka.open_port(client_protocol) self.kafka.start_minikdc() self.bounce() @matrix(client_protocol=["SSL", "SASL_PLAINTEXT", "SASL_SSL"]) def test_rolling_upgrade_phase_one(self, client_protocol): """ Start with a PLAINTEXT cluster, open a SECURED port, via a rolling upgrade, ensuring we could produce and consume throughout over PLAINTEXT. Finally check we can produce and consume the new secured port. """ self.kafka.interbroker_security_protocol = "PLAINTEXT" self.kafka.security_protocol = "PLAINTEXT" self.kafka.start() # Create PLAINTEXT producer and consumer self.create_producer_and_consumer() # Rolling upgrade, opening a secure protocol, ensuring the Plaintext producer/consumer continues to run self.run_produce_consume_validate(self.open_secured_port, client_protocol) # Now we can produce and consume via the secured port self.kafka.security_protocol = client_protocol self.create_producer_and_consumer() self.run_produce_consume_validate(lambda: time.sleep(1)) @matrix(client_protocol=["SASL_SSL", "SSL", "SASL_PLAINTEXT"], broker_protocol=["SASL_SSL", "SSL", "SASL_PLAINTEXT"]) def test_rolling_upgrade_phase_two(self, client_protocol, broker_protocol): """ Start with a PLAINTEXT cluster with a second Secured port open (i.e. result of phase one). Start an Producer and Consumer via the SECURED port Rolling upgrade to add inter-broker be the secure protocol Rolling upgrade again to disable PLAINTEXT Ensure the producer and consumer ran throughout """ #Given we have a broker that has both secure and PLAINTEXT ports open self.kafka.security_protocol = client_protocol self.kafka.interbroker_security_protocol = "PLAINTEXT" self.kafka.start() #Create Secured Producer and Consumer self.create_producer_and_consumer() #Roll in the security protocol. Disable Plaintext. Ensure we can produce and Consume throughout self.run_produce_consume_validate(self.roll_in_secured_settings, client_protocol, broker_protocol)
class TransactionsTest(Test): """Tests transactions by transactionally copying data from a source topic to a destination topic and killing the copy process as well as the broker randomly through the process. In the end we verify that the final output topic contains exactly one committed copy of each message in the input topic """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(TransactionsTest, self).__init__(test_context=test_context) self.input_topic = "input-topic" self.output_topic = "output-topic" self.num_brokers = 3 # Test parameters self.num_input_partitions = 2 self.num_output_partitions = 3 self.num_seed_messages = 20000 self.transaction_size = 500 self.first_transactional_id = "my-first-transactional-id" self.second_transactional_id = "my-second-transactional-id" self.consumer_group = "transactions-test-consumer-group" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=self.num_brokers, zk=self.zk, topics={ self.input_topic: { "partitions": self.num_input_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } }, self.output_topic: { "partitions": self.num_output_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } } }) def setUp(self): self.zk.start() def seed_messages(self): seed_timeout_sec = 10000 seed_producer = VerifiableProducer(context=self.test_context, num_nodes=1, kafka=self.kafka, topic=self.input_topic, message_validator=is_int, max_messages=self.num_seed_messages, enable_idempotence=True) seed_producer.start() wait_until(lambda: seed_producer.num_acked >= self.num_seed_messages, timeout_sec=seed_timeout_sec, err_msg="Producer failed to produce messages %d in %ds." %\ (self.num_seed_messages, seed_timeout_sec)) return seed_producer.acked def get_messages_from_output_topic(self): consumer = ConsoleConsumer(context=self.test_context, num_nodes=1, kafka=self.kafka, topic=self.output_topic, new_consumer=True, message_validator=is_int, from_beginning=True, consumer_timeout_ms=5000, isolation_level="read_committed") consumer.start() # ensure that the consumer is up. wait_until(lambda: consumer.alive(consumer.nodes[0]) == True, timeout_sec=60, err_msg="Consumer failed to start for %ds" %\ 60) # wait until the consumer closes, which will be 5 seconds after # receiving the last message. wait_until(lambda: consumer.alive(consumer.nodes[0]) == False, timeout_sec=60, err_msg="Consumer failed to consume %d messages in %ds" %\ (self.num_seed_messages, 60)) return consumer.messages_consumed[1] def bounce_brokers(self, clean_shutdown): for node in self.kafka.nodes: if clean_shutdown: self.kafka.restart_node(node, clean_shutdown=True) else: self.kafka.stop_node(node, clean_shutdown=False) wait_until(lambda: len(self.kafka.pids(node)) == 0 and not self .kafka.is_registered(node), timeout_sec=self.kafka.zk_session_timeout + 5, err_msg="Failed to see timely deregistration of \ hard-killed broker %s" % str(node.account)) self.kafka.start_node(node) def create_and_start_message_copier(self, input_partition, transactional_id): message_copier = TransactionalMessageCopier( context=self.test_context, num_nodes=1, kafka=self.kafka, transactional_id=transactional_id, consumer_group=self.consumer_group, input_topic=self.input_topic, input_partition=input_partition, output_topic=self.output_topic, max_messages=-1, transaction_size=self.transaction_size) message_copier.start() wait_until(lambda: message_copier.alive(message_copier.nodes[0]), timeout_sec=10, err_msg="Message copier failed to start after 10 s") return message_copier def bounce_copiers(self, copiers, clean_shutdown): for _ in range(3): for copier in copiers: wait_until(lambda: copier.progress_percent() >= 20.0, timeout_sec=30, err_msg="%s : Message copier didn't make enough progress in 30s. Current progress: %s" \ % (copier.transactional_id, str(copier.progress_percent()))) self.logger.info( "%s - progress: %s" % (copier.transactional_id, str(copier.progress_percent()))) copier.restart(clean_shutdown) def create_and_start_copiers(self): copiers = [] copiers.append( self.create_and_start_message_copier( input_partition=0, transactional_id=self.first_transactional_id)) copiers.append( self.create_and_start_message_copier( input_partition=1, transactional_id=self.second_transactional_id)) return copiers def copy_messages_transactionally(self, failure_mode, bounce_target): copiers = self.create_and_start_copiers() clean_shutdown = False if failure_mode == "clean_bounce": clean_shutdown = True if bounce_target == "brokers": self.bounce_brokers(clean_shutdown) elif bounce_target == "clients": self.bounce_copiers(copiers, clean_shutdown) for copier in copiers: wait_until(lambda: copier.is_done, timeout_sec=60, err_msg="%s - Failed to copy all messages in %ds." %\ (copier.transactional_id, 60)) self.logger.info("finished copying messages") @cluster(num_nodes=8) @matrix(failure_mode=["clean_bounce", "hard_bounce"], bounce_target=["brokers", "clients"]) def test_transactions(self, failure_mode, bounce_target): security_protocol = 'PLAINTEXT' self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol self.kafka.start() input_messages = self.seed_messages() self.copy_messages_transactionally(failure_mode, bounce_target) output_messages = self.get_messages_from_output_topic() output_message_set = set(output_messages) input_message_set = set(input_messages) num_dups = abs(len(output_messages) - len(output_message_set)) assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" %\ (len(input_message_set), len(output_message_set))
class TestSecurityRollingUpgrade(ProduceConsumeValidateTest): """Tests a rolling upgrade from PLAINTEXT to a secured cluster """ def __init__(self, test_context): super(TestSecurityRollingUpgrade, self).__init__(test_context=test_context) def setUp(self): self.acls = ACLs(self.test_context) self.topic = "test_topic" self.group = "group" self.producer_throughput = 100 self.num_producers = 1 self.num_consumers = 1 self.zk = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, topics={self.topic: { "partitions": 3, "replication-factor": 3, 'configs': {"min.insync.replicas": 2}}}) self.zk.start() def create_producer_and_consumer(self): self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput) self.consumer = ConsoleConsumer( self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=60000, message_validator=is_int, new_consumer=True) self.consumer.group_id = "group" def bounce(self): self.kafka.start_minikdc() for node in self.kafka.nodes: self.kafka.stop_node(node) self.kafka.start_node(node) time.sleep(10) def roll_in_secured_settings(self, client_protocol, broker_protocol): # Roll cluster to include inter broker security protocol. self.kafka.interbroker_security_protocol = broker_protocol self.kafka.open_port(client_protocol) self.kafka.open_port(broker_protocol) self.bounce() # Roll cluster to disable PLAINTEXT port self.kafka.close_port('PLAINTEXT') self.set_authorizer_and_bounce(client_protocol, broker_protocol) def set_authorizer_and_bounce(self, client_protocol, broker_protocol): self.kafka.authorizer_class_name = KafkaService.SIMPLE_AUTHORIZER self.acls.set_acls(client_protocol, self.kafka, self.zk, self.topic, self.group) self.acls.set_acls(broker_protocol, self.kafka, self.zk, self.topic, self.group) self.bounce() def open_secured_port(self, client_protocol): self.kafka.security_protocol = client_protocol self.kafka.open_port(client_protocol) self.kafka.start_minikdc() self.bounce() def add_sasl_mechanism(self, new_client_sasl_mechanism): self.kafka.client_sasl_mechanism = new_client_sasl_mechanism self.kafka.start_minikdc() self.bounce() def roll_in_sasl_mechanism(self, security_protocol, new_sasl_mechanism): # Roll cluster to update inter-broker SASL mechanism. This disables the old mechanism. self.kafka.interbroker_sasl_mechanism = new_sasl_mechanism self.bounce() # Bounce again with ACLs for new mechanism self.set_authorizer_and_bounce(security_protocol, security_protocol) @matrix(client_protocol=["SSL", "SASL_PLAINTEXT", "SASL_SSL"]) def test_rolling_upgrade_phase_one(self, client_protocol): """ Start with a PLAINTEXT cluster, open a SECURED port, via a rolling upgrade, ensuring we could produce and consume throughout over PLAINTEXT. Finally check we can produce and consume the new secured port. """ self.kafka.interbroker_security_protocol = "PLAINTEXT" self.kafka.security_protocol = "PLAINTEXT" self.kafka.start() # Create PLAINTEXT producer and consumer self.create_producer_and_consumer() # Rolling upgrade, opening a secure protocol, ensuring the Plaintext producer/consumer continues to run self.run_produce_consume_validate(self.open_secured_port, client_protocol) # Now we can produce and consume via the secured port self.kafka.security_protocol = client_protocol self.create_producer_and_consumer() self.run_produce_consume_validate(lambda: time.sleep(1)) @matrix(client_protocol=["SASL_SSL", "SSL", "SASL_PLAINTEXT"], broker_protocol=["SASL_SSL", "SSL", "SASL_PLAINTEXT"]) def test_rolling_upgrade_phase_two(self, client_protocol, broker_protocol): """ Start with a PLAINTEXT cluster with a second Secured port open (i.e. result of phase one). Start an Producer and Consumer via the SECURED port Incrementally upgrade to add inter-broker be the secure protocol Incrementally upgrade again to add ACLs as well as disabling the PLAINTEXT port Ensure the producer and consumer ran throughout """ #Given we have a broker that has both secure and PLAINTEXT ports open self.kafka.security_protocol = client_protocol self.kafka.interbroker_security_protocol = "PLAINTEXT" self.kafka.start() #Create Secured Producer and Consumer self.create_producer_and_consumer() #Roll in the security protocol. Disable Plaintext. Ensure we can produce and Consume throughout self.run_produce_consume_validate(self.roll_in_secured_settings, client_protocol, broker_protocol) @parametrize(new_client_sasl_mechanism='PLAIN') def test_rolling_upgrade_sasl_mechanism_phase_one(self, new_client_sasl_mechanism): """ Start with a SASL/GSSAPI cluster, add new SASL mechanism, via a rolling upgrade, ensuring we could produce and consume throughout over SASL/GSSAPI. Finally check we can produce and consume using new mechanism. """ self.kafka.interbroker_security_protocol = "SASL_SSL" self.kafka.security_protocol = "SASL_SSL" self.kafka.client_sasl_mechanism = "GSSAPI" self.kafka.interbroker_sasl_mechanism = "GSSAPI" self.kafka.start() # Create SASL/GSSAPI producer and consumer self.create_producer_and_consumer() # Rolling upgrade, adding new SASL mechanism, ensuring the GSSAPI producer/consumer continues to run self.run_produce_consume_validate(self.add_sasl_mechanism, new_client_sasl_mechanism) # Now we can produce and consume using the new SASL mechanism self.kafka.client_sasl_mechanism = new_client_sasl_mechanism self.create_producer_and_consumer() self.run_produce_consume_validate(lambda: time.sleep(1)) @parametrize(new_sasl_mechanism='PLAIN') def test_rolling_upgrade_sasl_mechanism_phase_two(self, new_sasl_mechanism): """ Start with a SASL cluster with GSSAPI for inter-broker and a second mechanism for clients (i.e. result of phase one). Start Producer and Consumer using the second mechanism Incrementally upgrade to set inter-broker to the second mechanism and disable GSSAPI Incrementally upgrade again to add ACLs Ensure the producer and consumer run throughout """ #Start with a broker that has GSSAPI for inter-broker and a second mechanism for clients self.kafka.security_protocol = "SASL_SSL" self.kafka.interbroker_security_protocol = "SASL_SSL" self.kafka.client_sasl_mechanism = new_sasl_mechanism self.kafka.interbroker_sasl_mechanism = "GSSAPI" self.kafka.start() #Create Producer and Consumer using second mechanism self.create_producer_and_consumer() #Roll in the second SASL mechanism for inter-broker, disabling first mechanism. Ensure we can produce and consume throughout self.run_produce_consume_validate(self.roll_in_sasl_mechanism, self.kafka.security_protocol, new_sasl_mechanism)
class RoundTripFaultTest(Test): topic_name_index = 0 def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(RoundTripFaultTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk) self.workload_service = RoundTripWorkloadService(test_context, self.kafka) self.trogdor = TrogdorService(context=self.test_context, client_services=[self.zk, self.kafka, self.workload_service]) topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1 active_topics={topic_name : {"partitionAssignments":{"0": [0,1,2]}}} self.round_trip_spec = RoundTripWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.workload_service.client_node, self.workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=100000, active_topics=active_topics) def setUp(self): self.zk.start() self.kafka.start() self.trogdor.start() def teardown(self): self.trogdor.stop() self.kafka.stop() self.zk.stop() def test_round_trip_workload(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) workload1.wait_for_done(timeout_sec=600) def test_round_trip_workload_with_broker_partition(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) part1 = [self.kafka.nodes[0]] part2 = self.kafka.nodes[1:] + [self.workload_service.nodes[0]] + self.zk.nodes partition1_spec = NetworkPartitionFaultSpec(0, TaskSpec.MAX_DURATION_MS, [part1, part2]) partition1 = self.trogdor.create_task("partition1", partition1_spec) workload1.wait_for_done(timeout_sec=600) partition1.stop() partition1.wait_for_done() def test_produce_consume_with_broker_pause(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) stop1_spec = ProcessStopFaultSpec(0, TaskSpec.MAX_DURATION_MS, [self.kafka.nodes[0]], self.kafka.java_class_name()) stop1 = self.trogdor.create_task("stop1", stop1_spec) workload1.wait_for_done(timeout_sec=600) stop1.stop() stop1.wait_for_done() self.kafka.stop_node(self.kafka.nodes[0], False) def test_produce_consume_with_client_partition(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) part1 = [self.workload_service.nodes[0]] part2 = self.kafka.nodes + self.zk.nodes partition1_spec = NetworkPartitionFaultSpec(0, 60000, [part1, part2]) stop1 = self.trogdor.create_task("stop1", partition1_spec) workload1.wait_for_done(timeout_sec=600) stop1.stop() stop1.wait_for_done()
class ZooKeeperSecurityUpgradeTest(ProduceConsumeValidateTest): """Tests a rolling upgrade for zookeeper. """ def __init__(self, test_context): super(ZooKeeperSecurityUpgradeTest, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.group = "group" self.producer_throughput = 100 self.num_producers = 1 self.num_consumers = 1 self.acls = ACLs() self.zk = ZookeeperService(self.test_context, num_nodes=3) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, topics={ self.topic: { "partitions": 3, "replication-factor": 3, 'configs': { "min.insync.replicas": 2 } } }) def create_producer_and_consumer(self): self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=60000, message_validator=is_int, new_consumer=True) self.consumer.group_id = self.group @property def no_sasl(self): return self.kafka.security_protocol == "PLAINTEXT" or self.kafka.security_protocol == "SSL" @property def is_secure(self): return self.kafka.security_protocol == "SASL_PLAINTEXT" \ or self.kafka.security_protocol == "SSL" \ or self.kafka.security_protocol == "SASL_SSL" def run_zk_migration(self): # change zk config (auth provider + jaas login) self.zk.kafka_opts = self.zk.security_system_properties self.zk.zk_sasl = True if self.no_sasl: self.kafka.start_minikdc(self.zk.zk_principals) # restart zk for node in self.zk.nodes: self.zk.stop_node(node) self.zk.start_node(node) # restart broker with jaas login for node in self.kafka.nodes: self.kafka.stop_node(node) self.kafka.start_node(node) # run migration tool for node in self.zk.nodes: self.zk.zookeeper_migration(node, "secure") # restart broker with zookeeper.set.acl=true and acls self.kafka.zk_set_acl = "true" for node in self.kafka.nodes: self.kafka.stop_node(node) self.kafka.start_node(node) @matrix( security_protocol=["PLAINTEXT", "SSL", "SASL_SSL", "SASL_PLAINTEXT"]) def test_zk_security_upgrade(self, security_protocol): self.zk.start() self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol # set acls if self.is_secure: self.kafka.authorizer_class_name = KafkaService.SIMPLE_AUTHORIZER self.acls.set_acls(security_protocol, self.kafka, self.zk, self.topic, self.group) if (self.no_sasl): self.kafka.start() else: self.kafka.start(self.zk.zk_principals) #Create Producer and Consumer self.create_producer_and_consumer() #Run upgrade self.run_produce_consume_validate(self.run_zk_migration)
class ZookeeperTlsTest(ProduceConsumeValidateTest): """Tests TLS connectivity to zookeeper. """ def __init__(self, test_context): super(ZookeeperTlsTest, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.group = "group" self.producer_throughput = 100 self.num_producers = 1 self.num_consumers = 1 self.zk = ZookeeperService(self.test_context, num_nodes=3) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, topics={ self.topic: { "partitions": 3, "replication-factor": 3, 'configs': { "min.insync.replicas": 2 } } }) def create_producer_and_consumer(self): self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=60000, message_validator=is_int) self.consumer.group_id = self.group def perform_produce_consume_validation(self): self.create_producer_and_consumer() self.run_produce_consume_validate() self.producer.free() self.consumer.free() def enable_zk_tls(self): self.test_context.logger.debug( "Enabling the TLS port in Zookeeper (we won't use it from Kafka yet)" ) # change zk config (enable TLS, but also keep non-TLS) self.zk.zk_client_secure_port = True self.zk.restart_cluster() # bounce a Kafka broker -- allows us to detect a broker restart failure as a simple sanity check self.kafka.stop_node(self.kafka.nodes[0]) self.kafka.start_node(self.kafka.nodes[0]) def enable_kafka_zk_tls(self): self.test_context.logger.debug( "Configuring Kafka to use the TLS port in Zookeeper") # change Kafka config (enable TLS to Zookeeper) and restart the Kafka cluster self.kafka.zk_client_secure = True self.kafka.restart_cluster() def disable_zk_non_tls(self): self.test_context.logger.debug( "Disabling the non-TLS port in Zookeeper (as a simple sanity check)" ) # change zk config (disable non-TLS, keep TLS) and restart the ZooKeeper cluster self.zk.zk_client_port = False self.zk.restart_cluster() # bounce a Kafka broker -- allows us to detect a broker restart failure as a simple sanity check self.kafka.stop_node(self.kafka.nodes[0]) self.kafka.start_node(self.kafka.nodes[0]) @cluster(num_nodes=9) def test_zk_tls(self): self.zk.start() self.kafka.security_protocol = self.kafka.interbroker_security_protocol = "PLAINTEXT" self.kafka.start() # Enable TLS port in Zookeeper in addition to the regular non-TLS port # Bounces the ZooKeeper cluster (and a single broker as a sanity check) self.enable_zk_tls() # Leverage ZooKeeper TLS port in Kafka # Bounces the Kafka cluster self.enable_kafka_zk_tls() self.perform_produce_consume_validation() # Disable ZooKeeper non-TLS port to make sure we aren't using it # Bounces the ZooKeeper cluster (and a single broker as a sanity check) self.disable_zk_non_tls() # Make sure the ZooKeeper command line is able to talk to a TLS-enabled ZooKeeper quorum # Test both create() and query(), each of which leverages the ZooKeeper command line # This tests the code in org.apache.zookeeper.ZooKeeperMainWithTlsSupportForKafka path = "/foo" value = "{\"bar\": 0}" self.zk.create(path, value=value) if self.zk.query(path) != value: raise Exception( "Error creating and then querying a znode using the CLI with a TLS-enabled ZooKeeper quorum" ) # Make sure the ConfigCommand CLI is able to talk to a TLS-enabled ZooKeeper quorum # This is necessary for the bootstrap use case despite direct ZooKeeper connectivity being deprecated self.zk.describe(self.topic) # Make sure the AclCommand CLI is able to talk to a TLS-enabled ZooKeeper quorum # This is necessary for the bootstrap use case despite direct ZooKeeper connectivity being deprecated self.zk.list_acls(self.topic) # # Test zookeeper.set.acl with just TLS mutual authentication (no SASL) # # Step 1: run migration tool self.zk.zookeeper_migration(self.zk.nodes[0], "secure") # Step 2: restart brokers with zookeeper.set.acl=true and acls (with TLS but no SASL) self.kafka.zk_set_acl = True self.kafka.restart_cluster() self.perform_produce_consume_validation() # # Test zookeeper.set.acl with both SASL and TLS mutual authentication # # Step 1: remove ACLs created previously self.kafka.zk_set_acl = False self.kafka.restart_cluster() self.zk.zookeeper_migration(self.zk.nodes[0], "unsecure") # Step 2: enable ZooKeeper SASL authentication, but don't take advantage of it in Kafka yet self.zk.zk_sasl = True self.kafka.start_minikdc_if_necessary(self.zk.zk_principals) self.zk.restart_cluster() # bounce a Kafka broker -- allows us to detect a broker restart failure as a simple sanity check self.kafka.stop_node(self.kafka.nodes[0]) self.kafka.start_node(self.kafka.nodes[0]) # Step 3: run migration tool self.zk.zookeeper_migration(self.zk.nodes[0], "secure") # Step 4: restart brokers with zookeeper.set.acl=true and acls (with both TLS and SASL) self.kafka.zk_set_acl = True self.kafka.restart_cluster() self.perform_produce_consume_validation()
class RoundTripFaultTest(Test): topic_name_index = 0 def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(RoundTripFaultTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) if quorum.for_test( test_context) == quorum.zk else None self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk) self.workload_service = RoundTripWorkloadService( test_context, self.kafka) if quorum.for_test(test_context) == quorum.zk: trogdor_client_services = [ self.zk, self.kafka, self.workload_service ] elif quorum.for_test(test_context) == quorum.remote_kraft: trogdor_client_services = [ self.kafka.controller_quorum, self.kafka, self.workload_service ] else: #co-located case, which we currently don't test but handle here for completeness in case we do test it trogdor_client_services = [self.kafka, self.workload_service] self.trogdor = TrogdorService(context=self.test_context, client_services=trogdor_client_services) topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1 active_topics = { topic_name: { "partitionAssignments": { "0": [0, 1, 2] } } } self.round_trip_spec = RoundTripWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.workload_service.client_node, self.workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=100000, active_topics=active_topics) def setUp(self): if self.zk: self.zk.start() self.kafka.start() self.trogdor.start() def teardown(self): self.trogdor.stop() self.kafka.stop() if self.zk: self.zk.stop() def remote_quorum_nodes(self): if quorum.for_test(self.test_context) == quorum.zk: return self.zk.nodes elif quorum.for_test(self.test_context) == quorum.remote_kraft: return self.kafka.controller_quorum.nodes else: # co-located case, which we currently don't test but handle here for completeness in case we do test it return [] @cluster(num_nodes=9) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_round_trip_workload(self, metadata_quorum=quorum.zk): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) workload1.wait_for_done(timeout_sec=600) @cluster(num_nodes=9) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_round_trip_workload_with_broker_partition( self, metadata_quorum=quorum.zk): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) part1 = [self.kafka.nodes[0]] part2 = self.kafka.nodes[1:] + [self.workload_service.nodes[0] ] + self.remote_quorum_nodes() partition1_spec = NetworkPartitionFaultSpec(0, TaskSpec.MAX_DURATION_MS, [part1, part2]) partition1 = self.trogdor.create_task("partition1", partition1_spec) workload1.wait_for_done(timeout_sec=600) partition1.stop() partition1.wait_for_done() @cluster(num_nodes=9) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_produce_consume_with_broker_pause(self, metadata_quorum=quorum.zk): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) stop1_spec = ProcessStopFaultSpec(0, TaskSpec.MAX_DURATION_MS, [self.kafka.nodes[0]], self.kafka.java_class_name()) stop1 = self.trogdor.create_task("stop1", stop1_spec) workload1.wait_for_done(timeout_sec=600) stop1.stop() stop1.wait_for_done() self.kafka.stop_node(self.kafka.nodes[0], False) @cluster(num_nodes=9) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_produce_consume_with_client_partition(self, metadata_quorum=quorum.zk): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) part1 = [self.workload_service.nodes[0]] part2 = self.kafka.nodes + self.remote_quorum_nodes() partition1_spec = NetworkPartitionFaultSpec(0, 60000, [part1, part2]) stop1 = self.trogdor.create_task("stop1", partition1_spec) workload1.wait_for_done(timeout_sec=600) stop1.stop() stop1.wait_for_done() @cluster(num_nodes=9) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_produce_consume_with_latency(self, metadata_quorum=quorum.zk): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) spec = DegradedNetworkFaultSpec(0, 60000) for node in self.kafka.nodes + self.remote_quorum_nodes(): spec.add_node_spec(node.name, "eth0", latencyMs=100, rateLimitKbit=3000) slow1 = self.trogdor.create_task("slow1", spec) workload1.wait_for_done(timeout_sec=600) slow1.stop() slow1.wait_for_done()
class GroupModeTransactionsTest(Test): """Essentially testing the same functionality as TransactionsTest by transactionally copying data from a source topic to a destination topic and killing the copy process as well as the broker randomly through the process. The major difference is that we choose to work as a collaborated group with same topic subscription instead of individual copiers. In the end we verify that the final output topic contains exactly one committed copy of each message from the original producer. """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(GroupModeTransactionsTest, self).__init__(test_context=test_context) self.input_topic = "input-topic" self.output_topic = "output-topic" self.num_brokers = 3 # Test parameters self.num_input_partitions = 9 self.num_output_partitions = 9 self.num_copiers = 3 self.num_seed_messages = 100000 self.transaction_size = 750 # The transaction timeout should be lower than the progress timeout, but at # least as high as the request timeout (which is 30s by default). When the # client is hard-bounced, progress may depend on the previous transaction # being aborted. When the broker is hard-bounced, we may have to wait as # long as the request timeout to get a `Produce` response and we do not # want the coordinator timing out the transaction. self.transaction_timeout = 40000 self.progress_timeout_sec = 60 self.consumer_group = "grouped-transactions-test-consumer-group" self.zk = ZookeeperService(test_context, num_nodes=1) if quorum.for_test( test_context) == quorum.zk else None self.kafka = KafkaService(test_context, num_nodes=self.num_brokers, zk=self.zk, controller_num_nodes_override=1) def setUp(self): if self.zk: self.zk.start() def seed_messages(self, topic, num_seed_messages): seed_timeout_sec = 10000 seed_producer = VerifiableProducer( context=self.test_context, num_nodes=1, kafka=self.kafka, topic=topic, message_validator=is_int, max_messages=num_seed_messages, enable_idempotence=True, repeating_keys=self.num_input_partitions) seed_producer.start() wait_until(lambda: seed_producer.num_acked >= num_seed_messages, timeout_sec=seed_timeout_sec, err_msg="Producer failed to produce messages %d in %ds." % \ (self.num_seed_messages, seed_timeout_sec)) return seed_producer.acked_by_partition def get_messages_from_topic(self, topic, num_messages): consumer = self.start_consumer(topic, group_id="verifying_consumer") return self.drain_consumer(consumer, num_messages) def bounce_brokers(self, clean_shutdown): for node in self.kafka.nodes: if clean_shutdown: self.kafka.restart_node(node, clean_shutdown=True) else: self.kafka.stop_node(node, clean_shutdown=False) gracePeriodSecs = 5 if self.zk: wait_until( lambda: len(self.kafka.pids( node)) == 0 and not self.kafka.is_registered(node), timeout_sec=self.kafka.zk_session_timeout + gracePeriodSecs, err_msg= "Failed to see timely deregistration of hard-killed broker %s" % str(node.account)) else: brokerSessionTimeoutSecs = 18 wait_until( lambda: len(self.kafka.pids(node)) == 0, timeout_sec=brokerSessionTimeoutSecs + gracePeriodSecs, err_msg= "Failed to see timely disappearance of process for hard-killed broker %s" % str(node.account)) time.sleep(brokerSessionTimeoutSecs + gracePeriodSecs) self.kafka.start_node(node) def create_and_start_message_copier(self, input_topic, output_topic, transactional_id): message_copier = TransactionalMessageCopier( context=self.test_context, num_nodes=1, kafka=self.kafka, transactional_id=transactional_id, consumer_group=self.consumer_group, input_topic=input_topic, input_partition=-1, output_topic=output_topic, max_messages=-1, transaction_size=self.transaction_size, transaction_timeout=self.transaction_timeout, use_group_metadata=True, group_mode=True) message_copier.start() wait_until(lambda: message_copier.alive(message_copier.nodes[0]), timeout_sec=10, err_msg="Message copier failed to start after 10 s") return message_copier def bounce_copiers(self, copiers, clean_shutdown, timeout_sec=240): for _ in range(3): for copier in copiers: wait_until(lambda: copier.progress_percent() >= 20.0, timeout_sec=self.progress_timeout_sec, err_msg="%s : Message copier didn't make enough progress in %ds. Current progress: %s" \ % (copier.transactional_id, self.progress_timeout_sec, str(copier.progress_percent()))) self.logger.info( "%s - progress: %s" % (copier.transactional_id, str(copier.progress_percent()))) copier.restart(clean_shutdown) def create_and_start_copiers(self, input_topic, output_topic, num_copiers): copiers = [] for i in range(0, num_copiers): copiers.append( self.create_and_start_message_copier( input_topic=input_topic, output_topic=output_topic, transactional_id="copier-" + str(i))) return copiers @staticmethod def valid_value_and_partition(msg): """Method used to check whether the given message is a valid tab separated value + partition return value and partition as a size-two array represented tuple: [value, partition] """ try: splitted_msg = msg.split('\t') value = int(splitted_msg[1]) partition = int(splitted_msg[0].split(":")[1]) return [value, partition] except ValueError: raise Exception( "Unexpected message format (expected a tab separated [value, partition] tuple). Message: %s" % (msg)) def start_consumer(self, topic_to_read, group_id): consumer = ConsoleConsumer( context=self.test_context, num_nodes=1, kafka=self.kafka, topic=topic_to_read, group_id=group_id, message_validator=self.valid_value_and_partition, from_beginning=True, print_partition=True, isolation_level="read_committed") consumer.start() # ensure that the consumer is up. wait_until(lambda: (len(consumer.messages_consumed[1]) > 0) == True, timeout_sec=60, err_msg="Consumer failed to consume any messages for %ds" % \ 60) return consumer @staticmethod def split_by_partition(messages_consumed): messages_by_partition = {} for msg in messages_consumed: partition = msg[1] if partition not in messages_by_partition: messages_by_partition[partition] = [] messages_by_partition[partition].append(msg[0]) return messages_by_partition def drain_consumer(self, consumer, num_messages): # wait until we read at least the expected number of messages. # This is a safe check because both failure modes will be caught: # 1. If we have 'num_seed_messages' but there are duplicates, then # this is checked for later. # # 2. If we never reach 'num_seed_messages', then this will cause the # test to fail. wait_until(lambda: len(consumer.messages_consumed[1]) >= num_messages, timeout_sec=90, err_msg="Consumer consumed only %d out of %d messages in %ds" % \ (len(consumer.messages_consumed[1]), num_messages, 90)) consumer.stop() return self.split_by_partition(consumer.messages_consumed[1]) def copy_messages_transactionally(self, failure_mode, bounce_target, input_topic, output_topic, num_copiers, num_messages_to_copy): """Copies messages transactionally from the seeded input topic to the output topic, either bouncing brokers or clients in a hard and soft way as it goes. This method also consumes messages in read_committed mode from the output topic while the bounces and copy is going on. It returns the concurrently consumed messages. """ copiers = self.create_and_start_copiers(input_topic=input_topic, output_topic=output_topic, num_copiers=num_copiers) concurrent_consumer = self.start_consumer( output_topic, group_id="concurrent_consumer") clean_shutdown = False if failure_mode == "clean_bounce": clean_shutdown = True if bounce_target == "brokers": self.bounce_brokers(clean_shutdown) elif bounce_target == "clients": self.bounce_copiers(copiers, clean_shutdown) copier_timeout_sec = 240 for copier in copiers: wait_until(lambda: copier.is_done, timeout_sec=copier_timeout_sec, err_msg="%s - Failed to copy all messages in %ds." % \ (copier.transactional_id, copier_timeout_sec)) self.logger.info("finished copying messages") return self.drain_consumer(concurrent_consumer, num_messages_to_copy) def setup_topics(self): self.kafka.topics = { self.input_topic: { "partitions": self.num_input_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } }, self.output_topic: { "partitions": self.num_output_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } } } @cluster(num_nodes=10) @matrix(failure_mode=["hard_bounce", "clean_bounce"], bounce_target=["brokers", "clients"]) def test_transactions(self, failure_mode, bounce_target, metadata_quorum=quorum.zk): security_protocol = 'PLAINTEXT' self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol self.kafka.logs["kafka_data_1"]["collect_default"] = True self.kafka.logs["kafka_data_2"]["collect_default"] = True self.kafka.logs["kafka_operational_logs_debug"][ "collect_default"] = True self.setup_topics() self.kafka.start() input_messages_by_partition = self.seed_messages( self.input_topic, self.num_seed_messages) concurrently_consumed_message_by_partition = self.copy_messages_transactionally( failure_mode, bounce_target, input_topic=self.input_topic, output_topic=self.output_topic, num_copiers=self.num_copiers, num_messages_to_copy=self.num_seed_messages) output_messages_by_partition = self.get_messages_from_topic( self.output_topic, self.num_seed_messages) assert len(input_messages_by_partition) == \ len(concurrently_consumed_message_by_partition), "The lengths of partition count doesn't match: " \ "input partitions count %d, " \ "concurrently consumed partitions count %d" % \ (len(input_messages_by_partition), len(concurrently_consumed_message_by_partition)) assert len(input_messages_by_partition) == \ len(output_messages_by_partition), "The lengths of partition count doesn't match: " \ "input partitions count %d, " \ "output partitions count %d" % \ (len(input_messages_by_partition), len(concurrently_consumed_message_by_partition)) for p in range(self.num_input_partitions): if p not in input_messages_by_partition: continue assert p in output_messages_by_partition, "Partition %d not in output messages" assert p in concurrently_consumed_message_by_partition, "Partition %d not in concurrently consumed messages" output_message_set = set(output_messages_by_partition[p]) input_message_set = set(input_messages_by_partition[p]) concurrently_consumed_message_set = set( concurrently_consumed_message_by_partition[p]) num_dups = abs(len(output_messages) - len(output_message_set)) num_dups_in_concurrent_consumer = abs( len(concurrently_consumed_messages) - len(concurrently_consumed_message_set)) assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" % \ (len(input_message_set), len(output_message_set)) assert num_dups_in_concurrent_consumer == 0, "Detected %d dups in concurrently consumed messages" % num_dups_in_concurrent_consumer assert input_message_set == concurrently_consumed_message_set, \ "Input and concurrently consumed output message sets are not equal. Num input messages: %d. Num concurrently_consumed_messages: %d" % \ (len(input_message_set), len(concurrently_consumed_message_set)) assert input_messages == sorted( input_messages ), "The seed messages themselves were not in order" assert output_messages == input_messages, "Output messages are not in order" assert concurrently_consumed_messages == output_messages, "Concurrently consumed messages are not in order"