class TestUpgrade(ProduceConsumeValidateTest): def __init__(self, test_context): super(TestUpgrade, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.zk = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=LATEST_0_8_2, topics={self.topic: { "partitions": 3, "replication-factor": 3, 'configs': {"min.insync.replicas": 2}}}) self.zk.start() self.kafka.start() # Producer and consumer self.producer_throughput = 10000 self.num_producers = 1 self.num_consumers = 1 self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput, version=LATEST_0_8_2) # TODO - reduce the timeout self.consumer = ConsoleConsumer( self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=30000, message_validator=is_int, version=LATEST_0_8_2) def perform_upgrade(self): self.logger.info("First pass bounce - rolling upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = TRUNK node.config[config_property.INTER_BROKER_PROTOCOL_VERSION] = "0.8.2.X" self.kafka.start_node(node) self.logger.info("Second pass bounce - remove inter.broker.protocol.version config") for node in self.kafka.nodes: self.kafka.stop_node(node) del node.config[config_property.INTER_BROKER_PROTOCOL_VERSION] self.kafka.start_node(node) def test_upgrade(self): """Test upgrade of Kafka broker cluster from 0.8.2 to 0.9.0 - Start 3 node broker cluster on version 0.8.2 - Start producer and consumer in the background - Perform two-phase rolling upgrade - First phase: upgrade brokers to 0.9.0 with inter.broker.protocol.version set to 0.8.2.X - Second phase: remove inter.broker.protocol.version config with rolling bounce - Finally, validate that every message acked by the producer was consumed by the consumer """ self.run_produce_consume_validate(core_test_action=self.perform_upgrade)
class TestSecurityRollingUpgrade(ProduceConsumeValidateTest): """Tests a rolling upgrade from PLAINTEXT to a secured cluster """ def __init__(self, test_context): super(TestSecurityRollingUpgrade, self).__init__(test_context=test_context) def setUp(self): self.acls = ACLs(self.test_context) self.topic = "test_topic" self.group = "group" self.producer_throughput = 100 self.num_producers = 1 self.num_consumers = 1 self.zk = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, topics={self.topic: { "partitions": 3, "replication-factor": 3, 'configs': {"min.insync.replicas": 2}}}) self.zk.start() def create_producer_and_consumer(self): self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput) self.consumer = ConsoleConsumer( self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=60000, message_validator=is_int, new_consumer=True) self.consumer.group_id = "group" def bounce(self): self.kafka.start_minikdc() for node in self.kafka.nodes: self.kafka.stop_node(node) self.kafka.start_node(node) time.sleep(10) def roll_in_secured_settings(self, client_protocol, broker_protocol): # Roll cluster to include inter broker security protocol. self.kafka.interbroker_security_protocol = broker_protocol self.kafka.open_port(client_protocol) self.kafka.open_port(broker_protocol) self.bounce() # Roll cluster to disable PLAINTEXT port self.kafka.close_port('PLAINTEXT') self.set_authorizer_and_bounce(client_protocol, broker_protocol) def set_authorizer_and_bounce(self, client_protocol, broker_protocol): self.kafka.authorizer_class_name = KafkaService.SIMPLE_AUTHORIZER self.acls.set_acls(client_protocol, self.kafka, self.zk, self.topic, self.group) self.acls.set_acls(broker_protocol, self.kafka, self.zk, self.topic, self.group) self.bounce() def open_secured_port(self, client_protocol): self.kafka.security_protocol = client_protocol self.kafka.open_port(client_protocol) self.kafka.start_minikdc() self.bounce() def add_sasl_mechanism(self, new_client_sasl_mechanism): self.kafka.client_sasl_mechanism = new_client_sasl_mechanism self.kafka.start_minikdc() self.bounce() def roll_in_sasl_mechanism(self, security_protocol, new_sasl_mechanism): # Roll cluster to update inter-broker SASL mechanism. This disables the old mechanism. self.kafka.interbroker_sasl_mechanism = new_sasl_mechanism self.bounce() # Bounce again with ACLs for new mechanism self.set_authorizer_and_bounce(security_protocol, security_protocol) @matrix(client_protocol=["SSL", "SASL_PLAINTEXT", "SASL_SSL"]) def test_rolling_upgrade_phase_one(self, client_protocol): """ Start with a PLAINTEXT cluster, open a SECURED port, via a rolling upgrade, ensuring we could produce and consume throughout over PLAINTEXT. Finally check we can produce and consume the new secured port. """ self.kafka.interbroker_security_protocol = "PLAINTEXT" self.kafka.security_protocol = "PLAINTEXT" self.kafka.start() # Create PLAINTEXT producer and consumer self.create_producer_and_consumer() # Rolling upgrade, opening a secure protocol, ensuring the Plaintext producer/consumer continues to run self.run_produce_consume_validate(self.open_secured_port, client_protocol) # Now we can produce and consume via the secured port self.kafka.security_protocol = client_protocol self.create_producer_and_consumer() self.run_produce_consume_validate(lambda: time.sleep(1)) @matrix(client_protocol=["SASL_SSL", "SSL", "SASL_PLAINTEXT"], broker_protocol=["SASL_SSL", "SSL", "SASL_PLAINTEXT"]) def test_rolling_upgrade_phase_two(self, client_protocol, broker_protocol): """ Start with a PLAINTEXT cluster with a second Secured port open (i.e. result of phase one). Start an Producer and Consumer via the SECURED port Incrementally upgrade to add inter-broker be the secure protocol Incrementally upgrade again to add ACLs as well as disabling the PLAINTEXT port Ensure the producer and consumer ran throughout """ #Given we have a broker that has both secure and PLAINTEXT ports open self.kafka.security_protocol = client_protocol self.kafka.interbroker_security_protocol = "PLAINTEXT" self.kafka.start() #Create Secured Producer and Consumer self.create_producer_and_consumer() #Roll in the security protocol. Disable Plaintext. Ensure we can produce and Consume throughout self.run_produce_consume_validate(self.roll_in_secured_settings, client_protocol, broker_protocol) @parametrize(new_client_sasl_mechanism='PLAIN') def test_rolling_upgrade_sasl_mechanism_phase_one(self, new_client_sasl_mechanism): """ Start with a SASL/GSSAPI cluster, add new SASL mechanism, via a rolling upgrade, ensuring we could produce and consume throughout over SASL/GSSAPI. Finally check we can produce and consume using new mechanism. """ self.kafka.interbroker_security_protocol = "SASL_SSL" self.kafka.security_protocol = "SASL_SSL" self.kafka.client_sasl_mechanism = "GSSAPI" self.kafka.interbroker_sasl_mechanism = "GSSAPI" self.kafka.start() # Create SASL/GSSAPI producer and consumer self.create_producer_and_consumer() # Rolling upgrade, adding new SASL mechanism, ensuring the GSSAPI producer/consumer continues to run self.run_produce_consume_validate(self.add_sasl_mechanism, new_client_sasl_mechanism) # Now we can produce and consume using the new SASL mechanism self.kafka.client_sasl_mechanism = new_client_sasl_mechanism self.create_producer_and_consumer() self.run_produce_consume_validate(lambda: time.sleep(1)) @parametrize(new_sasl_mechanism='PLAIN') def test_rolling_upgrade_sasl_mechanism_phase_two(self, new_sasl_mechanism): """ Start with a SASL cluster with GSSAPI for inter-broker and a second mechanism for clients (i.e. result of phase one). Start Producer and Consumer using the second mechanism Incrementally upgrade to set inter-broker to the second mechanism and disable GSSAPI Incrementally upgrade again to add ACLs Ensure the producer and consumer run throughout """ #Start with a broker that has GSSAPI for inter-broker and a second mechanism for clients self.kafka.security_protocol = "SASL_SSL" self.kafka.interbroker_security_protocol = "SASL_SSL" self.kafka.client_sasl_mechanism = new_sasl_mechanism self.kafka.interbroker_sasl_mechanism = "GSSAPI" self.kafka.start() #Create Producer and Consumer using second mechanism self.create_producer_and_consumer() #Roll in the second SASL mechanism for inter-broker, disabling first mechanism. Ensure we can produce and consume throughout self.run_produce_consume_validate(self.roll_in_sasl_mechanism, self.kafka.security_protocol, new_sasl_mechanism)
class StreamsBrokerDownResilience(Test): """ This test validates that Streams is resilient to a broker being down longer than specified timeouts in configs """ inputTopic = "streamsResilienceSource" outputTopic = "streamsResilienceSink" num_messages = 5 def __init__(self, test_context): super(StreamsBrokerDownResilience, self).__init__(test_context=test_context) self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={ self.inputTopic: { 'partitions': 3, 'replication-factor': 1 }, self.outputTopic: { 'partitions': 1, 'replication-factor': 1 } }) def get_consumer(self, num_messages): return VerifiableConsumer(self.test_context, 1, self.kafka, self.outputTopic, "stream-broker-resilience-verify-consumer", max_messages=num_messages) def get_producer(self, num_messages): return VerifiableProducer(self.test_context, 1, self.kafka, self.inputTopic, max_messages=num_messages, acks=1) def assert_produce_consume(self, test_state, num_messages=5): producer = self.get_producer(num_messages) producer.start() wait_until(lambda: producer.num_acked >= num_messages, timeout_sec=30, err_msg="At %s failed to send messages " % test_state) consumer = self.get_consumer(num_messages) consumer.start() wait_until( lambda: consumer.total_consumed() >= num_messages, timeout_sec=60, err_msg="At %s streams did not process messages in 60 seconds " % test_state) @staticmethod def get_configs(extra_configs=""): # Consumer max.poll.interval > min(max.block.ms, ((retries + 1) * request.timeout) consumer_poll_ms = "consumer.max.poll.interval.ms=50000" retries_config = "producer.retries=2" request_timeout = "producer.request.timeout.ms=15000" max_block_ms = "producer.max.block.ms=30000" # java code expects configs in key=value,key=value format updated_configs = consumer_poll_ms + "," + retries_config + "," + request_timeout + "," + max_block_ms + extra_configs return updated_configs def wait_for_verification(self, processor, message, file, num_lines=1): wait_until(lambda: self.verify_from_file(processor, message, file ) >= num_lines, timeout_sec=60, err_msg="Did expect to read '%s' from %s" % (message, processor.node.account)) @staticmethod def verify_from_file(processor, message, file): result = processor.node.account.ssh_output("grep '%s' %s | wc -l" % (message, file), allow_fail=False) return int(result) def setUp(self): self.zk.start() def test_streams_resilient_to_broker_down(self): self.kafka.start() # Broker should be down over 2x of retries * timeout ms # So with (2 * 15000) = 30 seconds, we'll set downtime to 70 seconds broker_down_time_in_seconds = 70 processor = StreamsBrokerDownResilienceService(self.test_context, self.kafka, self.get_configs()) processor.start() # until KIP-91 is merged we'll only send 5 messages to assert Kafka Streams is running before taking the broker down # After KIP-91 is merged we'll continue to send messages the duration of the test self.assert_produce_consume("before_broker_stop") node = self.kafka.leader(self.inputTopic) self.kafka.stop_node(node) time.sleep(broker_down_time_in_seconds) self.kafka.start_node(node) self.assert_produce_consume("after_broker_stop") self.kafka.stop() def test_streams_runs_with_broker_down_initially(self): self.kafka.start() node = self.kafka.leader(self.inputTopic) self.kafka.stop_node(node) configs = self.get_configs( extra_configs=",application.id=starting_wo_broker_id") # start streams with broker down initially processor = StreamsBrokerDownResilienceService(self.test_context, self.kafka, configs) processor.start() processor_2 = StreamsBrokerDownResilienceService( self.test_context, self.kafka, configs) processor_2.start() processor_3 = StreamsBrokerDownResilienceService( self.test_context, self.kafka, configs) processor_3.start() broker_unavailable_message = "Broker may not be available" # verify streams instances unable to connect to broker, kept trying self.wait_for_verification(processor, broker_unavailable_message, processor.LOG_FILE, 100) self.wait_for_verification(processor_2, broker_unavailable_message, processor_2.LOG_FILE, 100) self.wait_for_verification(processor_3, broker_unavailable_message, processor_3.LOG_FILE, 100) # now start broker self.kafka.start_node(node) # assert streams can process when starting with broker down self.assert_produce_consume("running_with_broker_down_initially", num_messages=9) message = "processed3messages" # need to show all 3 instances processed messages self.wait_for_verification(processor, message, processor.STDOUT_FILE) self.wait_for_verification(processor_2, message, processor_2.STDOUT_FILE) self.wait_for_verification(processor_3, message, processor_3.STDOUT_FILE) self.kafka.stop() def test_streams_should_scale_in_while_brokers_down(self): self.kafka.start() configs = self.get_configs( extra_configs=",application.id=shutdown_with_broker_down") processor = StreamsBrokerDownResilienceService(self.test_context, self.kafka, configs) processor.start() processor_2 = StreamsBrokerDownResilienceService( self.test_context, self.kafka, configs) processor_2.start() processor_3 = StreamsBrokerDownResilienceService( self.test_context, self.kafka, configs) processor_3.start() # need to wait for rebalance once self.wait_for_verification( processor_3, "State transition from REBALANCING to RUNNING", processor_3.LOG_FILE) # assert streams can process when starting with broker down self.assert_produce_consume("waiting for rebalance to complete", num_messages=9) message = "processed3messages" self.wait_for_verification(processor, message, processor.STDOUT_FILE) self.wait_for_verification(processor_2, message, processor_2.STDOUT_FILE) self.wait_for_verification(processor_3, message, processor_3.STDOUT_FILE) node = self.kafka.leader(self.inputTopic) self.kafka.stop_node(node) processor.stop() processor_2.stop() shutdown_message = "Complete shutdown of streams resilience test app now" self.wait_for_verification(processor, shutdown_message, processor.STDOUT_FILE) self.wait_for_verification(processor_2, shutdown_message, processor_2.STDOUT_FILE) self.kafka.start_node(node) self.assert_produce_consume( "sending_message_after_stopping_streams_instance_bouncing_broker", num_messages=9) self.wait_for_verification(processor_3, "processed9messages", processor_3.STDOUT_FILE) self.kafka.stop()
class TestSnapshots(ProduceConsumeValidateTest): TOPIC_NAME_PREFIX = "test_topic_" def __init__(self, test_context): super(TestSnapshots, self).__init__(test_context=test_context) self.topics_created = 0 self.topic = "test_topic" self.partitions = 3 self.replication_factor = 3 self.num_nodes = 3 # Producer and consumer self.producer_throughput = 1000 self.num_producers = 1 self.num_consumers = 1 security_protocol = 'PLAINTEXT' # Setup Custom Config to ensure snapshot will be generated deterministically self.kafka = KafkaService( self.test_context, self.num_nodes, zk=None, topics={ self.topic: { "partitions": self.partitions, "replication-factor": self.replication_factor, 'configs': { "min.insync.replicas": 2 } } }, server_prop_overrides=[ [ config_property.METADATA_LOG_DIR, KafkaService.METADATA_LOG_DIR ], [config_property.METADATA_LOG_SEGMENT_MS, "10000"], [config_property.METADATA_LOG_RETENTION_BYTES, "2048"], [config_property.METADATA_LOG_BYTES_BETWEEN_SNAPSHOTS, "2048"] ]) self.kafka.interbroker_security_protocol = security_protocol self.kafka.security_protocol = security_protocol def setUp(self): # Start the cluster and ensure that a snapshot is generated self.logger.info( "Starting the cluster and running until snapshot creation") assert quorum.for_test(self.test_context) in quorum.all_kraft, \ "Snapshot test should be run Kraft Modes only" self.kafka.start() topic_count = 10 self.topics_created += self.create_n_topics(topic_count) if self.kafka.remote_controller_quorum: self.controller_nodes = self.kafka.remote_controller_quorum.nodes else: self.controller_nodes = self.kafka.nodes[:self.kafka. num_nodes_controller_role] # Waiting for snapshot creation and first log segment # cleanup on all controller nodes for node in self.controller_nodes: self.logger.debug("Waiting for snapshot on: %s" % self.kafka.who_am_i(node)) self.wait_for_log_segment_delete(node) self.wait_for_snapshot(node) self.logger.debug("Verified Snapshots exist on controller nodes") def create_n_topics(self, topic_count): for i in range(self.topics_created, topic_count): topic = "%s%d" % (TestSnapshots.TOPIC_NAME_PREFIX, i) self.logger.debug("Creating topic %s" % topic) topic_cfg = { "topic": topic, "partitions": self.partitions, "replication-factor": self.replication_factor, "configs": { "min.insync.replicas": 2 } } self.kafka.create_topic(topic_cfg) self.logger.debug("Created %d more topics" % topic_count) return topic_count def wait_for_log_segment_delete(self, node): file_path = self.kafka.METADATA_FIRST_LOG # Wait until the first log segment in metadata log is marked for deletion wait_until( lambda: not self.file_exists(node, file_path), timeout_sec=100, backoff_sec=1, err_msg= "Not able to verify cleanup of log file %s in a reasonable amount of time" % file_path) def wait_for_snapshot(self, node): # Wait for a snapshot file to show up file_path = self.kafka.METADATA_SNAPSHOT_SEARCH_STR wait_until( lambda: self.file_exists(node, file_path), timeout_sec=100, backoff_sec=1, err_msg= "Not able to verify snapshot existence in a reasonable amount of time" ) def file_exists(self, node, file_path): # Check if the first log segment is cleaned up self.logger.debug("Checking if file %s exists" % file_path) cmd = "ls %s" % file_path files = node.account.ssh_output(cmd, allow_fail=True, combine_stderr=False) if len(files) is 0: self.logger.debug("File %s does not exist" % file_path) return False else: self.logger.debug("File %s was found" % file_path) return True def validate_success(self, topic=None): if topic is None: # Create a new topic topic = "%s%d" % (TestSnapshots.TOPIC_NAME_PREFIX, self.topics_created) self.topics_created += self.create_n_topics(topic_count=1) # Produce to the newly created topic to ensure broker has caught up self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, topic, throughput=self.producer_throughput, message_validator=is_int) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, topic, consumer_timeout_ms=30000, message_validator=is_int) self.start_producer_and_consumer() self.stop_producer_and_consumer() self.validate() @cluster(num_nodes=9) @matrix(metadata_quorum=quorum.all_kraft) def test_broker(self, metadata_quorum=quorum.colocated_kraft): """ Test the ability of a broker to consume metadata snapshots and to recover the cluster metadata state using them The test ensures that that there is atleast one snapshot created on the controller quorum during the setup phase and that at least the first log segment in the metadata log has been marked for deletion, thereby ensuring that any observer of the log needs to always load a snapshot to catch up to the current metadata state. Each scenario is a progression over the previous one. The scenarios build on top of each other by: * Loading a snapshot * Loading and snapshot and some delta records * Loading a snapshot and delta and ensuring that the most recent metadata state has been caught up. Even though a subsequent scenario covers the previous one, they are all left in the test to make debugging a failure of the test easier e.g. if the first scenario passes and the second fails, it hints towards a problem with the application of delta records while catching up """ # Scenario -- Re-init broker after cleaning up all persistent state node = random.choice(self.kafka.nodes) self.logger.debug("Scenario: kill-clean-start on broker node %s", self.kafka.who_am_i(node)) self.kafka.clean_node(node) self.kafka.start_node(node) # Scenario -- Re-init broker after cleaning up all persistent state # Create some metadata changes for the broker to consume as well. node = random.choice(self.kafka.nodes) self.logger.debug( "Scenario: kill-clean-create_topics-start on broker node %s", self.kafka.who_am_i(node)) self.kafka.clean_node(node) # Now modify the cluster to create more metadata changes self.topics_created += self.create_n_topics(topic_count=10) self.kafka.start_node(node) # Scenario -- Re-init broker after cleaning up all persistent state # And ensure that the broker has replicated the metadata log node = random.choice(self.kafka.nodes) self.logger.debug( "Scenario: kill-clean-start-verify-produce on broker node %s", self.kafka.who_am_i(node)) self.kafka.clean_node(node) self.kafka.start_node(node) # Create a topic where the affected broker must be the leader broker_topic = "%s%d" % (TestSnapshots.TOPIC_NAME_PREFIX, self.topics_created) self.topics_created += 1 self.logger.debug("Creating topic %s" % broker_topic) topic_cfg = { "topic": broker_topic, "replica-assignment": self.kafka.idx(node), "configs": { "min.insync.replicas": 1 } } self.kafka.create_topic(topic_cfg) # Produce to the newly created topic and make sure it works. self.validate_success(broker_topic) @cluster(num_nodes=9) @matrix(metadata_quorum=quorum.all_kraft) def test_controller(self, metadata_quorum=quorum.colocated_kraft): """ Test the ability of controllers to consume metadata snapshots and to recover the cluster metadata state using them The test ensures that that there is atleast one snapshot created on the controller quorum during the setup phase and that at least the first log segment in the metadata log has been marked for deletion, thereby ensuring that any observer of the log needs to always load a snapshot to catch up to the current metadata state. Each scenario is a progression over the previous one. The scenarios build on top of each other by: * Loading a snapshot * Loading and snapshot and some delta records * Loading a snapshot and delta and ensuring that the most recent metadata state has been caught up. Even though a subsequent scenario covers the previous one, they are all left in the test to make debugging a failure of the test easier e.g. if the first scenario passes and the second fails, it hints towards a problem with the application of delta records while catching up """ # Scenario -- Re-init controllers with a clean kafka dir self.logger.debug("Scenario: kill-clean-start controller node") for node in self.controller_nodes: self.logger.debug("Restarting node: %s", self.kafka.controller_quorum.who_am_i(node)) self.kafka.controller_quorum.clean_node(node) self.kafka.controller_quorum.start_node(node) # Scenario -- Re-init controllers with a clean kafka dir and # make metadata changes while they are down. # This will force the entire quorum to load from snapshots # and verify the quorum's ability to catch up to the latest metadata self.logger.debug( "Scenario: kill-clean-create_topics-start on controller node %s") for node in self.controller_nodes: self.logger.debug("Restarting node: %s", self.kafka.controller_quorum.who_am_i(node)) self.kafka.controller_quorum.clean_node(node) # Now modify the cluster to create more metadata changes self.topics_created += self.create_n_topics(topic_count=5) self.kafka.controller_quorum.start_node(node) # Produce to a newly created topic and make sure it works. self.validate_success()
class TestSecurityRollingUpgrade(ProduceConsumeValidateTest): """Tests a rolling upgrade from PLAINTEXT to a secured cluster """ def __init__(self, test_context): super(TestSecurityRollingUpgrade, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.producer_throughput = 100 self.num_producers = 1 self.num_consumers = 1 self.zk = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, topics={self.topic: { "partitions": 3, "replication-factor": 3, 'configs': {"min.insync.replicas": 2}}}) self.zk.start() #reduce replica.lag.time.max.ms due to KAFKA-2827 self.kafka.replica_lag = 2000 def create_producer_and_consumer(self): self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput) self.consumer = ConsoleConsumer( self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=60000, message_validator=is_int, new_consumer=True) self.consumer.group_id = "unique-test-group-" + str(random.random()) def bounce(self): #Sleeps reduce the intermittent failures reported in KAFKA-2891. Should be removed once resolved. for node in self.kafka.nodes: self.kafka.stop_node(node) time.sleep(10) self.kafka.start_node(node) time.sleep(10) def roll_in_secured_settings(self, upgrade_protocol): self.kafka.interbroker_security_protocol = upgrade_protocol # Roll cluster to include inter broker security protocol. self.kafka.open_port(upgrade_protocol) self.bounce() # Roll cluster to disable PLAINTEXT port self.kafka.close_port('PLAINTEXT') self.bounce() def open_secured_port(self, upgrade_protocol): self.kafka.security_protocol = upgrade_protocol self.kafka.open_port(upgrade_protocol) self.kafka.start_minikdc() self.bounce() @matrix(upgrade_protocol=["SSL", "SASL_PLAINTEXT", "SASL_SSL"]) def test_rolling_upgrade_phase_one(self, upgrade_protocol): """ Start with a PLAINTEXT cluster, open a SECURED port, via a rolling upgrade, ensuring we could produce and consume throughout over PLAINTEXT. Finally check we can produce and consume the new secured port. """ self.kafka.interbroker_security_protocol = "PLAINTEXT" self.kafka.security_protocol = "PLAINTEXT" self.kafka.start() #Create PLAINTEXT producer and consumer self.create_producer_and_consumer() # Rolling upgrade, opening a secure protocol, ensuring the Plaintext producer/consumer continues to run self.run_produce_consume_validate(self.open_secured_port, upgrade_protocol) # Now we can produce and consume via the secured port self.kafka.security_protocol = upgrade_protocol self.create_producer_and_consumer() self.run_produce_consume_validate(lambda: time.sleep(1)) @matrix(upgrade_protocol=["SSL", "SASL_PLAINTEXT", "SASL_SSL"]) def test_rolling_upgrade_phase_two(self, upgrade_protocol): """ Start with a PLAINTEXT cluster with a second Secured port open (i.e. result of phase one). Start an Producer and Consumer via the SECURED port Rolling upgrade to add inter-broker be the secure protocol Rolling upgrade again to disable PLAINTEXT Ensure the producer and consumer ran throughout """ #Given we have a broker that has both secure and PLAINTEXT ports open self.kafka.security_protocol = upgrade_protocol self.kafka.interbroker_security_protocol = "PLAINTEXT" self.kafka.start() #Create Secured Producer and Consumer self.create_producer_and_consumer() #Roll in the security protocol. Disable Plaintext. Ensure we can produce and Consume throughout self.run_produce_consume_validate(self.roll_in_secured_settings, upgrade_protocol)
class TransactionsTest(Test): """Tests transactions by transactionally copying data from a source topic to a destination topic and killing the copy process as well as the broker randomly through the process. In the end we verify that the final output topic contains exactly one committed copy of each message in the input topic """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(TransactionsTest, self).__init__(test_context=test_context) self.input_topic = "input-topic" self.output_topic = "output-topic" self.num_brokers = 3 # Test parameters self.num_input_partitions = 2 self.num_output_partitions = 3 self.num_seed_messages = 100000 self.transaction_size = 750 self.first_transactional_id = "my-first-transactional-id" self.second_transactional_id = "my-second-transactional-id" self.consumer_group = "transactions-test-consumer-group" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=self.num_brokers, zk=self.zk, topics={ self.input_topic: { "partitions": self.num_input_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } }, self.output_topic: { "partitions": self.num_output_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } } }) def setUp(self): self.zk.start() def seed_messages(self): seed_timeout_sec = 10000 seed_producer = VerifiableProducer(context=self.test_context, num_nodes=1, kafka=self.kafka, topic=self.input_topic, message_validator=is_int, max_messages=self.num_seed_messages, enable_idempotence=True) seed_producer.start() wait_until(lambda: seed_producer.num_acked >= self.num_seed_messages, timeout_sec=seed_timeout_sec, err_msg="Producer failed to produce messages %d in %ds." %\ (self.num_seed_messages, seed_timeout_sec)) return seed_producer.acked def get_messages_from_output_topic(self): consumer = self.start_consumer(self.output_topic, group_id="verifying_consumer") return self.drain_consumer(consumer) def bounce_brokers(self, clean_shutdown): for node in self.kafka.nodes: if clean_shutdown: self.kafka.restart_node(node, clean_shutdown=True) else: self.kafka.stop_node(node, clean_shutdown=False) wait_until(lambda: len(self.kafka.pids(node)) == 0 and not self .kafka.is_registered(node), timeout_sec=self.kafka.zk_session_timeout + 5, err_msg="Failed to see timely deregistration of \ hard-killed broker %s" % str(node.account)) self.kafka.start_node(node) def create_and_start_message_copier(self, input_partition, transactional_id): message_copier = TransactionalMessageCopier( context=self.test_context, num_nodes=1, kafka=self.kafka, transactional_id=transactional_id, consumer_group=self.consumer_group, input_topic=self.input_topic, input_partition=input_partition, output_topic=self.output_topic, max_messages=-1, transaction_size=self.transaction_size) message_copier.start() wait_until(lambda: message_copier.alive(message_copier.nodes[0]), timeout_sec=10, err_msg="Message copier failed to start after 10 s") return message_copier def bounce_copiers(self, copiers, clean_shutdown): for _ in range(3): for copier in copiers: wait_until(lambda: copier.progress_percent() >= 20.0, timeout_sec=30, err_msg="%s : Message copier didn't make enough progress in 30s. Current progress: %s" \ % (copier.transactional_id, str(copier.progress_percent()))) self.logger.info( "%s - progress: %s" % (copier.transactional_id, str(copier.progress_percent()))) copier.restart(clean_shutdown) def create_and_start_copiers(self): copiers = [] copiers.append( self.create_and_start_message_copier( input_partition=0, transactional_id=self.first_transactional_id)) copiers.append( self.create_and_start_message_copier( input_partition=1, transactional_id=self.second_transactional_id)) return copiers def start_consumer(self, topic_to_read, group_id): consumer = ConsoleConsumer(context=self.test_context, num_nodes=1, kafka=self.kafka, topic=topic_to_read, group_id=group_id, new_consumer=True, message_validator=is_int, from_beginning=True, isolation_level="read_committed") consumer.start() # ensure that the consumer is up. wait_until(lambda: (len(consumer.messages_consumed[1]) > 0) == True, timeout_sec=60, err_msg="Consumer failed to consume any messages for %ds" %\ 60) return consumer def drain_consumer(self, consumer): # wait until we read at least the expected number of messages. # This is a safe check because both failure modes will be caught: # 1. If we have 'num_seed_messages' but there are duplicates, then # this is checked for later. # # 2. If we never reach 'num_seed_messages', then this will cause the # test to fail. wait_until(lambda: len(consumer.messages_consumed[1]) >= self.num_seed_messages, timeout_sec=90, err_msg="Consumer consumed only %d out of %d messages in %ds" %\ (len(consumer.messages_consumed[1]), self.num_seed_messages, 90)) consumer.stop() return consumer.messages_consumed[1] def copy_messages_transactionally(self, failure_mode, bounce_target): """Copies messages transactionally from the seeded input topic to the output topic, either bouncing brokers or clients in a hard and soft way as it goes. This method also consumes messages in read_committed mode from the output topic while the bounces and copy is going on. It returns the concurrently consumed messages. """ copiers = self.create_and_start_copiers() concurrent_consumer = self.start_consumer( self.output_topic, group_id="concurrent_consumer") clean_shutdown = False if failure_mode == "clean_bounce": clean_shutdown = True if bounce_target == "brokers": self.bounce_brokers(clean_shutdown) elif bounce_target == "clients": self.bounce_copiers(copiers, clean_shutdown) for copier in copiers: wait_until(lambda: copier.is_done, timeout_sec=60, err_msg="%s - Failed to copy all messages in %ds." %\ (copier.transactional_id, 60)) self.logger.info("finished copying messages") return self.drain_consumer(concurrent_consumer) @cluster(num_nodes=9) @matrix(failure_mode=["hard_bounce", "clean_bounce"], bounce_target=["brokers", "clients"]) def test_transactions(self, failure_mode, bounce_target): security_protocol = 'PLAINTEXT' self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol self.kafka.logs["kafka_data"]["collect_default"] = True self.kafka.logs["kafka_operational_logs_debug"][ "collect_default"] = True self.kafka.start() input_messages = self.seed_messages() concurrently_consumed_messages = self.copy_messages_transactionally( failure_mode, bounce_target) output_messages = self.get_messages_from_output_topic() concurrently_consumed_message_set = set(concurrently_consumed_messages) output_message_set = set(output_messages) input_message_set = set(input_messages) num_dups = abs(len(output_messages) - len(output_message_set)) num_dups_in_concurrent_consumer = abs( len(concurrently_consumed_messages) - len(concurrently_consumed_message_set)) assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" %\ (len(input_message_set), len(output_message_set)) assert num_dups_in_concurrent_consumer == 0, "Detected %d dups in concurrently consumed messages" % num_dups_in_concurrent_consumer assert input_message_set == concurrently_consumed_message_set, \ "Input and concurrently consumed output message sets are not equal. Num input messages: %d. Num concurrently_consumed_messages: %d" %\ (len(input_message_set), len(concurrently_consumed_message_set))
class StreamsBrokerDownResilience(Test): """ This test validates that Streams is resilient to a broker being down longer than specified timeouts in configs """ inputTopic = "streamsResilienceSource" outputTopic = "streamsResilienceSink" num_messages = 5 def __init__(self, test_context): super(StreamsBrokerDownResilience, self).__init__(test_context=test_context) self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={ self.inputTopic: { 'partitions': 1, 'replication-factor': 1 }, self.outputTopic: { 'partitions': 1, 'replication-factor': 1 } }) def get_consumer(self): return VerifiableConsumer(self.test_context, 1, self.kafka, self.outputTopic, "stream-broker-resilience-verify-consumer", max_messages=self.num_messages) def get_producer(self): return VerifiableProducer(self.test_context, 1, self.kafka, self.inputTopic, max_messages=self.num_messages, acks=1) def assert_produce_consume(self, test_state): producer = self.get_producer() producer.start() wait_until(lambda: producer.num_acked > 0, timeout_sec=30, err_msg="At %s failed to send messages " % test_state) consumer = self.get_consumer() consumer.start() wait_until( lambda: consumer.total_consumed() > 0, timeout_sec=120, err_msg="At %s streams did not process messages in 120 seconds " % test_state) def setUp(self): self.zk.start() def test_streams_resilient_to_broker_down(self): self.kafka.start() # Consumer max.poll.interval > min(max.block.ms, ((retries + 1) * request.timeout) consumer_poll_ms = "consumer.max.poll.interval.ms=50000" retries_config = "producer.retries=2" request_timeout = "producer.request.timeout.ms=15000" max_block_ms = "producer.max.block.ms=30000" # Broker should be down over 2x of retries * timeout ms # So with (2 * 15000) = 30 seconds, we'll set downtime to 70 seconds broker_down_time_in_seconds = 70 # java code expects configs in key=value,key=value format updated_configs = consumer_poll_ms + "," + retries_config + "," + request_timeout + "," + max_block_ms processor = StreamsBrokerDownResilienceService(self.test_context, self.kafka, updated_configs) processor.start() # until KIP-91 is merged we'll only send 5 messages to assert Kafka Streams is running before taking the broker down # After KIP-91 is merged we'll continue to send messages the duration of the test self.assert_produce_consume("before_broker_stop") node = self.kafka.leader(self.inputTopic) self.kafka.stop_node(node) time.sleep(broker_down_time_in_seconds) self.kafka.start_node(node) self.assert_produce_consume("after_broker_stop") self.kafka.stop()
class ZookeeperTlsTest(ProduceConsumeValidateTest): """Tests TLS connectivity to zookeeper. """ def __init__(self, test_context): super(ZookeeperTlsTest, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.group = "group" self.producer_throughput = 100 self.num_producers = 1 self.num_consumers = 1 self.zk = ZookeeperService(self.test_context, num_nodes=3) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, topics={ self.topic: { "partitions": 3, "replication-factor": 3, 'configs': { "min.insync.replicas": 2 } } }) def create_producer_and_consumer(self): self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=60000, message_validator=is_int) self.consumer.group_id = self.group def perform_produce_consume_validation(self): self.create_producer_and_consumer() self.run_produce_consume_validate() self.producer.free() self.consumer.free() def enable_zk_tls(self): self.test_context.logger.debug( "Enabling the TLS port in Zookeeper (we won't use it from Kafka yet)" ) # change zk config (enable TLS, but also keep non-TLS) self.zk.zk_client_secure_port = True self.zk.restart_cluster() # bounce a Kafka broker -- allows us to detect a broker restart failure as a simple sanity check self.kafka.stop_node(self.kafka.nodes[0]) self.kafka.start_node(self.kafka.nodes[0]) def enable_kafka_zk_tls(self): self.test_context.logger.debug( "Configuring Kafka to use the TLS port in Zookeeper") # change Kafka config (enable TLS to Zookeeper) and restart the Kafka cluster self.kafka.zk_client_secure = True self.kafka.restart_cluster() def disable_zk_non_tls(self): self.test_context.logger.debug( "Disabling the non-TLS port in Zookeeper (as a simple sanity check)" ) # change zk config (disable non-TLS, keep TLS) and restart the ZooKeeper cluster self.zk.zk_client_port = False self.zk.restart_cluster() # bounce a Kafka broker -- allows us to detect a broker restart failure as a simple sanity check self.kafka.stop_node(self.kafka.nodes[0]) self.kafka.start_node(self.kafka.nodes[0]) @cluster(num_nodes=9) def test_zk_tls(self): self.zk.start() self.kafka.security_protocol = self.kafka.interbroker_security_protocol = "PLAINTEXT" self.kafka.start() # Enable TLS port in Zookeeper in addition to the regular non-TLS port # Bounces the ZooKeeper cluster (and a single broker as a sanity check) self.enable_zk_tls() # Leverage ZooKeeper TLS port in Kafka # Bounces the Kafka cluster self.enable_kafka_zk_tls() self.perform_produce_consume_validation() # Disable ZooKeeper non-TLS port to make sure we aren't using it # Bounces the ZooKeeper cluster (and a single broker as a sanity check) self.disable_zk_non_tls() # Make sure the ZooKeeper command line is able to talk to a TLS-enabled ZooKeeper quorum # Test both create() and query(), each of which leverages the ZooKeeper command line # This tests the code in org.apache.zookeeper.ZooKeeperMainWithTlsSupportForKafka path = "/foo" value = "{\"bar\": 0}" self.zk.create(path, value=value) if self.zk.query(path) != value: raise Exception( "Error creating and then querying a znode using the CLI with a TLS-enabled ZooKeeper quorum" ) # Make sure the ConfigCommand CLI is able to talk to a TLS-enabled ZooKeeper quorum # This is necessary for the bootstrap use case despite direct ZooKeeper connectivity being deprecated self.zk.describe(self.topic) # Make sure the AclCommand CLI is able to talk to a TLS-enabled ZooKeeper quorum # This is necessary for the bootstrap use case despite direct ZooKeeper connectivity being deprecated self.zk.list_acls(self.topic) # # Test zookeeper.set.acl with just TLS mutual authentication (no SASL) # # Step 1: run migration tool self.zk.zookeeper_migration(self.zk.nodes[0], "secure") # Step 2: restart brokers with zookeeper.set.acl=true and acls (with TLS but no SASL) self.kafka.zk_set_acl = True self.kafka.restart_cluster() self.perform_produce_consume_validation() # # Test zookeeper.set.acl with both SASL and TLS mutual authentication # # Step 1: remove ACLs created previously self.kafka.zk_set_acl = False self.kafka.restart_cluster() self.zk.zookeeper_migration(self.zk.nodes[0], "unsecure") # Step 2: enable ZooKeeper SASL authentication, but don't take advantage of it in Kafka yet self.zk.zk_sasl = True self.kafka.start_minikdc_if_necessary(self.zk.zk_principals) self.zk.restart_cluster() # bounce a Kafka broker -- allows us to detect a broker restart failure as a simple sanity check self.kafka.stop_node(self.kafka.nodes[0]) self.kafka.start_node(self.kafka.nodes[0]) # Step 3: run migration tool self.zk.zookeeper_migration(self.zk.nodes[0], "secure") # Step 4: restart brokers with zookeeper.set.acl=true and acls (with both TLS and SASL) self.kafka.zk_set_acl = True self.kafka.restart_cluster() self.perform_produce_consume_validation()
class TransactionsTest(Test): """Tests transactions by transactionally copying data from a source topic to a destination topic and killing the copy process as well as the broker randomly through the process. In the end we verify that the final output topic contains exactly one committed copy of each message in the input topic """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(TransactionsTest, self).__init__(test_context=test_context) self.input_topic = "input-topic" self.output_topic = "output-topic" self.num_brokers = 3 # Test parameters self.num_input_partitions = 2 self.num_output_partitions = 3 self.num_seed_messages = 100000 self.transaction_size = 750 self.consumer_group = "transactions-test-consumer-group" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=self.num_brokers, zk=self.zk) def setUp(self): self.zk.start() def seed_messages(self, topic, num_seed_messages): seed_timeout_sec = 10000 seed_producer = VerifiableProducer(context=self.test_context, num_nodes=1, kafka=self.kafka, topic=topic, message_validator=is_int, max_messages=num_seed_messages, enable_idempotence=True) seed_producer.start() wait_until(lambda: seed_producer.num_acked >= num_seed_messages, timeout_sec=seed_timeout_sec, err_msg="Producer failed to produce messages %d in %ds." %\ (self.num_seed_messages, seed_timeout_sec)) return seed_producer.acked def get_messages_from_topic(self, topic, num_messages): consumer = self.start_consumer(topic, group_id="verifying_consumer") return self.drain_consumer(consumer, num_messages) def bounce_brokers(self, clean_shutdown): for node in self.kafka.nodes: if clean_shutdown: self.kafka.restart_node(node, clean_shutdown = True) else: self.kafka.stop_node(node, clean_shutdown = False) wait_until(lambda: len(self.kafka.pids(node)) == 0 and not self.kafka.is_registered(node), timeout_sec=self.kafka.zk_session_timeout + 5, err_msg="Failed to see timely deregistration of \ hard-killed broker %s" % str(node.account)) self.kafka.start_node(node) def create_and_start_message_copier(self, input_topic, input_partition, output_topic, transactional_id): message_copier = TransactionalMessageCopier( context=self.test_context, num_nodes=1, kafka=self.kafka, transactional_id=transactional_id, consumer_group=self.consumer_group, input_topic=input_topic, input_partition=input_partition, output_topic=output_topic, max_messages=-1, transaction_size=self.transaction_size ) message_copier.start() wait_until(lambda: message_copier.alive(message_copier.nodes[0]), timeout_sec=10, err_msg="Message copier failed to start after 10 s") return message_copier def bounce_copiers(self, copiers, clean_shutdown): for _ in range(3): for copier in copiers: wait_until(lambda: copier.progress_percent() >= 20.0, timeout_sec=30, err_msg="%s : Message copier didn't make enough progress in 30s. Current progress: %s" \ % (copier.transactional_id, str(copier.progress_percent()))) self.logger.info("%s - progress: %s" % (copier.transactional_id, str(copier.progress_percent()))) copier.restart(clean_shutdown) def create_and_start_copiers(self, input_topic, output_topic, num_copiers): copiers = [] for i in range(0, num_copiers): copiers.append(self.create_and_start_message_copier( input_topic=input_topic, output_topic=output_topic, input_partition=i, transactional_id="copier-" + str(i) )) return copiers def start_consumer(self, topic_to_read, group_id): consumer = ConsoleConsumer(context=self.test_context, num_nodes=1, kafka=self.kafka, topic=topic_to_read, group_id=group_id, message_validator=is_int, from_beginning=True, isolation_level="read_committed") consumer.start() # ensure that the consumer is up. wait_until(lambda: (len(consumer.messages_consumed[1]) > 0) == True, timeout_sec=60, err_msg="Consumer failed to consume any messages for %ds" %\ 60) return consumer def drain_consumer(self, consumer, num_messages): # wait until we read at least the expected number of messages. # This is a safe check because both failure modes will be caught: # 1. If we have 'num_seed_messages' but there are duplicates, then # this is checked for later. # # 2. If we never reach 'num_seed_messages', then this will cause the # test to fail. wait_until(lambda: len(consumer.messages_consumed[1]) >= num_messages, timeout_sec=90, err_msg="Consumer consumed only %d out of %d messages in %ds" %\ (len(consumer.messages_consumed[1]), num_messages, 90)) consumer.stop() return consumer.messages_consumed[1] def copy_messages_transactionally(self, failure_mode, bounce_target, input_topic, output_topic, num_copiers, num_messages_to_copy): """Copies messages transactionally from the seeded input topic to the output topic, either bouncing brokers or clients in a hard and soft way as it goes. This method also consumes messages in read_committed mode from the output topic while the bounces and copy is going on. It returns the concurrently consumed messages. """ copiers = self.create_and_start_copiers(input_topic=input_topic, output_topic=output_topic, num_copiers=num_copiers) concurrent_consumer = self.start_consumer(output_topic, group_id="concurrent_consumer") clean_shutdown = False if failure_mode == "clean_bounce": clean_shutdown = True if bounce_target == "brokers": self.bounce_brokers(clean_shutdown) elif bounce_target == "clients": self.bounce_copiers(copiers, clean_shutdown) for copier in copiers: wait_until(lambda: copier.is_done, timeout_sec=120, err_msg="%s - Failed to copy all messages in %ds." %\ (copier.transactional_id, 120)) self.logger.info("finished copying messages") return self.drain_consumer(concurrent_consumer, num_messages_to_copy) def setup_topics(self): self.kafka.topics = { self.input_topic: { "partitions": self.num_input_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } }, self.output_topic: { "partitions": self.num_output_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } } } @cluster(num_nodes=9) @matrix(failure_mode=["hard_bounce", "clean_bounce"], bounce_target=["brokers", "clients"], check_order=[True, False]) def test_transactions(self, failure_mode, bounce_target, check_order): security_protocol = 'PLAINTEXT' self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol self.kafka.logs["kafka_data_1"]["collect_default"] = True self.kafka.logs["kafka_data_2"]["collect_default"] = True self.kafka.logs["kafka_operational_logs_debug"]["collect_default"] = True if check_order: # To check ordering, we simply create input and output topics # with a single partition. # We reduce the number of seed messages to copy to account for the fewer output # partitions, and thus lower parallelism. This helps keep the test # time shorter. self.num_seed_messages = self.num_seed_messages / 3 self.num_input_partitions = 1 self.num_output_partitions = 1 self.setup_topics() self.kafka.start() input_messages = self.seed_messages(self.input_topic, self.num_seed_messages) concurrently_consumed_messages = self.copy_messages_transactionally( failure_mode, bounce_target, input_topic=self.input_topic, output_topic=self.output_topic, num_copiers=self.num_input_partitions, num_messages_to_copy=self.num_seed_messages) output_messages = self.get_messages_from_topic(self.output_topic, self.num_seed_messages) concurrently_consumed_message_set = set(concurrently_consumed_messages) output_message_set = set(output_messages) input_message_set = set(input_messages) num_dups = abs(len(output_messages) - len(output_message_set)) num_dups_in_concurrent_consumer = abs(len(concurrently_consumed_messages) - len(concurrently_consumed_message_set)) assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" %\ (len(input_message_set), len(output_message_set)) assert num_dups_in_concurrent_consumer == 0, "Detected %d dups in concurrently consumed messages" % num_dups_in_concurrent_consumer assert input_message_set == concurrently_consumed_message_set, \ "Input and concurrently consumed output message sets are not equal. Num input messages: %d. Num concurrently_consumed_messages: %d" %\ (len(input_message_set), len(concurrently_consumed_message_set)) if check_order: assert input_messages == sorted(input_messages), "The seed messages themselves were not in order" assert output_messages == input_messages, "Output messages are not in order" assert concurrently_consumed_messages == output_messages, "Concurrently consumed messages are not in order"
class TestSecurityRollingUpgrade(ProduceConsumeValidateTest): """Tests a rolling upgrade from PLAINTEXT to a secured cluster """ def __init__(self, test_context): super(TestSecurityRollingUpgrade, self).__init__(test_context=test_context) def setUp(self): self.acls = ACLs(self.test_context) self.topic = "test_topic" self.group = "group" self.producer_throughput = 100 self.num_producers = 1 self.num_consumers = 1 self.zk = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, topics={ self.topic: { "partitions": 3, "replication-factor": 3, 'configs': { "min.insync.replicas": 2 } } }) self.zk.start() def create_producer_and_consumer(self): self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=60000, message_validator=is_int) self.consumer.group_id = "group" def bounce(self): self.kafka.start_minikdc() for node in self.kafka.nodes: self.kafka.stop_node(node) self.kafka.start_node(node) time.sleep(10) def roll_in_secured_settings(self, client_protocol, broker_protocol): # Roll cluster to include inter broker security protocol. self.kafka.setup_interbroker_listener(broker_protocol) self.bounce() # Roll cluster to disable PLAINTEXT port self.kafka.close_port(SecurityConfig.PLAINTEXT) self.set_authorizer_and_bounce(client_protocol, broker_protocol) def set_authorizer_and_bounce(self, client_protocol, broker_protocol): self.kafka.authorizer_class_name = KafkaService.SIMPLE_AUTHORIZER self.acls.set_acls(client_protocol, self.kafka, self.topic, self.group) self.acls.set_acls(broker_protocol, self.kafka, self.topic, self.group) self.bounce() def open_secured_port(self, client_protocol): self.kafka.security_protocol = client_protocol self.kafka.open_port(client_protocol) self.kafka.start_minikdc() self.bounce() def add_sasl_mechanism(self, new_client_sasl_mechanism): self.kafka.client_sasl_mechanism = new_client_sasl_mechanism self.kafka.start_minikdc() self.bounce() def roll_in_sasl_mechanism(self, security_protocol, new_sasl_mechanism): # Roll cluster to update inter-broker SASL mechanism. This disables the old mechanism. self.kafka.interbroker_sasl_mechanism = new_sasl_mechanism self.bounce() # Bounce again with ACLs for new mechanism self.set_authorizer_and_bounce(security_protocol, security_protocol) def add_separate_broker_listener(self, broker_security_protocol, broker_sasl_mechanism): self.kafka.setup_interbroker_listener(broker_security_protocol, True) self.kafka.interbroker_sasl_mechanism = broker_sasl_mechanism # kafka opens interbroker port automatically in start() but not in bounce() self.kafka.open_port(self.kafka.INTERBROKER_LISTENER_NAME) self.bounce() def remove_separate_broker_listener(self, client_security_protocol, client_sasl_mechanism): # separate interbroker listener port will be closed automatically in setup_interbroker_listener # if not using separate interbroker listener self.kafka.setup_interbroker_listener(client_security_protocol, False) self.kafka.interbroker_sasl_mechanism = client_sasl_mechanism self.bounce() @cluster(num_nodes=8) @matrix(client_protocol=[SecurityConfig.SSL]) @cluster(num_nodes=9) @matrix(client_protocol=[ SecurityConfig.SASL_PLAINTEXT, SecurityConfig.SASL_SSL ]) def test_rolling_upgrade_phase_one(self, client_protocol): """ Start with a PLAINTEXT cluster, open a SECURED port, via a rolling upgrade, ensuring we could produce and consume throughout over PLAINTEXT. Finally check we can produce and consume the new secured port. """ self.kafka.setup_interbroker_listener(SecurityConfig.PLAINTEXT) self.kafka.security_protocol = SecurityConfig.PLAINTEXT self.kafka.start() # Create PLAINTEXT producer and consumer self.create_producer_and_consumer() # Rolling upgrade, opening a secure protocol, ensuring the Plaintext producer/consumer continues to run self.run_produce_consume_validate(self.open_secured_port, client_protocol) # Now we can produce and consume via the secured port self.kafka.security_protocol = client_protocol self.create_producer_and_consumer() self.run_produce_consume_validate(lambda: time.sleep(1)) @cluster(num_nodes=8) @matrix(client_protocol=[ SecurityConfig.SASL_SSL, SecurityConfig.SSL, SecurityConfig.SASL_PLAINTEXT ], broker_protocol=[ SecurityConfig.SASL_SSL, SecurityConfig.SSL, SecurityConfig.SASL_PLAINTEXT ]) def test_rolling_upgrade_phase_two(self, client_protocol, broker_protocol): """ Start with a PLAINTEXT cluster with a second Secured port open (i.e. result of phase one). A third secure port is also open if inter-broker and client protocols are different. Start a Producer and Consumer via the SECURED client port Incrementally upgrade to add inter-broker be the secure broker protocol Incrementally upgrade again to add ACLs as well as disabling the PLAINTEXT port Ensure the producer and consumer ran throughout """ #Given we have a broker that has both secure and PLAINTEXT ports open self.kafka.security_protocol = client_protocol self.kafka.setup_interbroker_listener(SecurityConfig.PLAINTEXT, use_separate_listener=False) self.kafka.open_port(broker_protocol) self.kafka.start() #Create Secured Producer and Consumer self.create_producer_and_consumer() #Roll in the security protocol. Disable Plaintext. Ensure we can produce and Consume throughout self.run_produce_consume_validate(self.roll_in_secured_settings, client_protocol, broker_protocol) @cluster(num_nodes=9) @matrix(new_client_sasl_mechanism=[SecurityConfig.SASL_MECHANISM_PLAIN]) def test_rolling_upgrade_sasl_mechanism_phase_one( self, new_client_sasl_mechanism): """ Start with a SASL/GSSAPI cluster, add new SASL mechanism, via a rolling upgrade, ensuring we could produce and consume throughout over SASL/GSSAPI. Finally check we can produce and consume using new mechanism. """ self.kafka.setup_interbroker_listener(SecurityConfig.SASL_SSL, use_separate_listener=False) self.kafka.security_protocol = SecurityConfig.SASL_SSL self.kafka.client_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI self.kafka.interbroker_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI self.kafka.start() # Create SASL/GSSAPI producer and consumer self.create_producer_and_consumer() # Rolling upgrade, adding new SASL mechanism, ensuring the GSSAPI producer/consumer continues to run self.run_produce_consume_validate(self.add_sasl_mechanism, new_client_sasl_mechanism) # Now we can produce and consume using the new SASL mechanism self.kafka.client_sasl_mechanism = new_client_sasl_mechanism self.create_producer_and_consumer() self.run_produce_consume_validate(lambda: time.sleep(1)) @cluster(num_nodes=8) @matrix(new_sasl_mechanism=[SecurityConfig.SASL_MECHANISM_PLAIN]) def test_rolling_upgrade_sasl_mechanism_phase_two(self, new_sasl_mechanism): """ Start with a SASL cluster with GSSAPI for inter-broker and a second mechanism for clients (i.e. result of phase one). Start Producer and Consumer using the second mechanism Incrementally upgrade to set inter-broker to the second mechanism and disable GSSAPI Incrementally upgrade again to add ACLs Ensure the producer and consumer run throughout """ #Start with a broker that has GSSAPI for inter-broker and a second mechanism for clients self.kafka.security_protocol = SecurityConfig.SASL_SSL self.kafka.setup_interbroker_listener(SecurityConfig.SASL_SSL, use_separate_listener=False) self.kafka.client_sasl_mechanism = new_sasl_mechanism self.kafka.interbroker_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI self.kafka.start() #Create Producer and Consumer using second mechanism self.create_producer_and_consumer() #Roll in the second SASL mechanism for inter-broker, disabling first mechanism. Ensure we can produce and consume throughout self.run_produce_consume_validate(self.roll_in_sasl_mechanism, self.kafka.security_protocol, new_sasl_mechanism) @cluster(num_nodes=9) def test_enable_separate_interbroker_listener(self): """ Start with a cluster that has a single PLAINTEXT listener. Start producing/consuming on PLAINTEXT port. While doing that, do a rolling restart to enable separate secured interbroker port """ self.kafka.security_protocol = SecurityConfig.PLAINTEXT self.kafka.setup_interbroker_listener(SecurityConfig.PLAINTEXT, use_separate_listener=False) self.kafka.start() self.create_producer_and_consumer() self.run_produce_consume_validate(self.add_separate_broker_listener, SecurityConfig.SASL_SSL, SecurityConfig.SASL_MECHANISM_PLAIN) @cluster(num_nodes=9) def test_disable_separate_interbroker_listener(self): """ Start with a cluster that has two listeners, one on SSL (clients), another on SASL_SSL (broker-to-broker). Start producer and consumer on SSL listener. Close dedicated interbroker listener via rolling restart. Ensure we can produce and consume via SSL listener throughout. """ client_protocol = SecurityConfig.SSL client_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI self.kafka.security_protocol = client_protocol self.kafka.client_sasl_mechanism = client_sasl_mechanism self.kafka.setup_interbroker_listener(SecurityConfig.SASL_SSL, use_separate_listener=True) self.kafka.interbroker_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI self.kafka.start() # create producer and consumer via client security protocol self.create_producer_and_consumer() # run produce/consume/validate loop while disabling a separate interbroker listener via rolling restart self.run_produce_consume_validate(self.remove_separate_broker_listener, client_protocol, client_sasl_mechanism)
class ZooKeeperSecurityUpgradeTest(ProduceConsumeValidateTest): """Tests a rolling upgrade for zookeeper. """ def __init__(self, test_context): super(ZooKeeperSecurityUpgradeTest, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.group = "group" self.producer_throughput = 100 self.num_producers = 1 self.num_consumers = 1 self.acls = ACLs() self.zk = ZookeeperService(self.test_context, num_nodes=3) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, topics={self.topic: { "partitions": 3, "replication-factor": 3, 'configs': {"min.insync.replicas": 2}}}) def create_producer_and_consumer(self): self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput) self.consumer = ConsoleConsumer( self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=60000, message_validator=is_int, new_consumer=True) self.consumer.group_id = self.group @property def no_sasl(self): return self.kafka.security_protocol == "PLAINTEXT" or self.kafka.security_protocol == "SSL" @property def is_secure(self): return self.kafka.security_protocol == "SASL_PLAINTEXT" \ or self.kafka.security_protocol == "SSL" \ or self.kafka.security_protocol == "SASL_SSL" def run_zk_migration(self): # change zk config (auth provider + jaas login) self.zk.kafka_opts = self.zk.security_system_properties self.zk.zk_sasl = True if self.no_sasl: self.kafka.start_minikdc(self.zk.zk_principals) # restart zk for node in self.zk.nodes: self.zk.stop_node(node) self.zk.start_node(node) # restart broker with jaas login for node in self.kafka.nodes: self.kafka.stop_node(node) self.kafka.start_node(node) # run migration tool for node in self.zk.nodes: self.zk.zookeeper_migration(node, "secure") # restart broker with zookeeper.set.acl=true and acls self.kafka.zk_set_acl = "true" for node in self.kafka.nodes: self.kafka.stop_node(node) self.kafka.start_node(node) @matrix(security_protocol=["PLAINTEXT","SSL","SASL_SSL","SASL_PLAINTEXT"]) def test_zk_security_upgrade(self, security_protocol): self.zk.start() self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol # set acls if self.is_secure: self.kafka.authorizer_class_name = KafkaService.SIMPLE_AUTHORIZER self.acls.set_acls(security_protocol, self.kafka, self.zk, self.topic, self.group) if(self.no_sasl): self.kafka.start() else: self.kafka.start(self.zk.zk_principals) #Create Producer and Consumer self.create_producer_and_consumer() #Run upgrade self.run_produce_consume_validate(self.run_zk_migration)
class TestSecurityRollingUpgrade(ProduceConsumeValidateTest): """Tests a rolling upgrade from PLAINTEXT to a secured cluster """ def __init__(self, test_context): super(TestSecurityRollingUpgrade, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.producer_throughput = 100 self.num_producers = 1 self.num_consumers = 1 self.zk = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, topics={ self.topic: { "partitions": 3, "replication-factor": 3, 'configs': { "min.insync.replicas": 2 } } }) self.zk.start() def create_producer_and_consumer(self): self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=60000, message_validator=is_int, new_consumer=True) self.consumer.group_id = "unique-test-group-" + str(random.random()) def bounce(self): self.kafka.start_minikdc() for node in self.kafka.nodes: self.kafka.stop_node(node) self.kafka.start_node(node) time.sleep(10) def roll_in_secured_settings(self, client_protocol, broker_protocol): # Roll cluster to include inter broker security protocol. self.kafka.interbroker_security_protocol = broker_protocol self.kafka.open_port(client_protocol) self.kafka.open_port(broker_protocol) self.bounce() # Roll cluster to disable PLAINTEXT port self.kafka.close_port('PLAINTEXT') self.bounce() def open_secured_port(self, client_protocol): self.kafka.security_protocol = client_protocol self.kafka.open_port(client_protocol) self.kafka.start_minikdc() self.bounce() @matrix(client_protocol=["SSL", "SASL_PLAINTEXT", "SASL_SSL"]) def test_rolling_upgrade_phase_one(self, client_protocol): """ Start with a PLAINTEXT cluster, open a SECURED port, via a rolling upgrade, ensuring we could produce and consume throughout over PLAINTEXT. Finally check we can produce and consume the new secured port. """ self.kafka.interbroker_security_protocol = "PLAINTEXT" self.kafka.security_protocol = "PLAINTEXT" self.kafka.start() # Create PLAINTEXT producer and consumer self.create_producer_and_consumer() # Rolling upgrade, opening a secure protocol, ensuring the Plaintext producer/consumer continues to run self.run_produce_consume_validate(self.open_secured_port, client_protocol) # Now we can produce and consume via the secured port self.kafka.security_protocol = client_protocol self.create_producer_and_consumer() self.run_produce_consume_validate(lambda: time.sleep(1)) @matrix(client_protocol=["SASL_SSL", "SSL", "SASL_PLAINTEXT"], broker_protocol=["SASL_SSL", "SSL", "SASL_PLAINTEXT"]) def test_rolling_upgrade_phase_two(self, client_protocol, broker_protocol): """ Start with a PLAINTEXT cluster with a second Secured port open (i.e. result of phase one). Start an Producer and Consumer via the SECURED port Rolling upgrade to add inter-broker be the secure protocol Rolling upgrade again to disable PLAINTEXT Ensure the producer and consumer ran throughout """ #Given we have a broker that has both secure and PLAINTEXT ports open self.kafka.security_protocol = client_protocol self.kafka.interbroker_security_protocol = "PLAINTEXT" self.kafka.start() #Create Secured Producer and Consumer self.create_producer_and_consumer() #Roll in the security protocol. Disable Plaintext. Ensure we can produce and Consume throughout self.run_produce_consume_validate(self.roll_in_secured_settings, client_protocol, broker_protocol)
class TransactionsTest(Test): """Tests transactions by transactionally copying data from a source topic to a destination topic and killing the copy process as well as the broker randomly through the process. In the end we verify that the final output topic contains exactly one committed copy of each message in the input topic """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(TransactionsTest, self).__init__(test_context=test_context) self.input_topic = "input-topic" self.output_topic = "output-topic" self.num_brokers = 3 # Test parameters self.num_input_partitions = 2 self.num_output_partitions = 3 self.num_seed_messages = 20000 self.transaction_size = 500 self.first_transactional_id = "my-first-transactional-id" self.second_transactional_id = "my-second-transactional-id" self.consumer_group = "transactions-test-consumer-group" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=self.num_brokers, zk=self.zk, topics={ self.input_topic: { "partitions": self.num_input_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } }, self.output_topic: { "partitions": self.num_output_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } } }) def setUp(self): self.zk.start() def seed_messages(self): seed_timeout_sec = 10000 seed_producer = VerifiableProducer(context=self.test_context, num_nodes=1, kafka=self.kafka, topic=self.input_topic, message_validator=is_int, max_messages=self.num_seed_messages, enable_idempotence=True) seed_producer.start() wait_until(lambda: seed_producer.num_acked >= self.num_seed_messages, timeout_sec=seed_timeout_sec, err_msg="Producer failed to produce messages %d in %ds." %\ (self.num_seed_messages, seed_timeout_sec)) return seed_producer.acked def get_messages_from_output_topic(self): consumer = ConsoleConsumer(context=self.test_context, num_nodes=1, kafka=self.kafka, topic=self.output_topic, new_consumer=True, message_validator=is_int, from_beginning=True, consumer_timeout_ms=5000, isolation_level="read_committed") consumer.start() # ensure that the consumer is up. wait_until(lambda: consumer.alive(consumer.nodes[0]) == True, timeout_sec=60, err_msg="Consumer failed to start for %ds" %\ 60) # wait until the consumer closes, which will be 5 seconds after # receiving the last message. wait_until(lambda: consumer.alive(consumer.nodes[0]) == False, timeout_sec=60, err_msg="Consumer failed to consume %d messages in %ds" %\ (self.num_seed_messages, 60)) return consumer.messages_consumed[1] def bounce_brokers(self, clean_shutdown): for node in self.kafka.nodes: if clean_shutdown: self.kafka.restart_node(node, clean_shutdown=True) else: self.kafka.stop_node(node, clean_shutdown=False) wait_until(lambda: len(self.kafka.pids(node)) == 0 and not self .kafka.is_registered(node), timeout_sec=self.kafka.zk_session_timeout + 5, err_msg="Failed to see timely deregistration of \ hard-killed broker %s" % str(node.account)) self.kafka.start_node(node) def create_and_start_message_copier(self, input_partition, transactional_id): message_copier = TransactionalMessageCopier( context=self.test_context, num_nodes=1, kafka=self.kafka, transactional_id=transactional_id, consumer_group=self.consumer_group, input_topic=self.input_topic, input_partition=input_partition, output_topic=self.output_topic, max_messages=-1, transaction_size=self.transaction_size) message_copier.start() wait_until(lambda: message_copier.alive(message_copier.nodes[0]), timeout_sec=10, err_msg="Message copier failed to start after 10 s") return message_copier def bounce_copiers(self, copiers, clean_shutdown): for _ in range(3): for copier in copiers: wait_until(lambda: copier.progress_percent() >= 20.0, timeout_sec=30, err_msg="%s : Message copier didn't make enough progress in 30s. Current progress: %s" \ % (copier.transactional_id, str(copier.progress_percent()))) self.logger.info( "%s - progress: %s" % (copier.transactional_id, str(copier.progress_percent()))) copier.restart(clean_shutdown) def create_and_start_copiers(self): copiers = [] copiers.append( self.create_and_start_message_copier( input_partition=0, transactional_id=self.first_transactional_id)) copiers.append( self.create_and_start_message_copier( input_partition=1, transactional_id=self.second_transactional_id)) return copiers def copy_messages_transactionally(self, failure_mode, bounce_target): copiers = self.create_and_start_copiers() clean_shutdown = False if failure_mode == "clean_bounce": clean_shutdown = True if bounce_target == "brokers": self.bounce_brokers(clean_shutdown) elif bounce_target == "clients": self.bounce_copiers(copiers, clean_shutdown) for copier in copiers: wait_until(lambda: copier.is_done, timeout_sec=60, err_msg="%s - Failed to copy all messages in %ds." %\ (copier.transactional_id, 60)) self.logger.info("finished copying messages") @cluster(num_nodes=8) @matrix(failure_mode=["clean_bounce", "hard_bounce"], bounce_target=["brokers", "clients"]) def test_transactions(self, failure_mode, bounce_target): security_protocol = 'PLAINTEXT' self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol self.kafka.start() input_messages = self.seed_messages() self.copy_messages_transactionally(failure_mode, bounce_target) output_messages = self.get_messages_from_output_topic() output_message_set = set(output_messages) input_message_set = set(input_messages) num_dups = abs(len(output_messages) - len(output_message_set)) assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" %\ (len(input_message_set), len(output_message_set))
class ReplicaScaleTest(Test): def __init__(self, test_context): super(ReplicaScaleTest, self).__init__(test_context=test_context) self.test_context = test_context self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=8, zk=self.zk) def setUp(self): self.zk.start() self.kafka.start() def teardown(self): # Need to increase the timeout due to partition count for node in self.kafka.nodes: self.kafka.stop_node(node, clean_shutdown=False, timeout_sec=60) self.kafka.stop() self.zk.stop() @cluster(num_nodes=12) @parametrize(topic_count=500, partition_count=34, replication_factor=3) def test_produce_consume(self, topic_count, partition_count, replication_factor): topics_create_start_time = time.time() for i in range(topic_count): topic = "replicas_produce_consume_%d" % i print("Creating topic %s" % topic) # Force some stdout for Jenkins topic_cfg = { "topic": topic, "partitions": partition_count, "replication-factor": replication_factor, "configs": {"min.insync.replicas": 2} } self.kafka.create_topic(topic_cfg) topics_create_end_time = time.time() self.logger.info("Time to create topics: %d" % (topics_create_end_time - topics_create_start_time)) producer_workload_service = ProduceBenchWorkloadService(self.test_context, self.kafka) consumer_workload_service = ConsumeBenchWorkloadService(self.test_context, self.kafka) trogdor = TrogdorService(context=self.test_context, client_services=[self.kafka, producer_workload_service, consumer_workload_service]) trogdor.start() produce_spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, producer_workload_service.producer_node, producer_workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=3400000, producer_conf={}, admin_client_conf={}, common_client_conf={}, inactive_topics={}, active_topics={"replicas_produce_consume_[0-2]": { "numPartitions": partition_count, "replicationFactor": replication_factor }}) produce_workload = trogdor.create_task("replicas-produce-workload", produce_spec) produce_workload.wait_for_done(timeout_sec=600) self.logger.info("Completed produce bench") consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, consumer_workload_service.consumer_node, consumer_workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=3400000, consumer_conf={}, admin_client_conf={}, common_client_conf={}, active_topics=["replicas_produce_consume_[0-2]"]) consume_workload = trogdor.create_task("replicas-consume-workload", consume_spec) consume_workload.wait_for_done(timeout_sec=600) self.logger.info("Completed consume bench") trogdor.stop() @cluster(num_nodes=12) @parametrize(topic_count=500, partition_count=34, replication_factor=3) def test_clean_bounce(self, topic_count, partition_count, replication_factor): topics_create_start_time = time.time() for i in range(topic_count): topic = "topic-%04d" % i print("Creating topic %s" % topic) # Force some stdout for Jenkins topic_cfg = { "topic": topic, "partitions": partition_count, "replication-factor": replication_factor, "configs": {"min.insync.replicas": 2} } self.kafka.create_topic(topic_cfg) topics_create_end_time = time.time() self.logger.info("Time to create topics: %d" % (topics_create_end_time - topics_create_start_time)) restart_times = [] for node in self.kafka.nodes: broker_bounce_start_time = time.time() self.kafka.stop_node(node, clean_shutdown=True, timeout_sec=600) self.kafka.start_node(node, timeout_sec=600) broker_bounce_end_time = time.time() restart_times.append(broker_bounce_end_time - broker_bounce_start_time) self.logger.info("Time to restart %s: %d" % (node.name, broker_bounce_end_time - broker_bounce_start_time)) self.logger.info("Restart times: %s" % restart_times) delete_start_time = time.time() for i in range(topic_count): topic = "topic-%04d" % i self.logger.info("Deleting topic %s" % topic) self.kafka.delete_topic(topic) delete_end_time = time.time() self.logger.info("Time to delete topics: %d" % (delete_end_time - delete_start_time))
class ZooKeeperSecurityUpgradeTest(ProduceConsumeValidateTest): """Tests a rolling upgrade for zookeeper. """ def __init__(self, test_context): super(ZooKeeperSecurityUpgradeTest, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.group = "group" self.producer_throughput = 100 self.num_producers = 1 self.num_consumers = 1 self.acls = ACLs() self.zk = ZookeeperService(self.test_context, num_nodes=3) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, topics={ self.topic: { "partitions": 3, "replication-factor": 3, 'configs': { "min.insync.replicas": 2 } } }) def create_producer_and_consumer(self): self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=60000, message_validator=is_int, new_consumer=True) self.consumer.group_id = self.group @property def no_sasl(self): return self.kafka.security_protocol == "PLAINTEXT" or self.kafka.security_protocol == "SSL" @property def is_secure(self): return self.kafka.security_protocol == "SASL_PLAINTEXT" \ or self.kafka.security_protocol == "SSL" \ or self.kafka.security_protocol == "SASL_SSL" def run_zk_migration(self): # change zk config (auth provider + jaas login) self.zk.kafka_opts = self.zk.security_system_properties self.zk.zk_sasl = True if self.no_sasl: self.kafka.start_minikdc(self.zk.zk_principals) # restart zk for node in self.zk.nodes: self.zk.stop_node(node) self.zk.start_node(node) # restart broker with jaas login for node in self.kafka.nodes: self.kafka.stop_node(node) self.kafka.start_node(node) # run migration tool for node in self.zk.nodes: self.zk.zookeeper_migration(node, "secure") # restart broker with zookeeper.set.acl=true and acls self.kafka.zk_set_acl = "true" for node in self.kafka.nodes: self.kafka.stop_node(node) self.kafka.start_node(node) @matrix( security_protocol=["PLAINTEXT", "SSL", "SASL_SSL", "SASL_PLAINTEXT"]) def test_zk_security_upgrade(self, security_protocol): self.zk.start() self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol # set acls if self.is_secure: self.kafka.authorizer_class_name = KafkaService.SIMPLE_AUTHORIZER self.acls.set_acls(security_protocol, self.kafka, self.zk, self.topic, self.group) if (self.no_sasl): self.kafka.start() else: self.kafka.start(self.zk.zk_principals) #Create Producer and Consumer self.create_producer_and_consumer() #Run upgrade self.run_produce_consume_validate(self.run_zk_migration)
class TransactionsTest(Test): """Tests transactions by transactionally copying data from a source topic to a destination topic and killing the copy process as well as the broker randomly through the process. In the end we verify that the final output topic contains exactly one committed copy of each message in the input topic. """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(TransactionsTest, self).__init__(test_context=test_context) self.input_topic = "input-topic" self.output_topic = "output-topic" self.num_brokers = 3 # Test parameters self.num_input_partitions = 2 self.num_output_partitions = 3 self.num_seed_messages = 100000 self.transaction_size = 750 # The transaction timeout should be lower than the progress timeout, but at # least as high as the request timeout (which is 30s by default). When the # client is hard-bounced, progress may depend on the previous transaction # being aborted. When the broker is hard-bounced, we may have to wait as # long as the request timeout to get a `Produce` response and we do not # want the coordinator timing out the transaction. self.transaction_timeout = 40000 self.progress_timeout_sec = 60 self.consumer_group = "transactions-test-consumer-group" self.zk = ZookeeperService(test_context, num_nodes=1) if quorum.for_test( test_context) == quorum.zk else None self.kafka = KafkaService(test_context, num_nodes=self.num_brokers, zk=self.zk, controller_num_nodes_override=1) def setUp(self): if self.zk: self.zk.start() def seed_messages(self, topic, num_seed_messages): seed_timeout_sec = 10000 seed_producer = VerifiableProducer(context=self.test_context, num_nodes=1, kafka=self.kafka, topic=topic, message_validator=is_int, max_messages=num_seed_messages, enable_idempotence=True) seed_producer.start() wait_until(lambda: seed_producer.num_acked >= num_seed_messages, timeout_sec=seed_timeout_sec, err_msg="Producer failed to produce messages %d in %ds." %\ (self.num_seed_messages, seed_timeout_sec)) return seed_producer.acked def get_messages_from_topic(self, topic, num_messages): consumer = self.start_consumer(topic, group_id="verifying_consumer") return self.drain_consumer(consumer, num_messages) def bounce_brokers(self, clean_shutdown): for node in self.kafka.nodes: if clean_shutdown: self.kafka.restart_node(node, clean_shutdown=True) else: self.kafka.stop_node(node, clean_shutdown=False) gracePeriodSecs = 5 if self.zk: wait_until( lambda: len(self.kafka.pids( node)) == 0 and not self.kafka.is_registered(node), timeout_sec=self.kafka.zk_session_timeout + gracePeriodSecs, err_msg= "Failed to see timely deregistration of hard-killed broker %s" % str(node.account)) else: brokerSessionTimeoutSecs = 18 wait_until( lambda: len(self.kafka.pids(node)) == 0, timeout_sec=brokerSessionTimeoutSecs + gracePeriodSecs, err_msg= "Failed to see timely disappearance of process for hard-killed broker %s" % str(node.account)) time.sleep(brokerSessionTimeoutSecs + gracePeriodSecs) self.kafka.start_node(node) def create_and_start_message_copier(self, input_topic, input_partition, output_topic, transactional_id, use_group_metadata): message_copier = TransactionalMessageCopier( context=self.test_context, num_nodes=1, kafka=self.kafka, transactional_id=transactional_id, consumer_group=self.consumer_group, input_topic=input_topic, input_partition=input_partition, output_topic=output_topic, max_messages=-1, transaction_size=self.transaction_size, transaction_timeout=self.transaction_timeout, use_group_metadata=use_group_metadata) message_copier.start() wait_until(lambda: message_copier.alive(message_copier.nodes[0]), timeout_sec=10, err_msg="Message copier failed to start after 10 s") return message_copier def bounce_copiers(self, copiers, clean_shutdown): for _ in range(3): for copier in copiers: wait_until(lambda: copier.progress_percent() >= 20.0, timeout_sec=self.progress_timeout_sec, err_msg="%s : Message copier didn't make enough progress in %ds. Current progress: %s" \ % (copier.transactional_id, self.progress_timeout_sec, str(copier.progress_percent()))) self.logger.info( "%s - progress: %s" % (copier.transactional_id, str(copier.progress_percent()))) copier.restart(clean_shutdown) def create_and_start_copiers(self, input_topic, output_topic, num_copiers, use_group_metadata): copiers = [] for i in range(0, num_copiers): copiers.append( self.create_and_start_message_copier( input_topic=input_topic, output_topic=output_topic, input_partition=i, transactional_id="copier-" + str(i), use_group_metadata=use_group_metadata)) return copiers def start_consumer(self, topic_to_read, group_id): consumer = ConsoleConsumer(context=self.test_context, num_nodes=1, kafka=self.kafka, topic=topic_to_read, group_id=group_id, message_validator=is_int, from_beginning=True, isolation_level="read_committed") consumer.start() # ensure that the consumer is up. wait_until(lambda: (len(consumer.messages_consumed[1]) > 0) == True, timeout_sec=60, err_msg="Consumer failed to consume any messages for %ds" %\ 60) return consumer def drain_consumer(self, consumer, num_messages): # wait until we read at least the expected number of messages. # This is a safe check because both failure modes will be caught: # 1. If we have 'num_seed_messages' but there are duplicates, then # this is checked for later. # # 2. If we never reach 'num_seed_messages', then this will cause the # test to fail. wait_until(lambda: len(consumer.messages_consumed[1]) >= num_messages, timeout_sec=90, err_msg="Consumer consumed only %d out of %d messages in %ds" %\ (len(consumer.messages_consumed[1]), num_messages, 90)) consumer.stop() return consumer.messages_consumed[1] def copy_messages_transactionally(self, failure_mode, bounce_target, input_topic, output_topic, num_copiers, num_messages_to_copy, use_group_metadata): """Copies messages transactionally from the seeded input topic to the output topic, either bouncing brokers or clients in a hard and soft way as it goes. This method also consumes messages in read_committed mode from the output topic while the bounces and copy is going on. It returns the concurrently consumed messages. """ copiers = self.create_and_start_copiers( input_topic=input_topic, output_topic=output_topic, num_copiers=num_copiers, use_group_metadata=use_group_metadata) concurrent_consumer = self.start_consumer( output_topic, group_id="concurrent_consumer") clean_shutdown = False if failure_mode == "clean_bounce": clean_shutdown = True if bounce_target == "brokers": self.bounce_brokers(clean_shutdown) elif bounce_target == "clients": self.bounce_copiers(copiers, clean_shutdown) copier_timeout_sec = 120 for copier in copiers: wait_until(lambda: copier.is_done, timeout_sec=copier_timeout_sec, err_msg="%s - Failed to copy all messages in %ds." %\ (copier.transactional_id, copier_timeout_sec)) self.logger.info("finished copying messages") return self.drain_consumer(concurrent_consumer, num_messages_to_copy) def setup_topics(self): self.kafka.topics = { self.input_topic: { "partitions": self.num_input_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } }, self.output_topic: { "partitions": self.num_output_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } } } @cluster(num_nodes=9) @matrix(failure_mode=["hard_bounce", "clean_bounce"], bounce_target=["brokers", "clients"], check_order=[True, False], use_group_metadata=[True, False]) def test_transactions(self, failure_mode, bounce_target, check_order, use_group_metadata, metadata_quorum=quorum.all): security_protocol = 'PLAINTEXT' self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol self.kafka.logs["kafka_data_1"]["collect_default"] = True self.kafka.logs["kafka_data_2"]["collect_default"] = True self.kafka.logs["kafka_operational_logs_debug"][ "collect_default"] = True if check_order: # To check ordering, we simply create input and output topics # with a single partition. # We reduce the number of seed messages to copy to account for the fewer output # partitions, and thus lower parallelism. This helps keep the test # time shorter. self.num_seed_messages = self.num_seed_messages // 3 self.num_input_partitions = 1 self.num_output_partitions = 1 self.setup_topics() self.kafka.start() input_messages = self.seed_messages(self.input_topic, self.num_seed_messages) concurrently_consumed_messages = self.copy_messages_transactionally( failure_mode, bounce_target, input_topic=self.input_topic, output_topic=self.output_topic, num_copiers=self.num_input_partitions, num_messages_to_copy=self.num_seed_messages, use_group_metadata=use_group_metadata) output_messages = self.get_messages_from_topic(self.output_topic, self.num_seed_messages) concurrently_consumed_message_set = set(concurrently_consumed_messages) output_message_set = set(output_messages) input_message_set = set(input_messages) num_dups = abs(len(output_messages) - len(output_message_set)) num_dups_in_concurrent_consumer = abs( len(concurrently_consumed_messages) - len(concurrently_consumed_message_set)) assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" %\ (len(input_message_set), len(output_message_set)) assert num_dups_in_concurrent_consumer == 0, "Detected %d dups in concurrently consumed messages" % num_dups_in_concurrent_consumer assert input_message_set == concurrently_consumed_message_set, \ "Input and concurrently consumed output message sets are not equal. Num input messages: %d. Num concurrently_consumed_messages: %d" %\ (len(input_message_set), len(concurrently_consumed_message_set)) if check_order: assert input_messages == sorted( input_messages ), "The seed messages themselves were not in order" assert output_messages == input_messages, "Output messages are not in order" assert concurrently_consumed_messages == output_messages, "Concurrently consumed messages are not in order"
class GroupModeTransactionsTest(Test): """Essentially testing the same functionality as TransactionsTest by transactionally copying data from a source topic to a destination topic and killing the copy process as well as the broker randomly through the process. The major difference is that we choose to work as a collaborated group with same topic subscription instead of individual copiers. In the end we verify that the final output topic contains exactly one committed copy of each message from the original producer. """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(GroupModeTransactionsTest, self).__init__(test_context=test_context) self.input_topic = "input-topic" self.output_topic = "output-topic" self.num_brokers = 3 # Test parameters self.num_input_partitions = 9 self.num_output_partitions = 9 self.num_copiers = 3 self.num_seed_messages = 100000 self.transaction_size = 750 # The transaction timeout should be lower than the progress timeout, but at # least as high as the request timeout (which is 30s by default). When the # client is hard-bounced, progress may depend on the previous transaction # being aborted. When the broker is hard-bounced, we may have to wait as # long as the request timeout to get a `Produce` response and we do not # want the coordinator timing out the transaction. self.transaction_timeout = 40000 self.progress_timeout_sec = 60 self.consumer_group = "grouped-transactions-test-consumer-group" self.zk = ZookeeperService(test_context, num_nodes=1) if quorum.for_test( test_context) == quorum.zk else None self.kafka = KafkaService(test_context, num_nodes=self.num_brokers, zk=self.zk, controller_num_nodes_override=1) def setUp(self): if self.zk: self.zk.start() def seed_messages(self, topic, num_seed_messages): seed_timeout_sec = 10000 seed_producer = VerifiableProducer( context=self.test_context, num_nodes=1, kafka=self.kafka, topic=topic, message_validator=is_int, max_messages=num_seed_messages, enable_idempotence=True, repeating_keys=self.num_input_partitions) seed_producer.start() wait_until(lambda: seed_producer.num_acked >= num_seed_messages, timeout_sec=seed_timeout_sec, err_msg="Producer failed to produce messages %d in %ds." % \ (self.num_seed_messages, seed_timeout_sec)) return seed_producer.acked_by_partition def get_messages_from_topic(self, topic, num_messages): consumer = self.start_consumer(topic, group_id="verifying_consumer") return self.drain_consumer(consumer, num_messages) def bounce_brokers(self, clean_shutdown): for node in self.kafka.nodes: if clean_shutdown: self.kafka.restart_node(node, clean_shutdown=True) else: self.kafka.stop_node(node, clean_shutdown=False) gracePeriodSecs = 5 if self.zk: wait_until( lambda: len(self.kafka.pids( node)) == 0 and not self.kafka.is_registered(node), timeout_sec=self.kafka.zk_session_timeout + gracePeriodSecs, err_msg= "Failed to see timely deregistration of hard-killed broker %s" % str(node.account)) else: brokerSessionTimeoutSecs = 18 wait_until( lambda: len(self.kafka.pids(node)) == 0, timeout_sec=brokerSessionTimeoutSecs + gracePeriodSecs, err_msg= "Failed to see timely disappearance of process for hard-killed broker %s" % str(node.account)) time.sleep(brokerSessionTimeoutSecs + gracePeriodSecs) self.kafka.start_node(node) def create_and_start_message_copier(self, input_topic, output_topic, transactional_id): message_copier = TransactionalMessageCopier( context=self.test_context, num_nodes=1, kafka=self.kafka, transactional_id=transactional_id, consumer_group=self.consumer_group, input_topic=input_topic, input_partition=-1, output_topic=output_topic, max_messages=-1, transaction_size=self.transaction_size, transaction_timeout=self.transaction_timeout, use_group_metadata=True, group_mode=True) message_copier.start() wait_until(lambda: message_copier.alive(message_copier.nodes[0]), timeout_sec=10, err_msg="Message copier failed to start after 10 s") return message_copier def bounce_copiers(self, copiers, clean_shutdown, timeout_sec=240): for _ in range(3): for copier in copiers: wait_until(lambda: copier.progress_percent() >= 20.0, timeout_sec=self.progress_timeout_sec, err_msg="%s : Message copier didn't make enough progress in %ds. Current progress: %s" \ % (copier.transactional_id, self.progress_timeout_sec, str(copier.progress_percent()))) self.logger.info( "%s - progress: %s" % (copier.transactional_id, str(copier.progress_percent()))) copier.restart(clean_shutdown) def create_and_start_copiers(self, input_topic, output_topic, num_copiers): copiers = [] for i in range(0, num_copiers): copiers.append( self.create_and_start_message_copier( input_topic=input_topic, output_topic=output_topic, transactional_id="copier-" + str(i))) return copiers @staticmethod def valid_value_and_partition(msg): """Method used to check whether the given message is a valid tab separated value + partition return value and partition as a size-two array represented tuple: [value, partition] """ try: splitted_msg = msg.split('\t') value = int(splitted_msg[1]) partition = int(splitted_msg[0].split(":")[1]) return [value, partition] except ValueError: raise Exception( "Unexpected message format (expected a tab separated [value, partition] tuple). Message: %s" % (msg)) def start_consumer(self, topic_to_read, group_id): consumer = ConsoleConsumer( context=self.test_context, num_nodes=1, kafka=self.kafka, topic=topic_to_read, group_id=group_id, message_validator=self.valid_value_and_partition, from_beginning=True, print_partition=True, isolation_level="read_committed") consumer.start() # ensure that the consumer is up. wait_until(lambda: (len(consumer.messages_consumed[1]) > 0) == True, timeout_sec=60, err_msg="Consumer failed to consume any messages for %ds" % \ 60) return consumer @staticmethod def split_by_partition(messages_consumed): messages_by_partition = {} for msg in messages_consumed: partition = msg[1] if partition not in messages_by_partition: messages_by_partition[partition] = [] messages_by_partition[partition].append(msg[0]) return messages_by_partition def drain_consumer(self, consumer, num_messages): # wait until we read at least the expected number of messages. # This is a safe check because both failure modes will be caught: # 1. If we have 'num_seed_messages' but there are duplicates, then # this is checked for later. # # 2. If we never reach 'num_seed_messages', then this will cause the # test to fail. wait_until(lambda: len(consumer.messages_consumed[1]) >= num_messages, timeout_sec=90, err_msg="Consumer consumed only %d out of %d messages in %ds" % \ (len(consumer.messages_consumed[1]), num_messages, 90)) consumer.stop() return self.split_by_partition(consumer.messages_consumed[1]) def copy_messages_transactionally(self, failure_mode, bounce_target, input_topic, output_topic, num_copiers, num_messages_to_copy): """Copies messages transactionally from the seeded input topic to the output topic, either bouncing brokers or clients in a hard and soft way as it goes. This method also consumes messages in read_committed mode from the output topic while the bounces and copy is going on. It returns the concurrently consumed messages. """ copiers = self.create_and_start_copiers(input_topic=input_topic, output_topic=output_topic, num_copiers=num_copiers) concurrent_consumer = self.start_consumer( output_topic, group_id="concurrent_consumer") clean_shutdown = False if failure_mode == "clean_bounce": clean_shutdown = True if bounce_target == "brokers": self.bounce_brokers(clean_shutdown) elif bounce_target == "clients": self.bounce_copiers(copiers, clean_shutdown) copier_timeout_sec = 240 for copier in copiers: wait_until(lambda: copier.is_done, timeout_sec=copier_timeout_sec, err_msg="%s - Failed to copy all messages in %ds." % \ (copier.transactional_id, copier_timeout_sec)) self.logger.info("finished copying messages") return self.drain_consumer(concurrent_consumer, num_messages_to_copy) def setup_topics(self): self.kafka.topics = { self.input_topic: { "partitions": self.num_input_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } }, self.output_topic: { "partitions": self.num_output_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } } } @cluster(num_nodes=10) @matrix(failure_mode=["hard_bounce", "clean_bounce"], bounce_target=["brokers", "clients"]) def test_transactions(self, failure_mode, bounce_target, metadata_quorum=quorum.zk): security_protocol = 'PLAINTEXT' self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol self.kafka.logs["kafka_data_1"]["collect_default"] = True self.kafka.logs["kafka_data_2"]["collect_default"] = True self.kafka.logs["kafka_operational_logs_debug"][ "collect_default"] = True self.setup_topics() self.kafka.start() input_messages_by_partition = self.seed_messages( self.input_topic, self.num_seed_messages) concurrently_consumed_message_by_partition = self.copy_messages_transactionally( failure_mode, bounce_target, input_topic=self.input_topic, output_topic=self.output_topic, num_copiers=self.num_copiers, num_messages_to_copy=self.num_seed_messages) output_messages_by_partition = self.get_messages_from_topic( self.output_topic, self.num_seed_messages) assert len(input_messages_by_partition) == \ len(concurrently_consumed_message_by_partition), "The lengths of partition count doesn't match: " \ "input partitions count %d, " \ "concurrently consumed partitions count %d" % \ (len(input_messages_by_partition), len(concurrently_consumed_message_by_partition)) assert len(input_messages_by_partition) == \ len(output_messages_by_partition), "The lengths of partition count doesn't match: " \ "input partitions count %d, " \ "output partitions count %d" % \ (len(input_messages_by_partition), len(concurrently_consumed_message_by_partition)) for p in range(self.num_input_partitions): if p not in input_messages_by_partition: continue assert p in output_messages_by_partition, "Partition %d not in output messages" assert p in concurrently_consumed_message_by_partition, "Partition %d not in concurrently consumed messages" output_message_set = set(output_messages_by_partition[p]) input_message_set = set(input_messages_by_partition[p]) concurrently_consumed_message_set = set( concurrently_consumed_message_by_partition[p]) num_dups = abs(len(output_messages) - len(output_message_set)) num_dups_in_concurrent_consumer = abs( len(concurrently_consumed_messages) - len(concurrently_consumed_message_set)) assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" % \ (len(input_message_set), len(output_message_set)) assert num_dups_in_concurrent_consumer == 0, "Detected %d dups in concurrently consumed messages" % num_dups_in_concurrent_consumer assert input_message_set == concurrently_consumed_message_set, \ "Input and concurrently consumed output message sets are not equal. Num input messages: %d. Num concurrently_consumed_messages: %d" % \ (len(input_message_set), len(concurrently_consumed_message_set)) assert input_messages == sorted( input_messages ), "The seed messages themselves were not in order" assert output_messages == input_messages, "Output messages are not in order" assert concurrently_consumed_messages == output_messages, "Concurrently consumed messages are not in order"