class KafkaVersionTest(Test): """Sanity checks on kafka versioning.""" def __init__(self, test_context): super(KafkaVersionTest, self).__init__(test_context) self.topic = "topic" self.zk = ZookeeperService(test_context, num_nodes=1) def setUp(self): self.zk.start() @cluster(num_nodes=2) def test_0_8_2(self): """Test kafka service node-versioning api - verify that we can bring up a single-node 0.8.2.X cluster.""" self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics={self.topic: {"partitions": 1, "replication-factor": 1}}) node = self.kafka.nodes[0] node.version = LATEST_0_8_2 self.kafka.start() assert is_version(node, [LATEST_0_8_2], logger=self.logger) @cluster(num_nodes=3) def test_multi_version(self): """Test kafka service node-versioning api - ensure we can bring up a 2-node cluster, one on version 0.8.2.X, the other on the current development branch.""" self.kafka = KafkaService(self.test_context, num_nodes=2, zk=self.zk, topics={self.topic: {"partitions": 1, "replication-factor": 2}}) self.kafka.nodes[1].version = LATEST_0_8_2 self.kafka.nodes[1].config[config_property.INTER_BROKER_PROTOCOL_VERSION] = "0.8.2.X" self.kafka.start() assert is_version(self.kafka.nodes[0], [DEV_BRANCH.vstring], logger=self.logger) assert is_version(self.kafka.nodes[1], [LATEST_0_8_2], logger=self.logger)
class ProduceBenchTest(Test): def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ProduceBenchTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk) self.workload_service = ProduceBenchWorkloadService(test_context, self.kafka) self.trogdor = TrogdorService(context=self.test_context, client_services=[self.kafka, self.workload_service]) def setUp(self): self.trogdor.start() self.zk.start() self.kafka.start() def teardown(self): self.trogdor.stop() self.kafka.stop() self.zk.stop() def test_produce_bench(self): active_topics={"produce_bench_topic[0-1]":{"numPartitions":1, "replicationFactor":3}} inactive_topics={"produce_bench_topic[2-9]":{"numPartitions":1, "replicationFactor":3}} spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.workload_service.producer_node, self.workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=100000, producer_conf={}, inactive_topics=inactive_topics, active_topics=active_topics) workload1 = self.trogdor.create_task("workload1", spec) workload1.wait_for_done(timeout_sec=360) tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))
class GetOffsetShellTest(Test): """ Tests GetOffsetShell tool """ def __init__(self, test_context): super(GetOffsetShellTest, self).__init__(test_context) self.num_zk = 1 self.num_brokers = 1 self.messages_received_count = 0 self.topics = { TOPIC: {'partitions': NUM_PARTITIONS, 'replication-factor': REPLICATION_FACTOR} } self.zk = ZookeeperService(test_context, self.num_zk) def setUp(self): self.zk.start() def start_kafka(self, security_protocol, interbroker_security_protocol): self.kafka = KafkaService( self.test_context, self.num_brokers, self.zk, security_protocol=security_protocol, interbroker_security_protocol=interbroker_security_protocol, topics=self.topics) self.kafka.start() def start_producer(self): # This will produce to kafka cluster self.producer = VerifiableProducer(self.test_context, num_nodes=1, kafka=self.kafka, topic=TOPIC, throughput=1000, max_messages=MAX_MESSAGES) self.producer.start() current_acked = self.producer.num_acked wait_until(lambda: self.producer.num_acked >= current_acked + MAX_MESSAGES, timeout_sec=10, err_msg="Timeout awaiting messages to be produced and acked") def start_consumer(self): self.consumer = ConsoleConsumer(self.test_context, num_nodes=self.num_brokers, kafka=self.kafka, topic=TOPIC, consumer_timeout_ms=1000) self.consumer.start() @cluster(num_nodes=4) def test_get_offset_shell(self, security_protocol='PLAINTEXT'): """ Tests if GetOffsetShell is getting offsets correctly :return: None """ self.start_kafka(security_protocol, security_protocol) self.start_producer() # Assert that offset fetched without any consumers consuming is 0 assert self.kafka.get_offset_shell(TOPIC, None, 1000, 1, -1), "%s:%s:%s" % (TOPIC, NUM_PARTITIONS - 1, 0) self.start_consumer() node = self.consumer.nodes[0] wait_until(lambda: self.consumer.alive(node), timeout_sec=20, backoff_sec=.2, err_msg="Consumer was too slow to start") # Assert that offset is correctly indicated by GetOffsetShell tool wait_until(lambda: "%s:%s:%s" % (TOPIC, NUM_PARTITIONS - 1, MAX_MESSAGES) in self.kafka.get_offset_shell(TOPIC, None, 1000, 1, -1), timeout_sec=10, err_msg="Timed out waiting to reach expected offset.")
class CamusTest(Test): def __init__(self, test_context, num_zk, num_brokers, num_hadoop, num_schema_registry, num_rest, hadoop_distro='cdh', hadoop_version=2, topics=None): super(CamusTest, self).__init__(test_context) self.num_zk = num_zk self.num_brokers = num_brokers self.num_hadoop = num_hadoop self.num_schema_registry = num_schema_registry self.num_rest = num_rest self.topics = topics self.hadoop_distro = hadoop_distro self.hadoop_version = hadoop_version self.zk = ZookeeperService(test_context, self.num_zk) self.kafka = KafkaService(test_context, self.num_brokers, self.zk, topics=self.topics) self.hadoop = create_hadoop_service(test_context, self.num_hadoop, self.hadoop_distro, self.hadoop_version) self.schema_registry = SchemaRegistryService(test_context, self.num_schema_registry, self.zk, self.kafka) self.rest = KafkaRestService(test_context, self.num_rest, self.zk, self.kafka, self.schema_registry) def setUp(self): self.zk.start() self.kafka.start() self.hadoop.start() self.schema_registry.start() self.rest.start()
class CompressionTest(ProduceConsumeValidateTest): """ These tests validate produce / consume for compressed topics. """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(CompressionTest, self).__init__(test_context=test_context) self.topic = "test_topic" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={self.topic: { "partitions": 10, "replication-factor": 1}}) self.num_partitions = 10 self.timeout_sec = 60 self.producer_throughput = 1000 self.num_producers = 4 self.messages_per_producer = 1000 self.num_consumers = 1 def setUp(self): self.zk.start() def min_cluster_size(self): # Override this since we're adding services outside of the constructor return super(CompressionTest, self).min_cluster_size() + self.num_producers + self.num_consumers @parametrize(compression_types=["snappy","gzip","lz4","none"], new_consumer=True) @parametrize(compression_types=["snappy","gzip","lz4","none"], new_consumer=False) def test_compressed_topic(self, compression_types, new_consumer): """Test produce => consume => validate for compressed topics Setup: 1 zk, 1 kafka node, 1 topic with partitions=10, replication-factor=1 compression_types parameter gives a list of compression types (or no compression if "none"). Each producer in a VerifiableProducer group (num_producers = 4) will use a compression type from the list based on producer's index in the group. - Produce messages in the background - Consume messages in the background - Stop producing, and finish consuming - Validate that every acked message was consumed """ self.kafka.security_protocol = "PLAINTEXT" self.kafka.interbroker_security_protocol = self.kafka.security_protocol self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput, message_validator=is_int_with_prefix, compression_types=compression_types) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, new_consumer=new_consumer, consumer_timeout_ms=60000, message_validator=is_int_with_prefix) self.kafka.start() self.run_produce_consume_validate(lambda: wait_until( lambda: self.producer.each_produced_at_least(self.messages_per_producer) == True, timeout_sec=120, backoff_sec=1, err_msg="Producer did not produce all messages in reasonable amount of time"))
class ClientCompatibilityTestNewBroker(ProduceConsumeValidateTest): def __init__(self, test_context): super(ClientCompatibilityTestNewBroker, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # Producer and consumer self.producer_throughput = 10000 self.num_producers = 1 self.num_consumers = 1 self.messages_per_producer = 1000 @cluster(num_nodes=6) @parametrize(producer_version=str(DEV_BRANCH), consumer_version=str(DEV_BRANCH), compression_types=["snappy"], timestamp_type=str("LogAppendTime")) @parametrize(producer_version=str(DEV_BRANCH), consumer_version=str(DEV_BRANCH), compression_types=["none"], timestamp_type=str("LogAppendTime")) @parametrize(producer_version=str(DEV_BRANCH), consumer_version=str(LATEST_0_9), compression_types=["none"], new_consumer=False, timestamp_type=None) @parametrize(producer_version=str(DEV_BRANCH), consumer_version=str(LATEST_0_9), compression_types=["snappy"], timestamp_type=str("CreateTime")) @parametrize(producer_version=str(LATEST_1_1), consumer_version=str(LATEST_1_1), compression_types=["lz4"], timestamp_type=str("CreateTime")) @parametrize(producer_version=str(LATEST_1_0), consumer_version=str(LATEST_1_0), compression_types=["none"], timestamp_type=str("CreateTime")) @parametrize(producer_version=str(LATEST_0_11_0), consumer_version=str(LATEST_0_11_0), compression_types=["gzip"], timestamp_type=str("CreateTime")) @parametrize(producer_version=str(LATEST_0_10_2), consumer_version=str(LATEST_0_10_2), compression_types=["lz4"], timestamp_type=str("CreateTime")) @parametrize(producer_version=str(LATEST_0_10_1), consumer_version=str(LATEST_0_10_1), compression_types=["snappy"], timestamp_type=str("LogAppendTime")) @parametrize(producer_version=str(LATEST_0_10_0), consumer_version=str(LATEST_0_10_0), compression_types=["snappy"], timestamp_type=str("LogAppendTime")) @parametrize(producer_version=str(LATEST_0_9), consumer_version=str(DEV_BRANCH), compression_types=["none"], timestamp_type=None) @parametrize(producer_version=str(LATEST_0_9), consumer_version=str(DEV_BRANCH), compression_types=["snappy"], timestamp_type=None) @parametrize(producer_version=str(LATEST_0_9), consumer_version=str(LATEST_0_9), compression_types=["snappy"], timestamp_type=str("LogAppendTime")) @parametrize(producer_version=str(LATEST_0_8_2), consumer_version=str(LATEST_0_8_2), compression_types=["none"], new_consumer=False, timestamp_type=None) def test_compatibility(self, producer_version, consumer_version, compression_types, new_consumer=True, timestamp_type=None): self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=DEV_BRANCH, topics={self.topic: { "partitions": 3, "replication-factor": 3, 'configs': {"min.insync.replicas": 2}}}) for node in self.kafka.nodes: if timestamp_type is not None: node.config[config_property.MESSAGE_TIMESTAMP_TYPE] = timestamp_type self.kafka.start() self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput, message_validator=is_int, compression_types=compression_types, version=KafkaVersion(producer_version)) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=30000, new_consumer=new_consumer, message_validator=is_int, version=KafkaVersion(consumer_version)) self.run_produce_consume_validate(lambda: wait_until( lambda: self.producer.each_produced_at_least(self.messages_per_producer) == True, timeout_sec=120, backoff_sec=1, err_msg="Producer did not produce all messages in reasonable amount of time"))
class Log4jAppenderTest(Test): """ Tests KafkaLog4jAppender using VerifiableKafkaLog4jAppender that appends increasing ints to a Kafka topic """ def __init__(self, test_context): super(Log4jAppenderTest, self).__init__(test_context) self.num_zk = 1 self.num_brokers = 1 self.topics = { TOPIC: {'partitions': 1, 'replication-factor': 1} } self.zk = ZookeeperService(test_context, self.num_zk) def setUp(self): self.zk.start() def start_kafka(self, security_protocol, interbroker_security_protocol): self.kafka = KafkaService( self.test_context, self.num_brokers, self.zk, security_protocol=security_protocol, interbroker_security_protocol=interbroker_security_protocol, topics=self.topics) self.kafka.start() def start_appender(self, security_protocol): self.appender = KafkaLog4jAppender(self.test_context, self.num_brokers, self.kafka, TOPIC, MAX_MESSAGES, security_protocol=security_protocol) self.appender.start() def start_consumer(self, security_protocol): enable_new_consumer = security_protocol == SecurityConfig.SSL self.consumer = ConsoleConsumer(self.test_context, num_nodes=self.num_brokers, kafka=self.kafka, topic=TOPIC, consumer_timeout_ms=1000, new_consumer=enable_new_consumer) self.consumer.start() @matrix(security_protocol=['PLAINTEXT', 'SSL']) def test_log4j_appender(self, security_protocol='PLAINTEXT'): """ Tests if KafkaLog4jAppender is producing to Kafka topic :return: None """ self.start_kafka(security_protocol, security_protocol) self.start_appender(security_protocol) self.appender.wait() self.start_consumer(security_protocol) node = self.consumer.nodes[0] wait_until(lambda: self.consumer.alive(node), timeout_sec=10, backoff_sec=.2, err_msg="Consumer was too slow to start") # Verify consumed messages count expected_lines_count = MAX_MESSAGES * 2 # two times to account for new lines introduced by log4j wait_until(lambda: len(self.consumer.messages_consumed[1]) == expected_lines_count, timeout_sec=10, err_msg="Timed out waiting to consume expected number of messages.") self.consumer.stop()
class MiniTest(Test): def __init__(self, test_context): super(MiniTest, self).__init__(test_context=test_context) self.zk = ZookeeperService(test_context, 1) self.kafka = KafkaService(test_context, 1, self.zk) def test(self): self.zk.start() self.kafka.start()
class ClientCompatibilityFeaturesTest(Test): """ Tests clients for the presence or absence of specific features when communicating with brokers with various versions. Relies on ClientCompatibilityTest.java for much of the functionality. """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ClientCompatibilityFeaturesTest, self).__init__(test_context=test_context) self.zk = ZookeeperService(test_context, num_nodes=3) # Generate a unique topic name topic_name = "client_compat_features_topic_%d%d" % (int(time.time()), randint(0, 2147483647)) self.topics = { topic_name: { "partitions": 1, # Use only one partition to avoid worrying about ordering "replication-factor": 3 }} self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk, topics=self.topics) def invoke_compatibility_program(self, features): # Run the compatibility test on the first Kafka node. node = self.zk.nodes[0] cmd = ("%s org.apache.kafka.tools.ClientCompatibilityTest " "--bootstrap-server %s " "--num-cluster-nodes %d " "--topic %s " % (self.zk.path.script("kafka-run-class.sh", node), self.kafka.bootstrap_servers(), len(self.kafka.nodes), self.topics.keys()[0])) for k, v in features.iteritems(): cmd = cmd + ("--%s %s " % (k, v)) results_dir = TestContext.results_dir(self.test_context, 0) os.makedirs(results_dir) ssh_log_file = "%s/%s" % (results_dir, "client_compatibility_test_output.txt") try: self.logger.info("Running %s" % cmd) run_command(node, cmd, ssh_log_file) except Exception as e: self.logger.info("** Command failed. See %s for log messages." % ssh_log_file) raise @parametrize(broker_version=str(DEV_BRANCH)) @parametrize(broker_version=str(LATEST_0_10_0)) @parametrize(broker_version=str(LATEST_0_10_1)) @parametrize(broker_version=str(LATEST_0_10_2)) @parametrize(broker_version=str(LATEST_0_11_0)) @parametrize(broker_version=str(LATEST_1_0)) @parametrize(broker_version=str(LATEST_1_1)) def run_compatibility_test(self, broker_version): self.zk.start() self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() features = get_broker_features(broker_version) self.invoke_compatibility_program(features)
class TestVerifiableProducer(Test): """Sanity checks on verifiable producer service class.""" def __init__(self, test_context): super(TestVerifiableProducer, self).__init__(test_context) self.topic = "topic" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={self.topic: {"partitions": 1, "replication-factor": 1}}) self.num_messages = 1000 # This will produce to source kafka cluster self.producer = VerifiableProducer(test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, max_messages=self.num_messages, throughput=self.num_messages/5) def setUp(self): self.zk.start() self.kafka.start() @cluster(num_nodes=3) @parametrize(producer_version=str(LATEST_0_8_2)) @parametrize(producer_version=str(LATEST_0_9)) @parametrize(producer_version=str(LATEST_0_10_0)) @parametrize(producer_version=str(LATEST_0_10_1)) @parametrize(producer_version=str(DEV_BRANCH)) def test_simple_run(self, producer_version=DEV_BRANCH): """ Test that we can start VerifiableProducer on the current branch snapshot version or against the 0.8.2 jar, and verify that we can produce a small number of messages. """ node = self.producer.nodes[0] node.version = KafkaVersion(producer_version) self.producer.start() wait_until(lambda: self.producer.num_acked > 5, timeout_sec=5, err_msg="Producer failed to start in a reasonable amount of time.") # using version.vstring (distutils.version.LooseVersion) is a tricky way of ensuring # that this check works with DEV_BRANCH # When running VerifiableProducer 0.8.X, both the current branch version and 0.8.X should show up because of the # way verifiable producer pulls in some development directories into its classpath # # If the test fails here because 'ps .. | grep' couldn't find the process it means # the login and grep that is_version() performs is slower than # the time it takes the producer to produce its messages. # Easy fix is to decrease throughput= above, the good fix is to make the producer # not terminate until explicitly killed in this case. if node.version <= LATEST_0_8_2: assert is_version(node, [node.version.vstring, DEV_BRANCH.vstring], logger=self.logger) else: assert is_version(node, [node.version.vstring], logger=self.logger) self.producer.wait() num_produced = self.producer.num_acked assert num_produced == self.num_messages, "num_produced: %d, num_messages: %d" % (num_produced, self.num_messages)
class TestUpgrade(ProduceConsumeValidateTest): def __init__(self, test_context): super(TestUpgrade, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.zk = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=LATEST_0_8_2, topics={self.topic: { "partitions": 3, "replication-factor": 3, 'configs': {"min.insync.replicas": 2}}}) self.zk.start() self.kafka.start() # Producer and consumer self.producer_throughput = 10000 self.num_producers = 1 self.num_consumers = 1 self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput, version=LATEST_0_8_2) # TODO - reduce the timeout self.consumer = ConsoleConsumer( self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=30000, message_validator=is_int, version=LATEST_0_8_2) def perform_upgrade(self): self.logger.info("First pass bounce - rolling upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = TRUNK node.config[config_property.INTER_BROKER_PROTOCOL_VERSION] = "0.8.2.X" self.kafka.start_node(node) self.logger.info("Second pass bounce - remove inter.broker.protocol.version config") for node in self.kafka.nodes: self.kafka.stop_node(node) del node.config[config_property.INTER_BROKER_PROTOCOL_VERSION] self.kafka.start_node(node) def test_upgrade(self): """Test upgrade of Kafka broker cluster from 0.8.2 to 0.9.0 - Start 3 node broker cluster on version 0.8.2 - Start producer and consumer in the background - Perform two-phase rolling upgrade - First phase: upgrade brokers to 0.9.0 with inter.broker.protocol.version set to 0.8.2.X - Second phase: remove inter.broker.protocol.version config with rolling bounce - Finally, validate that every message acked by the producer was consumed by the consumer """ self.run_produce_consume_validate(core_test_action=self.perform_upgrade)
class ClientCompatibilityProduceConsumeTest(ProduceConsumeValidateTest): """ These tests validate that we can use a new client to produce and consume from older brokers. """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ClientCompatibilityProduceConsumeTest, self).__init__(test_context=test_context) self.topic = "test_topic" self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk, topics={self.topic:{ "partitions": 10, "replication-factor": 2}}) self.num_partitions = 10 self.timeout_sec = 60 self.producer_throughput = 1000 self.num_producers = 2 self.messages_per_producer = 1000 self.num_consumers = 1 def setUp(self): self.zk.start() def min_cluster_size(self): # Override this since we're adding services outside of the constructor return super(ClientCompatibilityProduceConsumeTest, self).min_cluster_size() + self.num_producers + self.num_consumers @parametrize(broker_version=str(DEV_BRANCH)) @parametrize(broker_version=str(LATEST_0_10_0)) @parametrize(broker_version=str(LATEST_0_10_1)) @parametrize(broker_version=str(LATEST_0_10_2)) @parametrize(broker_version=str(LATEST_0_11_0)) @parametrize(broker_version=str(LATEST_1_0)) @parametrize(broker_version=str(LATEST_1_1)) def test_produce_consume(self, broker_version): print("running producer_consumer_compat with broker_version = %s" % broker_version) self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.security_protocol = "PLAINTEXT" self.kafka.interbroker_security_protocol = self.kafka.security_protocol self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput, message_validator=is_int_with_prefix) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=60000, message_validator=is_int_with_prefix) self.kafka.start() self.run_produce_consume_validate(lambda: wait_until( lambda: self.producer.each_produced_at_least(self.messages_per_producer) == True, timeout_sec=120, backoff_sec=1, err_msg="Producer did not produce all messages in reasonable amount of time"))
class SimpleConsumerShellTest(Test): """ Tests SimpleConsumerShell tool """ def __init__(self, test_context): super(SimpleConsumerShellTest, self).__init__(test_context) self.num_zk = 1 self.num_brokers = 1 self.messages_received_count = 0 self.topics = {TOPIC: {"partitions": NUM_PARTITIONS, "replication-factor": REPLICATION_FACTOR}} self.zk = ZookeeperService(test_context, self.num_zk) def setUp(self): self.zk.start() def start_kafka(self): self.kafka = KafkaService(self.test_context, self.num_brokers, self.zk, topics=self.topics) self.kafka.start() def run_producer(self): # This will produce to kafka cluster self.producer = VerifiableProducer( self.test_context, num_nodes=1, kafka=self.kafka, topic=TOPIC, throughput=1000, max_messages=MAX_MESSAGES ) self.producer.start() wait_until( lambda: self.producer.num_acked == MAX_MESSAGES, timeout_sec=10, err_msg="Timeout awaiting messages to be produced and acked", ) def start_simple_consumer_shell(self): self.simple_consumer_shell = SimpleConsumerShell(self.test_context, 1, self.kafka, TOPIC) self.simple_consumer_shell.start() def test_simple_consumer_shell(self): """ Tests if SimpleConsumerShell is fetching expected records :return: None """ self.start_kafka() self.run_producer() self.start_simple_consumer_shell() # Assert that SimpleConsumerShell is fetching expected number of messages wait_until( lambda: self.simple_consumer_shell.get_output().count("\n") == (MAX_MESSAGES + 1), timeout_sec=10, err_msg="Timed out waiting to receive expected number of messages.", )
def test_compatibility(self, producer_version, consumer_version, compression_types, new_consumer=True, timestamp_type=None): self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=TRUNK, topics={self.topic: { "partitions": 3, "replication-factor": 3, 'configs': {"min.insync.replicas": 2}}}) for node in self.kafka.nodes: if timestamp_type is not None: node.config[config_property.MESSAGE_TIMESTAMP_TYPE] = timestamp_type self.kafka.start() self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput, message_validator=is_int, compression_types=compression_types, version=KafkaVersion(producer_version)) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=30000, new_consumer=new_consumer, message_validator=is_int, version=KafkaVersion(consumer_version)) self.run_produce_consume_validate(lambda: wait_until( lambda: self.producer.each_produced_at_least(self.messages_per_producer) == True, timeout_sec=120, backoff_sec=1, err_msg="Producer did not produce all messages in reasonable amount of time"))
def test_compatibility(self, producer_version, consumer_version): """ This tests performs the following checks: The workload is a mix of 0.9.x and 0.10.x producers and consumers that produce to and consume from a 0.10.x cluster 1. initially the topic is using message format 0.9.0 2. change the message format version for topic to 0.10.0 on the fly. 3. change the message format version for topic back to 0.9.0 on the fly. - The producers and consumers should not have any issue. - Note that for 0.9.x consumers/producers we only do steps 1 and 2 """ self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=TRUNK, topics={self.topic: { "partitions": 3, "replication-factor": 3, 'configs': {"min.insync.replicas": 2}}}) self.kafka.start() self.logger.info("First format change to 0.9.0") self.kafka.alter_message_format(self.topic, str(LATEST_0_9)) self.produce_and_consume(producer_version, consumer_version, "group1") self.logger.info("Second format change to 0.10.0") self.kafka.alter_message_format(self.topic, str(LATEST_0_10)) self.produce_and_consume(producer_version, consumer_version, "group2") if producer_version == str(TRUNK) and consumer_version == str(TRUNK): self.logger.info("Third format change back to 0.9.0") self.kafka.alter_message_format(self.topic, str(LATEST_0_9)) self.produce_and_consume(producer_version, consumer_version, "group3")
def __init__(self, test_context): super(DelegationTokenTest, self).__init__(test_context) self.test_context = test_context self.topic = "topic" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, zk_chroot="/kafka", topics={self.topic: {"partitions": 1, "replication-factor": 1}}, server_prop_overides=[ [config_property.DELEGATION_TOKEN_MAX_LIFETIME_MS, "604800000"], [config_property.DELEGATION_TOKEN_EXPIRY_TIME_MS, "86400000"], [config_property.DELEGATION_TOKEN_MASTER_KEY, "test12345"], [config_property.SASL_ENABLED_MECHANISMS, "GSSAPI,SCRAM-SHA-256"] ]) self.jaas_deleg_conf_path = "/tmp/jaas_deleg.conf" self.jaas_deleg_conf = "" self.client_properties_content = """ security.protocol=SASL_PLAINTEXT sasl.mechanism=SCRAM-SHA-256 sasl.kerberos.service.name=kafka client.id=console-consumer """ self.client_kafka_opts=' -Djava.security.auth.login.config=' + self.jaas_deleg_conf_path self.producer = VerifiableProducer(self.test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, max_messages=1, throughput=1, kafka_opts_override=self.client_kafka_opts, client_prop_file_override=self.client_properties_content) self.consumer = ConsoleConsumer(self.test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, kafka_opts_override=self.client_kafka_opts, client_prop_file_override=self.client_properties_content) self.kafka.security_protocol = 'SASL_PLAINTEXT' self.kafka.client_sasl_mechanism = 'GSSAPI,SCRAM-SHA-256' self.kafka.interbroker_sasl_mechanism = 'GSSAPI'
def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(LogDirFailureTest, self).__init__(test_context=test_context) self.topic1 = "test_topic_1" self.topic2 = "test_topic_2" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk, topics={ self.topic1: {"partitions": 1, "replication-factor": 3, "configs": {"min.insync.replicas": 1}}, self.topic2: {"partitions": 1, "replication-factor": 3, "configs": {"min.insync.replicas": 2}} }, # Set log.roll.ms to 3 seconds so that broker will detect disk error sooner when it creates log segment # Otherwise broker will still be able to read/write the log file even if the log directory is inaccessible. server_prop_overides=[ [config_property.OFFSETS_TOPIC_NUM_PARTITIONS, "1"], [config_property.LOG_FLUSH_INTERVAL_MESSAGE, "5"], [config_property.REPLICA_HIGHWATERMARK_CHECKPOINT_INTERVAL_MS, "60000"], [config_property.LOG_ROLL_TIME_MS, "3000"] ]) self.producer_throughput = 1000 self.num_producers = 1 self.num_consumers = 1
def start_kafka(self, security_protocol, interbroker_security_protocol): self.kafka = KafkaService( self.test_context, self.num_brokers, self.zk, security_protocol=security_protocol, interbroker_security_protocol=interbroker_security_protocol, topics=self.topics) self.kafka.log_level = "INFO" # We don't DEBUG logging here self.kafka.start()
def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ReassignPartitionsTest, self).__init__(test_context=test_context) self.topic = "test_topic" self.num_partitions = 20 self.zk = ZookeeperService(test_context, num_nodes=1) # We set the min.insync.replicas to match the replication factor because # it makes the test more stringent. If min.isr = 2 and # replication.factor=3, then the test would tolerate the failure of # reassignment for upto one replica per partition, which is not # desirable for this test in particular. self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk, server_prop_overides=[ [config_property.LOG_ROLL_TIME_MS, "5000"], [config_property.LOG_RETENTION_CHECK_INTERVAL_MS, "5000"] ], topics={self.topic: { "partitions": self.num_partitions, "replication-factor": 3, 'configs': { "min.insync.replicas": 3, }} }) self.timeout_sec = 60 self.producer_throughput = 1000 self.num_producers = 1 self.num_consumers = 1
def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(QuotaTest, self).__init__(test_context=test_context) self.topic = 'test_topic' self.logger.info('use topic ' + self.topic) # quota related parameters self.quota_config = {'quota_producer_default': 2500000, 'quota_consumer_default': 2000000, 'quota_producer_bytes_per_second_overrides': 'overridden_id=3750000', 'quota_consumer_bytes_per_second_overrides': 'overridden_id=3000000'} self.maximum_client_deviation_percentage = 100.0 self.maximum_broker_deviation_percentage = 5.0 self.num_records = 100000 self.record_size = 3000 self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, security_protocol='PLAINTEXT', interbroker_security_protocol='PLAINTEXT', topics={self.topic: {'partitions': 6, 'replication-factor': 1, 'configs': {'min.insync.replicas': 1}}}, quota_config=self.quota_config, jmx_object_names=['kafka.server:type=BrokerTopicMetrics,name=BytesInPerSec', 'kafka.server:type=BrokerTopicMetrics,name=BytesOutPerSec'], jmx_attributes=['OneMinuteRate']) self.num_producers = 1 self.num_consumers = 2
def __init__(self, test_context): super(ConsoleConsumerTest, self).__init__(test_context) self.topic = "topic" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, zk_chroot="/kafka", topics={self.topic: {"partitions": 1, "replication-factor": 1}}) self.consumer = ConsoleConsumer(self.test_context, num_nodes=1, kafka=self.kafka, topic=self.topic)
def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ProduceBenchTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk) self.workload_service = ProduceBenchWorkloadService(test_context, self.kafka) self.trogdor = TrogdorService(context=self.test_context, client_services=[self.kafka, self.workload_service])
class TestVerifiableProducer(Test): """Sanity checks on verifiable producer service class.""" def __init__(self, test_context): super(TestVerifiableProducer, self).__init__(test_context) self.topic = "topic" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={self.topic: {"partitions": 1, "replication-factor": 1}}) self.num_messages = 1000 # This will produce to source kafka cluster self.producer = VerifiableProducer(test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, max_messages=self.num_messages, throughput=1000) def setUp(self): self.zk.start() self.kafka.start() @parametrize(producer_version=str(LATEST_0_8_2)) @parametrize(producer_version=str(LATEST_0_9)) @parametrize(producer_version=str(TRUNK)) def test_simple_run(self, producer_version=TRUNK): """ Test that we can start VerifiableProducer on trunk or against the 0.8.2 jar, and verify that we can produce a small number of messages. """ node = self.producer.nodes[0] node.version = KafkaVersion(producer_version) self.producer.start() wait_until(lambda: self.producer.num_acked > 5, timeout_sec=5, err_msg="Producer failed to start in a reasonable amount of time.") # using version.vstring (distutils.version.LooseVersion) is a tricky way of ensuring # that this check works with TRUNK # When running VerifiableProducer 0.8.X, both trunk version and 0.8.X should show up because of the way # verifiable producer pulls in some trunk directories into its classpath if node.version <= LATEST_0_8_2: assert is_version(node, [node.version.vstring, TRUNK.vstring]) else: assert is_version(node, [node.version.vstring]) self.producer.wait() num_produced = self.producer.num_acked assert num_produced == self.num_messages, "num_produced: %d, num_messages: %d" % (num_produced, self.num_messages)
def test_0_8_2(self): """Test kafka service node-versioning api - verify that we can bring up a single-node 0.8.2.X cluster.""" self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics={self.topic: {"partitions": 1, "replication-factor": 1}}) node = self.kafka.nodes[0] node.version = LATEST_0_8_2 self.kafka.start() assert is_version(node, [LATEST_0_8_2], logger=self.logger)
def test_version_probing_upgrade(self): """ Starts 3 KafkaStreams instances, and upgrades one-by-one to "future version" """ self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with("") # run with TRUNK self.processors = [self.processor1, self.processor2, self.processor3] self.old_processors = [self.processor1, self.processor2, self.processor3] self.upgraded_processors = [] for p in self.processors: self.leader_counter[p] = 2 self.update_leader() for p in self.processors: self.leader_counter[p] = 0 self.leader_counter[self.leader] = 3 counter = 1 current_generation = 3 random.seed() random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False current_generation = self.do_rolling_bounce(p, counter, current_generation) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop()
def start_kafka(self, security_protocol, interbroker_security_protocol): self.kafka = KafkaService( self.test_context, self.num_brokers, self.zk, security_protocol=security_protocol, interbroker_security_protocol=interbroker_security_protocol, topics=self.topics, ) self.kafka.start()
def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ProduceBenchTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk) self.workload_service = ProduceBenchWorkloadService(test_context, self.kafka) self.trogdor = TrogdorService(context=self.test_context, client_services=[self.kafka, self.workload_service]) self.active_topics = {"produce_bench_topic[0-1]": {"numPartitions": 1, "replicationFactor": 3}} self.inactive_topics = {"produce_bench_topic[2-9]": {"numPartitions": 1, "replicationFactor": 3}}
def setup_services(self, security_protocol=SecurityConfig.PLAINTEXT): self.kafka = KafkaService(self.test_context, self.num_brokers, self.zk, security_protocol=security_protocol, interbroker_security_protocol=security_protocol, topics=self.topics) self.cc = ConnectDistributedService(self.test_context, 3, self.kafka, [self.INPUT_FILE, self.OUTPUT_FILE]) self.cc.log_level = "DEBUG" self.zk.start() self.kafka.start()
def __init__(self, test_context): super(TestMirrorMakerService, self).__init__(test_context) self.topic = "topic" self.source_zk = ZookeeperService(test_context, num_nodes=1) self.target_zk = ZookeeperService(test_context, num_nodes=1) self.source_kafka = KafkaService(test_context, num_nodes=1, zk=self.source_zk, topics={self.topic: {"partitions": 1, "replication-factor": 1}}) self.target_kafka = KafkaService(test_context, num_nodes=1, zk=self.target_zk, topics={self.topic: {"partitions": 1, "replication-factor": 1}}) # This will produce to source kafka cluster self.producer = VerifiableProducer(test_context, num_nodes=1, kafka=self.source_kafka, topic=self.topic, throughput=1000) self.mirror_maker = MirrorMaker(test_context, num_nodes=1, source=self.source_kafka, target=self.target_kafka, whitelist=self.topic, offset_commit_interval_ms=1000) # This will consume from target kafka cluster self.consumer = ConsoleConsumer(test_context, num_nodes=1, kafka=self.target_kafka, topic=self.topic, message_validator=is_int, consumer_timeout_ms=60000)
def __init__(self, test_context, num_zk, num_brokers, topics=None): super(KafkaTest, self).__init__(test_context) self.num_zk = num_zk self.num_brokers = num_brokers self.topics = topics self.zk = ZookeeperService(test_context, self.num_zk) self.kafka = KafkaService( test_context, self.num_brokers, self.zk, topics=self.topics)
class LogDirFailureTest(ProduceConsumeValidateTest): """ Note that consuming is a bit tricky, at least with console consumer. The goal is to consume all messages (foreach partition) in the topic. In this case, waiting for the last message may cause the consumer to stop too soon since console consumer is consuming multiple partitions from a single thread and therefore we lose ordering guarantees. Waiting on a count of consumed messages can be unreliable: if we stop consuming when num_consumed == num_acked, we might exit early if some messages are duplicated (though not an issue here since producer retries==0) Therefore rely here on the consumer.timeout.ms setting which times out on the interval between successively consumed messages. Since we run the producer to completion before running the consumer, this is a reliable indicator that nothing is left to consume. """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(LogDirFailureTest, self).__init__(test_context=test_context) self.topic1 = "test_topic_1" self.topic2 = "test_topic_2" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk, topics={ self.topic1: {"partitions": 1, "replication-factor": 3, "configs": {"min.insync.replicas": 2}}, self.topic2: {"partitions": 1, "replication-factor": 3, "configs": {"min.insync.replicas": 1}} }, # Set log.roll.ms to 3 seconds so that broker will detect disk error sooner when it creates log segment # Otherwise broker will still be able to read/write the log file even if the log directory is inaccessible. server_prop_overides=[ [config_property.LOG_FLUSH_INTERVAL_MESSAGE, "5"], [config_property.REPLICA_HIGHWATERMARK_CHECKPOINT_INTERVAL_MS, "60000"], [config_property.LOG_ROLL_TIME_MS, "3000"] ]) self.producer_throughput = 1000 self.num_producers = 1 self.num_consumers = 1 def setUp(self): self.zk.start() def min_cluster_size(self): """Override this since we're adding services outside of the constructor""" return super(LogDirFailureTest, self).min_cluster_size() + self.num_producers * 2 + self.num_consumers * 2 @cluster(num_nodes=9) @matrix(bounce_broker=[False, True], broker_type=["leader", "follower"], security_protocol=["PLAINTEXT"]) def test_replication_with_disk_failure(self, bounce_broker, security_protocol, broker_type): """Replication tests. These tests verify that replication provides simple durability guarantees by checking that data acked by brokers is still available for consumption in the face of various failure scenarios. Setup: 1 zk, 3 kafka nodes, 1 topic with partitions=3, replication-factor=3, and min.insync.replicas=2 and another topic with partitions=3, replication-factor=3, and min.insync.replicas=1 - Produce messages in the background - Consume messages in the background - Drive broker failures (shutdown, or bounce repeatedly with kill -15 or kill -9) - When done driving failures, stop producing, and finish consuming - Validate that every acked message was consumed """ self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol self.kafka.start() try: # Initialize producer/consumer for topic1 self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic1, throughput=self.producer_throughput) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic1, group_id="test-consumer-group-1", new_consumer=False, consumer_timeout_ms=60000, message_validator=is_int) self.start_producer_and_consumer() # Get a replica of the partition of topic1 and make its first log directory offline by changing the log dir's permission. # We assume that partition of topic1 is created in the first log directory of respective brokers. broker_node = select_node(self, broker_type, self.topic1) broker_idx = self.kafka.idx(broker_node) assert broker_idx in self.kafka.isr_idx_list(self.topic1), \ "Broker %d should be in isr set %s" % (broker_idx, str(self.kafka.isr_idx_list(self.topic1))) self.logger.debug("Making log dir %s inaccessible" % (KafkaService.DATA_LOG_DIR_1)) cmd = "chmod a-w %s -R" % (KafkaService.DATA_LOG_DIR_1) broker_node.account.ssh(cmd, allow_fail=False) if bounce_broker: self.kafka.restart_node(broker_node, clean_shutdown=True) # Verify the following: # 1) The broker with offline log directory is not the leader of the partition of topic1 # 2) The broker with offline log directory is not in the ISR # 3) The broker with offline log directory is still online # 4) Messages can still be produced and consumed from topic1 wait_until(lambda: self.kafka.leader(self.topic1, partition=0) != broker_node, timeout_sec=60, err_msg="Broker %d should not be leader of topic %s and partition 0" % (broker_idx, self.topic1)) assert self.kafka.alive(broker_node), "Broker %d should be still online" % (broker_idx) wait_until(lambda: broker_idx not in self.kafka.isr_idx_list(self.topic1), timeout_sec=60, err_msg="Broker %d should not be in isr set %s" % (broker_idx, str(self.kafka.isr_idx_list(self.topic1)))) self.stop_producer_and_consumer() self.validate() # Shutdown all other brokers so that the broker with offline log dir is the only online broker offline_nodes = [] for node in self.kafka.nodes: if broker_node != node: offline_nodes.append(node) self.logger.debug("Hard shutdown broker %d" % (self.kafka.idx(node))) self.kafka.stop_node(node) # Verify the following: # 1) The broker with offline directory is the only in-sync broker of the partition of topic2 # 2) Messages can still be produced and consumed from topic2 self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic2, throughput=self.producer_throughput, offline_nodes=offline_nodes) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic2, group_id="test-consumer-group-2", new_consumer=False, consumer_timeout_ms=60000, message_validator=is_int) self.start_producer_and_consumer() assert self.kafka.isr_idx_list(self.topic2) == [broker_idx], \ "In-sync replicas of topic %s and partition 0 should be %s" % (self.topic2, str([broker_idx])) self.stop_producer_and_consumer() self.validate() except BaseException as e: for s in self.test_context.services: self.mark_for_collect(s) raise
def test_app_upgrade(self, from_version, to_version, bounce_type): """ Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version> """ if from_version == to_version: return self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics={ 'echo': { 'partitions': 5, 'replication-factor': 1 }, 'data': { 'partitions': 5, 'replication-factor': 1 }, 'min': { 'partitions': 5, 'replication-factor': 1 }, 'min-suppressed': { 'partitions': 5, 'replication-factor': 1 }, 'min-raw': { 'partitions': 5, 'replication-factor': 1 }, 'max': { 'partitions': 5, 'replication-factor': 1 }, 'sum': { 'partitions': 5, 'replication-factor': 1 }, 'sws-raw': { 'partitions': 5, 'replication-factor': 1 }, 'sws-suppressed': { 'partitions': 5, 'replication-factor': 1 }, 'dif': { 'partitions': 5, 'replication-factor': 1 }, 'cnt': { 'partitions': 5, 'replication-factor': 1 }, 'avg': { 'partitions': 5, 'replication-factor': 1 }, 'wcnt': { 'partitions': 5, 'replication-factor': 1 }, 'tagg': { 'partitions': 5, 'replication-factor': 1 } }) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka, processing_guarantee="at_least_once", replication_factor=1) self.processor2 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka, processing_guarantee="at_least_once", replication_factor=1) self.processor3 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka, processing_guarantee="at_least_once", replication_factor=1) self.purge_state_dir(self.processor1) self.purge_state_dir(self.processor2) self.purge_state_dir(self.processor3) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] if bounce_type == "rolling": counter = 1 random.seed() # upgrade one-by-one via rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_stop_start_bounce(p, None, to_version, counter) counter = counter + 1 elif bounce_type == "full": self.restart_all_nodes_with(to_version) else: raise Exception("Unrecognized bounce_type: " + str(bounce_type)) # shutdown self.driver.stop() # Ideally, we would actually verify the expected results. # See KAFKA-10202 random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "SMOKE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'SMOKE-TEST-CLIENT-CLOSED' on " + str(node.account))
class ClientCompatibilityFeaturesTest(Test): """ Tests clients for the presence or absence of specific features when communicating with brokers with various versions. Relies on ClientCompatibilityTest.java for much of the functionality. """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ClientCompatibilityFeaturesTest, self).__init__(test_context=test_context) self.zk = ZookeeperService(test_context, num_nodes=3) if quorum.for_test( test_context) == quorum.zk else None # Generate a unique topic name topic_name = "client_compat_features_topic_%d%d" % (int( time.time()), randint(0, 2147483647)) self.topics = { topic_name: { "partitions": 1, # Use only one partition to avoid worrying about ordering "replication-factor": 3 } } self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk, topics=self.topics) # Always use the latest version of org.apache.kafka.tools.ClientCompatibilityTest # so store away the path to the DEV version before we set the Kafka version self.dev_script_path = self.kafka.path.script("kafka-run-class.sh", self.kafka.nodes[0]) def invoke_compatibility_program(self, features): # Run the compatibility test on the first Kafka node. node = self.kafka.nodes[0] cmd = ("%s org.apache.kafka.tools.ClientCompatibilityTest " "--bootstrap-server %s " "--num-cluster-nodes %d " "--topic %s " % (self.dev_script_path, self.kafka.bootstrap_servers(), len(self.kafka.nodes), list(self.topics.keys())[0])) for k, v in features.items(): cmd = cmd + ("--%s %s " % (k, v)) results_dir = TestContext.results_dir(self.test_context, 0) try: os.makedirs(results_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(results_dir): pass else: raise ssh_log_file = "%s/%s" % (results_dir, "client_compatibility_test_output.txt") try: self.logger.info("Running %s" % cmd) run_command(node, cmd, ssh_log_file) except Exception as e: self.logger.info("** Command failed. See %s for log messages." % ssh_log_file) raise @cluster(num_nodes=7) @matrix(broker_version=[str(DEV_BRANCH)], metadata_quorum=quorum.all_non_upgrade) @parametrize(broker_version=str(LATEST_0_10_0)) @parametrize(broker_version=str(LATEST_0_10_1)) @parametrize(broker_version=str(LATEST_0_10_2)) @parametrize(broker_version=str(LATEST_0_11_0)) @parametrize(broker_version=str(LATEST_1_0)) @parametrize(broker_version=str(LATEST_1_1)) @parametrize(broker_version=str(LATEST_2_0)) @parametrize(broker_version=str(LATEST_2_1)) @parametrize(broker_version=str(LATEST_2_2)) @parametrize(broker_version=str(LATEST_2_3)) @parametrize(broker_version=str(LATEST_2_4)) @parametrize(broker_version=str(LATEST_2_5)) @parametrize(broker_version=str(LATEST_2_6)) @parametrize(broker_version=str(LATEST_2_7)) def run_compatibility_test(self, broker_version, metadata_quorum=quorum.zk): if self.zk: self.zk.start() self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() features = get_broker_features(broker_version) if not self.zk: # The self-managed mode doesn't support acls yet, we should remove this once it does features["describe-acls-supported"] = False self.invoke_compatibility_program(features)
class ConsoleConsumerTest(Test): """Sanity checks on console consumer service class.""" def __init__(self, test_context): super(ConsoleConsumerTest, self).__init__(test_context) self.topic = "topic" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService( self.test_context, num_nodes=1, zk=self.zk, zk_chroot="/kafka", topics={self.topic: { "partitions": 1, "replication-factor": 1 }}) self.consumer = ConsoleConsumer(self.test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, new_consumer=False) def setUp(self): self.zk.start() @cluster(num_nodes=3) @parametrize(security_protocol='PLAINTEXT', new_consumer=False) @matrix(security_protocol=['PLAINTEXT', 'SSL']) @cluster(num_nodes=4) @matrix(security_protocol=['SASL_SSL'], sasl_mechanism=['PLAIN', 'SCRAM-SHA-256', 'SCRAM-SHA-512']) @matrix(security_protocol=['SASL_PLAINTEXT', 'SASL_SSL']) def test_lifecycle(self, security_protocol, new_consumer=True, sasl_mechanism='GSSAPI'): """Check that console consumer starts/stops properly, and that we are capturing log output.""" self.kafka.security_protocol = security_protocol self.kafka.client_sasl_mechanism = sasl_mechanism self.kafka.interbroker_sasl_mechanism = sasl_mechanism self.kafka.start() self.consumer.security_protocol = security_protocol self.consumer.new_consumer = new_consumer t0 = time.time() self.consumer.start() node = self.consumer.nodes[0] wait_until(lambda: self.consumer.alive(node), timeout_sec=10, backoff_sec=.2, err_msg="Consumer was too slow to start") self.logger.info("consumer started in %s seconds " % str(time.time() - t0)) # Verify that log output is happening wait_until(lambda: file_exists(node, ConsoleConsumer.LOG_FILE), timeout_sec=10, err_msg="Timed out waiting for consumer log file to exist.") wait_until(lambda: line_count(node, ConsoleConsumer.LOG_FILE) > 0, timeout_sec=1, backoff_sec=.25, err_msg="Timed out waiting for log entries to start.") # Verify no consumed messages assert line_count(node, ConsoleConsumer.STDOUT_CAPTURE) == 0 self.consumer.stop_node(node) @cluster(num_nodes=4) def test_version(self): """Check that console consumer v0.8.2.X successfully starts and consumes messages.""" self.kafka.start() num_messages = 1000 self.producer = VerifiableProducer(self.test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, max_messages=num_messages, throughput=1000) self.producer.start() self.producer.wait() self.consumer.nodes[0].version = LATEST_0_8_2 self.consumer.consumer_timeout_ms = 1000 self.consumer.start() self.consumer.wait() num_consumed = len(self.consumer.messages_consumed[1]) num_produced = self.producer.num_acked assert num_produced == num_consumed, "num_produced: %d, num_consumed: %d" % ( num_produced, num_consumed)
class StreamsUpgradeTest(Test): """ Test upgrading Kafka Streams (all version combination) If metadata was changes, upgrade is more difficult Metadata version was bumped in 0.10.1.0 """ def __init__(self, test_context): super(StreamsUpgradeTest, self).__init__(test_context) self.topics = { 'echo': { 'partitions': 5 }, 'data': { 'partitions': 5 }, } def perform_broker_upgrade(self, to_version): self.logger.info("First pass bounce - rolling broker upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = KafkaVersion(to_version) self.kafka.start_node(node) @ignore @cluster(num_nodes=6) @matrix(from_version=broker_upgrade_versions, to_version=broker_upgrade_versions) def test_upgrade_downgrade_brokers(self, from_version, to_version): """ Start a smoke test client then perform rolling upgrades on the broker. """ if from_version == to_version: return self.replication = 3 self.partitions = 1 self.isr = 2 self.topics = { 'echo': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'data': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'min': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'max': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'sum': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'dif': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'cnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'avg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'wcnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'tagg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } } } # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # number of nodes needs to be >= 3 for the smoke test self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=KafkaVersion(from_version), topics=self.topics) self.kafka.start() # allow some time for topics to be created time.sleep(10) self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.processor1.start() time.sleep(15) self.perform_broker_upgrade(to_version) time.sleep(15) self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False) self.processor1.node.account.ssh_capture( "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False) @matrix(from_version=simple_upgrade_versions_metadata_version_2, to_version=simple_upgrade_versions_metadata_version_2) def test_simple_upgrade_downgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version> """ if from_version == to_version: return self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # upgrade one-by-one via rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_rolling_bounce(p, "", to_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() #@parametrize(new_version=str(LATEST_0_10_1)) we cannot run this test until Kafka 0.10.1.2 is released @parametrize(new_version=str(LATEST_0_10_2)) @parametrize(new_version=str(LATEST_0_11_0)) @parametrize(new_version=str(DEV_VERSION)) def test_metadata_upgrade(self, new_version): """ Starts 3 KafkaStreams instances with version 0.10.0, and upgrades one-by-one to <new_version> """ self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(str(LATEST_0_10_0)) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # first rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_rolling_bounce(p, "0.10.0", new_version, counter) counter = counter + 1 # second rolling bounce random.shuffle(self.processors) for p in self.processors: self.do_rolling_bounce(p, "", new_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() def start_all_nodes_with(self, version): # start first with <version> self.prepare_for(self.processor1, version) node1 = self.processor1.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor: with node1.account.monitor_log( self.processor1.LOG_FILE) as log_monitor: self.processor1.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node1.account)) monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) # start second with <version> self.prepare_for(self.processor2, version) node2 = self.processor2.node with node1.account.monitor_log( self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log( self.processor2.STDOUT_FILE) as second_monitor: with node2.account.monitor_log( self.processor2.LOG_FILE) as log_monitor: self.processor2.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node2.account)) first_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node2.account)) # start third with <version> self.prepare_for(self.processor3, version) node3 = self.processor3.node with node1.account.monitor_log( self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log( self.processor2.STDOUT_FILE) as second_monitor: with node3.account.monitor_log( self.processor3.STDOUT_FILE) as third_monitor: with node3.account.monitor_log( self.processor3.LOG_FILE) as log_monitor: self.processor3.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node3.account)) first_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node2.account)) third_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node3.account)) @staticmethod def prepare_for(processor, version): processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT, allow_fail=False) if version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(version) def do_rolling_bounce(self, processor, upgrade_from, new_version, counter): first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node # stop processor and wait for rebalance of others with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.stop() first_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) second_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) if upgrade_from == "": # upgrade disabled -- second round of rolling bounces roll_counter = ".1-" # second round of rolling bounces else: roll_counter = ".0-" # first round of rolling boundes node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + roll_counter + str(counter), allow_fail=False) if new_version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(new_version) processor.set_upgrade_from(upgrade_from) grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" " with node.account.monitor_log(processor.STDOUT_FILE) as monitor: with node.account.monitor_log(processor.LOG_FILE) as log_monitor: with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE ) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.start() log_monitor.wait_until( "Kafka version : " + new_version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + new_version + " " + str(node.account)) first_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) found = list( first_other_node.account.ssh_capture( grep_metadata_error + first_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) second_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) found = list( second_other_node.account.ssh_capture( grep_metadata_error + second_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node.account))
def test_upgrade(self, from_kafka_version, to_message_format_version, compression_types, new_consumer=True, security_protocol="PLAINTEXT"): """Test upgrade of Kafka broker cluster from 0.8.2, 0.9.0, 0.10.0, 0.10.1, 0.10.2 to the current version from_kafka_version is a Kafka version to upgrade from If to_message_format_version is None, it means that we will upgrade to default (latest) message format version. It is possible to upgrade to 0.10 brokers but still use message format version 0.9 - Start 3 node broker cluster on version 'from_kafka_version' - Start producer and consumer in the background - Perform two-phase rolling upgrade - First phase: upgrade brokers to 0.10 with inter.broker.protocol.version set to from_kafka_version and log.message.format.version set to from_kafka_version - Second phase: remove inter.broker.protocol.version config with rolling bounce; if to_message_format_version is set to 0.9, set log.message.format.version to to_message_format_version, otherwise remove log.message.format.version config - Finally, validate that every message acked by the producer was consumed by the consumer """ self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=KafkaVersion(from_kafka_version), topics={ self.topic: { "partitions": 3, "replication-factor": 3, 'configs': { "min.insync.replicas": 2 } } }) self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol self.kafka.start() self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput, message_validator=is_int, compression_types=compression_types, version=KafkaVersion(from_kafka_version)) if from_kafka_version <= LATEST_0_10_0: assert self.zk.query("/cluster/id") is None # TODO - reduce the timeout self.consumer = ConsoleConsumer( self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=30000, new_consumer=new_consumer, message_validator=is_int, version=KafkaVersion(from_kafka_version)) self.run_produce_consume_validate( core_test_action=lambda: self.perform_upgrade( from_kafka_version, to_message_format_version)) cluster_id_json = self.zk.query("/cluster/id") assert cluster_id_json is not None try: cluster_id = json.loads(cluster_id_json) except: self.logger.debug( "Data in /cluster/id znode could not be parsed. Data = %s" % cluster_id_json) self.logger.debug("Cluster id [%s]", cluster_id) assert len(cluster_id["id"]) == 22
def test_upgrade(self, from_kafka_version, to_message_format_version, compression_types, security_protocol="PLAINTEXT"): """Test upgrade of Kafka broker cluster from various versions to the current version from_kafka_version is a Kafka version to upgrade from If to_message_format_version is None, it means that we will upgrade to default (latest) message format version. It is possible to upgrade to 0.10 brokers but still use message format version 0.9 - Start 3 node broker cluster on version 'from_kafka_version' - Start producer and consumer in the background - Perform two-phase rolling upgrade - First phase: upgrade brokers to 0.10 with inter.broker.protocol.version set to from_kafka_version and log.message.format.version set to from_kafka_version - Second phase: remove inter.broker.protocol.version config with rolling bounce; if to_message_format_version is set to 0.9, set log.message.format.version to to_message_format_version, otherwise remove log.message.format.version config - Finally, validate that every message acked by the producer was consumed by the consumer """ self.zk = ZookeeperService(self.test_context, num_nodes=1, version=KafkaVersion(from_kafka_version)) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=KafkaVersion(from_kafka_version), topics={ self.topic: { "partitions": self.partitions, "replication-factor": self.replication_factor, 'configs': { "min.insync.replicas": 2 } } }) self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol jdk_version = java_version(self.kafka.nodes[0]) if jdk_version > 9 and from_kafka_version in new_jdk_not_supported: self.logger.info("Test ignored! Kafka " + from_kafka_version + " not support jdk " + str(jdk_version)) return self.zk.start() self.kafka.start() self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput, message_validator=is_int, compression_types=compression_types, version=KafkaVersion(from_kafka_version)) if from_kafka_version <= LATEST_0_10_0: assert self.kafka.cluster_id() is None # With older message formats before KIP-101, message loss may occur due to truncation # after leader change. Tolerate limited data loss for this case to avoid transient test failures. self.may_truncate_acked_records = False if from_kafka_version >= V_0_11_0_0 else True new_consumer = from_kafka_version >= V_0_9_0_0 # TODO - reduce the timeout self.consumer = ConsoleConsumer( self.test_context, self.num_consumers, self.kafka, self.topic, new_consumer=new_consumer, consumer_timeout_ms=30000, message_validator=is_int, version=KafkaVersion(from_kafka_version)) self.run_produce_consume_validate( core_test_action=lambda: self.perform_upgrade( from_kafka_version, to_message_format_version)) cluster_id = self.kafka.cluster_id() assert cluster_id is not None assert len(cluster_id) == 22 assert self.kafka.check_protocol_errors(self)
class TestSecurityRollingUpgrade(ProduceConsumeValidateTest): """Tests a rolling upgrade from PLAINTEXT to a secured cluster """ def __init__(self, test_context): super(TestSecurityRollingUpgrade, self).__init__(test_context=test_context) def setUp(self): self.acls = ACLs(self.test_context) self.topic = "test_topic" self.group = "group" self.producer_throughput = 100 self.num_producers = 1 self.num_consumers = 1 self.zk = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, topics={ self.topic: { "partitions": 3, "replication-factor": 3, 'configs': { "min.insync.replicas": 2 } } }) self.zk.start() def create_producer_and_consumer(self): self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=60000, message_validator=is_int) self.consumer.group_id = "group" def bounce(self): self.kafka.start_minikdc_if_necessary() self.kafka.restart_cluster( after_each_broker_restart=lambda: time.sleep(10)) def roll_in_secured_settings(self, client_protocol, broker_protocol): # Roll cluster to include inter broker security protocol. self.kafka.setup_interbroker_listener(broker_protocol) self.bounce() # Roll cluster to disable PLAINTEXT port self.kafka.close_port(SecurityConfig.PLAINTEXT) self.set_authorizer_and_bounce(client_protocol, broker_protocol) def set_authorizer_and_bounce( self, client_protocol, broker_protocol, authorizer_class_name=KafkaService.ACL_AUTHORIZER): self.kafka.authorizer_class_name = authorizer_class_name # Force use of direct ZooKeeper access due to SecurityDisabledException: No Authorizer is configured on the broker. self.acls.set_acls(client_protocol, self.kafka, self.topic, self.group, force_use_zk_connection=True) self.acls.set_acls(broker_protocol, self.kafka, self.topic, self.group, force_use_zk_connection=True) self.bounce() # enables the authorizer def open_secured_port(self, client_protocol): self.kafka.security_protocol = client_protocol self.kafka.open_port(client_protocol) self.kafka.start_minikdc_if_necessary() self.bounce() def add_sasl_mechanism(self, new_client_sasl_mechanism): self.kafka.client_sasl_mechanism = new_client_sasl_mechanism self.kafka.start_minikdc_if_necessary() self.bounce() def roll_in_sasl_mechanism(self, security_protocol, new_sasl_mechanism): # Roll cluster to update inter-broker SASL mechanism. This disables the old mechanism. self.kafka.interbroker_sasl_mechanism = new_sasl_mechanism self.bounce() # Bounce again with ACLs for new mechanism. Use old SimpleAclAuthorizer here to ensure that is also tested. self.set_authorizer_and_bounce(security_protocol, security_protocol, KafkaService.SIMPLE_AUTHORIZER) def add_separate_broker_listener(self, broker_security_protocol, broker_sasl_mechanism): # Enable the new internal listener on all brokers first self.kafka.open_port(self.kafka.INTERBROKER_LISTENER_NAME) self.kafka.port_mappings[ self.kafka. INTERBROKER_LISTENER_NAME].security_protocol = broker_security_protocol self.kafka.client_sasl_mechanism = broker_sasl_mechanism self.bounce() # Update inter-broker listener after all brokers have been updated to enable the new listener self.kafka.setup_interbroker_listener(broker_security_protocol, True) self.kafka.interbroker_sasl_mechanism = broker_sasl_mechanism self.bounce() def remove_separate_broker_listener(self, client_security_protocol, client_sasl_mechanism): # separate interbroker listener port will be closed automatically in setup_interbroker_listener # if not using separate interbroker listener self.kafka.setup_interbroker_listener(client_security_protocol, False) self.kafka.interbroker_sasl_mechanism = client_sasl_mechanism self.bounce() @cluster(num_nodes=8) @matrix(client_protocol=[SecurityConfig.SSL]) @cluster(num_nodes=9) @matrix(client_protocol=[ SecurityConfig.SASL_PLAINTEXT, SecurityConfig.SASL_SSL ]) def test_rolling_upgrade_phase_one(self, client_protocol): """ Start with a PLAINTEXT cluster, open a SECURED port, via a rolling upgrade, ensuring we could produce and consume throughout over PLAINTEXT. Finally check we can produce and consume the new secured port. """ self.kafka.setup_interbroker_listener(SecurityConfig.PLAINTEXT) self.kafka.security_protocol = SecurityConfig.PLAINTEXT self.kafka.start() # Create PLAINTEXT producer and consumer self.create_producer_and_consumer() # Rolling upgrade, opening a secure protocol, ensuring the Plaintext producer/consumer continues to run self.run_produce_consume_validate(self.open_secured_port, client_protocol) # Now we can produce and consume via the secured port self.kafka.security_protocol = client_protocol self.create_producer_and_consumer() self.run_produce_consume_validate(lambda: time.sleep(1)) @cluster(num_nodes=8) @matrix(client_protocol=[ SecurityConfig.SASL_SSL, SecurityConfig.SSL, SecurityConfig.SASL_PLAINTEXT ], broker_protocol=[ SecurityConfig.SASL_SSL, SecurityConfig.SSL, SecurityConfig.SASL_PLAINTEXT ]) def test_rolling_upgrade_phase_two(self, client_protocol, broker_protocol): """ Start with a PLAINTEXT cluster with a second Secured port open (i.e. result of phase one). A third secure port is also open if inter-broker and client protocols are different. Start a Producer and Consumer via the SECURED client port Incrementally upgrade to add inter-broker be the secure broker protocol Incrementally upgrade again to add ACLs as well as disabling the PLAINTEXT port Ensure the producer and consumer ran throughout """ #Given we have a broker that has both secure and PLAINTEXT ports open self.kafka.security_protocol = client_protocol self.kafka.setup_interbroker_listener(SecurityConfig.PLAINTEXT, use_separate_listener=False) self.kafka.open_port(broker_protocol) self.kafka.start() #Create Secured Producer and Consumer self.create_producer_and_consumer() #Roll in the security protocol. Disable Plaintext. Ensure we can produce and Consume throughout self.run_produce_consume_validate(self.roll_in_secured_settings, client_protocol, broker_protocol) @cluster(num_nodes=9) @matrix(new_client_sasl_mechanism=[SecurityConfig.SASL_MECHANISM_PLAIN]) def test_rolling_upgrade_sasl_mechanism_phase_one( self, new_client_sasl_mechanism): """ Start with a SASL/GSSAPI cluster, add new SASL mechanism, via a rolling upgrade, ensuring we could produce and consume throughout over SASL/GSSAPI. Finally check we can produce and consume using new mechanism. """ self.kafka.setup_interbroker_listener(SecurityConfig.SASL_SSL, use_separate_listener=False) self.kafka.security_protocol = SecurityConfig.SASL_SSL self.kafka.client_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI self.kafka.interbroker_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI self.kafka.start() # Create SASL/GSSAPI producer and consumer self.create_producer_and_consumer() # Rolling upgrade, adding new SASL mechanism, ensuring the GSSAPI producer/consumer continues to run self.run_produce_consume_validate(self.add_sasl_mechanism, new_client_sasl_mechanism) # Now we can produce and consume using the new SASL mechanism self.kafka.client_sasl_mechanism = new_client_sasl_mechanism self.create_producer_and_consumer() self.run_produce_consume_validate(lambda: time.sleep(1)) @cluster(num_nodes=8) @matrix(new_sasl_mechanism=[SecurityConfig.SASL_MECHANISM_PLAIN]) def test_rolling_upgrade_sasl_mechanism_phase_two(self, new_sasl_mechanism): """ Start with a SASL cluster with GSSAPI for inter-broker and a second mechanism for clients (i.e. result of phase one). Start Producer and Consumer using the second mechanism Incrementally upgrade to set inter-broker to the second mechanism and disable GSSAPI Incrementally upgrade again to add ACLs Ensure the producer and consumer run throughout """ #Start with a broker that has GSSAPI for inter-broker and a second mechanism for clients self.kafka.security_protocol = SecurityConfig.SASL_SSL self.kafka.setup_interbroker_listener(SecurityConfig.SASL_SSL, use_separate_listener=False) self.kafka.client_sasl_mechanism = new_sasl_mechanism self.kafka.interbroker_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI self.kafka.start() #Create Producer and Consumer using second mechanism self.create_producer_and_consumer() #Roll in the second SASL mechanism for inter-broker, disabling first mechanism. Ensure we can produce and consume throughout self.run_produce_consume_validate(self.roll_in_sasl_mechanism, self.kafka.security_protocol, new_sasl_mechanism) @cluster(num_nodes=9) def test_enable_separate_interbroker_listener(self): """ Start with a cluster that has a single PLAINTEXT listener. Start producing/consuming on PLAINTEXT port. While doing that, do a rolling restart to enable separate secured interbroker port """ self.kafka.security_protocol = SecurityConfig.PLAINTEXT self.kafka.setup_interbroker_listener(SecurityConfig.PLAINTEXT, use_separate_listener=False) self.kafka.start() self.create_producer_and_consumer() self.run_produce_consume_validate(self.add_separate_broker_listener, SecurityConfig.SASL_SSL, SecurityConfig.SASL_MECHANISM_PLAIN) @cluster(num_nodes=9) def test_disable_separate_interbroker_listener(self): """ Start with a cluster that has two listeners, one on SSL (clients), another on SASL_SSL (broker-to-broker). Start producer and consumer on SSL listener. Close dedicated interbroker listener via rolling restart. Ensure we can produce and consume via SSL listener throughout. """ client_protocol = SecurityConfig.SSL client_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI self.kafka.security_protocol = client_protocol self.kafka.client_sasl_mechanism = client_sasl_mechanism self.kafka.setup_interbroker_listener(SecurityConfig.SASL_SSL, use_separate_listener=True) self.kafka.interbroker_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI self.kafka.start() # create producer and consumer via client security protocol self.create_producer_and_consumer() # run produce/consume/validate loop while disabling a separate interbroker listener via rolling restart self.run_produce_consume_validate(self.remove_separate_broker_listener, client_protocol, client_sasl_mechanism)
class ConsumerGroupCommandTest(Test): """ Tests ConsumerGroupCommand """ # Root directory for persistent output PERSISTENT_ROOT = "/mnt/consumer_group_command" COMMAND_CONFIG_FILE = os.path.join(PERSISTENT_ROOT, "command.properties") def __init__(self, test_context): super(ConsumerGroupCommandTest, self).__init__(test_context) self.num_zk = 1 self.num_brokers = 1 self.topics = {TOPIC: {'partitions': 1, 'replication-factor': 1}} self.zk = ZookeeperService(test_context, self.num_zk) def setUp(self): self.zk.start() def start_kafka(self, security_protocol, interbroker_security_protocol): self.kafka = KafkaService( self.test_context, self.num_brokers, self.zk, security_protocol=security_protocol, interbroker_security_protocol=interbroker_security_protocol, topics=self.topics) self.kafka.start() def start_consumer(self, security_protocol): enable_new_consumer = security_protocol == SecurityConfig.SSL self.consumer = ConsoleConsumer(self.test_context, num_nodes=self.num_brokers, kafka=self.kafka, topic=TOPIC, consumer_timeout_ms=None, new_consumer=enable_new_consumer) self.consumer.start() def setup_and_verify(self, security_protocol, group=None): self.start_kafka(security_protocol, security_protocol) self.start_consumer(security_protocol) consumer_node = self.consumer.nodes[0] wait_until(lambda: self.consumer.alive(consumer_node), timeout_sec=20, backoff_sec=.2, err_msg="Consumer was too slow to start") kafka_node = self.kafka.nodes[0] if security_protocol is not SecurityConfig.PLAINTEXT: prop_file = str(self.kafka.security_config.client_config()) self.logger.debug(prop_file) kafka_node.account.ssh("mkdir -p %s" % self.PERSISTENT_ROOT, allow_fail=False) kafka_node.account.create_file(self.COMMAND_CONFIG_FILE, prop_file) # Verify ConsumerGroupCommand lists expected consumer groups enable_new_consumer = security_protocol != SecurityConfig.PLAINTEXT command_config_file = None if enable_new_consumer: command_config_file = self.COMMAND_CONFIG_FILE if group: wait_until( lambda: re.search( "topic-consumer-group-command", self.kafka.describe_consumer_group( group=group, node=kafka_node, new_consumer=enable_new_consumer, command_config=command_config_file)), timeout_sec=10, err_msg="Timed out waiting to list expected consumer groups.") else: wait_until( lambda: "test-consumer-group" in self.kafka. list_consumer_groups(node=kafka_node, new_consumer=enable_new_consumer, command_config=command_config_file), timeout_sec=10, err_msg="Timed out waiting to list expected consumer groups.") self.consumer.stop() @cluster(num_nodes=3) @matrix(security_protocol=['PLAINTEXT', 'SSL']) def test_list_consumer_groups(self, security_protocol='PLAINTEXT'): """ Tests if ConsumerGroupCommand is listing correct consumer groups :return: None """ self.setup_and_verify(security_protocol) @cluster(num_nodes=3) @matrix(security_protocol=['PLAINTEXT', 'SSL']) def test_describe_consumer_group(self, security_protocol='PLAINTEXT'): """ Tests if ConsumerGroupCommand is describing a consumer group correctly :return: None """ self.setup_and_verify(security_protocol, group="test-consumer-group")
class StreamsUpgradeTest(Test): """ Test upgrading Kafka Streams (all version combination) If metadata was changes, upgrade is more difficult Metadata version was bumped in 0.10.1.0 and subsequently bumped in 2.0.0 """ def __init__(self, test_context): super(StreamsUpgradeTest, self).__init__(test_context) self.topics = { 'echo': { 'partitions': 5 }, 'data': { 'partitions': 5 }, } processed_msg = "processed [0-9]* records" base_version_number = str(DEV_VERSION).split("-")[0] def perform_broker_upgrade(self, to_version): self.logger.info("First pass bounce - rolling broker upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = KafkaVersion(to_version) self.kafka.start_node(node) @ignore @cluster(num_nodes=6) @matrix(from_version=broker_upgrade_versions, to_version=broker_upgrade_versions) def test_upgrade_downgrade_brokers(self, from_version, to_version): """ Start a smoke test client then perform rolling upgrades on the broker. """ if from_version == to_version: return self.replication = 3 self.num_kafka_nodes = 3 self.partitions = 1 self.isr = 2 self.topics = { 'echo': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'data': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'min': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'max': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'sum': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'dif': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'cnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'avg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'wcnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'tagg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } } } # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # number of nodes needs to be >= 3 for the smoke test self.kafka = KafkaService(self.test_context, num_nodes=self.num_kafka_nodes, zk=self.zk, version=KafkaVersion(from_version), topics=self.topics) self.kafka.start() # allow some time for topics to be created wait_until(lambda: self.confirm_topics_on_all_brokers( set(self.topics.keys())), timeout_sec=60, err_msg="Broker did not create all topics in 60 seconds ") self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) processor = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka, "at_least_once") with self.driver.node.account.monitor_log( self.driver.STDOUT_FILE) as driver_monitor: self.driver.start() with processor.node.account.monitor_log( processor.STDOUT_FILE) as monitor: processor.start() monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(processor.node)) connected_message = "Discovered group coordinator" with processor.node.account.monitor_log( processor.LOG_FILE) as log_monitor: with processor.node.account.monitor_log( processor.STDOUT_FILE) as stdout_monitor: self.perform_broker_upgrade(to_version) log_monitor.wait_until( connected_message, timeout_sec=120, err_msg=("Never saw output '%s' on " % connected_message) + str(processor.node.account)) stdout_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on" % self.processed_msg + str(processor.node.account)) # SmokeTestDriver allows up to 6 minutes to consume all # records for the verification step so this timeout is set to # 6 minutes (360 seconds) for consuming of verification records # and a very conservative additional 2 minutes (120 seconds) to process # the records in the verification step driver_monitor.wait_until( 'ALL-RECORDS-DELIVERED\|PROCESSED-MORE-THAN-GENERATED', timeout_sec=480, err_msg="Never saw output '%s' on" % 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED' + str(self.driver.node.account)) self.driver.stop() processor.stop() processor.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) @matrix(from_version=metadata_1_versions, to_version=[str(DEV_VERSION)]) @matrix(from_version=metadata_2_versions, to_version=[str(DEV_VERSION)]) def test_metadata_upgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version> """ self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # first rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_stop_start_bounce(p, from_version[:-2], to_version, counter) counter = counter + 1 # second rolling bounce random.shuffle(self.processors) for p in self.processors: self.do_stop_start_bounce(p, None, to_version, counter) counter = counter + 1 # shutdown self.driver.stop() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) def test_version_probing_upgrade(self): """ Starts 3 KafkaStreams instances, and upgrades one-by-one to "future version" """ self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with("") # run with TRUNK self.processors = [self.processor1, self.processor2, self.processor3] self.old_processors = [ self.processor1, self.processor2, self.processor3 ] self.upgraded_processors = [] counter = 1 current_generation = 3 random.seed() random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False current_generation = self.do_rolling_bounce( p, counter, current_generation) counter = counter + 1 # shutdown self.driver.stop() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) def get_version_string(self, version): if version.startswith("0") or version.startswith("1") \ or version.startswith("2.0") or version.startswith("2.1"): return "Kafka version : " + version elif "SNAPSHOT" in version: return "Kafka version.*" + self.base_version_number + ".*SNAPSHOT" else: return "Kafka version: " + version def start_all_nodes_with(self, version): kafka_version_str = self.get_version_string(version) # start first with <version> self.prepare_for(self.processor1, version) node1 = self.processor1.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor: with node1.account.monitor_log( self.processor1.LOG_FILE) as log_monitor: self.processor1.start() log_monitor.wait_until( kafka_version_str, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node1.account)) monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(node1.account)) # start second with <version> self.prepare_for(self.processor2, version) node2 = self.processor2.node with node1.account.monitor_log( self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log( self.processor2.STDOUT_FILE) as second_monitor: with node2.account.monitor_log( self.processor2.LOG_FILE) as log_monitor: self.processor2.start() log_monitor.wait_until( kafka_version_str, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " on " + str(node2.account)) first_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(node1.account)) second_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(node2.account)) # start third with <version> self.prepare_for(self.processor3, version) node3 = self.processor3.node with node1.account.monitor_log( self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log( self.processor2.STDOUT_FILE) as second_monitor: with node3.account.monitor_log( self.processor3.STDOUT_FILE) as third_monitor: with node3.account.monitor_log( self.processor3.LOG_FILE) as log_monitor: self.processor3.start() log_monitor.wait_until( kafka_version_str, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " on " + str(node3.account)) first_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(node1.account)) second_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(node2.account)) third_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(node3.account)) @staticmethod def prepare_for(processor, version): processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT, allow_fail=False) if version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(version) def do_stop_start_bounce(self, processor, upgrade_from, new_version, counter): kafka_version_str = self.get_version_string(new_version) first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node # stop processor and wait for rebalance of others with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.stop() first_other_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(first_other_node.account)) second_other_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(second_other_node.account)) node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) if upgrade_from is None: # upgrade disabled -- second round of rolling bounces roll_counter = ".1-" # second round of rolling bounces else: roll_counter = ".0-" # first round of rolling bounces node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + roll_counter + str(counter), allow_fail=False) if new_version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(new_version) processor.set_upgrade_from(upgrade_from) grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" " with node.account.monitor_log(processor.STDOUT_FILE) as monitor: with node.account.monitor_log(processor.LOG_FILE) as log_monitor: with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE ) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.start() log_monitor.wait_until( kafka_version_str, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + new_version + " on " + str(node.account)) first_other_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(first_other_node.account)) found = list( first_other_node.account.ssh_capture( grep_metadata_error + first_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) second_other_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(second_other_node.account)) found = list( second_other_node.account.ssh_capture( grep_metadata_error + second_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(node.account)) def do_rolling_bounce(self, processor, counter, current_generation): first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node with first_other_node.account.monitor_log( first_other_processor.LOG_FILE) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.LOG_FILE) as second_other_monitor: # stop processor processor.stop() node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + "." + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + "." + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + "." + str(counter), allow_fail=False) with node.account.monitor_log( processor.LOG_FILE) as log_monitor: processor.set_upgrade_to("future_version") processor.start() self.old_processors.remove(processor) self.upgraded_processors.append(processor) # checking for the dev version log_monitor.wait_until( "Kafka version.*" + self.base_version_number + ".*", timeout_sec=60, err_msg="Could not detect Kafka Streams version " + str(DEV_VERSION) + " in " + str(node.account)) log_monitor.offset = 5 log_monitor.wait_until( "partition\.assignment\.strategy = \[org\.apache\.kafka\.streams\.tests\.StreamsUpgradeTest$FutureStreamsPartitionAssignor\]", timeout_sec=60, err_msg= "Could not detect FutureStreamsPartitionAssignor in " + str(node.account)) monitors = {} monitors[processor] = log_monitor monitors[first_other_processor] = first_other_monitor monitors[second_other_processor] = second_other_monitor end_of_upgrade_message = "Sent a version 9 subscription and group.s latest commonly supported version is 10 (successful version probing and end of rolling upgrade). Upgrading subscription metadata version to 10 for next rebalance." end_of_upgrade_error_message = "Could not detect 'successful version probing and end of rolling upgrade' at upgraded node " followup_rebalance_message = "Triggering the followup rebalance scheduled for 0 ms." followup_rebalance_error_message = "Could not detect 'Triggering followup rebalance' at node " if len(self.old_processors) > 0: log_monitor.wait_until( "Sent a version 10 subscription and got version 9 assignment back (successful version probing). Downgrade subscription metadata to commonly supported version 9 and trigger new rebalance.", timeout_sec=60, err_msg= "Could not detect 'successful version probing' at upgrading node " + str(node.account)) log_monitor.wait_until( followup_rebalance_message, timeout_sec=60, err_msg=followup_rebalance_error_message + str(node.account)) else: first_other_monitor.wait_until( end_of_upgrade_message, timeout_sec=60, err_msg=end_of_upgrade_error_message + str(first_other_node.account)) first_other_monitor.wait_until( followup_rebalance_message, timeout_sec=60, err_msg=followup_rebalance_error_message + str(node.account)) second_other_monitor.wait_until( end_of_upgrade_message, timeout_sec=60, err_msg=end_of_upgrade_error_message + str(second_other_node.account)) second_other_monitor.wait_until( followup_rebalance_message, timeout_sec=60, err_msg=followup_rebalance_error_message + str(node.account)) # version probing should trigger second rebalance # now we check that after consecutive rebalances we have synchronized generation generation_synchronized = False retries = 0 while retries < 10: processor_found = extract_generation_from_logs( processor) first_other_processor_found = extract_generation_from_logs( first_other_processor) second_other_processor_found = extract_generation_from_logs( second_other_processor) if len(processor_found) > 0 and len( first_other_processor_found) > 0 and len( second_other_processor_found) > 0: self.logger.info("processor: " + str(processor_found)) self.logger.info("first other processor: " + str(first_other_processor_found)) self.logger.info("second other processor: " + str(second_other_processor_found)) processor_generation = self.extract_highest_generation( processor_found) first_other_processor_generation = self.extract_highest_generation( first_other_processor_found) second_other_processor_generation = self.extract_highest_generation( second_other_processor_found) if processor_generation == first_other_processor_generation and processor_generation == second_other_processor_generation: current_generation = processor_generation generation_synchronized = True break time.sleep(5) retries = retries + 1 if generation_synchronized == False: raise Exception( "Never saw all three processors have the synchronized generation number" ) if len(self.old_processors) > 0: self.verify_metadata_no_upgraded_yet( end_of_upgrade_message) return current_generation def extract_highest_generation(self, found_generations): return extract_generation_id(found_generations[-1]) def verify_metadata_no_upgraded_yet(self, end_of_upgrade_message): for p in self.processors: found = list( p.node.account.ssh_capture("grep \"" + end_of_upgrade_message + "\" " + p.LOG_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'group member upgraded to metadata 8 too early'" ) def confirm_topics_on_all_brokers(self, expected_topic_set): for node in self.kafka.nodes: match_count = 0 # need to iterate over topic_list_generator as kafka.list_topics() # returns a python generator so values are fetched lazily # so we can't just compare directly we must iterate over what's returned topic_list_generator = self.kafka.list_topics(node=node) for topic in topic_list_generator: if topic in expected_topic_set: match_count += 1 if len(expected_topic_set) != match_count: return False return True
class TestUpgrade(ProduceConsumeValidateTest): def __init__(self, test_context): super(TestUpgrade, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # Producer and consumer self.producer_throughput = 1000 self.num_producers = 1 self.num_consumers = 1 def perform_upgrade(self, from_kafka_version, to_message_format_version=None): self.logger.info("First pass bounce - rolling upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = DEV_BRANCH node.config[config_property. INTER_BROKER_PROTOCOL_VERSION] = from_kafka_version node.config[ config_property.MESSAGE_FORMAT_VERSION] = from_kafka_version self.kafka.start_node(node) self.logger.info( "Second pass bounce - remove inter.broker.protocol.version config") for node in self.kafka.nodes: self.kafka.stop_node(node) del node.config[config_property.INTER_BROKER_PROTOCOL_VERSION] if to_message_format_version is None: del node.config[config_property.MESSAGE_FORMAT_VERSION] else: node.config[config_property. MESSAGE_FORMAT_VERSION] = to_message_format_version self.kafka.start_node(node) @cluster(num_nodes=6) @parametrize(from_kafka_version=str(LATEST_2_3), to_message_format_version=None, compression_types=["none"]) @parametrize(from_kafka_version=str(LATEST_2_3), to_message_format_version=None, compression_types=["zstd"]) @parametrize(from_kafka_version=str(LATEST_2_2), to_message_format_version=None, compression_types=["none"]) @parametrize(from_kafka_version=str(LATEST_2_2), to_message_format_version=None, compression_types=["zstd"]) @parametrize(from_kafka_version=str(LATEST_2_1), to_message_format_version=None, compression_types=["none"]) @parametrize(from_kafka_version=str(LATEST_2_1), to_message_format_version=None, compression_types=["lz4"]) @parametrize(from_kafka_version=str(LATEST_2_0), to_message_format_version=None, compression_types=["none"]) @parametrize(from_kafka_version=str(LATEST_2_0), to_message_format_version=None, compression_types=["snappy"]) @parametrize(from_kafka_version=str(LATEST_1_1), to_message_format_version=None, compression_types=["none"]) @parametrize(from_kafka_version=str(LATEST_1_1), to_message_format_version=None, compression_types=["lz4"]) @parametrize(from_kafka_version=str(LATEST_1_0), to_message_format_version=None, compression_types=["none"]) @parametrize(from_kafka_version=str(LATEST_1_0), to_message_format_version=None, compression_types=["snappy"]) @parametrize(from_kafka_version=str(LATEST_0_11_0), to_message_format_version=None, compression_types=["gzip"]) @parametrize(from_kafka_version=str(LATEST_0_11_0), to_message_format_version=None, compression_types=["lz4"]) @parametrize(from_kafka_version=str(LATEST_0_10_2), to_message_format_version=str(LATEST_0_9), compression_types=["none"]) @parametrize(from_kafka_version=str(LATEST_0_10_2), to_message_format_version=str(LATEST_0_10), compression_types=["snappy"]) @parametrize(from_kafka_version=str(LATEST_0_10_2), to_message_format_version=None, compression_types=["none"]) @parametrize(from_kafka_version=str(LATEST_0_10_2), to_message_format_version=None, compression_types=["lz4"]) @parametrize(from_kafka_version=str(LATEST_0_10_1), to_message_format_version=None, compression_types=["lz4"]) @parametrize(from_kafka_version=str(LATEST_0_10_1), to_message_format_version=None, compression_types=["snappy"]) @parametrize(from_kafka_version=str(LATEST_0_10_0), to_message_format_version=None, compression_types=["snappy"]) @parametrize(from_kafka_version=str(LATEST_0_10_0), to_message_format_version=None, compression_types=["lz4"]) @cluster(num_nodes=7) @parametrize(from_kafka_version=str(LATEST_0_9), to_message_format_version=None, compression_types=["none"], security_protocol="SASL_SSL") @cluster(num_nodes=6) @parametrize(from_kafka_version=str(LATEST_0_9), to_message_format_version=None, compression_types=["snappy"]) @parametrize(from_kafka_version=str(LATEST_0_9), to_message_format_version=None, compression_types=["lz4"]) @parametrize(from_kafka_version=str(LATEST_0_9), to_message_format_version=str(LATEST_0_9), compression_types=["none"]) @parametrize(from_kafka_version=str(LATEST_0_9), to_message_format_version=str(LATEST_0_9), compression_types=["lz4"]) @cluster(num_nodes=7) @parametrize(from_kafka_version=str(LATEST_0_8_2), to_message_format_version=None, compression_types=["none"]) @parametrize(from_kafka_version=str(LATEST_0_8_2), to_message_format_version=None, compression_types=["snappy"]) def test_upgrade(self, from_kafka_version, to_message_format_version, compression_types, security_protocol="PLAINTEXT"): """Test upgrade of Kafka broker cluster from various versions to the current version from_kafka_version is a Kafka version to upgrade from If to_message_format_version is None, it means that we will upgrade to default (latest) message format version. It is possible to upgrade to 0.10 brokers but still use message format version 0.9 - Start 3 node broker cluster on version 'from_kafka_version' - Start producer and consumer in the background - Perform two-phase rolling upgrade - First phase: upgrade brokers to 0.10 with inter.broker.protocol.version set to from_kafka_version and log.message.format.version set to from_kafka_version - Second phase: remove inter.broker.protocol.version config with rolling bounce; if to_message_format_version is set to 0.9, set log.message.format.version to to_message_format_version, otherwise remove log.message.format.version config - Finally, validate that every message acked by the producer was consumed by the consumer """ self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=KafkaVersion(from_kafka_version), topics={ self.topic: { "partitions": 3, "replication-factor": 3, 'configs': { "min.insync.replicas": 2 } } }) self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol self.kafka.start() self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput, message_validator=is_int, compression_types=compression_types, version=KafkaVersion(from_kafka_version)) if from_kafka_version <= LATEST_0_10_0: assert self.kafka.cluster_id() is None # With older message formats before KIP-101, message loss may occur due to truncation # after leader change. Tolerate limited data loss for this case to avoid transient test failures. self.may_truncate_acked_records = False if from_kafka_version >= V_0_11_0_0 else True new_consumer = from_kafka_version >= V_0_9_0_0 # TODO - reduce the timeout self.consumer = ConsoleConsumer( self.test_context, self.num_consumers, self.kafka, self.topic, new_consumer=new_consumer, consumer_timeout_ms=30000, message_validator=is_int, version=KafkaVersion(from_kafka_version)) self.run_produce_consume_validate( core_test_action=lambda: self.perform_upgrade( from_kafka_version, to_message_format_version)) cluster_id = self.kafka.cluster_id() assert cluster_id is not None assert len(cluster_id) == 22
def test_upgrade_downgrade_brokers(self, from_version, to_version): """ Start a smoke test client then perform rolling upgrades on the broker. """ if from_version == to_version: return self.replication = 3 self.num_kafka_nodes = 3 self.partitions = 1 self.isr = 2 self.topics = { 'echo': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'data': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'min': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'max': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'sum': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'dif': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'cnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'avg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'wcnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'tagg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } } } # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # number of nodes needs to be >= 3 for the smoke test self.kafka = KafkaService(self.test_context, num_nodes=self.num_kafka_nodes, zk=self.zk, version=KafkaVersion(from_version), topics=self.topics) self.kafka.start() # allow some time for topics to be created wait_until(lambda: self.confirm_topics_on_all_brokers( set(self.topics.keys())), timeout_sec=60, err_msg="Broker did not create all topics in 60 seconds ") self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) processor = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka, "at_least_once") with self.driver.node.account.monitor_log( self.driver.STDOUT_FILE) as driver_monitor: self.driver.start() with processor.node.account.monitor_log( processor.STDOUT_FILE) as monitor: processor.start() monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(processor.node)) connected_message = "Discovered group coordinator" with processor.node.account.monitor_log( processor.LOG_FILE) as log_monitor: with processor.node.account.monitor_log( processor.STDOUT_FILE) as stdout_monitor: self.perform_broker_upgrade(to_version) log_monitor.wait_until( connected_message, timeout_sec=120, err_msg=("Never saw output '%s' on " % connected_message) + str(processor.node.account)) stdout_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on" % self.processed_msg + str(processor.node.account)) # SmokeTestDriver allows up to 6 minutes to consume all # records for the verification step so this timeout is set to # 6 minutes (360 seconds) for consuming of verification records # and a very conservative additional 2 minutes (120 seconds) to process # the records in the verification step driver_monitor.wait_until( 'ALL-RECORDS-DELIVERED\|PROCESSED-MORE-THAN-GENERATED', timeout_sec=480, err_msg="Never saw output '%s' on" % 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED' + str(self.driver.node.account)) self.driver.stop() processor.stop() processor.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False)
class TestVerifiableProducer(Test): """Sanity checks on verifiable producer service class.""" def __init__(self, test_context): super(TestVerifiableProducer, self).__init__(test_context) self.topic = "topic" self.zk = ZookeeperService(test_context, num_nodes=1) if quorum.for_test(test_context) == quorum.zk else None self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={self.topic: {"partitions": 1, "replication-factor": 1}}) self.num_messages = 1000 # This will produce to source kafka cluster self.producer = VerifiableProducer(test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, max_messages=self.num_messages, throughput=self.num_messages // 10) def setUp(self): if self.zk: self.zk.start() @cluster(num_nodes=3) @parametrize(producer_version=str(LATEST_0_8_2)) @parametrize(producer_version=str(LATEST_0_9)) @parametrize(producer_version=str(LATEST_0_10_0)) @parametrize(producer_version=str(LATEST_0_10_1)) @matrix(producer_version=[str(DEV_BRANCH)], acks=["0", "1", "-1"], enable_idempotence=[False]) @matrix(producer_version=[str(DEV_BRANCH)], acks=["-1"], enable_idempotence=[True]) @matrix(producer_version=[str(DEV_BRANCH)], security_protocol=['PLAINTEXT', 'SSL'], metadata_quorum=quorum.all) @cluster(num_nodes=4) @matrix(producer_version=[str(DEV_BRANCH)], security_protocol=['SASL_SSL'], sasl_mechanism=['PLAIN', 'GSSAPI'], metadata_quorum=quorum.all) def test_simple_run(self, producer_version, acks=None, enable_idempotence=False, security_protocol = 'PLAINTEXT', sasl_mechanism='PLAIN', metadata_quorum=quorum.zk): """ Test that we can start VerifiableProducer on the current branch snapshot version or against the 0.8.2 jar, and verify that we can produce a small number of messages. """ self.kafka.security_protocol = security_protocol self.kafka.client_sasl_mechanism = sasl_mechanism self.kafka.interbroker_security_protocol = security_protocol self.kafka.interbroker_sasl_mechanism = sasl_mechanism if self.kafka.quorum_info.using_kraft: controller_quorum = self.kafka.controller_quorum controller_quorum.controller_security_protocol = security_protocol controller_quorum.controller_sasl_mechanism = sasl_mechanism controller_quorum.intercontroller_security_protocol = security_protocol controller_quorum.intercontroller_sasl_mechanism = sasl_mechanism self.kafka.start() node = self.producer.nodes[0] self.producer.enable_idempotence = enable_idempotence self.producer.acks = acks node.version = KafkaVersion(producer_version) self.producer.start() wait_until(lambda: self.producer.num_acked > 5, timeout_sec=15, err_msg="Producer failed to start in a reasonable amount of time.") # using version.vstring (distutils.version.LooseVersion) is a tricky way of ensuring # that this check works with DEV_BRANCH # When running VerifiableProducer 0.8.X, both the current branch version and 0.8.X should show up because of the # way verifiable producer pulls in some development directories into its classpath # # If the test fails here because 'ps .. | grep' couldn't find the process it means # the login and grep that is_version() performs is slower than # the time it takes the producer to produce its messages. # Easy fix is to decrease throughput= above, the good fix is to make the producer # not terminate until explicitly killed in this case. if node.version <= LATEST_0_8_2: assert is_version(node, [node.version.vstring, DEV_BRANCH.vstring], logger=self.logger) else: assert is_version(node, [node.version.vstring], logger=self.logger) self.producer.wait() num_produced = self.producer.num_acked assert num_produced == self.num_messages, "num_produced: %d, num_messages: %d" % (num_produced, self.num_messages) @cluster(num_nodes=4) @matrix(inter_broker_security_protocol=['PLAINTEXT', 'SSL'], metadata_quorum=[quorum.remote_kraft]) @matrix(inter_broker_security_protocol=['SASL_SSL'], inter_broker_sasl_mechanism=['PLAIN', 'GSSAPI'], metadata_quorum=[quorum.remote_kraft]) def test_multiple_kraft_security_protocols( self, inter_broker_security_protocol, inter_broker_sasl_mechanism='GSSAPI', metadata_quorum=quorum.remote_kraft): """ Test for remote KRaft cases that we can start VerifiableProducer on the current branch snapshot version, and verify that we can produce a small number of messages. The inter-controller and broker-to-controller security protocols are defined to be different (which differs from the above test, where they were the same). """ self.kafka.security_protocol = self.kafka.interbroker_security_protocol = inter_broker_security_protocol self.kafka.client_sasl_mechanism = self.kafka.interbroker_sasl_mechanism = inter_broker_sasl_mechanism controller_quorum = self.kafka.controller_quorum sasl_mechanism = 'PLAIN' if inter_broker_sasl_mechanism == 'GSSAPI' else 'GSSAPI' if inter_broker_security_protocol == 'PLAINTEXT': controller_security_protocol = 'SSL' intercontroller_security_protocol = 'SASL_SSL' elif inter_broker_security_protocol == 'SSL': controller_security_protocol = 'SASL_SSL' intercontroller_security_protocol = 'PLAINTEXT' else: # inter_broker_security_protocol == 'SASL_SSL' controller_security_protocol = 'PLAINTEXT' intercontroller_security_protocol = 'SSL' controller_quorum.controller_security_protocol = controller_security_protocol controller_quorum.controller_sasl_mechanism = sasl_mechanism controller_quorum.intercontroller_security_protocol = intercontroller_security_protocol controller_quorum.intercontroller_sasl_mechanism = sasl_mechanism self.kafka.start() node = self.producer.nodes[0] node.version = KafkaVersion(str(DEV_BRANCH)) self.producer.start() wait_until(lambda: self.producer.num_acked > 5, timeout_sec=15, err_msg="Producer failed to start in a reasonable amount of time.") # See above comment above regarding use of version.vstring (distutils.version.LooseVersion) assert is_version(node, [node.version.vstring], logger=self.logger) self.producer.wait() num_produced = self.producer.num_acked assert num_produced == self.num_messages, "num_produced: %d, num_messages: %d" % (num_produced, self.num_messages) @cluster(num_nodes=4) @parametrize(metadata_quorum=quorum.remote_kraft) def test_multiple_kraft_sasl_mechanisms(self, metadata_quorum): """ Test for remote KRaft cases that we can start VerifiableProducer on the current branch snapshot version, and verify that we can produce a small number of messages. The inter-controller and broker-to-controller security protocols are both SASL_PLAINTEXT but the SASL mechanisms are different (we set GSSAPI for the inter-controller mechanism and PLAIN for the broker-to-controller mechanism). This test differs from the above tests -- he ones above used the same SASL mechanism for both paths. """ self.kafka.security_protocol = self.kafka.interbroker_security_protocol = 'PLAINTEXT' controller_quorum = self.kafka.controller_quorum controller_quorum.controller_security_protocol = 'SASL_PLAINTEXT' controller_quorum.controller_sasl_mechanism = 'PLAIN' controller_quorum.intercontroller_security_protocol = 'SASL_PLAINTEXT' controller_quorum.intercontroller_sasl_mechanism = 'GSSAPI' self.kafka.start() node = self.producer.nodes[0] node.version = KafkaVersion(str(DEV_BRANCH)) self.producer.start() wait_until(lambda: self.producer.num_acked > 5, timeout_sec=15, err_msg="Producer failed to start in a reasonable amount of time.") # See above comment above regarding use of version.vstring (distutils.version.LooseVersion) assert is_version(node, [node.version.vstring], logger=self.logger) self.producer.wait() num_produced = self.producer.num_acked assert num_produced == self.num_messages, "num_produced: %d, num_messages: %d" % (num_produced, self.num_messages)
def start_kafka(self, security_protocol, interbroker_security_protocol): self.kafka = KafkaService( self.test_context, self.num_brokers, self.zk, security_protocol=security_protocol, interbroker_security_protocol=interbroker_security_protocol, topics=self.topics) self.kafka.start()
class GroupModeTransactionsTest(Test): """Essentially testing the same functionality as TransactionsTest by transactionally copying data from a source topic to a destination topic and killing the copy process as well as the broker randomly through the process. The major difference is that we choose to work as a collaborated group with same topic subscription instead of individual copiers. In the end we verify that the final output topic contains exactly one committed copy of each message from the original producer. """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(GroupModeTransactionsTest, self).__init__(test_context=test_context) self.input_topic = "input-topic" self.output_topic = "output-topic" self.num_brokers = 3 # Test parameters self.num_input_partitions = 9 self.num_output_partitions = 9 self.num_copiers = 3 self.num_seed_messages = 100000 self.transaction_size = 750 # The transaction timeout should be lower than the progress timeout, but at # least as high as the request timeout (which is 30s by default). When the # client is hard-bounced, progress may depend on the previous transaction # being aborted. When the broker is hard-bounced, we may have to wait as # long as the request timeout to get a `Produce` response and we do not # want the coordinator timing out the transaction. self.transaction_timeout = 40000 self.progress_timeout_sec = 60 self.consumer_group = "grouped-transactions-test-consumer-group" self.zk = ZookeeperService(test_context, num_nodes=1) if quorum.for_test( test_context) == quorum.zk else None self.kafka = KafkaService(test_context, num_nodes=self.num_brokers, zk=self.zk, controller_num_nodes_override=1) def setUp(self): if self.zk: self.zk.start() def seed_messages(self, topic, num_seed_messages): seed_timeout_sec = 10000 seed_producer = VerifiableProducer( context=self.test_context, num_nodes=1, kafka=self.kafka, topic=topic, message_validator=is_int, max_messages=num_seed_messages, enable_idempotence=True, repeating_keys=self.num_input_partitions) seed_producer.start() wait_until(lambda: seed_producer.num_acked >= num_seed_messages, timeout_sec=seed_timeout_sec, err_msg="Producer failed to produce messages %d in %ds." % \ (self.num_seed_messages, seed_timeout_sec)) return seed_producer.acked_by_partition def get_messages_from_topic(self, topic, num_messages): consumer = self.start_consumer(topic, group_id="verifying_consumer") return self.drain_consumer(consumer, num_messages) def bounce_brokers(self, clean_shutdown): for node in self.kafka.nodes: if clean_shutdown: self.kafka.restart_node(node, clean_shutdown=True) else: self.kafka.stop_node(node, clean_shutdown=False) gracePeriodSecs = 5 if self.zk: wait_until( lambda: len(self.kafka.pids( node)) == 0 and not self.kafka.is_registered(node), timeout_sec=self.kafka.zk_session_timeout + gracePeriodSecs, err_msg= "Failed to see timely deregistration of hard-killed broker %s" % str(node.account)) else: brokerSessionTimeoutSecs = 18 wait_until( lambda: len(self.kafka.pids(node)) == 0, timeout_sec=brokerSessionTimeoutSecs + gracePeriodSecs, err_msg= "Failed to see timely disappearance of process for hard-killed broker %s" % str(node.account)) time.sleep(brokerSessionTimeoutSecs + gracePeriodSecs) self.kafka.start_node(node) def create_and_start_message_copier(self, input_topic, output_topic, transactional_id): message_copier = TransactionalMessageCopier( context=self.test_context, num_nodes=1, kafka=self.kafka, transactional_id=transactional_id, consumer_group=self.consumer_group, input_topic=input_topic, input_partition=-1, output_topic=output_topic, max_messages=-1, transaction_size=self.transaction_size, transaction_timeout=self.transaction_timeout, use_group_metadata=True, group_mode=True) message_copier.start() wait_until(lambda: message_copier.alive(message_copier.nodes[0]), timeout_sec=10, err_msg="Message copier failed to start after 10 s") return message_copier def bounce_copiers(self, copiers, clean_shutdown, timeout_sec=240): for _ in range(3): for copier in copiers: wait_until(lambda: copier.progress_percent() >= 20.0, timeout_sec=self.progress_timeout_sec, err_msg="%s : Message copier didn't make enough progress in %ds. Current progress: %s" \ % (copier.transactional_id, self.progress_timeout_sec, str(copier.progress_percent()))) self.logger.info( "%s - progress: %s" % (copier.transactional_id, str(copier.progress_percent()))) copier.restart(clean_shutdown) def create_and_start_copiers(self, input_topic, output_topic, num_copiers): copiers = [] for i in range(0, num_copiers): copiers.append( self.create_and_start_message_copier( input_topic=input_topic, output_topic=output_topic, transactional_id="copier-" + str(i))) return copiers @staticmethod def valid_value_and_partition(msg): """Method used to check whether the given message is a valid tab separated value + partition return value and partition as a size-two array represented tuple: [value, partition] """ try: splitted_msg = msg.split('\t') value = int(splitted_msg[1]) partition = int(splitted_msg[0].split(":")[1]) return [value, partition] except ValueError: raise Exception( "Unexpected message format (expected a tab separated [value, partition] tuple). Message: %s" % (msg)) def start_consumer(self, topic_to_read, group_id): consumer = ConsoleConsumer( context=self.test_context, num_nodes=1, kafka=self.kafka, topic=topic_to_read, group_id=group_id, message_validator=self.valid_value_and_partition, from_beginning=True, print_partition=True, isolation_level="read_committed") consumer.start() # ensure that the consumer is up. wait_until(lambda: (len(consumer.messages_consumed[1]) > 0) == True, timeout_sec=60, err_msg="Consumer failed to consume any messages for %ds" % \ 60) return consumer @staticmethod def split_by_partition(messages_consumed): messages_by_partition = {} for msg in messages_consumed: partition = msg[1] if partition not in messages_by_partition: messages_by_partition[partition] = [] messages_by_partition[partition].append(msg[0]) return messages_by_partition def drain_consumer(self, consumer, num_messages): # wait until we read at least the expected number of messages. # This is a safe check because both failure modes will be caught: # 1. If we have 'num_seed_messages' but there are duplicates, then # this is checked for later. # # 2. If we never reach 'num_seed_messages', then this will cause the # test to fail. wait_until(lambda: len(consumer.messages_consumed[1]) >= num_messages, timeout_sec=90, err_msg="Consumer consumed only %d out of %d messages in %ds" % \ (len(consumer.messages_consumed[1]), num_messages, 90)) consumer.stop() return self.split_by_partition(consumer.messages_consumed[1]) def copy_messages_transactionally(self, failure_mode, bounce_target, input_topic, output_topic, num_copiers, num_messages_to_copy): """Copies messages transactionally from the seeded input topic to the output topic, either bouncing brokers or clients in a hard and soft way as it goes. This method also consumes messages in read_committed mode from the output topic while the bounces and copy is going on. It returns the concurrently consumed messages. """ copiers = self.create_and_start_copiers(input_topic=input_topic, output_topic=output_topic, num_copiers=num_copiers) concurrent_consumer = self.start_consumer( output_topic, group_id="concurrent_consumer") clean_shutdown = False if failure_mode == "clean_bounce": clean_shutdown = True if bounce_target == "brokers": self.bounce_brokers(clean_shutdown) elif bounce_target == "clients": self.bounce_copiers(copiers, clean_shutdown) copier_timeout_sec = 240 for copier in copiers: wait_until(lambda: copier.is_done, timeout_sec=copier_timeout_sec, err_msg="%s - Failed to copy all messages in %ds." % \ (copier.transactional_id, copier_timeout_sec)) self.logger.info("finished copying messages") return self.drain_consumer(concurrent_consumer, num_messages_to_copy) def setup_topics(self): self.kafka.topics = { self.input_topic: { "partitions": self.num_input_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } }, self.output_topic: { "partitions": self.num_output_partitions, "replication-factor": 3, "configs": { "min.insync.replicas": 2 } } } @cluster(num_nodes=10) @matrix(failure_mode=["hard_bounce", "clean_bounce"], bounce_target=["brokers", "clients"]) def test_transactions(self, failure_mode, bounce_target, metadata_quorum=quorum.zk): security_protocol = 'PLAINTEXT' self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol self.kafka.logs["kafka_data_1"]["collect_default"] = True self.kafka.logs["kafka_data_2"]["collect_default"] = True self.kafka.logs["kafka_operational_logs_debug"][ "collect_default"] = True self.setup_topics() self.kafka.start() input_messages_by_partition = self.seed_messages( self.input_topic, self.num_seed_messages) concurrently_consumed_message_by_partition = self.copy_messages_transactionally( failure_mode, bounce_target, input_topic=self.input_topic, output_topic=self.output_topic, num_copiers=self.num_copiers, num_messages_to_copy=self.num_seed_messages) output_messages_by_partition = self.get_messages_from_topic( self.output_topic, self.num_seed_messages) assert len(input_messages_by_partition) == \ len(concurrently_consumed_message_by_partition), "The lengths of partition count doesn't match: " \ "input partitions count %d, " \ "concurrently consumed partitions count %d" % \ (len(input_messages_by_partition), len(concurrently_consumed_message_by_partition)) assert len(input_messages_by_partition) == \ len(output_messages_by_partition), "The lengths of partition count doesn't match: " \ "input partitions count %d, " \ "output partitions count %d" % \ (len(input_messages_by_partition), len(concurrently_consumed_message_by_partition)) for p in range(self.num_input_partitions): if p not in input_messages_by_partition: continue assert p in output_messages_by_partition, "Partition %d not in output messages" assert p in concurrently_consumed_message_by_partition, "Partition %d not in concurrently consumed messages" output_message_set = set(output_messages_by_partition[p]) input_message_set = set(input_messages_by_partition[p]) concurrently_consumed_message_set = set( concurrently_consumed_message_by_partition[p]) num_dups = abs(len(output_messages) - len(output_message_set)) num_dups_in_concurrent_consumer = abs( len(concurrently_consumed_messages) - len(concurrently_consumed_message_set)) assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" % \ (len(input_message_set), len(output_message_set)) assert num_dups_in_concurrent_consumer == 0, "Detected %d dups in concurrently consumed messages" % num_dups_in_concurrent_consumer assert input_message_set == concurrently_consumed_message_set, \ "Input and concurrently consumed output message sets are not equal. Num input messages: %d. Num concurrently_consumed_messages: %d" % \ (len(input_message_set), len(concurrently_consumed_message_set)) assert input_messages == sorted( input_messages ), "The seed messages themselves were not in order" assert output_messages == input_messages, "Output messages are not in order" assert concurrently_consumed_messages == output_messages, "Concurrently consumed messages are not in order"
class ThrottlingTest(ProduceConsumeValidateTest): """Tests throttled partition reassignment. This is essentially similar to the reassign_partitions_test, except that we throttle the reassignment and verify that it takes a sensible amount of time given the throttle and the amount of data being moved. Since the correctness is time dependent, this test also simplifies the cluster topology. In particular, we fix the number of brokers, the replication-factor, the number of partitions, the partition size, and the number of partitions being moved so that we can accurately predict the time throttled reassignment should take. """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ThrottlingTest, self).__init__(test_context=test_context) self.topic = "test_topic" self.zk = ZookeeperService(test_context, num_nodes=1) # Because we are starting the producer/consumer/validate cycle _after_ # seeding the cluster with big data (to test throttling), we need to # Start the consumer from the end of the stream. further, we need to # ensure that the consumer is fully started before the producer starts # so that we don't miss any messages. This timeout ensures the sufficient # condition. self.consumer_init_timeout_sec = 10 self.num_brokers = 6 self.num_partitions = 3 self.kafka = KafkaService(test_context, num_nodes=self.num_brokers, zk=self.zk, topics={ self.topic: { "partitions": self.num_partitions, "replication-factor": 2, "configs": { "segment.bytes": 64 * 1024 * 1024 } } }) self.producer_throughput = 1000 self.timeout_sec = 400 self.num_records = 2000 self.record_size = 4096 * 100 # 400 KB # 1 MB per partition on average. self.partition_size = (self.num_records * self.record_size) / self.num_partitions self.num_producers = 2 self.num_consumers = 1 self.throttle = 4 * 1024 * 1024 # 4 MB/s def setUp(self): self.zk.start() def min_cluster_size(self): # Override this since we're adding services outside of the constructor return super(ThrottlingTest, self).min_cluster_size() +\ self.num_producers + self.num_consumers def clean_bounce_some_brokers(self): """Bounce every other broker""" for node in self.kafka.nodes[::2]: self.kafka.restart_node(node, clean_shutdown=True) def reassign_partitions(self, bounce_brokers, throttle): """This method reassigns partitions using a throttle. It makes an assertion about the minimum amount of time the reassignment should take given the value of the throttle, the number of partitions being moved, and the size of each partition. """ partition_info = self.kafka.parse_describe_topic( self.kafka.describe_topic(self.topic)) self.logger.debug("Partitions before reassignment:" + str(partition_info)) max_num_moves = 0 for i in range(0, self.num_partitions): old_replicas = set(partition_info["partitions"][i]["replicas"]) new_part = (i + 1) % self.num_partitions new_replicas = set( partition_info["partitions"][new_part]["replicas"]) max_num_moves = max(len(new_replicas - old_replicas), max_num_moves) partition_info["partitions"][i]["partition"] = new_part self.logger.debug("Jumbled partitions: " + str(partition_info)) self.kafka.execute_reassign_partitions(partition_info, throttle=throttle) start = time.time() if bounce_brokers: # bounce a few brokers at the same time self.clean_bounce_some_brokers() # Wait until finished or timeout size_per_broker = max_num_moves * self.partition_size self.logger.debug("Max amount of data transfer per broker: %fb", size_per_broker) estimated_throttled_time = math.ceil( float(size_per_broker) / self.throttle) estimated_time_with_buffer = estimated_throttled_time * 2 self.logger.debug("Waiting %ds for the reassignment to complete", estimated_time_with_buffer) wait_until( lambda: self.kafka.verify_reassign_partitions(partition_info), timeout_sec=estimated_time_with_buffer, backoff_sec=.5) stop = time.time() time_taken = stop - start self.logger.debug("Transfer took %d second. Estimated time : %ds", time_taken, estimated_throttled_time) assert time_taken >= estimated_throttled_time, \ ("Expected rebalance to take at least %ds, but it took %ds" % ( estimated_throttled_time, time_taken)) @parametrize(bounce_brokers=False) @parametrize(bounce_brokers=True) def test_throttled_reassignment(self, bounce_brokers): security_protocol = 'PLAINTEXT' self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol producer_id = 'bulk_producer' bulk_producer = ProducerPerformanceService( context=self.test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, num_records=self.num_records, record_size=self.record_size, throughput=-1, client_id=producer_id, jmx_object_names=[ 'kafka.producer:type=producer-metrics,client-id=%s' % producer_id ], jmx_attributes=['outgoing-byte-rate']) self.producer = VerifiableProducer(context=self.test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, message_validator=is_int, throughput=self.producer_throughput) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, new_consumer=True, consumer_timeout_ms=60000, message_validator=is_int, from_beginning=False) self.kafka.start() bulk_producer.run() self.run_produce_consume_validate( core_test_action=lambda: self.reassign_partitions( bounce_brokers, self.throttle))
class ConnectDistributedTest(Test): """ Simple test of Kafka Connect in distributed mode, producing data from files on one cluster and consuming it on another, validating the total output is identical to the input. """ FILE_SOURCE_CONNECTOR = 'org.apache.kafka.connect.file.FileStreamSourceConnector' FILE_SINK_CONNECTOR = 'org.apache.kafka.connect.file.FileStreamSinkConnector' INPUT_FILE = "/mnt/connect.input" OUTPUT_FILE = "/mnt/connect.output" TOPIC = "test" OFFSETS_TOPIC = "connect-offsets" CONFIG_TOPIC = "connect-configs" STATUS_TOPIC = "connect-status" # Since tasks can be assigned to any node and we're testing with files, we need to make sure the content is the same # across all nodes. FIRST_INPUT_LIST = ["foo", "bar", "baz"] FIRST_INPUTS = "\n".join(FIRST_INPUT_LIST) + "\n" SECOND_INPUT_LIST = ["razz", "ma", "tazz"] SECOND_INPUTS = "\n".join(SECOND_INPUT_LIST) + "\n" SCHEMA = { "type": "string", "optional": False } def __init__(self, test_context): super(ConnectDistributedTest, self).__init__(test_context) self.num_zk = 1 self.num_brokers = 1 self.topics = { 'test' : { 'partitions': 1, 'replication-factor': 1 } } self.zk = ZookeeperService(test_context, self.num_zk) self.key_converter = "org.apache.kafka.connect.json.JsonConverter" self.value_converter = "org.apache.kafka.connect.json.JsonConverter" self.schemas = True def setup_services(self, security_protocol=SecurityConfig.PLAINTEXT): self.kafka = KafkaService(self.test_context, self.num_brokers, self.zk, security_protocol=security_protocol, interbroker_security_protocol=security_protocol, topics=self.topics) self.cc = ConnectDistributedService(self.test_context, 3, self.kafka, [self.INPUT_FILE, self.OUTPUT_FILE]) self.cc.log_level = "DEBUG" self.zk.start() self.kafka.start() def _start_connector(self, config_file): connector_props = self.render(config_file) connector_config = dict([line.strip().split('=', 1) for line in connector_props.split('\n') if line.strip() and not line.strip().startswith('#')]) self.cc.create_connector(connector_config) def _connector_status(self, connector, node=None): try: return self.cc.get_connector_status(connector, node) except ConnectRestError: return None def _connector_has_state(self, status, state): return status is not None and status['connector']['state'] == state def _task_has_state(self, task_id, status, state): if not status: return False tasks = status['tasks'] if not tasks: return False for task in tasks: if task['id'] == task_id: return task['state'] == state return False def _all_tasks_have_state(self, status, task_count, state): if status is None: return False tasks = status['tasks'] if len(tasks) != task_count: return False return reduce(operator.and_, [task['state'] == state for task in tasks], True) def is_running(self, connector, node=None): status = self._connector_status(connector.name, node) return self._connector_has_state(status, 'RUNNING') and self._all_tasks_have_state(status, connector.tasks, 'RUNNING') def is_paused(self, connector, node=None): status = self._connector_status(connector.name, node) return self._connector_has_state(status, 'PAUSED') and self._all_tasks_have_state(status, connector.tasks, 'PAUSED') def connector_is_running(self, connector, node=None): status = self._connector_status(connector.name, node) return self._connector_has_state(status, 'RUNNING') def connector_is_failed(self, connector, node=None): status = self._connector_status(connector.name, node) return self._connector_has_state(status, 'FAILED') def task_is_failed(self, connector, task_id, node=None): status = self._connector_status(connector.name, node) return self._task_has_state(task_id, status, 'FAILED') def task_is_running(self, connector, task_id, node=None): status = self._connector_status(connector.name, node) return self._task_has_state(task_id, status, 'RUNNING') def test_restart_failed_connector(self): self.setup_services() self.cc.set_configs(lambda node: self.render("connect-distributed.properties", node=node)) self.cc.start() self.sink = MockSink(self.cc, self.topics.keys(), mode='connector-failure', delay_sec=5) self.sink.start() wait_until(lambda: self.connector_is_failed(self.sink), timeout_sec=15, err_msg="Failed to see connector transition to the FAILED state") self.cc.restart_connector(self.sink.name) wait_until(lambda: self.connector_is_running(self.sink), timeout_sec=10, err_msg="Failed to see connector transition to the RUNNING state") @matrix(connector_type=["source", "sink"]) def test_restart_failed_task(self, connector_type): self.setup_services() self.cc.set_configs(lambda node: self.render("connect-distributed.properties", node=node)) self.cc.start() connector = None if connector_type == "sink": connector = MockSink(self.cc, self.topics.keys(), mode='task-failure', delay_sec=5) else: connector = MockSource(self.cc, self.topics.keys(), mode='task-failure', delay_sec=5) connector.start() task_id = 0 wait_until(lambda: self.task_is_failed(connector, task_id), timeout_sec=15, err_msg="Failed to see task transition to the FAILED state") self.cc.restart_task(connector.name, task_id) wait_until(lambda: self.task_is_running(connector, task_id), timeout_sec=10, err_msg="Failed to see task transition to the RUNNING state") def test_pause_and_resume_source(self): """ Verify that source connectors stop producing records when paused and begin again after being resumed. """ self.setup_services() self.cc.set_configs(lambda node: self.render("connect-distributed.properties", node=node)) self.cc.start() self.source = VerifiableSource(self.cc) self.source.start() wait_until(lambda: self.is_running(self.source), timeout_sec=30, err_msg="Failed to see connector transition to the RUNNING state") self.cc.pause_connector(self.source.name) # wait until all nodes report the paused transition for node in self.cc.nodes: wait_until(lambda: self.is_paused(self.source, node), timeout_sec=30, err_msg="Failed to see connector transition to the PAUSED state") # verify that we do not produce new messages while paused num_messages = len(self.source.messages()) time.sleep(10) assert num_messages == len(self.source.messages()), "Paused source connector should not produce any messages" self.cc.resume_connector(self.source.name) for node in self.cc.nodes: wait_until(lambda: self.is_running(self.source, node), timeout_sec=30, err_msg="Failed to see connector transition to the RUNNING state") # after resuming, we should see records produced again wait_until(lambda: len(self.source.messages()) > num_messages, timeout_sec=30, err_msg="Failed to produce messages after resuming source connector") def test_pause_and_resume_sink(self): """ Verify that sink connectors stop consuming records when paused and begin again after being resumed. """ self.setup_services() self.cc.set_configs(lambda node: self.render("connect-distributed.properties", node=node)) self.cc.start() # use the verifiable source to produce a steady stream of messages self.source = VerifiableSource(self.cc) self.source.start() self.sink = VerifiableSink(self.cc) self.sink.start() wait_until(lambda: self.is_running(self.sink), timeout_sec=30, err_msg="Failed to see connector transition to the RUNNING state") self.cc.pause_connector(self.sink.name) # wait until all nodes report the paused transition for node in self.cc.nodes: wait_until(lambda: self.is_paused(self.sink, node), timeout_sec=30, err_msg="Failed to see connector transition to the PAUSED state") # verify that we do not consume new messages while paused num_messages = len(self.sink.received_messages()) time.sleep(10) assert num_messages == len(self.sink.received_messages()), "Paused sink connector should not consume any messages" self.cc.resume_connector(self.sink.name) for node in self.cc.nodes: wait_until(lambda: self.is_running(self.sink, node), timeout_sec=30, err_msg="Failed to see connector transition to the RUNNING state") # after resuming, we should see records consumed again wait_until(lambda: len(self.sink.received_messages()) > num_messages, timeout_sec=30, err_msg="Failed to consume messages after resuming source connector") def test_pause_state_persistent(self): """ Verify that paused state is preserved after a cluster restart. """ self.setup_services() self.cc.set_configs(lambda node: self.render("connect-distributed.properties", node=node)) self.cc.start() self.source = VerifiableSource(self.cc) self.source.start() wait_until(lambda: self.is_running(self.source), timeout_sec=30, err_msg="Failed to see connector transition to the RUNNING state") self.cc.pause_connector(self.source.name) self.cc.restart() # we should still be paused after restarting for node in self.cc.nodes: wait_until(lambda: self.is_paused(self.source, node), timeout_sec=30, err_msg="Failed to see connector startup in PAUSED state") @matrix(security_protocol=[SecurityConfig.PLAINTEXT, SecurityConfig.SASL_SSL]) def test_file_source_and_sink(self, security_protocol): """ Tests that a basic file connector works across clean rolling bounces. This validates that the connector is correctly created, tasks instantiated, and as nodes restart the work is rebalanced across nodes. """ self.setup_services(security_protocol=security_protocol) self.cc.set_configs(lambda node: self.render("connect-distributed.properties", node=node)) self.cc.start() self.logger.info("Creating connectors") self._start_connector("connect-file-source.properties") self._start_connector("connect-file-sink.properties") # Generating data on the source node should generate new records and create new output on the sink node. Timeouts # here need to be more generous than they are for standalone mode because a) it takes longer to write configs, # do rebalancing of the group, etc, and b) without explicit leave group support, rebalancing takes awhile for node in self.cc.nodes: node.account.ssh("echo -e -n " + repr(self.FIRST_INPUTS) + " >> " + self.INPUT_FILE) wait_until(lambda: self._validate_file_output(self.FIRST_INPUT_LIST), timeout_sec=70, err_msg="Data added to input file was not seen in the output file in a reasonable amount of time.") # Restarting both should result in them picking up where they left off, # only processing new data. self.cc.restart() for node in self.cc.nodes: node.account.ssh("echo -e -n " + repr(self.SECOND_INPUTS) + " >> " + self.INPUT_FILE) wait_until(lambda: self._validate_file_output(self.FIRST_INPUT_LIST + self.SECOND_INPUT_LIST), timeout_sec=70, err_msg="Sink output file never converged to the same state as the input file") @matrix(clean=[True, False]) def test_bounce(self, clean): """ Validates that source and sink tasks that run continuously and produce a predictable sequence of messages run correctly and deliver messages exactly once when Kafka Connect workers undergo clean rolling bounces. """ num_tasks = 3 self.setup_services() self.cc.set_configs(lambda node: self.render("connect-distributed.properties", node=node)) self.cc.start() self.source = VerifiableSource(self.cc, tasks=num_tasks) self.source.start() self.sink = VerifiableSink(self.cc, tasks=num_tasks) self.sink.start() for _ in range(3): for node in self.cc.nodes: started = time.time() self.logger.info("%s bouncing Kafka Connect on %s", clean and "Clean" or "Hard", str(node.account)) self.cc.stop_node(node, clean_shutdown=clean) with node.account.monitor_log(self.cc.LOG_FILE) as monitor: self.cc.start_node(node) monitor.wait_until("Starting connectors and tasks using config offset", timeout_sec=90, err_msg="Kafka Connect worker didn't successfully join group and start work") self.logger.info("Bounced Kafka Connect on %s and rejoined in %f seconds", node.account, time.time() - started) # If this is a hard bounce, give additional time for the consumer groups to recover. If we don't give # some time here, the next bounce may cause consumers to be shut down before they have any time to process # data and we can end up with zero data making it through the test. if not clean: time.sleep(15) self.source.stop() self.sink.stop() self.cc.stop() # Validate at least once delivery of everything that was reported as written since we should have flushed and # cleanly exited. Currently this only tests at least once delivery because the sink task may not have consumed # all the messages generated by the source task. This needs to be done per-task since seqnos are not unique across # tasks. success = True errors = [] allow_dups = not clean src_messages = self.source.messages() sink_messages = self.sink.messages() for task in range(num_tasks): # Validate source messages src_seqnos = [msg['seqno'] for msg in src_messages if msg['task'] == task] # Every seqno up to the largest one we ever saw should appear. Each seqno should only appear once because clean # bouncing should commit on rebalance. src_seqno_max = max(src_seqnos) self.logger.debug("Max source seqno: %d", src_seqno_max) src_seqno_counts = Counter(src_seqnos) missing_src_seqnos = sorted(set(range(src_seqno_max)).difference(set(src_seqnos))) duplicate_src_seqnos = sorted([seqno for seqno,count in src_seqno_counts.iteritems() if count > 1]) if missing_src_seqnos: self.logger.error("Missing source sequence numbers for task " + str(task)) errors.append("Found missing source sequence numbers for task %d: %s" % (task, missing_src_seqnos)) success = False if not allow_dups and duplicate_src_seqnos: self.logger.error("Duplicate source sequence numbers for task " + str(task)) errors.append("Found duplicate source sequence numbers for task %d: %s" % (task, duplicate_src_seqnos)) success = False # Validate sink messages sink_seqnos = [msg['seqno'] for msg in sink_messages if msg['task'] == task and 'flushed' in msg] # Every seqno up to the largest one we ever saw should appear. Each seqno should only appear once because # clean bouncing should commit on rebalance. sink_seqno_max = max(sink_seqnos) self.logger.debug("Max sink seqno: %d", sink_seqno_max) sink_seqno_counts = Counter(sink_seqnos) missing_sink_seqnos = sorted(set(range(sink_seqno_max)).difference(set(sink_seqnos))) duplicate_sink_seqnos = sorted([seqno for seqno,count in sink_seqno_counts.iteritems() if count > 1]) if missing_sink_seqnos: self.logger.error("Missing sink sequence numbers for task " + str(task)) errors.append("Found missing sink sequence numbers for task %d: %s" % (task, missing_sink_seqnos)) success = False if not allow_dups and duplicate_sink_seqnos: self.logger.error("Duplicate sink sequence numbers for task " + str(task)) errors.append("Found duplicate sink sequence numbers for task %d: %s" % (task, duplicate_sink_seqnos)) success = False # Validate source and sink match if sink_seqno_max > src_seqno_max: self.logger.error("Found sink sequence number greater than any generated sink sequence number for task %d: %d > %d", task, sink_seqno_max, src_seqno_max) errors.append("Found sink sequence number greater than any generated sink sequence number for task %d: %d > %d" % (task, sink_seqno_max, src_seqno_max)) success = False if src_seqno_max < 1000 or sink_seqno_max < 1000: errors.append("Not enough messages were processed: source:%d sink:%d" % (src_seqno_max, sink_seqno_max)) success = False if not success: self.mark_for_collect(self.cc) # Also collect the data in the topic to aid in debugging consumer_validator = ConsoleConsumer(self.test_context, 1, self.kafka, self.source.topic, consumer_timeout_ms=1000, print_key=True) consumer_validator.run() self.mark_for_collect(consumer_validator, "consumer_stdout") assert success, "Found validation errors:\n" + "\n ".join(errors) def _validate_file_output(self, input): input_set = set(input) # Output needs to be collected from all nodes because we can't be sure where the tasks will be scheduled. # Between the first and second rounds, we might even end up with half the data on each node. output_set = set(itertools.chain(*[ [line.strip() for line in self._file_contents(node, self.OUTPUT_FILE)] for node in self.cc.nodes ])) return input_set == output_set def _file_contents(self, node, file): try: # Convert to a list here or the CalledProcessError may be returned during a call to the generator instead of # immediately return list(node.account.ssh_capture("cat " + file)) except subprocess.CalledProcessError: return []
class StreamsOptimizedTest(Test): """ Test doing upgrades of a Kafka Streams application that is un-optimized initially then optimized """ input_topic = 'inputTopic' aggregation_topic = 'aggregationTopic' reduce_topic = 'reduceTopic' join_topic = 'joinTopic' operation_pattern = 'AGGREGATED\|REDUCED\|JOINED' stopped_message = 'OPTIMIZE_TEST Streams Stopped' def __init__(self, test_context): super(StreamsOptimizedTest, self).__init__(test_context) self.topics = { self.input_topic: { 'partitions': 6 }, self.aggregation_topic: { 'partitions': 6 }, self.reduce_topic: { 'partitions': 6 }, self.join_topic: { 'partitions': 6 } } self.zookeeper = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zookeeper, topics=self.topics) self.producer = VerifiableProducer(self.test_context, 1, self.kafka, self.input_topic, throughput=1000, acks=1) def test_upgrade_optimized_topology(self): self.zookeeper.start() self.kafka.start() processor1 = StreamsOptimizedUpgradeTestService( self.test_context, self.kafka) processor2 = StreamsOptimizedUpgradeTestService( self.test_context, self.kafka) processor3 = StreamsOptimizedUpgradeTestService( self.test_context, self.kafka) processors = [processor1, processor2, processor3] # produce records continually during the test self.producer.start() # start all processors unoptimized for processor in processors: self.set_topics(processor) processor.CLEAN_NODE_ENABLED = False self.verify_running_repartition_topic_count(processor, 4) self.verify_processing(processors, verify_individual_operations=False) stop_processors(processors, self.stopped_message) # start again with topology optimized for processor in processors: processor.OPTIMIZED_CONFIG = 'all' self.verify_running_repartition_topic_count(processor, 1) self.verify_processing(processors, verify_individual_operations=True) stop_processors(processors, self.stopped_message) self.producer.stop() self.kafka.stop() self.zookeeper.stop() @staticmethod def verify_running_repartition_topic_count(processor, repartition_topic_count): node = processor.node with node.account.monitor_log(processor.STDOUT_FILE) as monitor: processor.start() monitor.wait_until( 'REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' % repartition_topic_count, timeout_sec=120, err_msg= "Never saw 'REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' message " % repartition_topic_count + str(processor.node.account)) def verify_processing(self, processors, verify_individual_operations): for processor in processors: if not self.all_source_subtopology_tasks(processor): if verify_individual_operations: for operation in self.operation_pattern.split('\|'): self.do_verify(processor, operation) else: self.do_verify(processor, self.operation_pattern) else: self.logger.info( "Skipping processor %s with all source tasks" % processor.node.account) def do_verify(self, processor, pattern): self.logger.info("Verifying %s processing pattern in STDOUT_FILE" % pattern) with processor.node.account.monitor_log( processor.STDOUT_FILE) as monitor: monitor.wait_until( pattern, timeout_sec=60, err_msg="Never saw processing of %s " % pattern + str(processor.node.account)) def all_source_subtopology_tasks(self, processor): retries = 0 while retries < 5: found = list( processor.node.account.ssh_capture( "sed -n 's/.*current active tasks: \[\(\(0_[0-9], \)\{3\}0_[0-9]\)\].*/\1/p' %s" % processor.LOG_FILE, allow_fail=True)) self.logger.info("Returned %s from assigned task check" % found) if len(found) > 0: return True retries += 1 time.sleep(1) return False def set_topics(self, processor): processor.INPUT_TOPIC = self.input_topic processor.AGGREGATION_TOPIC = self.aggregation_topic processor.REDUCE_TOPIC = self.reduce_topic processor.JOIN_TOPIC = self.join_topic
class TestUpgrade(ProduceConsumeValidateTest): def __init__(self, test_context): super(TestUpgrade, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # Producer and consumer self.producer_throughput = 10000 self.num_producers = 1 self.num_consumers = 1 def perform_upgrade(self, from_kafka_version, to_message_format_version=None): self.logger.info("First pass bounce - rolling upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = DEV_BRANCH node.config[config_property. INTER_BROKER_PROTOCOL_VERSION] = from_kafka_version node.config[ config_property.MESSAGE_FORMAT_VERSION] = from_kafka_version self.kafka.start_node(node) self.logger.info( "Second pass bounce - remove inter.broker.protocol.version config") for node in self.kafka.nodes: self.kafka.stop_node(node) del node.config[config_property.INTER_BROKER_PROTOCOL_VERSION] if to_message_format_version is None: del node.config[config_property.MESSAGE_FORMAT_VERSION] else: node.config[config_property. MESSAGE_FORMAT_VERSION] = to_message_format_version self.kafka.start_node(node) @cluster(num_nodes=6) @parametrize(from_kafka_version=str(LATEST_0_11_0), to_message_format_version=None, compression_types=["gzip"], new_consumer=False) @parametrize(from_kafka_version=str(LATEST_0_11_0), to_message_format_version=None, compression_types=["lz4"]) @parametrize(from_kafka_version=str(LATEST_0_10_2), to_message_format_version=str(LATEST_0_9), compression_types=["none"]) @parametrize(from_kafka_version=str(LATEST_0_10_2), to_message_format_version=str(LATEST_0_10), compression_types=["snappy"], new_consumer=False) @parametrize(from_kafka_version=str(LATEST_0_10_2), to_message_format_version=None, compression_types=["lz4"]) @parametrize(from_kafka_version=str(LATEST_0_10_2), to_message_format_version=None, compression_types=["none"]) @parametrize(from_kafka_version=str(LATEST_0_10_2), to_message_format_version=None, compression_types=["snappy"]) @parametrize(from_kafka_version=str(LATEST_0_10_2), to_message_format_version=None, compression_types=["lz4"], new_consumer=False) @parametrize(from_kafka_version=str(LATEST_0_10_1), to_message_format_version=None, compression_types=["lz4"]) @parametrize(from_kafka_version=str(LATEST_0_10_1), to_message_format_version=None, compression_types=["snappy"], new_consumer=False) @parametrize(from_kafka_version=str(LATEST_0_10_0), to_message_format_version=None, compression_types=["snappy"], new_consumer=False) @parametrize(from_kafka_version=str(LATEST_0_10_0), to_message_format_version=None, compression_types=["lz4"]) @cluster(num_nodes=7) @parametrize(from_kafka_version=str(LATEST_0_9), to_message_format_version=None, compression_types=["none"], security_protocol="SASL_SSL") @cluster(num_nodes=6) @parametrize(from_kafka_version=str(LATEST_0_9), to_message_format_version=None, compression_types=["snappy"]) @parametrize(from_kafka_version=str(LATEST_0_9), to_message_format_version=None, compression_types=["lz4"], new_consumer=False) @parametrize(from_kafka_version=str(LATEST_0_9), to_message_format_version=None, compression_types=["lz4"]) @parametrize(from_kafka_version=str(LATEST_0_9), to_message_format_version=str(LATEST_0_9), compression_types=["none"], new_consumer=False) @parametrize(from_kafka_version=str(LATEST_0_9), to_message_format_version=str(LATEST_0_9), compression_types=["snappy"]) @parametrize(from_kafka_version=str(LATEST_0_9), to_message_format_version=str(LATEST_0_9), compression_types=["lz4"]) @cluster(num_nodes=7) @parametrize(from_kafka_version=str(LATEST_0_8_2), to_message_format_version=None, compression_types=["none"], new_consumer=False) @parametrize(from_kafka_version=str(LATEST_0_8_2), to_message_format_version=None, compression_types=["snappy"], new_consumer=False) def test_upgrade(self, from_kafka_version, to_message_format_version, compression_types, new_consumer=True, security_protocol="PLAINTEXT"): """Test upgrade of Kafka broker cluster from 0.8.2, 0.9.0, 0.10.0, 0.10.1, 0.10.2 to the current version from_kafka_version is a Kafka version to upgrade from If to_message_format_version is None, it means that we will upgrade to default (latest) message format version. It is possible to upgrade to 0.10 brokers but still use message format version 0.9 - Start 3 node broker cluster on version 'from_kafka_version' - Start producer and consumer in the background - Perform two-phase rolling upgrade - First phase: upgrade brokers to 0.10 with inter.broker.protocol.version set to from_kafka_version and log.message.format.version set to from_kafka_version - Second phase: remove inter.broker.protocol.version config with rolling bounce; if to_message_format_version is set to 0.9, set log.message.format.version to to_message_format_version, otherwise remove log.message.format.version config - Finally, validate that every message acked by the producer was consumed by the consumer """ self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=KafkaVersion(from_kafka_version), topics={ self.topic: { "partitions": 3, "replication-factor": 3, 'configs': { "min.insync.replicas": 2 } } }) self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol self.kafka.start() self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput, message_validator=is_int, compression_types=compression_types, version=KafkaVersion(from_kafka_version)) if from_kafka_version <= LATEST_0_10_0: assert self.zk.query("/cluster/id") is None # TODO - reduce the timeout self.consumer = ConsoleConsumer( self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=30000, new_consumer=new_consumer, message_validator=is_int, version=KafkaVersion(from_kafka_version)) self.run_produce_consume_validate( core_test_action=lambda: self.perform_upgrade( from_kafka_version, to_message_format_version)) cluster_id_json = self.zk.query("/cluster/id") assert cluster_id_json is not None try: cluster_id = json.loads(cluster_id_json) except: self.logger.debug( "Data in /cluster/id znode could not be parsed. Data = %s" % cluster_id_json) self.logger.debug("Cluster id [%s]", cluster_id) assert len(cluster_id["id"]) == 22
class ClientCompatibilityTestNewBroker(ProduceConsumeValidateTest): def __init__(self, test_context): super(ClientCompatibilityTestNewBroker, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.zk = ZookeeperService( self.test_context, num_nodes=1) if quorum.for_test( self.test_context) == quorum.zk else None if self.zk: self.zk.start() # Producer and consumer self.producer_throughput = 10000 self.num_producers = 1 self.num_consumers = 1 self.messages_per_producer = 1000 @cluster(num_nodes=6) @matrix(producer_version=[str(DEV_BRANCH)], consumer_version=[str(DEV_BRANCH)], compression_types=[["snappy"]], timestamp_type=[str("LogAppendTime")], metadata_quorum=quorum.all_non_upgrade) @matrix(producer_version=[str(DEV_BRANCH)], consumer_version=[str(DEV_BRANCH)], compression_types=[["none"]], timestamp_type=[str("LogAppendTime")], metadata_quorum=quorum.all_non_upgrade) @parametrize(producer_version=str(DEV_BRANCH), consumer_version=str(LATEST_0_9), compression_types=["none"], new_consumer=False, timestamp_type=None) @matrix(producer_version=[str(DEV_BRANCH)], consumer_version=[str(LATEST_0_9)], compression_types=[["snappy"]], timestamp_type=[str("CreateTime")], metadata_quorum=quorum.all_non_upgrade) @matrix(producer_version=[str(LATEST_2_2)], consumer_version=[str(LATEST_2_2)], compression_types=[["none"]], timestamp_type=[str("CreateTime")], metadata_quorum=quorum.all_non_upgrade) @matrix(producer_version=[str(LATEST_2_3)], consumer_version=[str(LATEST_2_3)], compression_types=[["none"]], timestamp_type=[str("CreateTime")], metadata_quorum=quorum.all_non_upgrade) @matrix(producer_version=[str(LATEST_2_4)], consumer_version=[str(LATEST_2_4)], compression_types=[["none"]], timestamp_type=[str("CreateTime")], metadata_quorum=quorum.all_non_upgrade) @matrix(producer_version=[str(LATEST_2_5)], consumer_version=[str(LATEST_2_5)], compression_types=[["none"]], timestamp_type=[str("CreateTime")], metadata_quorum=quorum.all_non_upgrade) @matrix(producer_version=[str(LATEST_2_6)], consumer_version=[str(LATEST_2_6)], compression_types=[["none"]], timestamp_type=[str("CreateTime")], metadata_quorum=quorum.all_non_upgrade) @matrix(producer_version=[str(LATEST_2_7)], consumer_version=[str(LATEST_2_7)], compression_types=[["none"]], timestamp_type=[str("CreateTime")], metadata_quorum=quorum.all_non_upgrade) @matrix(producer_version=[str(LATEST_2_1)], consumer_version=[str(LATEST_2_1)], compression_types=[["zstd"]], timestamp_type=[str("CreateTime")], metadata_quorum=quorum.all_non_upgrade) @matrix(producer_version=[str(LATEST_2_0)], consumer_version=[str(LATEST_2_0)], compression_types=[["snappy"]], timestamp_type=[str("CreateTime")], metadata_quorum=quorum.all_non_upgrade) @matrix(producer_version=[str(LATEST_1_1)], consumer_version=[str(LATEST_1_1)], compression_types=[["lz4"]], timestamp_type=[str("CreateTime")], metadata_quorum=quorum.all_non_upgrade) @matrix(producer_version=[str(LATEST_1_0)], consumer_version=[str(LATEST_1_0)], compression_types=[["none"]], timestamp_type=[str("CreateTime")], metadata_quorum=quorum.all_non_upgrade) @matrix(producer_version=[str(LATEST_0_11_0)], consumer_version=[str(LATEST_0_11_0)], compression_types=[["gzip"]], timestamp_type=[str("CreateTime")], metadata_quorum=quorum.all_non_upgrade) @matrix(producer_version=[str(LATEST_0_10_2)], consumer_version=[str(LATEST_0_10_2)], compression_types=[["lz4"]], timestamp_type=[str("CreateTime")], metadata_quorum=quorum.all_non_upgrade) @matrix(producer_version=[str(LATEST_0_10_1)], consumer_version=[str(LATEST_0_10_1)], compression_types=[["snappy"]], timestamp_type=[str("LogAppendTime")], metadata_quorum=quorum.all_non_upgrade) @matrix(producer_version=[str(LATEST_0_10_0)], consumer_version=[str(LATEST_0_10_0)], compression_types=[["snappy"]], timestamp_type=[str("LogAppendTime")], metadata_quorum=quorum.all_non_upgrade) @matrix(producer_version=[str(LATEST_0_9)], consumer_version=[str(DEV_BRANCH)], compression_types=[["none"]], timestamp_type=[None], metadata_quorum=quorum.all_non_upgrade) @matrix(producer_version=[str(LATEST_0_9)], consumer_version=[str(DEV_BRANCH)], compression_types=[["snappy"]], timestamp_type=[None], metadata_quorum=quorum.all_non_upgrade) @matrix(producer_version=[str(LATEST_0_9)], consumer_version=[str(LATEST_0_9)], compression_types=[["snappy"]], timestamp_type=[str("LogAppendTime")], metadata_quorum=quorum.all_non_upgrade) @parametrize(producer_version=str(LATEST_0_8_2), consumer_version=str(LATEST_0_8_2), compression_types=["none"], new_consumer=False, timestamp_type=None) def test_compatibility(self, producer_version, consumer_version, compression_types, new_consumer=True, timestamp_type=None, metadata_quorum=quorum.zk): if not new_consumer and metadata_quorum != quorum.zk: raise Exception( "ZooKeeper-based consumers are not supported when using a Raft-based metadata quorum" ) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=DEV_BRANCH, topics={ self.topic: { "partitions": 3, "replication-factor": 3, 'configs': { "min.insync.replicas": 2 } } }, controller_num_nodes_override=1) for node in self.kafka.nodes: if timestamp_type is not None: node.config[ config_property.MESSAGE_TIMESTAMP_TYPE] = timestamp_type self.kafka.start() self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput, message_validator=is_int, compression_types=compression_types, version=KafkaVersion(producer_version)) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=30000, new_consumer=new_consumer, message_validator=is_int, version=KafkaVersion(consumer_version)) self.run_produce_consume_validate(lambda: wait_until( lambda: self.producer.each_produced_at_least( self.messages_per_producer) == True, timeout_sec=120, backoff_sec=1, err_msg= "Producer did not produce all messages in reasonable amount of time" ))
class StreamsCooperativeRebalanceUpgradeTest(Test): """ Test of a rolling upgrade from eager rebalance to cooperative rebalance """ source_topic = "source" sink_topic = "sink" task_delimiter = "#" report_interval = "1000" processing_message = "Processed [0-9]* records so far" stopped_message = "COOPERATIVE-REBALANCE-TEST-CLIENT-CLOSED" running_state_msg = "STREAMS in a RUNNING State" cooperative_turned_off_msg = "Eager rebalancing enabled now for upgrade from %s" cooperative_enabled_msg = "Cooperative rebalancing enabled now" first_bounce_phase = "first_bounce_phase-" second_bounce_phase = "second_bounce_phase-" # !!CAUTION!!: THIS LIST OF VERSIONS IS FIXED, NO VERSIONS MUST BE ADDED streams_eager_rebalance_upgrade_versions = [ str(LATEST_0_10_0), str(LATEST_0_10_1), str(LATEST_0_10_2), str(LATEST_0_11_0), str(LATEST_1_0), str(LATEST_1_1), str(LATEST_2_0), str(LATEST_2_1), str(LATEST_2_2), str(LATEST_2_3) ] def __init__(self, test_context): super(StreamsCooperativeRebalanceUpgradeTest, self).__init__(test_context) self.topics = { self.source_topic: { 'partitions': 9 }, self.sink_topic: { 'partitions': 9 } } self.zookeeper = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zookeeper, topics=self.topics) self.producer = VerifiableProducer(self.test_context, 1, self.kafka, self.source_topic, throughput=1000, acks=1) @matrix(upgrade_from_version=streams_eager_rebalance_upgrade_versions) def test_upgrade_to_cooperative_rebalance(self, upgrade_from_version): self.zookeeper.start() self.kafka.start() processor1 = CooperativeRebalanceUpgradeService( self.test_context, self.kafka) processor2 = CooperativeRebalanceUpgradeService( self.test_context, self.kafka) processor3 = CooperativeRebalanceUpgradeService( self.test_context, self.kafka) processors = [processor1, processor2, processor3] # produce records continually during the test self.producer.start() # start all processors without upgrade_from config; normal operations mode self.logger.info("Starting all streams clients in normal running mode") for processor in processors: processor.set_version(upgrade_from_version) self.set_props(processor) processor.CLEAN_NODE_ENABLED = False # can't use state as older version don't have state listener # so just verify up and running verify_running(processor, self.processing_message) # all running rebalancing has ceased for processor in processors: self.verify_processing(processor, self.processing_message) # first rolling bounce with "upgrade.from" config set previous_phase = "" self.maybe_upgrade_rolling_bounce_and_verify(processors, previous_phase, self.first_bounce_phase, upgrade_from_version) # All nodes processing, rebalancing has ceased for processor in processors: self.verify_processing( processor, self.first_bounce_phase + self.processing_message) # second rolling bounce without "upgrade.from" config self.maybe_upgrade_rolling_bounce_and_verify(processors, self.first_bounce_phase, self.second_bounce_phase) # All nodes processing, rebalancing has ceased for processor in processors: self.verify_processing( processor, self.second_bounce_phase + self.processing_message) # now verify tasks are unique for processor in processors: self.get_tasks_for_processor(processor) self.logger.info("Active tasks %s" % processor.active_tasks) overlapping_tasks = processor1.active_tasks.intersection( processor2.active_tasks) assert len(overlapping_tasks) == int(0), \ "Final task assignments are not unique %s %s" % (processor1.active_tasks, processor2.active_tasks) overlapping_tasks = processor1.active_tasks.intersection( processor3.active_tasks) assert len(overlapping_tasks) == int(0), \ "Final task assignments are not unique %s %s" % (processor1.active_tasks, processor3.active_tasks) overlapping_tasks = processor2.active_tasks.intersection( processor3.active_tasks) assert len(overlapping_tasks) == int(0), \ "Final task assignments are not unique %s %s" % (processor2.active_tasks, processor3.active_tasks) # test done close all down stop_processors(processors, self.second_bounce_phase + self.stopped_message) self.producer.stop() self.kafka.stop() self.zookeeper.stop() def maybe_upgrade_rolling_bounce_and_verify(self, processors, previous_phase, current_phase, upgrade_from_version=None): for processor in processors: # stop the processor in prep for setting "update.from" or removing "update.from" verify_stopped(processor, previous_phase + self.stopped_message) # upgrade to version with cooperative rebalance processor.set_version("") processor.set_upgrade_phase(current_phase) if upgrade_from_version is not None: # need to remove minor version numbers for check of valid upgrade from numbers upgrade_version = upgrade_from_version[:upgrade_from_version. rfind('.')] rebalance_mode_msg = self.cooperative_turned_off_msg % upgrade_version else: upgrade_version = None rebalance_mode_msg = self.cooperative_enabled_msg self.set_props(processor, upgrade_version) node = processor.node with node.account.monitor_log( processor.STDOUT_FILE) as stdout_monitor: with node.account.monitor_log( processor.LOG_FILE) as log_monitor: processor.start() # verify correct rebalance mode either turned off for upgrade or enabled after upgrade log_monitor.wait_until( rebalance_mode_msg, timeout_sec=60, err_msg="Never saw '%s' message " % rebalance_mode_msg + str(processor.node.account)) # verify rebalanced into a running state rebalance_msg = current_phase + self.running_state_msg stdout_monitor.wait_until( rebalance_msg, timeout_sec=60, err_msg="Never saw '%s' message " % rebalance_msg + str(processor.node.account)) # verify processing verify_processing_msg = current_phase + self.processing_message stdout_monitor.wait_until( verify_processing_msg, timeout_sec=60, err_msg="Never saw '%s' message " % verify_processing_msg + str(processor.node.account)) def verify_processing(self, processor, pattern): self.logger.info("Verifying %s processing pattern in STDOUT_FILE" % pattern) with processor.node.account.monitor_log( processor.STDOUT_FILE) as monitor: monitor.wait_until( pattern, timeout_sec=60, err_msg="Never saw processing of %s " % pattern + str(processor.node.account)) def get_tasks_for_processor(self, processor): retries = 0 while retries < 5: found_tasks = list( processor.node.account.ssh_capture( "grep TASK-ASSIGNMENTS %s | tail -n 1" % processor.STDOUT_FILE, allow_fail=True)) self.logger.info("Returned %s from assigned task check" % found_tasks) if len(found_tasks) > 0: task_string = str(found_tasks[0]).strip() self.logger.info("Converted %s from assigned task check" % task_string) processor.set_tasks(task_string) return retries += 1 time.sleep(1) return def set_props(self, processor, upgrade_from=None): processor.SOURCE_TOPIC = self.source_topic processor.SINK_TOPIC = self.sink_topic processor.REPORT_INTERVAL = self.report_interval processor.UPGRADE_FROM = upgrade_from
class ReplicationTest(ProduceConsumeValidateTest): """ Note that consuming is a bit tricky, at least with console consumer. The goal is to consume all messages (foreach partition) in the topic. In this case, waiting for the last message may cause the consumer to stop too soon since console consumer is consuming multiple partitions from a single thread and therefore we lose ordering guarantees. Waiting on a count of consumed messages can be unreliable: if we stop consuming when num_consumed == num_acked, we might exit early if some messages are duplicated (though not an issue here since producer retries==0) Therefore rely here on the consumer.timeout.ms setting which times out on the interval between successively consumed messages. Since we run the producer to completion before running the consumer, this is a reliable indicator that nothing is left to consume. """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ReplicationTest, self).__init__(test_context=test_context) self.topic = "test_topic" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk, topics={ self.topic: { "partitions": 3, "replication-factor": 3, 'configs': { "min.insync.replicas": 2 } } }) self.producer_throughput = 1000 self.num_producers = 1 self.num_consumers = 1 def setUp(self): self.zk.start() def min_cluster_size(self): """Override this since we're adding services outside of the constructor""" return super( ReplicationTest, self).min_cluster_size() + self.num_producers + self.num_consumers @cluster(num_nodes=7) @matrix(failure_mode=[ "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce" ], broker_type=["leader"], security_protocol=["PLAINTEXT", "SASL_SSL"]) @matrix(failure_mode=[ "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce" ], broker_type=["controller"], security_protocol=["PLAINTEXT", "SASL_SSL"]) @matrix(failure_mode=["hard_bounce"], broker_type=["leader"], security_protocol=["SASL_SSL"], client_sasl_mechanism=["PLAIN"], interbroker_sasl_mechanism=["PLAIN", "GSSAPI"]) @parametrize(failure_mode="hard_bounce", broker_type="leader", security_protocol="SASL_SSL", client_sasl_mechanism="SCRAM-SHA-256", interbroker_sasl_mechanism="SCRAM-SHA-512") def test_replication_with_broker_failure( self, failure_mode, security_protocol, broker_type, client_sasl_mechanism="GSSAPI", interbroker_sasl_mechanism="GSSAPI"): """Replication tests. These tests verify that replication provides simple durability guarantees by checking that data acked by brokers is still available for consumption in the face of various failure scenarios. Setup: 1 zk, 3 kafka nodes, 1 topic with partitions=3, replication-factor=3, and min.insync.replicas=2 - Produce messages in the background - Consume messages in the background - Drive broker failures (shutdown, or bounce repeatedly with kill -15 or kill -9) - When done driving failures, stop producing, and finish consuming - Validate that every acked message was consumed """ self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = security_protocol self.kafka.client_sasl_mechanism = client_sasl_mechanism self.kafka.interbroker_sasl_mechanism = interbroker_sasl_mechanism new_consumer = False if self.kafka.security_protocol == "PLAINTEXT" else True self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, new_consumer=new_consumer, consumer_timeout_ms=60000, message_validator=is_int) self.kafka.start() self.run_produce_consume_validate( core_test_action=lambda: failures[failure_mode](self, broker_type))
def test_upgrade_downgrade_brokers(self, from_version, to_version): """ Start a smoke test client then perform rolling upgrades on the broker. """ if from_version == to_version: return self.replication = 3 self.partitions = 1 self.isr = 2 self.topics = { 'echo': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'data': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'min': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'max': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'sum': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'dif': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'cnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'avg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'wcnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'tagg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } } } # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # number of nodes needs to be >= 3 for the smoke test self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=KafkaVersion(from_version), topics=self.topics) self.kafka.start() # allow some time for topics to be created time.sleep(10) self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.processor1.start() time.sleep(15) self.perform_broker_upgrade(to_version) time.sleep(15) self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False) self.processor1.node.account.ssh_capture( "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False)
class ClientCompatibilityFeaturesTest(Test): """ Tests clients for the presence or absence of specific features when communicating with brokers with various versions. Relies on ClientCompatibilityTest.java for much of the functionality. """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ClientCompatibilityFeaturesTest, self).__init__(test_context=test_context) self.zk = ZookeeperService(test_context, num_nodes=3) # Generate a unique topic name topic_name = "client_compat_features_topic_%d%d" % (int( time.time()), randint(0, 2147483647)) self.topics = { topic_name: { "partitions": 1, # Use only one partition to avoid worrying about ordering "replication-factor": 3 } } self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk, topics=self.topics) def invoke_compatibility_program(self, features): # Run the compatibility test on the first Kafka node. node = self.zk.nodes[0] cmd = ("%s org.apache.kafka.tools.ClientCompatibilityTest " "--bootstrap-server %s " "--num-cluster-nodes %d " "--topic %s " % (self.zk.path.script( "kafka-run-class.sh", node), self.kafka.bootstrap_servers(), len(self.kafka.nodes), self.topics.keys()[0])) for k, v in features.iteritems(): cmd = cmd + ("--%s %s " % (k, v)) results_dir = TestContext.results_dir(self.test_context, 0) os.makedirs(results_dir) ssh_log_file = "%s/%s" % (results_dir, "client_compatibility_test_output.txt") try: self.logger.info("Running %s" % cmd) run_command(node, cmd, ssh_log_file) except Exception as e: self.logger.info("** Command failed. See %s for log messages." % ssh_log_file) raise @parametrize(broker_version=str(DEV_BRANCH)) @parametrize(broker_version=str(LATEST_0_10_0)) @parametrize(broker_version=str(LATEST_0_10_1)) @parametrize(broker_version=str(LATEST_0_10_2)) @parametrize(broker_version=str(LATEST_0_11_0)) @parametrize(broker_version=str(LATEST_1_0)) @parametrize(broker_version=str(LATEST_1_1)) @parametrize(broker_version=str(LATEST_2_0)) @parametrize(broker_version=str(LATEST_2_1)) @parametrize(broker_version=str(LATEST_2_2)) def run_compatibility_test(self, broker_version): self.zk.start() self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() features = get_broker_features(broker_version) self.invoke_compatibility_program(features)
def test_metadata_upgrade(self, new_version): """ Starts 3 KafkaStreams instances with version 0.10.0, and upgrades one-by-one to <new_version> """ self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(str(LATEST_0_10_0)) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # first rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_rolling_bounce(p, "0.10.0", new_version, counter) counter = counter + 1 # second rolling bounce random.shuffle(self.processors) for p in self.processors: self.do_rolling_bounce(p, "", new_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop()
class Benchmark(Test): """A benchmark of Kafka producer/consumer performance. This replicates the test run here: https://engineering.linkedin.com/kafka/benchmarking-apache-kafka-2-million-writes-second-three-cheap-machines """ def __init__(self, test_context): super(Benchmark, self).__init__(test_context) self.num_zk = 1 self.num_brokers = 3 self.topics = { TOPIC_REP_ONE: { 'partitions': 6, 'replication-factor': 1 }, TOPIC_REP_THREE: { 'partitions': 6, 'replication-factor': 3 } } self.zk = ZookeeperService(test_context, self.num_zk) self.msgs_large = 10000000 self.batch_size = 8 * 1024 self.buffer_memory = 64 * 1024 * 1024 self.msg_sizes = [10, 100, 1000, 10000, 100000] self.target_data_size = 128 * 1024 * 1024 self.target_data_size_gb = self.target_data_size / float( 1024 * 1024 * 1024) def setUp(self): self.zk.start() def start_kafka(self, security_protocol, interbroker_security_protocol, version, tls_version=None): self.kafka = KafkaService( self.test_context, self.num_brokers, self.zk, security_protocol=security_protocol, interbroker_security_protocol=interbroker_security_protocol, topics=self.topics, version=version, tls_version=tls_version) self.kafka.log_level = "INFO" # We don't DEBUG logging here self.kafka.start() @cluster(num_nodes=5) @parametrize(acks=1, topic=TOPIC_REP_ONE) @parametrize(acks=1, topic=TOPIC_REP_THREE) @parametrize(acks=-1, topic=TOPIC_REP_THREE) @matrix(acks=[1], topic=[TOPIC_REP_THREE], message_size=[10, 100, 1000, 10000, 100000], compression_type=["none", "snappy"], security_protocol=['SSL'], tls_version=['TLSv1.2', 'TLSv1.3']) @matrix(acks=[1], topic=[TOPIC_REP_THREE], message_size=[10, 100, 1000, 10000, 100000], compression_type=["none", "snappy"], security_protocol=['PLAINTEXT']) @cluster(num_nodes=7) @parametrize(acks=1, topic=TOPIC_REP_THREE, num_producers=3) def test_producer_throughput(self, acks, topic, num_producers=1, message_size=DEFAULT_RECORD_SIZE, compression_type="none", security_protocol='PLAINTEXT', tls_version=None, client_version=str(DEV_BRANCH), broker_version=str(DEV_BRANCH)): """ Setup: 1 node zk + 3 node kafka cluster Produce ~128MB worth of messages to a topic with 6 partitions. Required acks, topic replication factor, security protocol and message size are varied depending on arguments injected into this test. Collect and return aggregate throughput statistics after all messages have been acknowledged. (This runs ProducerPerformance.java under the hood) """ client_version = KafkaVersion(client_version) broker_version = KafkaVersion(broker_version) self.validate_versions(client_version, broker_version) self.start_kafka(security_protocol, security_protocol, broker_version, tls_version) # Always generate the same total amount of data nrecords = int(self.target_data_size / message_size) self.producer = ProducerPerformanceService(self.test_context, num_producers, self.kafka, topic=topic, num_records=nrecords, record_size=message_size, throughput=-1, version=client_version, settings={ 'acks': acks, 'compression.type': compression_type, 'batch.size': self.batch_size, 'buffer.memory': self.buffer_memory }) self.producer.run() return compute_aggregate_throughput(self.producer) @cluster(num_nodes=5) @matrix(security_protocol=['SSL'], interbroker_security_protocol=['PLAINTEXT'], tls_version=['TLSv1.2', 'TLSv1.3'], compression_type=["none", "snappy"]) @matrix(security_protocol=['PLAINTEXT'], compression_type=["none", "snappy"]) def test_long_term_producer_throughput(self, compression_type="none", security_protocol='PLAINTEXT', tls_version=None, interbroker_security_protocol=None, client_version=str(DEV_BRANCH), broker_version=str(DEV_BRANCH)): """ Setup: 1 node zk + 3 node kafka cluster Produce 10e6 100 byte messages to a topic with 6 partitions, replication-factor 3, and acks=1. Collect and return aggregate throughput statistics after all messages have been acknowledged. (This runs ProducerPerformance.java under the hood) """ client_version = KafkaVersion(client_version) broker_version = KafkaVersion(broker_version) self.validate_versions(client_version, broker_version) if interbroker_security_protocol is None: interbroker_security_protocol = security_protocol self.start_kafka(security_protocol, interbroker_security_protocol, broker_version, tls_version) self.producer = ProducerPerformanceService( self.test_context, 1, self.kafka, topic=TOPIC_REP_THREE, num_records=self.msgs_large, record_size=DEFAULT_RECORD_SIZE, throughput=-1, version=client_version, settings={ 'acks': 1, 'compression.type': compression_type, 'batch.size': self.batch_size, 'buffer.memory': self.buffer_memory }, intermediate_stats=True) self.producer.run() summary = ["Throughput over long run, data > memory:"] data = {} # FIXME we should be generating a graph too # Try to break it into 5 blocks, but fall back to a smaller number if # there aren't even 5 elements block_size = max(len(self.producer.stats[0]) // 5, 1) nblocks = len(self.producer.stats[0]) // block_size for i in range(nblocks): subset = self.producer.stats[0][i * block_size:min( (i + 1) * block_size, len(self.producer.stats[0]))] if len(subset) == 0: summary.append(" Time block %d: (empty)" % i) data[i] = None else: records_per_sec = sum( [stat['records_per_sec'] for stat in subset]) / float(len(subset)) mb_per_sec = sum([stat['mbps'] for stat in subset]) / float(len(subset)) summary.append(" Time block %d: %f rec/sec (%f MB/s)" % (i, records_per_sec, mb_per_sec)) data[i] = throughput(records_per_sec, mb_per_sec) self.logger.info("\n".join(summary)) return data @cluster(num_nodes=5) @matrix(security_protocol=['SSL'], interbroker_security_protocol=['PLAINTEXT'], tls_version=['TLSv1.2', 'TLSv1.3'], compression_type=["none", "snappy"]) @matrix(security_protocol=['PLAINTEXT'], compression_type=["none", "snappy"]) @cluster(num_nodes=6) @matrix(security_protocol=['SASL_PLAINTEXT', 'SASL_SSL'], compression_type=["none", "snappy"]) def test_end_to_end_latency(self, compression_type="none", security_protocol="PLAINTEXT", tls_version=None, interbroker_security_protocol=None, client_version=str(DEV_BRANCH), broker_version=str(DEV_BRANCH)): """ Setup: 1 node zk + 3 node kafka cluster Produce (acks = 1) and consume 10e3 messages to a topic with 6 partitions and replication-factor 3, measuring the latency between production and consumption of each message. Return aggregate latency statistics. (Under the hood, this simply runs EndToEndLatency.scala) """ client_version = KafkaVersion(client_version) broker_version = KafkaVersion(broker_version) self.validate_versions(client_version, broker_version) if interbroker_security_protocol is None: interbroker_security_protocol = security_protocol self.start_kafka(security_protocol, interbroker_security_protocol, broker_version, tls_version) self.logger.info("BENCHMARK: End to end latency") self.perf = EndToEndLatencyService(self.test_context, 1, self.kafka, topic=TOPIC_REP_THREE, num_records=10000, compression_type=compression_type, version=client_version) self.perf.run() return latency(self.perf.results[0]['latency_50th_ms'], self.perf.results[0]['latency_99th_ms'], self.perf.results[0]['latency_999th_ms']) @cluster(num_nodes=6) @matrix(security_protocol=['SSL'], interbroker_security_protocol=['PLAINTEXT'], tls_version=['TLSv1.2', 'TLSv1.3'], compression_type=["none", "snappy"]) @matrix(security_protocol=['PLAINTEXT'], compression_type=["none", "snappy"]) def test_producer_and_consumer(self, compression_type="none", security_protocol="PLAINTEXT", tls_version=None, interbroker_security_protocol=None, client_version=str(DEV_BRANCH), broker_version=str(DEV_BRANCH)): """ Setup: 1 node zk + 3 node kafka cluster Concurrently produce and consume 10e6 messages with a single producer and a single consumer, Return aggregate throughput statistics for both producer and consumer. (Under the hood, this runs ProducerPerformance.java, and ConsumerPerformance.scala) """ client_version = KafkaVersion(client_version) broker_version = KafkaVersion(broker_version) self.validate_versions(client_version, broker_version) if interbroker_security_protocol is None: interbroker_security_protocol = security_protocol self.start_kafka(security_protocol, interbroker_security_protocol, broker_version, tls_version) num_records = 10 * 1000 * 1000 # 10e6 self.producer = ProducerPerformanceService( self.test_context, 1, self.kafka, topic=TOPIC_REP_THREE, num_records=num_records, record_size=DEFAULT_RECORD_SIZE, throughput=-1, version=client_version, settings={ 'acks': 1, 'compression.type': compression_type, 'batch.size': self.batch_size, 'buffer.memory': self.buffer_memory }) self.consumer = ConsumerPerformanceService(self.test_context, 1, self.kafka, topic=TOPIC_REP_THREE, messages=num_records) Service.run_parallel(self.producer, self.consumer) data = { "producer": compute_aggregate_throughput(self.producer), "consumer": compute_aggregate_throughput(self.consumer) } summary = ["Producer + consumer:", str(data)] self.logger.info("\n".join(summary)) return data @cluster(num_nodes=6) @matrix(security_protocol=['SSL'], interbroker_security_protocol=['PLAINTEXT'], tls_version=['TLSv1.2', 'TLSv1.3'], compression_type=["none", "snappy"]) @matrix(security_protocol=['PLAINTEXT'], compression_type=["none", "snappy"]) def test_consumer_throughput(self, compression_type="none", security_protocol="PLAINTEXT", tls_version=None, interbroker_security_protocol=None, num_consumers=1, client_version=str(DEV_BRANCH), broker_version=str(DEV_BRANCH)): """ Consume 10e6 100-byte messages with 1 or more consumers from a topic with 6 partitions and report throughput. """ client_version = KafkaVersion(client_version) broker_version = KafkaVersion(broker_version) self.validate_versions(client_version, broker_version) if interbroker_security_protocol is None: interbroker_security_protocol = security_protocol self.start_kafka(security_protocol, interbroker_security_protocol, broker_version, tls_version) num_records = 10 * 1000 * 1000 # 10e6 # seed kafka w/messages self.producer = ProducerPerformanceService( self.test_context, 1, self.kafka, topic=TOPIC_REP_THREE, num_records=num_records, record_size=DEFAULT_RECORD_SIZE, throughput=-1, version=client_version, settings={ 'acks': 1, 'compression.type': compression_type, 'batch.size': self.batch_size, 'buffer.memory': self.buffer_memory }) self.producer.run() # consume self.consumer = ConsumerPerformanceService(self.test_context, num_consumers, self.kafka, topic=TOPIC_REP_THREE, messages=num_records) self.consumer.group = "test-consumer-group" self.consumer.run() return compute_aggregate_throughput(self.consumer) def validate_versions(self, client_version, broker_version): assert client_version <= broker_version, "Client version %s should be <= than broker version %s" ( client_version, broker_version)
class StreamsBrokerCompatibility(Test): """ These tests validates that - Streams works for older brokers 0.11 (or newer) - Streams w/ EOS-alpha works for older brokers 0.11 (or newer) - Streams w/ EOS-v2 works for older brokers 2.5 (or newer) - Streams fails fast for older brokers 0.10.0, 0.10.2, and 0.10.1 - Streams w/ EOS-v2 fails fast for older brokers 2.4 or older """ input = "brokerCompatibilitySourceTopic" output = "brokerCompatibilitySinkTopic" def __init__(self, test_context): super(StreamsBrokerCompatibility, self).__init__(test_context=test_context) self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService( test_context, num_nodes=1, zk=self.zk, topics={ self.input: { 'partitions': 1, 'replication-factor': 1 }, self.output: { 'partitions': 1, 'replication-factor': 1 } }, server_prop_overrides=[[ "transaction.state.log.replication.factor", "1" ], ["transaction.state.log.min.isr", "1"]]) self.consumer = VerifiableConsumer( test_context, 1, self.kafka, self.output, "stream-broker-compatibility-verify-consumer") def setUp(self): self.zk.start() @cluster(num_nodes=4) @parametrize(broker_version=str(LATEST_2_4)) @parametrize(broker_version=str(LATEST_2_3)) @parametrize(broker_version=str(LATEST_2_2)) @parametrize(broker_version=str(LATEST_2_1)) @parametrize(broker_version=str(LATEST_2_0)) @parametrize(broker_version=str(LATEST_1_1)) @parametrize(broker_version=str(LATEST_1_0)) @parametrize(broker_version=str(LATEST_0_11_0)) def test_compatible_brokers_eos_disabled(self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, "at_least_once") processor.start() self.consumer.start() processor.wait() wait_until( lambda: self.consumer.total_consumed() > 0, timeout_sec=30, err_msg= "Did expect to read a message but got none within 30 seconds.") self.consumer.stop() self.kafka.stop() @cluster(num_nodes=4) @parametrize(broker_version=str(LATEST_2_6)) @parametrize(broker_version=str(LATEST_2_5)) @parametrize(broker_version=str(LATEST_2_4)) @parametrize(broker_version=str(LATEST_2_3)) @parametrize(broker_version=str(LATEST_2_2)) @parametrize(broker_version=str(LATEST_2_1)) @parametrize(broker_version=str(LATEST_2_0)) @parametrize(broker_version=str(LATEST_1_1)) @parametrize(broker_version=str(LATEST_1_0)) @parametrize(broker_version=str(LATEST_0_11_0)) def test_compatible_brokers_eos_alpha_enabled(self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, "exactly_once") processor.start() self.consumer.start() processor.wait() wait_until( lambda: self.consumer.total_consumed() > 0, timeout_sec=30, err_msg= "Did expect to read a message but got none within 30 seconds.") self.consumer.stop() self.kafka.stop() @cluster(num_nodes=4) @parametrize(broker_version=str(LATEST_2_8)) @parametrize(broker_version=str(LATEST_2_7)) @parametrize(broker_version=str(LATEST_2_6)) @parametrize(broker_version=str(LATEST_2_5)) def test_compatible_brokers_eos_v2_enabled(self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, "exactly_once_v2") processor.start() self.consumer.start() processor.wait() wait_until( lambda: self.consumer.total_consumed() > 0, timeout_sec=30, err_msg= "Did expect to read a message but got none within 30 seconds.") self.consumer.stop() self.kafka.stop() @cluster(num_nodes=4) @parametrize(broker_version=str(LATEST_0_10_2)) @parametrize(broker_version=str(LATEST_0_10_1)) @parametrize(broker_version=str(LATEST_0_10_0)) def test_fail_fast_on_incompatible_brokers(self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, "at_least_once") with processor.node.account.monitor_log( processor.STDERR_FILE) as monitor: processor.start() monitor.wait_until( 'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException', timeout_sec=60, err_msg= "Never saw 'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException " + str(processor.node.account)) self.kafka.stop() @cluster(num_nodes=4) @parametrize(broker_version=str(LATEST_2_4)) @parametrize(broker_version=str(LATEST_2_3)) @parametrize(broker_version=str(LATEST_2_2)) @parametrize(broker_version=str(LATEST_2_1)) @parametrize(broker_version=str(LATEST_2_0)) @parametrize(broker_version=str(LATEST_1_1)) @parametrize(broker_version=str(LATEST_1_0)) @parametrize(broker_version=str(LATEST_0_11_0)) def test_fail_fast_on_incompatible_brokers_if_eos_v2_enabled( self, broker_version): self.kafka.set_version(KafkaVersion(broker_version)) self.kafka.start() processor = StreamsBrokerCompatibilityService(self.test_context, self.kafka, "exactly_once_v2") with processor.node.account.monitor_log( processor.STDERR_FILE) as monitor: with processor.node.account.monitor_log(processor.LOG_FILE) as log: processor.start() log.wait_until( 'Shutting down because the Kafka cluster seems to be on a too old version. Setting processing\.guarantee="exactly_once_v2"/"exactly_once_beta" requires broker version 2\.5 or higher\.', timeout_sec=60, err_msg= "Never saw 'Shutting down because the Kafka cluster seems to be on a too old version. Setting `processing.guarantee=\"exactly_once_v2\"/\"exaclty_once_beta\"` requires broker version 2.5 or higher.' log message " + str(processor.node.account)) monitor.wait_until( 'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException', timeout_sec=60, err_msg= "Never saw 'FATAL: An unexpected exception org.apache.kafka.common.errors.UnsupportedVersionException' error message " + str(processor.node.account)) self.kafka.stop()
class ClientCompatibilityTest(Test): def __init__(self, test_context): super(ClientCompatibilityTest, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.zk = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=LATEST_0_8_2, topics={ self.topic: { "partitions": 3, "replication-factor": 3, 'configs': { "min.insync.replicas": 2 } } }) self.zk.start() self.kafka.start() # Producer and consumer self.producer_throughput = 10000 self.num_producers = 1 self.num_consumers = 1 def test_producer_back_compatibility(self): """Run 0.9.X java producer against 0.8.X brokers. This test documents the fact that java producer v0.9.0.0 and later won't run against 0.8.X brokers the broker responds to a V1 produce request with a V0 fetch response; the client then tries to parse this V0 produce response as a V1 produce response, resulting in a BufferUnderflowException """ self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, max_messages=100, throughput=self.producer_throughput, version=TRUNK) node = self.producer.nodes[0] try: self.producer.start() self.producer.wait() raise Exception( "0.9.X java producer should not run successfully against 0.8.X broker" ) except: # Expected pass finally: self.producer.kill_node(node, clean_shutdown=False) self.logger.info("Grepping producer log for expected error type") node.account.ssh("egrep -m 1 %s %s" % ( "\"org\.apache\.kafka\.common\.protocol\.types\.SchemaException.*throttle_time_ms.*: java\.nio\.BufferUnderflowException\"", self.producer.LOG_FILE), allow_fail=False) def test_consumer_back_compatibility(self): """Run the scala 0.8.X consumer against an 0.9.X cluster. Expect 0.8.X scala consumer to fail with buffer underflow. This error is the same as when an 0.9.X producer is run against an 0.8.X broker: the broker responds to a V1 fetch request with a V0 fetch response; the client then tries to parse this V0 fetch response as a V1 fetch response, resulting in a BufferUnderflowException """ num_messages = 10 self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, max_messages=num_messages, throughput=self.producer_throughput, version=LATEST_0_8_2) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, group_id="consumer-09X", consumer_timeout_ms=10000, message_validator=is_int, version=TRUNK) self.old_consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, group_id="consumer-08X", consumer_timeout_ms=10000, message_validator=is_int, version=LATEST_0_8_2) self.producer.run() self.consumer.run() self.old_consumer.run() consumed = len(self.consumer.messages_consumed[1]) old_consumed = len(self.old_consumer.messages_consumed[1]) assert old_consumed == num_messages, "Expected 0.8.X scala consumer to consume %d, but only got %d" % ( num_messages, old_consumed) assert consumed == 0, "Expected 0.9.X scala consumer to fail to consume any messages, but got %d" % consumed self.logger.info("Grepping consumer log for expected error type") node = self.consumer.nodes[0] node.account.ssh("egrep -m 1 %s %s" % ("\"java\.nio\.BufferUnderflowException\"", self.consumer.LOG_FILE), allow_fail=False)
class SecurityTest(ProduceConsumeValidateTest): """ These tests validate security features. """ def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(SecurityTest, self).__init__(test_context=test_context) self.topic = "test_topic" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={self.topic: { "partitions": 2, "replication-factor": 1} }) self.num_partitions = 2 self.timeout_sec = 10000 self.producer_throughput = 1000 self.num_producers = 1 self.num_consumers = 1 def setUp(self): self.zk.start() def producer_consumer_have_expected_error(self, error): try: for node in self.producer.nodes: node.account.ssh("grep %s %s" % (error, self.producer.LOG_FILE)) for node in self.consumer.nodes: node.account.ssh("grep %s %s" % (error, self.consumer.LOG_FILE)) except RemoteCommandError: return False return True @cluster(num_nodes=7) @parametrize(security_protocol='PLAINTEXT', interbroker_security_protocol='SSL') @parametrize(security_protocol='SSL', interbroker_security_protocol='PLAINTEXT') def test_client_ssl_endpoint_validation_failure(self, security_protocol, interbroker_security_protocol): """ Test that invalid hostname in certificate results in connection failures. When security_protocol=SSL, client SSL handshakes are expected to fail due to hostname verification failure. When security_protocol=PLAINTEXT and interbroker_security_protocol=SSL, controller connections fail with hostname verification failure. Hence clients are expected to fail with LEADER_NOT_AVAILABLE. """ self.kafka.security_protocol = security_protocol self.kafka.interbroker_security_protocol = interbroker_security_protocol SecurityConfig.ssl_stores = TestSslStores(self.test_context.local_scratch_dir, valid_hostname=False) self.kafka.start() self.create_producer_and_consumer() self.producer.log_level = "TRACE" self.producer.start() self.consumer.start() try: wait_until(lambda: self.producer.num_acked > 0, timeout_sec=5) # Fail quickly if messages are successfully acked raise RuntimeError("Messages published successfully but should not have!" " Endpoint validation did not fail with invalid hostname") except TimeoutError: # expected pass error = 'SSLHandshakeException' if security_protocol == 'SSL' else 'LEADER_NOT_AVAILABLE' wait_until(lambda: self.producer_consumer_have_expected_error(error), timeout_sec=5) self.producer.stop() self.consumer.stop() self.producer.log_level = "INFO" SecurityConfig.ssl_stores.valid_hostname = True for node in self.kafka.nodes: self.kafka.restart_node(node, clean_shutdown=True) self.create_producer_and_consumer() self.run_produce_consume_validate() def create_producer_and_consumer(self): self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=10000, message_validator=is_int)
class TestUpgrade(ProduceConsumeValidateTest): def __init__(self, test_context): super(TestUpgrade, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.zk = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=LATEST_0_8_2, topics={ self.topic: { "partitions": 3, "replication-factor": 3, "min.insync.replicas": 2 } }) self.zk.start() self.kafka.start() # Producer and consumer self.producer_throughput = 10000 self.num_producers = 1 self.num_consumers = 1 self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput, version=LATEST_0_8_2) # TODO - reduce the timeout self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=30000, message_validator=is_int, version=LATEST_0_8_2) def perform_upgrade(self): self.logger.info("First pass bounce - rolling upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = TRUNK node.config[ config_property.INTER_BROKER_PROTOCOL_VERSION] = "0.8.2.X" self.kafka.start_node(node) self.logger.info( "Second pass bounce - remove inter.broker.protocol.version config") for node in self.kafka.nodes: self.kafka.stop_node(node) del node.config[config_property.INTER_BROKER_PROTOCOL_VERSION] self.kafka.start_node(node) def test_upgrade(self): """Test upgrade of Kafka broker cluster from 0.8.2 to 0.9.0 - Start 3 node broker cluster on version 0.8.2 - Start producer and consumer in the background - Perform two-phase rolling upgrade - First phase: upgrade brokers to 0.9.0 with inter.broker.protocol.version set to 0.8.2.X - Second phase: remove inter.broker.protocol.version config with rolling bounce - Finally, validate that every message acked by the producer was consumed by the consumer """ self.run_produce_consume_validate( core_test_action=self.perform_upgrade)