class StreamsUpgradeTest(Test): """ Test upgrading Kafka Streams (all version combination) If metadata was changes, upgrade is more difficult Metadata version was bumped in 0.10.1.0 and subsequently bumped in 2.0.0 """ def __init__(self, test_context): super(StreamsUpgradeTest, self).__init__(test_context) self.topics = { 'echo': { 'partitions': 5 }, 'data': { 'partitions': 5 }, } self.leader = None self.leader_counter = {} processed_msg = "processed [0-9]* records" base_version_number = str(DEV_VERSION).split("-")[0] def perform_broker_upgrade(self, to_version): self.logger.info("First pass bounce - rolling broker upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = KafkaVersion(to_version) self.kafka.start_node(node) @ignore @cluster(num_nodes=6) @matrix(from_version=broker_upgrade_versions, to_version=broker_upgrade_versions) def test_upgrade_downgrade_brokers(self, from_version, to_version): """ Start a smoke test client then perform rolling upgrades on the broker. """ if from_version == to_version: return self.replication = 3 self.num_kafka_nodes = 3 self.partitions = 1 self.isr = 2 self.topics = { 'echo': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'data': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'min': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'max': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'sum': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'dif': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'cnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'avg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'wcnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'tagg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } } } # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # number of nodes needs to be >= 3 for the smoke test self.kafka = KafkaService(self.test_context, num_nodes=self.num_kafka_nodes, zk=self.zk, version=KafkaVersion(from_version), topics=self.topics) self.kafka.start() # allow some time for topics to be created wait_until(lambda: self.confirm_topics_on_all_brokers( set(self.topics.keys())), timeout_sec=60, err_msg="Broker did not create all topics in 60 seconds ") self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) processor = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka) with self.driver.node.account.monitor_log( self.driver.STDOUT_FILE) as driver_monitor: self.driver.start() with processor.node.account.monitor_log( processor.STDOUT_FILE) as monitor: processor.start() monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(processor.node)) connected_message = "Discovered group coordinator" with processor.node.account.monitor_log( processor.LOG_FILE) as log_monitor: with processor.node.account.monitor_log( processor.STDOUT_FILE) as stdout_monitor: self.perform_broker_upgrade(to_version) log_monitor.wait_until( connected_message, timeout_sec=120, err_msg=("Never saw output '%s' on " % connected_message) + str(processor.node.account)) stdout_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on" % self.processed_msg + str(processor.node.account)) # SmokeTestDriver allows up to 6 minutes to consume all # records for the verification step so this timeout is set to # 6 minutes (360 seconds) for consuming of verification records # and a very conservative additional 2 minutes (120 seconds) to process # the records in the verification step driver_monitor.wait_until( 'ALL-RECORDS-DELIVERED\|PROCESSED-MORE-THAN-GENERATED', timeout_sec=480, err_msg="Never saw output '%s' on" % 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED' + str(self.driver.node.account)) self.driver.stop() processor.stop() processor.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) @matrix(from_version=metadata_2_versions, to_version=metadata_2_versions) def test_simple_upgrade_downgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version> """ if from_version == to_version: return self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # upgrade one-by-one via rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_stop_start_bounce(p, None, to_version, counter) counter = counter + 1 # shutdown self.driver.stop() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) @matrix(from_version=metadata_1_versions, to_version=backward_compatible_metadata_2_versions) @matrix(from_version=metadata_1_versions, to_version=metadata_3_or_higher_versions) @matrix(from_version=metadata_2_versions, to_version=metadata_3_or_higher_versions) def test_metadata_upgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version> """ self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # first rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_stop_start_bounce(p, from_version[:-2], to_version, counter) counter = counter + 1 # second rolling bounce random.shuffle(self.processors) for p in self.processors: self.do_stop_start_bounce(p, None, to_version, counter) counter = counter + 1 # shutdown self.driver.stop() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) def test_version_probing_upgrade(self): """ Starts 3 KafkaStreams instances, and upgrades one-by-one to "future version" """ self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with("") # run with TRUNK self.processors = [self.processor1, self.processor2, self.processor3] self.old_processors = [ self.processor1, self.processor2, self.processor3 ] self.upgraded_processors = [] for p in self.processors: self.leader_counter[p] = 2 self.update_leader() for p in self.processors: self.leader_counter[p] = 0 self.leader_counter[self.leader] = 3 counter = 1 current_generation = 3 random.seed() random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False current_generation = self.do_rolling_bounce( p, counter, current_generation) counter = counter + 1 # shutdown self.driver.stop() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) def update_leader(self): self.leader = None retries = 10 while retries > 0: for p in self.processors: found = list( p.node.account.ssh_capture( "grep \"Finished assignment for group\" %s" % p.LOG_FILE, allow_fail=True)) if len(found) >= self.leader_counter[p] + 1: if self.leader is not None: raise Exception("Could not uniquely identify leader") self.leader = p self.leader_counter[p] = self.leader_counter[p] + 1 if self.leader is None: retries = retries - 1 time.sleep(5) else: break if self.leader is None: raise Exception("Could not identify leader") def get_version_string(self, version): if version.startswith("0") or version.startswith("1") \ or version.startswith("2.0") or version.startswith("2.1"): return "Kafka version : " + version elif "SNAPSHOT" in version: return "Kafka version.*" + self.base_version_number + ".*SNAPSHOT" else: return "Kafka version: " + version def start_all_nodes_with(self, version): kafka_version_str = self.get_version_string(version) # start first with <version> self.prepare_for(self.processor1, version) node1 = self.processor1.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor: with node1.account.monitor_log( self.processor1.LOG_FILE) as log_monitor: self.processor1.start() log_monitor.wait_until( kafka_version_str, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node1.account)) monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(node1.account)) # start second with <version> self.prepare_for(self.processor2, version) node2 = self.processor2.node with node1.account.monitor_log( self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log( self.processor2.STDOUT_FILE) as second_monitor: with node2.account.monitor_log( self.processor2.LOG_FILE) as log_monitor: self.processor2.start() log_monitor.wait_until( kafka_version_str, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " on " + str(node2.account)) first_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(node1.account)) second_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(node2.account)) # start third with <version> self.prepare_for(self.processor3, version) node3 = self.processor3.node with node1.account.monitor_log( self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log( self.processor2.STDOUT_FILE) as second_monitor: with node3.account.monitor_log( self.processor3.STDOUT_FILE) as third_monitor: with node3.account.monitor_log( self.processor3.LOG_FILE) as log_monitor: self.processor3.start() log_monitor.wait_until( kafka_version_str, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " on " + str(node3.account)) first_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(node1.account)) second_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(node2.account)) third_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(node3.account)) @staticmethod def prepare_for(processor, version): processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT, allow_fail=False) if version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(version) def do_stop_start_bounce(self, processor, upgrade_from, new_version, counter): kafka_version_str = self.get_version_string(new_version) first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node # stop processor and wait for rebalance of others with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.stop() first_other_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(first_other_node.account)) second_other_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(second_other_node.account)) node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) if upgrade_from is None: # upgrade disabled -- second round of rolling bounces roll_counter = ".1-" # second round of rolling bounces else: roll_counter = ".0-" # first round of rolling boundes node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + roll_counter + str(counter), allow_fail=False) if new_version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(new_version) processor.set_upgrade_from(upgrade_from) grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" " with node.account.monitor_log(processor.STDOUT_FILE) as monitor: with node.account.monitor_log(processor.LOG_FILE) as log_monitor: with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE ) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.start() log_monitor.wait_until( kafka_version_str, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + new_version + " on " + str(node.account)) first_other_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(first_other_node.account)) found = list( first_other_node.account.ssh_capture( grep_metadata_error + first_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) second_other_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(second_other_node.account)) found = list( second_other_node.account.ssh_capture( grep_metadata_error + second_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(node.account)) def do_rolling_bounce(self, processor, counter, current_generation): first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node with first_other_node.account.monitor_log( first_other_processor.LOG_FILE) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.LOG_FILE) as second_other_monitor: # stop processor processor.stop() node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + "." + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + "." + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + "." + str(counter), allow_fail=False) self.leader_counter[processor] = 0 with node.account.monitor_log( processor.LOG_FILE) as log_monitor: processor.set_upgrade_to("future_version") processor.start() self.old_processors.remove(processor) self.upgraded_processors.append(processor) # checking for the dev version which should be the only SNAPSHOT log_monitor.wait_until( "Kafka version.*" + self.base_version_number + ".*SNAPSHOT", timeout_sec=60, err_msg="Could not detect Kafka Streams version " + str(DEV_VERSION) + " in " + str(node.account)) log_monitor.offset = 5 log_monitor.wait_until( "partition\.assignment\.strategy = \[org\.apache\.kafka\.streams\.tests\.StreamsUpgradeTest$FutureStreamsPartitionAssignor\]", timeout_sec=60, err_msg= "Could not detect FutureStreamsPartitionAssignor in " + str(node.account)) if processor == self.leader: self.update_leader() else: self.leader_counter[ self.leader] = self.leader_counter[self.leader] + 1 if processor == self.leader: leader_monitor = log_monitor elif first_other_processor == self.leader: leader_monitor = first_other_monitor elif second_other_processor == self.leader: leader_monitor = second_other_monitor else: raise Exception("Could not identify leader.") monitors = {} monitors[processor] = log_monitor monitors[first_other_processor] = first_other_monitor monitors[second_other_processor] = second_other_monitor leader_monitor.wait_until( "Received a future (version probing) subscription (version: 5). Sending empty assignment back (with supported version 4).", timeout_sec=60, err_msg= "Could not detect 'version probing' attempt at leader " + str(self.leader.node.account)) if len(self.old_processors) > 0: log_monitor.wait_until( "Sent a version 5 subscription and got version 4 assignment back (successful version probing). Downgrading subscription metadata to received version and trigger new rebalance.", timeout_sec=60, err_msg= "Could not detect 'successful version probing' at upgrading node " + str(node.account)) else: log_monitor.wait_until( "Sent a version 5 subscription and got version 4 assignment back (successful version probing). Setting subscription metadata to leaders supported version 5 and trigger new rebalance.", timeout_sec=60, err_msg= "Could not detect 'successful version probing with upgraded leader' at upgrading node " + str(node.account)) first_other_monitor.wait_until( "Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.", timeout_sec=60, err_msg= "Never saw output 'Upgrade metadata to version 4' on" + str(first_other_node.account)) second_other_monitor.wait_until( "Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.", timeout_sec=60, err_msg= "Never saw output 'Upgrade metadata to version 4' on" + str(second_other_node.account)) log_monitor.wait_until( "Version probing detected. Triggering new rebalance.", timeout_sec=60, err_msg= "Could not detect 'Triggering new rebalance' at upgrading node " + str(node.account)) # version probing should trigger second rebalance # now we check that after consecutive rebalances we have synchronized generation generation_synchronized = False retries = 0 while retries < 10: processor_found = extract_generation_from_logs( processor) first_other_processor_found = extract_generation_from_logs( first_other_processor) second_other_processor_found = extract_generation_from_logs( second_other_processor) if len(processor_found) > 0 and len( first_other_processor_found) > 0 and len( second_other_processor_found) > 0: self.logger.info("processor: " + str(processor_found)) self.logger.info("first other processor: " + str(first_other_processor_found)) self.logger.info("second other processor: " + str(second_other_processor_found)) processor_generation = self.extract_highest_generation( processor_found) first_other_processor_generation = self.extract_highest_generation( first_other_processor_found) second_other_processor_generation = self.extract_highest_generation( second_other_processor_found) if processor_generation == first_other_processor_generation and processor_generation == second_other_processor_generation: current_generation = processor_generation generation_synchronized = True break time.sleep(5) retries = retries + 1 if generation_synchronized == False: raise Exception( "Never saw all three processors have the synchronized generation number" ) if processor == self.leader: self.update_leader() else: self.leader_counter[ self.leader] = self.leader_counter[self.leader] + 1 if self.leader in self.old_processors or len( self.old_processors) > 0: self.verify_metadata_no_upgraded_yet() return current_generation def extract_highest_generation(self, found_generations): return int(found_generations[-1]) def verify_metadata_no_upgraded_yet(self): for p in self.processors: found = list( p.node.account.ssh_capture( "grep \"Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.\" " + p.LOG_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'group member upgraded to metadata 4 too early'" ) def confirm_topics_on_all_brokers(self, expected_topic_set): for node in self.kafka.nodes: match_count = 0 # need to iterate over topic_list_generator as kafka.list_topics() # returns a python generator so values are fetched lazily # so we can't just compare directly we must iterate over what's returned topic_list_generator = self.kafka.list_topics(node=node) for topic in topic_list_generator: if topic in expected_topic_set: match_count += 1 if len(expected_topic_set) != match_count: return False return True
class StreamsUpgradeTest(Test): """ Test upgrading Kafka Streams (all version combination) If metadata was changes, upgrade is more difficult Metadata version was bumped in 0.10.1.0 """ def __init__(self, test_context): super(StreamsUpgradeTest, self).__init__(test_context) self.topics = { 'echo': { 'partitions': 5 }, 'data': { 'partitions': 5 }, } self.leader = None self.leader_counter = {} def perform_broker_upgrade(self, to_version): self.logger.info("First pass bounce - rolling broker upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = KafkaVersion(to_version) self.kafka.start_node(node) @ignore @cluster(num_nodes=6) @matrix(from_version=broker_upgrade_versions, to_version=broker_upgrade_versions) def test_upgrade_downgrade_brokers(self, from_version, to_version): """ Start a smoke test client then perform rolling upgrades on the broker. """ if from_version == to_version: return self.replication = 3 self.partitions = 1 self.isr = 2 self.topics = { 'echo': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'data': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'min': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'max': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'sum': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'dif': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'cnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'avg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'wcnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'tagg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } } } # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # number of nodes needs to be >= 3 for the smoke test self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=KafkaVersion(from_version), topics=self.topics) self.kafka.start() # allow some time for topics to be created time.sleep(10) self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.processor1.start() time.sleep(15) self.perform_broker_upgrade(to_version) time.sleep(15) self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node node.account.ssh( "grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED' %s" % self.driver.STDOUT_FILE, allow_fail=False) self.processor1.node.account.ssh_capture( "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False) @matrix(from_version=metadata_2_versions, to_version=metadata_2_versions) def test_simple_upgrade_downgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version> """ if from_version == to_version: return self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # upgrade one-by-one via rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_stop_start_bounce(p, None, to_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() @matrix(from_version=metadata_1_versions, to_version=backward_compatible_metadata_2_versions) @matrix(from_version=metadata_1_versions, to_version=metadata_3_or_higher_versions) @matrix(from_version=metadata_2_versions, to_version=metadata_3_or_higher_versions) def test_metadata_upgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version> """ self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # first rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_stop_start_bounce(p, from_version[:-2], to_version, counter) counter = counter + 1 # second rolling bounce random.shuffle(self.processors) for p in self.processors: self.do_stop_start_bounce(p, None, to_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() def test_version_probing_upgrade(self): """ Starts 3 KafkaStreams instances, and upgrades one-by-one to "future version" """ self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with("") # run with TRUNK self.processors = [self.processor1, self.processor2, self.processor3] self.old_processors = [ self.processor1, self.processor2, self.processor3 ] self.upgraded_processors = [] for p in self.processors: self.leader_counter[p] = 2 self.update_leader() for p in self.processors: self.leader_counter[p] = 0 self.leader_counter[self.leader] = 3 counter = 1 current_generation = 3 random.seed() random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False current_generation = self.do_rolling_bounce( p, counter, current_generation) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() def update_leader(self): self.leader = None retries = 10 while retries > 0: for p in self.processors: found = list( p.node.account.ssh_capture( "grep \"Finished assignment for group\" %s" % p.LOG_FILE, allow_fail=True)) if len(found) == self.leader_counter[p] + 1: if self.leader is not None: raise Exception("Could not uniquely identify leader") self.leader = p self.leader_counter[p] = self.leader_counter[p] + 1 if self.leader is None: retries = retries - 1 time.sleep(5) else: break if self.leader is None: raise Exception("Could not identify leader") def start_all_nodes_with(self, version): # start first with <version> self.prepare_for(self.processor1, version) node1 = self.processor1.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor: with node1.account.monitor_log( self.processor1.LOG_FILE) as log_monitor: self.processor1.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node1.account)) monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) # start second with <version> self.prepare_for(self.processor2, version) node2 = self.processor2.node with node1.account.monitor_log( self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log( self.processor2.STDOUT_FILE) as second_monitor: with node2.account.monitor_log( self.processor2.LOG_FILE) as log_monitor: self.processor2.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node2.account)) first_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node2.account)) # start third with <version> self.prepare_for(self.processor3, version) node3 = self.processor3.node with node1.account.monitor_log( self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log( self.processor2.STDOUT_FILE) as second_monitor: with node3.account.monitor_log( self.processor3.STDOUT_FILE) as third_monitor: with node3.account.monitor_log( self.processor3.LOG_FILE) as log_monitor: self.processor3.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node3.account)) first_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node2.account)) third_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node3.account)) @staticmethod def prepare_for(processor, version): processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT, allow_fail=False) if version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(version) def do_stop_start_bounce(self, processor, upgrade_from, new_version, counter): first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node # stop processor and wait for rebalance of others with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.stop() first_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) second_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) if upgrade_from is None: # upgrade disabled -- second round of rolling bounces roll_counter = ".1-" # second round of rolling bounces else: roll_counter = ".0-" # first round of rolling boundes node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + roll_counter + str(counter), allow_fail=False) if new_version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(new_version) processor.set_upgrade_from(upgrade_from) grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" " with node.account.monitor_log(processor.STDOUT_FILE) as monitor: with node.account.monitor_log(processor.LOG_FILE) as log_monitor: with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE ) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.start() log_monitor.wait_until( "Kafka version : " + new_version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + new_version + " " + str(node.account)) first_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) found = list( first_other_node.account.ssh_capture( grep_metadata_error + first_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) second_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) found = list( second_other_node.account.ssh_capture( grep_metadata_error + second_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node.account)) def do_rolling_bounce(self, processor, counter, current_generation): first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node with first_other_node.account.monitor_log( first_other_processor.LOG_FILE) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.LOG_FILE) as second_other_monitor: # stop processor processor.stop() node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + "." + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + "." + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + "." + str(counter), allow_fail=False) self.leader_counter[processor] = 0 with node.account.monitor_log( processor.LOG_FILE) as log_monitor: processor.set_upgrade_to("future_version") processor.start() self.old_processors.remove(processor) self.upgraded_processors.append(processor) log_monitor.wait_until( "Kafka version : " + str(DEV_VERSION), timeout_sec=60, err_msg="Could not detect Kafka Streams version " + str(DEV_VERSION) + " in " + str(node.account)) log_monitor.offset = 5 log_monitor.wait_until( "partition\.assignment\.strategy = \[org\.apache\.kafka\.streams\.tests\.StreamsUpgradeTest$FutureStreamsPartitionAssignor\]", timeout_sec=60, err_msg= "Could not detect FutureStreamsPartitionAssignor in " + str(node.account)) if processor == self.leader: self.update_leader() else: self.leader_counter[ self.leader] = self.leader_counter[self.leader] + 1 if processor == self.leader: leader_monitor = log_monitor elif first_other_processor == self.leader: leader_monitor = first_other_monitor elif second_other_processor == self.leader: leader_monitor = second_other_monitor else: raise Exception("Could not identify leader.") monitors = {} monitors[processor] = log_monitor monitors[first_other_processor] = first_other_monitor monitors[second_other_processor] = second_other_monitor leader_monitor.wait_until( "Received a future (version probing) subscription (version: 5). Sending empty assignment back (with supported version 4).", timeout_sec=60, err_msg= "Could not detect 'version probing' attempt at leader " + str(self.leader.node.account)) if len(self.old_processors) > 0: log_monitor.wait_until( "Sent a version 5 subscription and got version 4 assignment back (successful version probing). Downgrading subscription metadata to received version and trigger new rebalance.", timeout_sec=60, err_msg= "Could not detect 'successful version probing' at upgrading node " + str(node.account)) else: log_monitor.wait_until( "Sent a version 5 subscription and got version 4 assignment back (successful version probing). Setting subscription metadata to leaders supported version 5 and trigger new rebalance.", timeout_sec=60, err_msg= "Could not detect 'successful version probing with upgraded leader' at upgrading node " + str(node.account)) first_other_monitor.wait_until( "Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.", timeout_sec=60, err_msg= "Never saw output 'Upgrade metadata to version 4' on" + str(first_other_node.account)) second_other_monitor.wait_until( "Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.", timeout_sec=60, err_msg= "Never saw output 'Upgrade metadata to version 4' on" + str(second_other_node.account)) log_monitor.wait_until( "Version probing detected. Triggering new rebalance.", timeout_sec=60, err_msg= "Could not detect 'Triggering new rebalance' at upgrading node " + str(node.account)) # version probing should trigger second rebalance # now we check that after consecutive rebalances we have synchronized generation generation_synchronized = False retries = 0 while retries < 10: processor_found = self.extract_generation_from_logs( processor) first_other_processor_found = self.extract_generation_from_logs( first_other_processor) second_other_processor_found = self.extract_generation_from_logs( second_other_processor) if len(processor_found) > 0 and len( first_other_processor_found) > 0 and len( second_other_processor_found) > 0: self.logger.info("processor: " + str(processor_found)) self.logger.info("first other processor: " + str(first_other_processor_found)) self.logger.info("second other processor: " + str(second_other_processor_found)) processor_generation = self.extract_highest_generation( processor_found) first_other_processor_generation = self.extract_highest_generation( first_other_processor_found) second_other_processor_generation = self.extract_highest_generation( second_other_processor_found) if processor_generation == first_other_processor_generation and processor_generation == second_other_processor_generation: current_generation = processor_generation generation_synchronized = True break time.sleep(5) retries = retries + 1 if generation_synchronized == False: raise Exception( "Never saw all three processors have the synchronized generation number" ) if processor == self.leader: self.update_leader() else: self.leader_counter[ self.leader] = self.leader_counter[self.leader] + 1 if self.leader in self.old_processors or len( self.old_processors) > 0: self.verify_metadata_no_upgraded_yet() return current_generation def extract_generation_from_logs(self, processor): return list( processor.node.account.ssh_capture( "grep \"Successfully joined group with generation\" %s| awk \'{for(i=1;i<=NF;i++) {if ($i == \"generation\") beginning=i+1; if($i== \"(org.apache.kafka.clients.consumer.internals.AbstractCoordinator)\") ending=i }; for (j=beginning;j<ending;j++) printf $j; printf \"\\n\"}\'" % processor.LOG_FILE, allow_fail=True)) def extract_highest_generation(self, found_generations): return int(found_generations[-1]) def verify_metadata_no_upgraded_yet(self): for p in self.processors: found = list( p.node.account.ssh_capture( "grep \"Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.\" " + p.LOG_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'group member upgraded to metadata 4 too early'" )
class StreamsUpgradeTest(Test): """ Test upgrading Kafka Streams (all version combination) If metadata was changes, upgrade is more difficult Metadata version was bumped in 0.10.1.0 """ def __init__(self, test_context): super(StreamsUpgradeTest, self).__init__(test_context) self.topics = { 'echo': { 'partitions': 5 }, 'data': { 'partitions': 5 }, } self.leader = None def perform_broker_upgrade(self, to_version): self.logger.info("First pass bounce - rolling broker upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = KafkaVersion(to_version) self.kafka.start_node(node) @cluster(num_nodes=6) @matrix(from_version=broker_upgrade_versions, to_version=broker_upgrade_versions) def test_upgrade_downgrade_brokers(self, from_version, to_version): """ Start a smoke test client then perform rolling upgrades on the broker. """ if from_version == to_version: return self.replication = 3 self.partitions = 1 self.isr = 2 self.topics = { 'echo': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'data': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'min': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'max': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'sum': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'dif': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'cnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'avg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'wcnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'tagg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } } } # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # number of nodes needs to be >= 3 for the smoke test self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=KafkaVersion(from_version), topics=self.topics) self.kafka.start() # allow some time for topics to be created time.sleep(10) self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.processor1.start() time.sleep(15) self.perform_broker_upgrade(to_version) time.sleep(15) self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False) self.processor1.node.account.ssh_capture( "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False) @matrix(from_version=metadata_2_versions, to_version=metadata_2_versions) def test_simple_upgrade_downgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version> """ if from_version == to_version: return self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # upgrade one-by-one via rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_rolling_bounce(p, None, to_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() #@matrix(from_version=metadata_1_versions, to_version=backward_compatible_metadata_2_versions) @matrix(from_version=metadata_1_versions, to_version=metadata_3_versions) @matrix(from_version=metadata_2_versions, to_version=metadata_3_versions) def test_metadata_upgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version> """ self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # first rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_rolling_bounce(p, from_version[:-2], to_version, counter) counter = counter + 1 # second rolling bounce random.shuffle(self.processors) for p in self.processors: self.do_rolling_bounce(p, None, to_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() def start_all_nodes_with(self, version): # start first with <version> self.prepare_for(self.processor1, version) node1 = self.processor1.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor: with node1.account.monitor_log( self.processor1.LOG_FILE) as log_monitor: self.processor1.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node1.account)) monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) # start second with <version> self.prepare_for(self.processor2, version) node2 = self.processor2.node with node1.account.monitor_log( self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log( self.processor2.STDOUT_FILE) as second_monitor: with node2.account.monitor_log( self.processor2.LOG_FILE) as log_monitor: self.processor2.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node2.account)) first_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node2.account)) # start third with <version> self.prepare_for(self.processor3, version) node3 = self.processor3.node with node1.account.monitor_log( self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log( self.processor2.STDOUT_FILE) as second_monitor: with node3.account.monitor_log( self.processor3.STDOUT_FILE) as third_monitor: with node3.account.monitor_log( self.processor3.LOG_FILE) as log_monitor: self.processor3.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node3.account)) first_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node2.account)) third_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node3.account)) @staticmethod def prepare_for(processor, version): processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT, allow_fail=False) if version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(version) def do_rolling_bounce(self, processor, upgrade_from, new_version, counter): first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node # stop processor and wait for rebalance of others with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.stop() first_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) second_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) if upgrade_from is None: # upgrade disabled -- second round of rolling bounces roll_counter = ".1-" # second round of rolling bounces else: roll_counter = ".0-" # first round of rolling boundes node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + roll_counter + str(counter), allow_fail=False) if new_version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(new_version) processor.set_upgrade_from(upgrade_from) grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" " with node.account.monitor_log(processor.STDOUT_FILE) as monitor: with node.account.monitor_log(processor.LOG_FILE) as log_monitor: with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE ) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.start() log_monitor.wait_until( "Kafka version : " + new_version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + new_version + " " + str(node.account)) first_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) found = list( first_other_node.account.ssh_capture( grep_metadata_error + first_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) second_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) found = list( second_other_node.account.ssh_capture( grep_metadata_error + second_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node.account))
class StreamsUpgradeTest(KafkaTest): """ Test upgrading Kafka Streams (all version combination) If metadata was changes, upgrade is more difficult Metadata version was bumped in 0.10.1.0 """ def __init__(self, test_context): super(StreamsUpgradeTest, self).__init__(test_context, num_zk=1, num_brokers=1, topics={ 'echo': { 'partitions': 5 }, 'data': { 'partitions': 5 } }) self.driver = StreamsSmokeTestDriverService(test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( test_context, self.kafka) @parametrize(old_version=str(LATEST_0_10_1), new_version=str(LATEST_0_10_2)) @parametrize(old_version=str(LATEST_0_10_1), new_version=str(DEV_VERSION)) @parametrize(old_version=str(LATEST_0_10_2), new_version=str(DEV_VERSION)) def test_simple_upgrade(self, old_version, new_version): """ Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_verion> """ self.driver.start() self.start_all_nodes_with(old_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_rolling_bounce(p, "", new_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() #@parametrize(new_version=str(LATEST_0_10_1)) we cannot run this test until Kafka 0.10.1.2 is released #@parametrize(new_version=str(LATEST_0_10_2)) we cannot run this test until Kafka 0.10.2.2 is released @parametrize(new_version=str(DEV_VERSION)) def test_metadata_upgrade(self, new_version): """ Starts 3 KafkaStreams instances with version 0.10.0, and upgrades one-by-one to <new_version> """ self.driver.start() self.start_all_nodes_with(str(LATEST_0_10_0)) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # first rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_rolling_bounce(p, "0.10.0", new_version, counter) counter = counter + 1 # second rolling bounce random.shuffle(self.processors) for p in self.processors: self.do_rolling_bounce(p, "", new_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() def start_all_nodes_with(self, version): # start first with <version> self.prepare_for(self.processor1, version) node1 = self.processor1.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor: with node1.account.monitor_log( self.processor1.LOG_FILE) as log_monitor: self.processor1.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node1.account)) monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) # start second with <version> self.prepare_for(self.processor2, version) node2 = self.processor2.node with node1.account.monitor_log( self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log( self.processor2.STDOUT_FILE) as second_monitor: with node2.account.monitor_log( self.processor2.LOG_FILE) as log_monitor: self.processor2.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node2.account)) first_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node2.account)) # start third with <version> self.prepare_for(self.processor3, version) node3 = self.processor3.node with node1.account.monitor_log( self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log( self.processor2.STDOUT_FILE) as second_monitor: with node3.account.monitor_log( self.processor3.STDOUT_FILE) as third_monitor: with node3.account.monitor_log( self.processor3.LOG_FILE) as log_monitor: self.processor3.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node3.account)) first_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node2.account)) third_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node3.account)) @staticmethod def prepare_for(processor, version): processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT, allow_fail=False) processor.set_version(version) def do_rolling_bounce(self, processor, upgrade_from, new_version, counter): first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node # stop processor and wait for rebalance of others with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.stop() first_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) second_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) if upgrade_from == "": # upgrade disabled -- second round of rolling bounces roll_counter = ".1-" # second round of rolling bounces else: roll_counter = ".0-" # first round of rolling boundes node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + roll_counter + str(counter), allow_fail=False) if new_version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(new_version) processor.set_upgrade_from(upgrade_from) grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" " with node.account.monitor_log(processor.STDOUT_FILE) as monitor: with node.account.monitor_log(processor.LOG_FILE) as log_monitor: with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE ) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.start() log_monitor.wait_until( "Kafka version : " + new_version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + new_version + " " + str(node.account)) first_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) found = list( first_other_node.account.ssh_capture( grep_metadata_error + first_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) second_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) found = list( second_other_node.account.ssh_capture( grep_metadata_error + second_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node.account))
class StreamsUpgradeTest(Test): """ Test upgrading Kafka Streams (all version combination) If metadata was changes, upgrade is more difficult Metadata version was bumped in 0.10.1.0 """ def __init__(self, test_context): super(StreamsUpgradeTest, self).__init__(test_context) self.topics = { 'echo' : { 'partitions': 5 }, 'data' : { 'partitions': 5 }, } self.leader = None def perform_broker_upgrade(self, to_version): self.logger.info("First pass bounce - rolling broker upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = KafkaVersion(to_version) self.kafka.start_node(node) @ignore @cluster(num_nodes=6) @matrix(from_version=broker_upgrade_versions, to_version=broker_upgrade_versions) def test_upgrade_downgrade_brokers(self, from_version, to_version): """ Start a smoke test client then perform rolling upgrades on the broker. """ if from_version == to_version: return self.replication = 3 self.partitions = 1 self.isr = 2 self.topics = { 'echo' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr}}, 'data' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'min' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'max' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'sum' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'dif' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'cnt' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'avg' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'wcnt' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'tagg' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} } } # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # number of nodes needs to be >= 3 for the smoke test self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=KafkaVersion(from_version), topics=self.topics) self.kafka.start() # allow some time for topics to be created time.sleep(10) self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka) self.driver.start() self.processor1.start() time.sleep(15) self.perform_broker_upgrade(to_version) time.sleep(15) self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False) self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False) @ignore @matrix(from_version=metadata_2_versions, to_version=metadata_2_versions) def test_simple_upgrade_downgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version> """ if from_version == to_version: return self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # upgrade one-by-one via rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_rolling_bounce(p, None, to_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() #@matrix(from_version=metadata_1_versions, to_version=backward_compatible_metadata_2_versions) @ignore @matrix(from_version=metadata_1_versions, to_version=metadata_3_versions) @matrix(from_version=metadata_2_versions, to_version=metadata_3_versions) def test_metadata_upgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version> """ self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # first rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_rolling_bounce(p, from_version[:-2], to_version, counter) counter = counter + 1 # second rolling bounce random.shuffle(self.processors) for p in self.processors: self.do_rolling_bounce(p, None, to_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() def start_all_nodes_with(self, version): # start first with <version> self.prepare_for(self.processor1, version) node1 = self.processor1.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor: with node1.account.monitor_log(self.processor1.LOG_FILE) as log_monitor: self.processor1.start() log_monitor.wait_until("Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node1.account)) monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account)) # start second with <version> self.prepare_for(self.processor2, version) node2 = self.processor2.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log(self.processor2.STDOUT_FILE) as second_monitor: with node2.account.monitor_log(self.processor2.LOG_FILE) as log_monitor: self.processor2.start() log_monitor.wait_until("Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node2.account)) first_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node2.account)) # start third with <version> self.prepare_for(self.processor3, version) node3 = self.processor3.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log(self.processor2.STDOUT_FILE) as second_monitor: with node3.account.monitor_log(self.processor3.STDOUT_FILE) as third_monitor: with node3.account.monitor_log(self.processor3.LOG_FILE) as log_monitor: self.processor3.start() log_monitor.wait_until("Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node3.account)) first_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node2.account)) third_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node3.account)) @staticmethod def prepare_for(processor, version): processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT, allow_fail=False) if version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(version) def do_rolling_bounce(self, processor, upgrade_from, new_version, counter): first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node # stop processor and wait for rebalance of others with first_other_node.account.monitor_log(first_other_processor.STDOUT_FILE) as first_other_monitor: with second_other_node.account.monitor_log(second_other_processor.STDOUT_FILE) as second_other_monitor: processor.stop() first_other_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) second_other_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) if upgrade_from is None: # upgrade disabled -- second round of rolling bounces roll_counter = ".1-" # second round of rolling bounces else: roll_counter = ".0-" # first round of rolling boundes node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + roll_counter + str(counter), allow_fail=False) if new_version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(new_version) processor.set_upgrade_from(upgrade_from) grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" " with node.account.monitor_log(processor.STDOUT_FILE) as monitor: with node.account.monitor_log(processor.LOG_FILE) as log_monitor: with first_other_node.account.monitor_log(first_other_processor.STDOUT_FILE) as first_other_monitor: with second_other_node.account.monitor_log(second_other_processor.STDOUT_FILE) as second_other_monitor: processor.start() log_monitor.wait_until("Kafka version : " + new_version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + new_version + " " + str(node.account)) first_other_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) found = list(first_other_node.account.ssh_capture(grep_metadata_error + first_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception("Kafka Streams failed with 'unable to decode subscription data: version=2'") second_other_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) found = list(second_other_node.account.ssh_capture(grep_metadata_error + second_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception("Kafka Streams failed with 'unable to decode subscription data: version=2'") monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node.account))
class StreamsUpgradeTest(Test): """ Test upgrading Kafka Streams (all version combination) If metadata was changes, upgrade is more difficult Metadata version was bumped in 0.10.1.0 and subsequently bumped in 2.0.0 """ def __init__(self, test_context): super(StreamsUpgradeTest, self).__init__(test_context) self.topics = { 'echo': { 'partitions': 5 }, 'data': { 'partitions': 5 }, } processed_msg = "processed [0-9]* records" base_version_number = str(DEV_VERSION).split("-")[0] def perform_broker_upgrade(self, to_version): self.logger.info("First pass bounce - rolling broker upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = KafkaVersion(to_version) self.kafka.start_node(node) @cluster(num_nodes=6) @matrix(from_version=smoke_test_versions, to_version=dev_version, bounce_type=["full"]) def test_app_upgrade(self, from_version, to_version, bounce_type): """ Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version> """ if from_version == to_version: return self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics={ 'echo': { 'partitions': 5, 'replication-factor': 1 }, 'data': { 'partitions': 5, 'replication-factor': 1 }, 'min': { 'partitions': 5, 'replication-factor': 1 }, 'min-suppressed': { 'partitions': 5, 'replication-factor': 1 }, 'min-raw': { 'partitions': 5, 'replication-factor': 1 }, 'max': { 'partitions': 5, 'replication-factor': 1 }, 'sum': { 'partitions': 5, 'replication-factor': 1 }, 'sws-raw': { 'partitions': 5, 'replication-factor': 1 }, 'sws-suppressed': { 'partitions': 5, 'replication-factor': 1 }, 'dif': { 'partitions': 5, 'replication-factor': 1 }, 'cnt': { 'partitions': 5, 'replication-factor': 1 }, 'avg': { 'partitions': 5, 'replication-factor': 1 }, 'wcnt': { 'partitions': 5, 'replication-factor': 1 }, 'tagg': { 'partitions': 5, 'replication-factor': 1 } }) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka, processing_guarantee="at_least_once", replication_factor=1) self.processor2 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka, processing_guarantee="at_least_once", replication_factor=1) self.processor3 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka, processing_guarantee="at_least_once", replication_factor=1) self.purge_state_dir(self.processor1) self.purge_state_dir(self.processor2) self.purge_state_dir(self.processor3) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] if bounce_type == "rolling": counter = 1 random.seed() # upgrade one-by-one via rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_stop_start_bounce(p, None, to_version, counter) counter = counter + 1 elif bounce_type == "full": self.restart_all_nodes_with(to_version) else: raise Exception("Unrecognized bounce_type: " + str(bounce_type)) # shutdown self.driver.stop() # Ideally, we would actually verify the expected results. # See KAFKA-10202 random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "SMOKE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'SMOKE-TEST-CLIENT-CLOSED' on " + str(node.account)) def start_all_nodes_with(self, version): self.set_version(self.processor1, version) self.set_version(self.processor2, version) self.set_version(self.processor3, version) self.processor1.start() self.processor2.start() self.processor3.start() # double-check the version kafka_version_str = self.get_version_string(version) self.wait_for_verification(self.processor1, kafka_version_str, self.processor1.LOG_FILE) self.wait_for_verification(self.processor2, kafka_version_str, self.processor2.LOG_FILE) self.wait_for_verification(self.processor3, kafka_version_str, self.processor3.LOG_FILE) # wait for the members to join self.wait_for_verification(self.processor1, "SMOKE-TEST-CLIENT-STARTED", self.processor1.STDOUT_FILE) self.wait_for_verification(self.processor2, "SMOKE-TEST-CLIENT-STARTED", self.processor2.STDOUT_FILE) self.wait_for_verification(self.processor3, "SMOKE-TEST-CLIENT-STARTED", self.processor3.STDOUT_FILE) # make sure they've processed something self.wait_for_verification(self.processor1, self.processed_msg, self.processor1.STDOUT_FILE) self.wait_for_verification(self.processor2, self.processed_msg, self.processor2.STDOUT_FILE) self.wait_for_verification(self.processor3, self.processed_msg, self.processor3.STDOUT_FILE) def restart_all_nodes_with(self, version): self.processor1.stop_node(self.processor1.node) self.processor2.stop_node(self.processor2.node) self.processor3.stop_node(self.processor3.node) # make sure the members have stopped self.wait_for_verification(self.processor1, "SMOKE-TEST-CLIENT-CLOSED", self.processor1.STDOUT_FILE) self.wait_for_verification(self.processor2, "SMOKE-TEST-CLIENT-CLOSED", self.processor2.STDOUT_FILE) self.wait_for_verification(self.processor3, "SMOKE-TEST-CLIENT-CLOSED", self.processor3.STDOUT_FILE) self.roll_logs(self.processor1, ".1-1") self.roll_logs(self.processor2, ".1-1") self.roll_logs(self.processor3, ".1-1") self.set_version(self.processor1, version) self.set_version(self.processor2, version) self.set_version(self.processor3, version) self.processor1.start_node(self.processor1.node) self.processor2.start_node(self.processor2.node) self.processor3.start_node(self.processor3.node) # double-check the version kafka_version_str = self.get_version_string(version) self.wait_for_verification(self.processor1, kafka_version_str, self.processor1.LOG_FILE) self.wait_for_verification(self.processor2, kafka_version_str, self.processor2.LOG_FILE) self.wait_for_verification(self.processor3, kafka_version_str, self.processor3.LOG_FILE) # wait for the members to join self.wait_for_verification(self.processor1, "SMOKE-TEST-CLIENT-STARTED", self.processor1.STDOUT_FILE) self.wait_for_verification(self.processor2, "SMOKE-TEST-CLIENT-STARTED", self.processor2.STDOUT_FILE) self.wait_for_verification(self.processor3, "SMOKE-TEST-CLIENT-STARTED", self.processor3.STDOUT_FILE) # make sure they've processed something self.wait_for_verification(self.processor1, self.processed_msg, self.processor1.STDOUT_FILE) self.wait_for_verification(self.processor2, self.processed_msg, self.processor2.STDOUT_FILE) self.wait_for_verification(self.processor3, self.processed_msg, self.processor3.STDOUT_FILE) def get_version_string(self, version): if version.startswith("0") or version.startswith("1") \ or version.startswith("2.0") or version.startswith("2.1"): return "Kafka version : " + version elif "SNAPSHOT" in version: return "Kafka version.*" + self.base_version_number + ".*SNAPSHOT" else: return "Kafka version: " + version def wait_for_verification(self, processor, message, file, num_lines=1): wait_until(lambda: self.verify_from_file(processor, message, file ) >= num_lines, timeout_sec=60, err_msg="Did expect to read '%s' from %s" % (message, processor.node.account)) def verify_from_file(self, processor, message, file): result = processor.node.account.ssh_output("grep -E '%s' %s | wc -l" % (message, file), allow_fail=False) try: return int(result) except ValueError: self.logger.warn("Command failed with ValueError: " + result) return 0 def set_version(self, processor, version): if version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(version) def purge_state_dir(self, processor): processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT, allow_fail=False) def do_stop_start_bounce(self, processor, upgrade_from, new_version, counter): kafka_version_str = self.get_version_string(new_version) first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node # stop processor and wait for rebalance of others with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.stop_node(processor.node) first_other_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(first_other_node.account)) second_other_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(second_other_node.account)) node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) if upgrade_from is None: # upgrade disabled -- second round of rolling bounces roll_counter = ".1-" # second round of rolling bounces else: roll_counter = ".0-" # first round of rolling bounces self.roll_logs(processor, roll_counter + str(counter)) self.set_version(processor, new_version) processor.set_upgrade_from(upgrade_from) grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" " with node.account.monitor_log(processor.STDOUT_FILE) as monitor: with node.account.monitor_log(processor.LOG_FILE) as log_monitor: with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE ) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.start_node(processor.node) log_monitor.wait_until( kafka_version_str, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + new_version + " on " + str(node.account)) first_other_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(first_other_node.account)) found = list( first_other_node.account.ssh_capture( grep_metadata_error + first_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) second_other_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(second_other_node.account)) found = list( second_other_node.account.ssh_capture( grep_metadata_error + second_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(node.account)) def roll_logs(self, processor, roll_suffix): processor.node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + roll_suffix, allow_fail=False) processor.node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + roll_suffix, allow_fail=False) processor.node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + roll_suffix, allow_fail=False) processor.node.account.ssh("mv " + processor.CONFIG_FILE + " " + processor.CONFIG_FILE + roll_suffix, allow_fail=False)
class StreamsUpgradeTest(Test): """ Test upgrading Kafka Streams (all version combination) If metadata was changes, upgrade is more difficult Metadata version was bumped in 0.10.1.0 """ def __init__(self, test_context): super(StreamsUpgradeTest, self).__init__(test_context) self.topics = { 'echo' : { 'partitions': 5 }, 'data' : { 'partitions': 5 }, } self.leader = None self.leader_counter = {} def perform_broker_upgrade(self, to_version): self.logger.info("First pass bounce - rolling broker upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = KafkaVersion(to_version) self.kafka.start_node(node) @ignore @cluster(num_nodes=6) @matrix(from_version=broker_upgrade_versions, to_version=broker_upgrade_versions) def test_upgrade_downgrade_brokers(self, from_version, to_version): """ Start a smoke test client then perform rolling upgrades on the broker. """ if from_version == to_version: return self.replication = 3 self.partitions = 1 self.isr = 2 self.topics = { 'echo' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr}}, 'data' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'min' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'max' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'sum' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'dif' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'cnt' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'avg' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'wcnt' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'tagg' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} } } # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # number of nodes needs to be >= 3 for the smoke test self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=KafkaVersion(from_version), topics=self.topics) self.kafka.start() # allow some time for topics to be created time.sleep(10) self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka) self.driver.start() self.processor1.start() time.sleep(15) self.perform_broker_upgrade(to_version) time.sleep(15) self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False) self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False) @matrix(from_version=metadata_2_versions, to_version=metadata_2_versions) def test_simple_upgrade_downgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version> """ if from_version == to_version: return self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # upgrade one-by-one via rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_stop_start_bounce(p, None, to_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() @matrix(from_version=metadata_1_versions, to_version=backward_compatible_metadata_2_versions) @matrix(from_version=metadata_1_versions, to_version=metadata_3_or_higher_versions) @matrix(from_version=metadata_2_versions, to_version=metadata_3_or_higher_versions) def test_metadata_upgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version> """ self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # first rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_stop_start_bounce(p, from_version[:-2], to_version, counter) counter = counter + 1 # second rolling bounce random.shuffle(self.processors) for p in self.processors: self.do_stop_start_bounce(p, None, to_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() def test_version_probing_upgrade(self): """ Starts 3 KafkaStreams instances, and upgrades one-by-one to "future version" """ self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with("") # run with TRUNK self.processors = [self.processor1, self.processor2, self.processor3] self.old_processors = [self.processor1, self.processor2, self.processor3] self.upgraded_processors = [] for p in self.processors: self.leader_counter[p] = 2 self.update_leader() for p in self.processors: self.leader_counter[p] = 0 self.leader_counter[self.leader] = 3 counter = 1 current_generation = 3 random.seed() random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False current_generation = self.do_rolling_bounce(p, counter, current_generation) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() def update_leader(self): self.leader = None retries = 10 while retries > 0: for p in self.processors: found = list(p.node.account.ssh_capture("grep \"Finished assignment for group\" %s" % p.LOG_FILE, allow_fail=True)) if len(found) == self.leader_counter[p] + 1: if self.leader is not None: raise Exception("Could not uniquely identify leader") self.leader = p self.leader_counter[p] = self.leader_counter[p] + 1 if self.leader is None: retries = retries - 1 time.sleep(5) else: break if self.leader is None: raise Exception("Could not identify leader") def start_all_nodes_with(self, version): # start first with <version> self.prepare_for(self.processor1, version) node1 = self.processor1.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor: with node1.account.monitor_log(self.processor1.LOG_FILE) as log_monitor: self.processor1.start() log_monitor.wait_until("Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node1.account)) monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account)) # start second with <version> self.prepare_for(self.processor2, version) node2 = self.processor2.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log(self.processor2.STDOUT_FILE) as second_monitor: with node2.account.monitor_log(self.processor2.LOG_FILE) as log_monitor: self.processor2.start() log_monitor.wait_until("Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node2.account)) first_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node2.account)) # start third with <version> self.prepare_for(self.processor3, version) node3 = self.processor3.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log(self.processor2.STDOUT_FILE) as second_monitor: with node3.account.monitor_log(self.processor3.STDOUT_FILE) as third_monitor: with node3.account.monitor_log(self.processor3.LOG_FILE) as log_monitor: self.processor3.start() log_monitor.wait_until("Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node3.account)) first_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node2.account)) third_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node3.account)) @staticmethod def prepare_for(processor, version): processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT, allow_fail=False) if version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(version) def do_stop_start_bounce(self, processor, upgrade_from, new_version, counter): first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node # stop processor and wait for rebalance of others with first_other_node.account.monitor_log(first_other_processor.STDOUT_FILE) as first_other_monitor: with second_other_node.account.monitor_log(second_other_processor.STDOUT_FILE) as second_other_monitor: processor.stop() first_other_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) second_other_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) if upgrade_from is None: # upgrade disabled -- second round of rolling bounces roll_counter = ".1-" # second round of rolling bounces else: roll_counter = ".0-" # first round of rolling boundes node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + roll_counter + str(counter), allow_fail=False) if new_version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(new_version) processor.set_upgrade_from(upgrade_from) grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" " with node.account.monitor_log(processor.STDOUT_FILE) as monitor: with node.account.monitor_log(processor.LOG_FILE) as log_monitor: with first_other_node.account.monitor_log(first_other_processor.STDOUT_FILE) as first_other_monitor: with second_other_node.account.monitor_log(second_other_processor.STDOUT_FILE) as second_other_monitor: processor.start() log_monitor.wait_until("Kafka version : " + new_version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + new_version + " " + str(node.account)) first_other_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) found = list(first_other_node.account.ssh_capture(grep_metadata_error + first_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception("Kafka Streams failed with 'unable to decode subscription data: version=2'") second_other_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) found = list(second_other_node.account.ssh_capture(grep_metadata_error + second_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception("Kafka Streams failed with 'unable to decode subscription data: version=2'") monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node.account)) def do_rolling_bounce(self, processor, counter, current_generation): first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node with first_other_node.account.monitor_log(first_other_processor.LOG_FILE) as first_other_monitor: with second_other_node.account.monitor_log(second_other_processor.LOG_FILE) as second_other_monitor: # stop processor processor.stop() node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + "." + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + "." + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + "." + str(counter), allow_fail=False) self.leader_counter[processor] = 0 with node.account.monitor_log(processor.LOG_FILE) as log_monitor: processor.set_upgrade_to("future_version") processor.start() self.old_processors.remove(processor) self.upgraded_processors.append(processor) current_generation = current_generation + 1 log_monitor.wait_until("Kafka version : " + str(DEV_VERSION), timeout_sec=60, err_msg="Could not detect Kafka Streams version " + str(DEV_VERSION) + " in " + str(node.account)) log_monitor.offset = 5 log_monitor.wait_until("partition\.assignment\.strategy = \[org\.apache\.kafka\.streams\.tests\.StreamsUpgradeTest$FutureStreamsPartitionAssignor\]", timeout_sec=60, err_msg="Could not detect FutureStreamsPartitionAssignor in " + str(node.account)) log_monitor.wait_until("Successfully joined group with generation " + str(current_generation), timeout_sec=60, err_msg="Never saw output 'Successfully joined group with generation " + str(current_generation) + "' on" + str(node.account)) first_other_monitor.wait_until("Successfully joined group with generation " + str(current_generation), timeout_sec=60, err_msg="Never saw output 'Successfully joined group with generation " + str(current_generation) + "' on" + str(first_other_node.account)) second_other_monitor.wait_until("Successfully joined group with generation " + str(current_generation), timeout_sec=60, err_msg="Never saw output 'Successfully joined group with generation " + str(current_generation) + "' on" + str(second_other_node.account)) if processor == self.leader: self.update_leader() else: self.leader_counter[self.leader] = self.leader_counter[self.leader] + 1 if processor == self.leader: leader_monitor = log_monitor elif first_other_processor == self.leader: leader_monitor = first_other_monitor elif second_other_processor == self.leader: leader_monitor = second_other_monitor else: raise Exception("Could not identify leader.") monitors = {} monitors[processor] = log_monitor monitors[first_other_processor] = first_other_monitor monitors[second_other_processor] = second_other_monitor leader_monitor.wait_until("Received a future (version probing) subscription (version: 5). Sending empty assignment back (with supported version 4).", timeout_sec=60, err_msg="Could not detect 'version probing' attempt at leader " + str(self.leader.node.account)) if len(self.old_processors) > 0: log_monitor.wait_until("Sent a version 5 subscription and got version 4 assignment back (successful version probing). Downgrading subscription metadata to received version and trigger new rebalance.", timeout_sec=60, err_msg="Could not detect 'successful version probing' at upgrading node " + str(node.account)) else: log_monitor.wait_until("Sent a version 5 subscription and got version 4 assignment back (successful version probing). Setting subscription metadata to leaders supported version 5 and trigger new rebalance.", timeout_sec=60, err_msg="Could not detect 'successful version probing with upgraded leader' at upgrading node " + str(node.account)) first_other_monitor.wait_until("Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.", timeout_sec=60, err_msg="Never saw output 'Upgrade metadata to version 4' on" + str(first_other_node.account)) second_other_monitor.wait_until("Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.", timeout_sec=60, err_msg="Never saw output 'Upgrade metadata to version 4' on" + str(second_other_node.account)) log_monitor.wait_until("Version probing detected. Triggering new rebalance.", timeout_sec=60, err_msg="Could not detect 'Triggering new rebalance' at upgrading node " + str(node.account)) # version probing should trigger second rebalance current_generation = current_generation + 1 for p in self.processors: monitors[p].wait_until("Successfully joined group with generation " + str(current_generation), timeout_sec=60, err_msg="Never saw output 'Successfully joined group with generation " + str(current_generation) + "' on" + str(p.node.account)) if processor == self.leader: self.update_leader() else: self.leader_counter[self.leader] = self.leader_counter[self.leader] + 1 if self.leader in self.old_processors or len(self.old_processors) > 0: self.verify_metadata_no_upgraded_yet() return current_generation def verify_metadata_no_upgraded_yet(self): for p in self.processors: found = list(p.node.account.ssh_capture("grep \"Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.\" " + p.LOG_FILE, allow_fail=True)) if len(found) > 0: raise Exception("Kafka Streams failed with 'group member upgraded to metadata 4 too early'")