예제 #1
0
class StreamsUpgradeTest(Test):
    """
    Test upgrading Kafka Streams (all version combination)
    If metadata was changes, upgrade is more difficult
    Metadata version was bumped in 0.10.1.0 and
    subsequently bumped in 2.0.0
    """
    def __init__(self, test_context):
        super(StreamsUpgradeTest, self).__init__(test_context)
        self.topics = {
            'echo': {
                'partitions': 5
            },
            'data': {
                'partitions': 5
            },
        }
        self.leader = None
        self.leader_counter = {}

    processed_msg = "processed [0-9]* records"
    base_version_number = str(DEV_VERSION).split("-")[0]

    def perform_broker_upgrade(self, to_version):
        self.logger.info("First pass bounce - rolling broker upgrade")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            node.version = KafkaVersion(to_version)
            self.kafka.start_node(node)

    @ignore
    @cluster(num_nodes=6)
    @matrix(from_version=broker_upgrade_versions,
            to_version=broker_upgrade_versions)
    def test_upgrade_downgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker.
        """

        if from_version == to_version:
            return

        self.replication = 3
        self.num_kafka_nodes = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            }
        }

        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=self.num_kafka_nodes,
                                  zk=self.zk,
                                  version=KafkaVersion(from_version),
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        wait_until(lambda: self.confirm_topics_on_all_brokers(
            set(self.topics.keys())),
                   timeout_sec=60,
                   err_msg="Broker did not create all topics in 60 seconds ")

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)

        processor = StreamsSmokeTestJobRunnerService(self.test_context,
                                                     self.kafka)

        with self.driver.node.account.monitor_log(
                self.driver.STDOUT_FILE) as driver_monitor:
            self.driver.start()

            with processor.node.account.monitor_log(
                    processor.STDOUT_FILE) as monitor:
                processor.start()
                monitor.wait_until(
                    self.processed_msg,
                    timeout_sec=60,
                    err_msg="Never saw output '%s' on " % self.processed_msg +
                    str(processor.node))

            connected_message = "Discovered group coordinator"
            with processor.node.account.monitor_log(
                    processor.LOG_FILE) as log_monitor:
                with processor.node.account.monitor_log(
                        processor.STDOUT_FILE) as stdout_monitor:
                    self.perform_broker_upgrade(to_version)

                    log_monitor.wait_until(
                        connected_message,
                        timeout_sec=120,
                        err_msg=("Never saw output '%s' on " %
                                 connected_message) +
                        str(processor.node.account))

                    stdout_monitor.wait_until(
                        self.processed_msg,
                        timeout_sec=60,
                        err_msg="Never saw output '%s' on" % self.processed_msg
                        + str(processor.node.account))

            # SmokeTestDriver allows up to 6 minutes to consume all
            # records for the verification step so this timeout is set to
            # 6 minutes (360 seconds) for consuming of verification records
            # and a very conservative additional 2 minutes (120 seconds) to process
            # the records in the verification step
            driver_monitor.wait_until(
                'ALL-RECORDS-DELIVERED\|PROCESSED-MORE-THAN-GENERATED',
                timeout_sec=480,
                err_msg="Never saw output '%s' on" %
                'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED' +
                str(self.driver.node.account))

        self.driver.stop()
        processor.stop()
        processor.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" %
                                           processor.STDOUT_FILE,
                                           allow_fail=False)

    @matrix(from_version=metadata_2_versions, to_version=metadata_2_versions)
    def test_simple_upgrade_downgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version>
        """

        if from_version == to_version:
            return

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # upgrade one-by-one via rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_stop_start_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

    @matrix(from_version=metadata_1_versions,
            to_version=backward_compatible_metadata_2_versions)
    @matrix(from_version=metadata_1_versions,
            to_version=metadata_3_or_higher_versions)
    @matrix(from_version=metadata_2_versions,
            to_version=metadata_3_or_higher_versions)
    def test_metadata_upgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version>
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # first rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_stop_start_bounce(p, from_version[:-2], to_version,
                                      counter)
            counter = counter + 1

        # second rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            self.do_stop_start_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

    def test_version_probing_upgrade(self):
        """
        Starts 3 KafkaStreams instances, and upgrades one-by-one to "future version"
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with("")  # run with TRUNK

        self.processors = [self.processor1, self.processor2, self.processor3]
        self.old_processors = [
            self.processor1, self.processor2, self.processor3
        ]
        self.upgraded_processors = []
        for p in self.processors:
            self.leader_counter[p] = 2

        self.update_leader()
        for p in self.processors:
            self.leader_counter[p] = 0
        self.leader_counter[self.leader] = 3

        counter = 1
        current_generation = 3

        random.seed()
        random.shuffle(self.processors)

        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            current_generation = self.do_rolling_bounce(
                p, counter, current_generation)
            counter = counter + 1

        # shutdown
        self.driver.stop()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

    def update_leader(self):
        self.leader = None
        retries = 10
        while retries > 0:
            for p in self.processors:
                found = list(
                    p.node.account.ssh_capture(
                        "grep \"Finished assignment for group\" %s" %
                        p.LOG_FILE,
                        allow_fail=True))
                if len(found) >= self.leader_counter[p] + 1:
                    if self.leader is not None:
                        raise Exception("Could not uniquely identify leader")
                    self.leader = p
                    self.leader_counter[p] = self.leader_counter[p] + 1

            if self.leader is None:
                retries = retries - 1
                time.sleep(5)
            else:
                break

        if self.leader is None:
            raise Exception("Could not identify leader")

    def get_version_string(self, version):
        if version.startswith("0") or version.startswith("1") \
          or version.startswith("2.0") or version.startswith("2.1"):
            return "Kafka version : " + version
        elif "SNAPSHOT" in version:
            return "Kafka version.*" + self.base_version_number + ".*SNAPSHOT"
        else:
            return "Kafka version: " + version

    def start_all_nodes_with(self, version):
        kafka_version_str = self.get_version_string(version)

        # start first with <version>
        self.prepare_for(self.processor1, version)
        node1 = self.processor1.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor:
            with node1.account.monitor_log(
                    self.processor1.LOG_FILE) as log_monitor:
                self.processor1.start()
                log_monitor.wait_until(
                    kafka_version_str,
                    timeout_sec=60,
                    err_msg="Could not detect Kafka Streams version " +
                    version + " " + str(node1.account))
                monitor.wait_until(
                    self.processed_msg,
                    timeout_sec=60,
                    err_msg="Never saw output '%s' on " % self.processed_msg +
                    str(node1.account))

        # start second with <version>
        self.prepare_for(self.processor2, version)
        node2 = self.processor2.node
        with node1.account.monitor_log(
                self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(
                    self.processor2.STDOUT_FILE) as second_monitor:
                with node2.account.monitor_log(
                        self.processor2.LOG_FILE) as log_monitor:
                    self.processor2.start()
                    log_monitor.wait_until(
                        kafka_version_str,
                        timeout_sec=60,
                        err_msg="Could not detect Kafka Streams version " +
                        version + " on " + str(node2.account))
                    first_monitor.wait_until(
                        self.processed_msg,
                        timeout_sec=60,
                        err_msg="Never saw output '%s' on " %
                        self.processed_msg + str(node1.account))
                    second_monitor.wait_until(
                        self.processed_msg,
                        timeout_sec=60,
                        err_msg="Never saw output '%s' on " %
                        self.processed_msg + str(node2.account))

        # start third with <version>
        self.prepare_for(self.processor3, version)
        node3 = self.processor3.node
        with node1.account.monitor_log(
                self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(
                    self.processor2.STDOUT_FILE) as second_monitor:
                with node3.account.monitor_log(
                        self.processor3.STDOUT_FILE) as third_monitor:
                    with node3.account.monitor_log(
                            self.processor3.LOG_FILE) as log_monitor:
                        self.processor3.start()
                        log_monitor.wait_until(
                            kafka_version_str,
                            timeout_sec=60,
                            err_msg="Could not detect Kafka Streams version " +
                            version + " on " + str(node3.account))
                        first_monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg + str(node1.account))
                        second_monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg + str(node2.account))
                        third_monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg + str(node3.account))

    @staticmethod
    def prepare_for(processor, version):
        processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT,
                                   allow_fail=False)
        if version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(version)

    def do_stop_start_bounce(self, processor, upgrade_from, new_version,
                             counter):
        kafka_version_str = self.get_version_string(new_version)

        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        # stop processor and wait for rebalance of others
        with first_other_node.account.monitor_log(
                first_other_processor.STDOUT_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(
                    second_other_processor.STDOUT_FILE
            ) as second_other_monitor:
                processor.stop()
                first_other_monitor.wait_until(
                    self.processed_msg,
                    timeout_sec=60,
                    err_msg="Never saw output '%s' on " % self.processed_msg +
                    str(first_other_node.account))
                second_other_monitor.wait_until(
                    self.processed_msg,
                    timeout_sec=60,
                    err_msg="Never saw output '%s' on " % self.processed_msg +
                    str(second_other_node.account))
        node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" %
                                 processor.STDOUT_FILE,
                                 allow_fail=False)

        if upgrade_from is None:  # upgrade disabled -- second round of rolling bounces
            roll_counter = ".1-"  # second round of rolling bounces
        else:
            roll_counter = ".0-"  # first  round of rolling boundes

        node.account.ssh("mv " + processor.STDOUT_FILE + " " +
                         processor.STDOUT_FILE + roll_counter + str(counter),
                         allow_fail=False)
        node.account.ssh("mv " + processor.STDERR_FILE + " " +
                         processor.STDERR_FILE + roll_counter + str(counter),
                         allow_fail=False)
        node.account.ssh("mv " + processor.LOG_FILE + " " +
                         processor.LOG_FILE + roll_counter + str(counter),
                         allow_fail=False)

        if new_version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(new_version)
        processor.set_upgrade_from(upgrade_from)

        grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" "
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            with node.account.monitor_log(processor.LOG_FILE) as log_monitor:
                with first_other_node.account.monitor_log(
                        first_other_processor.STDOUT_FILE
                ) as first_other_monitor:
                    with second_other_node.account.monitor_log(
                            second_other_processor.STDOUT_FILE
                    ) as second_other_monitor:
                        processor.start()

                        log_monitor.wait_until(
                            kafka_version_str,
                            timeout_sec=60,
                            err_msg="Could not detect Kafka Streams version " +
                            new_version + " on " + str(node.account))
                        first_other_monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg + str(first_other_node.account))
                        found = list(
                            first_other_node.account.ssh_capture(
                                grep_metadata_error +
                                first_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        second_other_monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg +
                            str(second_other_node.account))
                        found = list(
                            second_other_node.account.ssh_capture(
                                grep_metadata_error +
                                second_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg + str(node.account))

    def do_rolling_bounce(self, processor, counter, current_generation):
        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        with first_other_node.account.monitor_log(
                first_other_processor.LOG_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(
                    second_other_processor.LOG_FILE) as second_other_monitor:
                # stop processor
                processor.stop()
                node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" %
                                         processor.STDOUT_FILE,
                                         allow_fail=False)

                node.account.ssh("mv " + processor.STDOUT_FILE + " " +
                                 processor.STDOUT_FILE + "." + str(counter),
                                 allow_fail=False)
                node.account.ssh("mv " + processor.STDERR_FILE + " " +
                                 processor.STDERR_FILE + "." + str(counter),
                                 allow_fail=False)
                node.account.ssh("mv " + processor.LOG_FILE + " " +
                                 processor.LOG_FILE + "." + str(counter),
                                 allow_fail=False)
                self.leader_counter[processor] = 0

                with node.account.monitor_log(
                        processor.LOG_FILE) as log_monitor:
                    processor.set_upgrade_to("future_version")
                    processor.start()
                    self.old_processors.remove(processor)
                    self.upgraded_processors.append(processor)

                    # checking for the dev version which should be the only SNAPSHOT
                    log_monitor.wait_until(
                        "Kafka version.*" + self.base_version_number +
                        ".*SNAPSHOT",
                        timeout_sec=60,
                        err_msg="Could not detect Kafka Streams version " +
                        str(DEV_VERSION) + " in " + str(node.account))
                    log_monitor.offset = 5
                    log_monitor.wait_until(
                        "partition\.assignment\.strategy = \[org\.apache\.kafka\.streams\.tests\.StreamsUpgradeTest$FutureStreamsPartitionAssignor\]",
                        timeout_sec=60,
                        err_msg=
                        "Could not detect FutureStreamsPartitionAssignor in " +
                        str(node.account))

                    if processor == self.leader:
                        self.update_leader()
                    else:
                        self.leader_counter[
                            self.leader] = self.leader_counter[self.leader] + 1

                    if processor == self.leader:
                        leader_monitor = log_monitor
                    elif first_other_processor == self.leader:
                        leader_monitor = first_other_monitor
                    elif second_other_processor == self.leader:
                        leader_monitor = second_other_monitor
                    else:
                        raise Exception("Could not identify leader.")

                    monitors = {}
                    monitors[processor] = log_monitor
                    monitors[first_other_processor] = first_other_monitor
                    monitors[second_other_processor] = second_other_monitor

                    leader_monitor.wait_until(
                        "Received a future (version probing) subscription (version: 5). Sending empty assignment back (with supported version 4).",
                        timeout_sec=60,
                        err_msg=
                        "Could not detect 'version probing' attempt at leader "
                        + str(self.leader.node.account))

                    if len(self.old_processors) > 0:
                        log_monitor.wait_until(
                            "Sent a version 5 subscription and got version 4 assignment back (successful version probing). Downgrading subscription metadata to received version and trigger new rebalance.",
                            timeout_sec=60,
                            err_msg=
                            "Could not detect 'successful version probing' at upgrading node "
                            + str(node.account))
                    else:
                        log_monitor.wait_until(
                            "Sent a version 5 subscription and got version 4 assignment back (successful version probing). Setting subscription metadata to leaders supported version 5 and trigger new rebalance.",
                            timeout_sec=60,
                            err_msg=
                            "Could not detect 'successful version probing with upgraded leader' at upgrading node "
                            + str(node.account))
                        first_other_monitor.wait_until(
                            "Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'Upgrade metadata to version 4' on"
                            + str(first_other_node.account))
                        second_other_monitor.wait_until(
                            "Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'Upgrade metadata to version 4' on"
                            + str(second_other_node.account))

                    log_monitor.wait_until(
                        "Version probing detected. Triggering new rebalance.",
                        timeout_sec=60,
                        err_msg=
                        "Could not detect 'Triggering new rebalance' at upgrading node "
                        + str(node.account))

                    # version probing should trigger second rebalance
                    # now we check that after consecutive rebalances we have synchronized generation
                    generation_synchronized = False
                    retries = 0

                    while retries < 10:
                        processor_found = extract_generation_from_logs(
                            processor)
                        first_other_processor_found = extract_generation_from_logs(
                            first_other_processor)
                        second_other_processor_found = extract_generation_from_logs(
                            second_other_processor)

                        if len(processor_found) > 0 and len(
                                first_other_processor_found) > 0 and len(
                                    second_other_processor_found) > 0:
                            self.logger.info("processor: " +
                                             str(processor_found))
                            self.logger.info("first other processor: " +
                                             str(first_other_processor_found))
                            self.logger.info("second other processor: " +
                                             str(second_other_processor_found))

                            processor_generation = self.extract_highest_generation(
                                processor_found)
                            first_other_processor_generation = self.extract_highest_generation(
                                first_other_processor_found)
                            second_other_processor_generation = self.extract_highest_generation(
                                second_other_processor_found)

                            if processor_generation == first_other_processor_generation and processor_generation == second_other_processor_generation:
                                current_generation = processor_generation
                                generation_synchronized = True
                                break

                        time.sleep(5)
                        retries = retries + 1

                    if generation_synchronized == False:
                        raise Exception(
                            "Never saw all three processors have the synchronized generation number"
                        )

                    if processor == self.leader:
                        self.update_leader()
                    else:
                        self.leader_counter[
                            self.leader] = self.leader_counter[self.leader] + 1

                    if self.leader in self.old_processors or len(
                            self.old_processors) > 0:
                        self.verify_metadata_no_upgraded_yet()

        return current_generation

    def extract_highest_generation(self, found_generations):
        return int(found_generations[-1])

    def verify_metadata_no_upgraded_yet(self):
        for p in self.processors:
            found = list(
                p.node.account.ssh_capture(
                    "grep \"Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.\" "
                    + p.LOG_FILE,
                    allow_fail=True))
            if len(found) > 0:
                raise Exception(
                    "Kafka Streams failed with 'group member upgraded to metadata 4 too early'"
                )

    def confirm_topics_on_all_brokers(self, expected_topic_set):
        for node in self.kafka.nodes:
            match_count = 0
            # need to iterate over topic_list_generator as kafka.list_topics()
            # returns a python generator so values are fetched lazily
            # so we can't just compare directly we must iterate over what's returned
            topic_list_generator = self.kafka.list_topics(node=node)
            for topic in topic_list_generator:
                if topic in expected_topic_set:
                    match_count += 1

            if len(expected_topic_set) != match_count:
                return False

        return True
예제 #2
0
class StreamsUpgradeTest(Test):
    """
    Test upgrading Kafka Streams (all version combination)
    If metadata was changes, upgrade is more difficult
    Metadata version was bumped in 0.10.1.0
    """
    def __init__(self, test_context):
        super(StreamsUpgradeTest, self).__init__(test_context)
        self.topics = {
            'echo': {
                'partitions': 5
            },
            'data': {
                'partitions': 5
            },
        }
        self.leader = None
        self.leader_counter = {}

    def perform_broker_upgrade(self, to_version):
        self.logger.info("First pass bounce - rolling broker upgrade")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            node.version = KafkaVersion(to_version)
            self.kafka.start_node(node)

    @ignore
    @cluster(num_nodes=6)
    @matrix(from_version=broker_upgrade_versions,
            to_version=broker_upgrade_versions)
    def test_upgrade_downgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker.
        """

        if from_version == to_version:
            return

        self.replication = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            }
        }

        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zk,
                                  version=KafkaVersion(from_version),
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        time.sleep(10)

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.processor1.start()
        time.sleep(15)

        self.perform_broker_upgrade(to_version)

        time.sleep(15)
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh(
            "grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED' %s"
            % self.driver.STDOUT_FILE,
            allow_fail=False)
        self.processor1.node.account.ssh_capture(
            "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE,
            allow_fail=False)

    @matrix(from_version=metadata_2_versions, to_version=metadata_2_versions)
    def test_simple_upgrade_downgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version>
        """

        if from_version == to_version:
            return

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # upgrade one-by-one via rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_stop_start_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

        self.driver.stop()

    @matrix(from_version=metadata_1_versions,
            to_version=backward_compatible_metadata_2_versions)
    @matrix(from_version=metadata_1_versions,
            to_version=metadata_3_or_higher_versions)
    @matrix(from_version=metadata_2_versions,
            to_version=metadata_3_or_higher_versions)
    def test_metadata_upgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version>
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # first rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_stop_start_bounce(p, from_version[:-2], to_version,
                                      counter)
            counter = counter + 1

        # second rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            self.do_stop_start_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

        self.driver.stop()

    def test_version_probing_upgrade(self):
        """
        Starts 3 KafkaStreams instances, and upgrades one-by-one to "future version"
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with("")  # run with TRUNK

        self.processors = [self.processor1, self.processor2, self.processor3]
        self.old_processors = [
            self.processor1, self.processor2, self.processor3
        ]
        self.upgraded_processors = []
        for p in self.processors:
            self.leader_counter[p] = 2

        self.update_leader()
        for p in self.processors:
            self.leader_counter[p] = 0
        self.leader_counter[self.leader] = 3

        counter = 1
        current_generation = 3

        random.seed()
        random.shuffle(self.processors)

        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            current_generation = self.do_rolling_bounce(
                p, counter, current_generation)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

        self.driver.stop()

    def update_leader(self):
        self.leader = None
        retries = 10
        while retries > 0:
            for p in self.processors:
                found = list(
                    p.node.account.ssh_capture(
                        "grep \"Finished assignment for group\" %s" %
                        p.LOG_FILE,
                        allow_fail=True))
                if len(found) == self.leader_counter[p] + 1:
                    if self.leader is not None:
                        raise Exception("Could not uniquely identify leader")
                    self.leader = p
                    self.leader_counter[p] = self.leader_counter[p] + 1

            if self.leader is None:
                retries = retries - 1
                time.sleep(5)
            else:
                break

        if self.leader is None:
            raise Exception("Could not identify leader")

    def start_all_nodes_with(self, version):
        # start first with <version>
        self.prepare_for(self.processor1, version)
        node1 = self.processor1.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor:
            with node1.account.monitor_log(
                    self.processor1.LOG_FILE) as log_monitor:
                self.processor1.start()
                log_monitor.wait_until(
                    "Kafka version : " + version,
                    timeout_sec=60,
                    err_msg="Could not detect Kafka Streams version " +
                    version + " " + str(node1.account))
                monitor.wait_until(
                    "processed 100 records from topic",
                    timeout_sec=60,
                    err_msg=
                    "Never saw output 'processed 100 records from topic' on" +
                    str(node1.account))

        # start second with <version>
        self.prepare_for(self.processor2, version)
        node2 = self.processor2.node
        with node1.account.monitor_log(
                self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(
                    self.processor2.STDOUT_FILE) as second_monitor:
                with node2.account.monitor_log(
                        self.processor2.LOG_FILE) as log_monitor:
                    self.processor2.start()
                    log_monitor.wait_until(
                        "Kafka version : " + version,
                        timeout_sec=60,
                        err_msg="Could not detect Kafka Streams version " +
                        version + " " + str(node2.account))
                    first_monitor.wait_until(
                        "processed 100 records from topic",
                        timeout_sec=60,
                        err_msg=
                        "Never saw output 'processed 100 records from topic' on"
                        + str(node1.account))
                    second_monitor.wait_until(
                        "processed 100 records from topic",
                        timeout_sec=60,
                        err_msg=
                        "Never saw output 'processed 100 records from topic' on"
                        + str(node2.account))

        # start third with <version>
        self.prepare_for(self.processor3, version)
        node3 = self.processor3.node
        with node1.account.monitor_log(
                self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(
                    self.processor2.STDOUT_FILE) as second_monitor:
                with node3.account.monitor_log(
                        self.processor3.STDOUT_FILE) as third_monitor:
                    with node3.account.monitor_log(
                            self.processor3.LOG_FILE) as log_monitor:
                        self.processor3.start()
                        log_monitor.wait_until(
                            "Kafka version : " + version,
                            timeout_sec=60,
                            err_msg="Could not detect Kafka Streams version " +
                            version + " " + str(node3.account))
                        first_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node1.account))
                        second_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node2.account))
                        third_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node3.account))

    @staticmethod
    def prepare_for(processor, version):
        processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT,
                                   allow_fail=False)
        if version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(version)

    def do_stop_start_bounce(self, processor, upgrade_from, new_version,
                             counter):
        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        # stop processor and wait for rebalance of others
        with first_other_node.account.monitor_log(
                first_other_processor.STDOUT_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(
                    second_other_processor.STDOUT_FILE
            ) as second_other_monitor:
                processor.stop()
                first_other_monitor.wait_until(
                    "processed 100 records from topic",
                    timeout_sec=60,
                    err_msg=
                    "Never saw output 'processed 100 records from topic' on" +
                    str(first_other_node.account))
                second_other_monitor.wait_until(
                    "processed 100 records from topic",
                    timeout_sec=60,
                    err_msg=
                    "Never saw output 'processed 100 records from topic' on" +
                    str(second_other_node.account))
        node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" %
                                 processor.STDOUT_FILE,
                                 allow_fail=False)

        if upgrade_from is None:  # upgrade disabled -- second round of rolling bounces
            roll_counter = ".1-"  # second round of rolling bounces
        else:
            roll_counter = ".0-"  # first  round of rolling boundes

        node.account.ssh("mv " + processor.STDOUT_FILE + " " +
                         processor.STDOUT_FILE + roll_counter + str(counter),
                         allow_fail=False)
        node.account.ssh("mv " + processor.STDERR_FILE + " " +
                         processor.STDERR_FILE + roll_counter + str(counter),
                         allow_fail=False)
        node.account.ssh("mv " + processor.LOG_FILE + " " +
                         processor.LOG_FILE + roll_counter + str(counter),
                         allow_fail=False)

        if new_version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(new_version)
        processor.set_upgrade_from(upgrade_from)

        grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" "
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            with node.account.monitor_log(processor.LOG_FILE) as log_monitor:
                with first_other_node.account.monitor_log(
                        first_other_processor.STDOUT_FILE
                ) as first_other_monitor:
                    with second_other_node.account.monitor_log(
                            second_other_processor.STDOUT_FILE
                    ) as second_other_monitor:
                        processor.start()

                        log_monitor.wait_until(
                            "Kafka version : " + new_version,
                            timeout_sec=60,
                            err_msg="Could not detect Kafka Streams version " +
                            new_version + " " + str(node.account))
                        first_other_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(first_other_node.account))
                        found = list(
                            first_other_node.account.ssh_capture(
                                grep_metadata_error +
                                first_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        second_other_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(second_other_node.account))
                        found = list(
                            second_other_node.account.ssh_capture(
                                grep_metadata_error +
                                second_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node.account))

    def do_rolling_bounce(self, processor, counter, current_generation):
        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        with first_other_node.account.monitor_log(
                first_other_processor.LOG_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(
                    second_other_processor.LOG_FILE) as second_other_monitor:
                # stop processor
                processor.stop()
                node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" %
                                         processor.STDOUT_FILE,
                                         allow_fail=False)

                node.account.ssh("mv " + processor.STDOUT_FILE + " " +
                                 processor.STDOUT_FILE + "." + str(counter),
                                 allow_fail=False)
                node.account.ssh("mv " + processor.STDERR_FILE + " " +
                                 processor.STDERR_FILE + "." + str(counter),
                                 allow_fail=False)
                node.account.ssh("mv " + processor.LOG_FILE + " " +
                                 processor.LOG_FILE + "." + str(counter),
                                 allow_fail=False)
                self.leader_counter[processor] = 0

                with node.account.monitor_log(
                        processor.LOG_FILE) as log_monitor:
                    processor.set_upgrade_to("future_version")
                    processor.start()
                    self.old_processors.remove(processor)
                    self.upgraded_processors.append(processor)

                    log_monitor.wait_until(
                        "Kafka version : " + str(DEV_VERSION),
                        timeout_sec=60,
                        err_msg="Could not detect Kafka Streams version " +
                        str(DEV_VERSION) + " in " + str(node.account))
                    log_monitor.offset = 5
                    log_monitor.wait_until(
                        "partition\.assignment\.strategy = \[org\.apache\.kafka\.streams\.tests\.StreamsUpgradeTest$FutureStreamsPartitionAssignor\]",
                        timeout_sec=60,
                        err_msg=
                        "Could not detect FutureStreamsPartitionAssignor in " +
                        str(node.account))

                    if processor == self.leader:
                        self.update_leader()
                    else:
                        self.leader_counter[
                            self.leader] = self.leader_counter[self.leader] + 1

                    if processor == self.leader:
                        leader_monitor = log_monitor
                    elif first_other_processor == self.leader:
                        leader_monitor = first_other_monitor
                    elif second_other_processor == self.leader:
                        leader_monitor = second_other_monitor
                    else:
                        raise Exception("Could not identify leader.")

                    monitors = {}
                    monitors[processor] = log_monitor
                    monitors[first_other_processor] = first_other_monitor
                    monitors[second_other_processor] = second_other_monitor

                    leader_monitor.wait_until(
                        "Received a future (version probing) subscription (version: 5). Sending empty assignment back (with supported version 4).",
                        timeout_sec=60,
                        err_msg=
                        "Could not detect 'version probing' attempt at leader "
                        + str(self.leader.node.account))

                    if len(self.old_processors) > 0:
                        log_monitor.wait_until(
                            "Sent a version 5 subscription and got version 4 assignment back (successful version probing). Downgrading subscription metadata to received version and trigger new rebalance.",
                            timeout_sec=60,
                            err_msg=
                            "Could not detect 'successful version probing' at upgrading node "
                            + str(node.account))
                    else:
                        log_monitor.wait_until(
                            "Sent a version 5 subscription and got version 4 assignment back (successful version probing). Setting subscription metadata to leaders supported version 5 and trigger new rebalance.",
                            timeout_sec=60,
                            err_msg=
                            "Could not detect 'successful version probing with upgraded leader' at upgrading node "
                            + str(node.account))
                        first_other_monitor.wait_until(
                            "Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'Upgrade metadata to version 4' on"
                            + str(first_other_node.account))
                        second_other_monitor.wait_until(
                            "Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'Upgrade metadata to version 4' on"
                            + str(second_other_node.account))

                    log_monitor.wait_until(
                        "Version probing detected. Triggering new rebalance.",
                        timeout_sec=60,
                        err_msg=
                        "Could not detect 'Triggering new rebalance' at upgrading node "
                        + str(node.account))

                    # version probing should trigger second rebalance
                    # now we check that after consecutive rebalances we have synchronized generation
                    generation_synchronized = False
                    retries = 0

                    while retries < 10:
                        processor_found = self.extract_generation_from_logs(
                            processor)
                        first_other_processor_found = self.extract_generation_from_logs(
                            first_other_processor)
                        second_other_processor_found = self.extract_generation_from_logs(
                            second_other_processor)

                        if len(processor_found) > 0 and len(
                                first_other_processor_found) > 0 and len(
                                    second_other_processor_found) > 0:
                            self.logger.info("processor: " +
                                             str(processor_found))
                            self.logger.info("first other processor: " +
                                             str(first_other_processor_found))
                            self.logger.info("second other processor: " +
                                             str(second_other_processor_found))

                            processor_generation = self.extract_highest_generation(
                                processor_found)
                            first_other_processor_generation = self.extract_highest_generation(
                                first_other_processor_found)
                            second_other_processor_generation = self.extract_highest_generation(
                                second_other_processor_found)

                            if processor_generation == first_other_processor_generation and processor_generation == second_other_processor_generation:
                                current_generation = processor_generation
                                generation_synchronized = True
                                break

                        time.sleep(5)
                        retries = retries + 1

                    if generation_synchronized == False:
                        raise Exception(
                            "Never saw all three processors have the synchronized generation number"
                        )

                    if processor == self.leader:
                        self.update_leader()
                    else:
                        self.leader_counter[
                            self.leader] = self.leader_counter[self.leader] + 1

                    if self.leader in self.old_processors or len(
                            self.old_processors) > 0:
                        self.verify_metadata_no_upgraded_yet()

        return current_generation

    def extract_generation_from_logs(self, processor):
        return list(
            processor.node.account.ssh_capture(
                "grep \"Successfully joined group with generation\" %s| awk \'{for(i=1;i<=NF;i++) {if ($i == \"generation\") beginning=i+1; if($i== \"(org.apache.kafka.clients.consumer.internals.AbstractCoordinator)\") ending=i }; for (j=beginning;j<ending;j++) printf $j; printf \"\\n\"}\'"
                % processor.LOG_FILE,
                allow_fail=True))

    def extract_highest_generation(self, found_generations):
        return int(found_generations[-1])

    def verify_metadata_no_upgraded_yet(self):
        for p in self.processors:
            found = list(
                p.node.account.ssh_capture(
                    "grep \"Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.\" "
                    + p.LOG_FILE,
                    allow_fail=True))
            if len(found) > 0:
                raise Exception(
                    "Kafka Streams failed with 'group member upgraded to metadata 4 too early'"
                )
예제 #3
0
class StreamsUpgradeTest(Test):
    """
    Test upgrading Kafka Streams (all version combination)
    If metadata was changes, upgrade is more difficult
    Metadata version was bumped in 0.10.1.0
    """
    def __init__(self, test_context):
        super(StreamsUpgradeTest, self).__init__(test_context)
        self.topics = {
            'echo': {
                'partitions': 5
            },
            'data': {
                'partitions': 5
            },
        }
        self.leader = None

    def perform_broker_upgrade(self, to_version):
        self.logger.info("First pass bounce - rolling broker upgrade")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            node.version = KafkaVersion(to_version)
            self.kafka.start_node(node)

    @cluster(num_nodes=6)
    @matrix(from_version=broker_upgrade_versions,
            to_version=broker_upgrade_versions)
    def test_upgrade_downgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker.
        """

        if from_version == to_version:
            return

        self.replication = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            }
        }

        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zk,
                                  version=KafkaVersion(from_version),
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        time.sleep(10)

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.processor1.start()
        time.sleep(15)

        self.perform_broker_upgrade(to_version)

        time.sleep(15)
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" %
                         self.driver.STDOUT_FILE,
                         allow_fail=False)
        self.processor1.node.account.ssh_capture(
            "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE,
            allow_fail=False)

    @matrix(from_version=metadata_2_versions, to_version=metadata_2_versions)
    def test_simple_upgrade_downgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version>
        """

        if from_version == to_version:
            return

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # upgrade one-by-one via rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_rolling_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

        self.driver.stop()

    #@matrix(from_version=metadata_1_versions, to_version=backward_compatible_metadata_2_versions)
    @matrix(from_version=metadata_1_versions, to_version=metadata_3_versions)
    @matrix(from_version=metadata_2_versions, to_version=metadata_3_versions)
    def test_metadata_upgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version>
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # first rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_rolling_bounce(p, from_version[:-2], to_version, counter)
            counter = counter + 1

        # second rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            self.do_rolling_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

        self.driver.stop()

    def start_all_nodes_with(self, version):
        # start first with <version>
        self.prepare_for(self.processor1, version)
        node1 = self.processor1.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor:
            with node1.account.monitor_log(
                    self.processor1.LOG_FILE) as log_monitor:
                self.processor1.start()
                log_monitor.wait_until(
                    "Kafka version : " + version,
                    timeout_sec=60,
                    err_msg="Could not detect Kafka Streams version " +
                    version + " " + str(node1.account))
                monitor.wait_until(
                    "processed 100 records from topic",
                    timeout_sec=60,
                    err_msg=
                    "Never saw output 'processed 100 records from topic' on" +
                    str(node1.account))

        # start second with <version>
        self.prepare_for(self.processor2, version)
        node2 = self.processor2.node
        with node1.account.monitor_log(
                self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(
                    self.processor2.STDOUT_FILE) as second_monitor:
                with node2.account.monitor_log(
                        self.processor2.LOG_FILE) as log_monitor:
                    self.processor2.start()
                    log_monitor.wait_until(
                        "Kafka version : " + version,
                        timeout_sec=60,
                        err_msg="Could not detect Kafka Streams version " +
                        version + " " + str(node2.account))
                    first_monitor.wait_until(
                        "processed 100 records from topic",
                        timeout_sec=60,
                        err_msg=
                        "Never saw output 'processed 100 records from topic' on"
                        + str(node1.account))
                    second_monitor.wait_until(
                        "processed 100 records from topic",
                        timeout_sec=60,
                        err_msg=
                        "Never saw output 'processed 100 records from topic' on"
                        + str(node2.account))

        # start third with <version>
        self.prepare_for(self.processor3, version)
        node3 = self.processor3.node
        with node1.account.monitor_log(
                self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(
                    self.processor2.STDOUT_FILE) as second_monitor:
                with node3.account.monitor_log(
                        self.processor3.STDOUT_FILE) as third_monitor:
                    with node3.account.monitor_log(
                            self.processor3.LOG_FILE) as log_monitor:
                        self.processor3.start()
                        log_monitor.wait_until(
                            "Kafka version : " + version,
                            timeout_sec=60,
                            err_msg="Could not detect Kafka Streams version " +
                            version + " " + str(node3.account))
                        first_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node1.account))
                        second_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node2.account))
                        third_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node3.account))

    @staticmethod
    def prepare_for(processor, version):
        processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT,
                                   allow_fail=False)
        if version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(version)

    def do_rolling_bounce(self, processor, upgrade_from, new_version, counter):
        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        # stop processor and wait for rebalance of others
        with first_other_node.account.monitor_log(
                first_other_processor.STDOUT_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(
                    second_other_processor.STDOUT_FILE
            ) as second_other_monitor:
                processor.stop()
                first_other_monitor.wait_until(
                    "processed 100 records from topic",
                    timeout_sec=60,
                    err_msg=
                    "Never saw output 'processed 100 records from topic' on" +
                    str(first_other_node.account))
                second_other_monitor.wait_until(
                    "processed 100 records from topic",
                    timeout_sec=60,
                    err_msg=
                    "Never saw output 'processed 100 records from topic' on" +
                    str(second_other_node.account))
        node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" %
                                 processor.STDOUT_FILE,
                                 allow_fail=False)

        if upgrade_from is None:  # upgrade disabled -- second round of rolling bounces
            roll_counter = ".1-"  # second round of rolling bounces
        else:
            roll_counter = ".0-"  # first  round of rolling boundes

        node.account.ssh("mv " + processor.STDOUT_FILE + " " +
                         processor.STDOUT_FILE + roll_counter + str(counter),
                         allow_fail=False)
        node.account.ssh("mv " + processor.STDERR_FILE + " " +
                         processor.STDERR_FILE + roll_counter + str(counter),
                         allow_fail=False)
        node.account.ssh("mv " + processor.LOG_FILE + " " +
                         processor.LOG_FILE + roll_counter + str(counter),
                         allow_fail=False)

        if new_version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(new_version)
        processor.set_upgrade_from(upgrade_from)

        grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" "
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            with node.account.monitor_log(processor.LOG_FILE) as log_monitor:
                with first_other_node.account.monitor_log(
                        first_other_processor.STDOUT_FILE
                ) as first_other_monitor:
                    with second_other_node.account.monitor_log(
                            second_other_processor.STDOUT_FILE
                    ) as second_other_monitor:
                        processor.start()

                        log_monitor.wait_until(
                            "Kafka version : " + new_version,
                            timeout_sec=60,
                            err_msg="Could not detect Kafka Streams version " +
                            new_version + " " + str(node.account))
                        first_other_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(first_other_node.account))
                        found = list(
                            first_other_node.account.ssh_capture(
                                grep_metadata_error +
                                first_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        second_other_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(second_other_node.account))
                        found = list(
                            second_other_node.account.ssh_capture(
                                grep_metadata_error +
                                second_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node.account))
class StreamsUpgradeTest(KafkaTest):
    """
    Test upgrading Kafka Streams (all version combination)
    If metadata was changes, upgrade is more difficult
    Metadata version was bumped in 0.10.1.0
    """
    def __init__(self, test_context):
        super(StreamsUpgradeTest, self).__init__(test_context,
                                                 num_zk=1,
                                                 num_brokers=1,
                                                 topics={
                                                     'echo': {
                                                         'partitions': 5
                                                     },
                                                     'data': {
                                                         'partitions': 5
                                                     }
                                                 })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            test_context, self.kafka)

    @parametrize(old_version=str(LATEST_0_10_1),
                 new_version=str(LATEST_0_10_2))
    @parametrize(old_version=str(LATEST_0_10_1), new_version=str(DEV_VERSION))
    @parametrize(old_version=str(LATEST_0_10_2), new_version=str(DEV_VERSION))
    def test_simple_upgrade(self, old_version, new_version):
        """
        Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_verion>
        """

        self.driver.start()
        self.start_all_nodes_with(old_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_rolling_bounce(p, "", new_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

        self.driver.stop()

    #@parametrize(new_version=str(LATEST_0_10_1)) we cannot run this test until Kafka 0.10.1.2 is released
    #@parametrize(new_version=str(LATEST_0_10_2)) we cannot run this test until Kafka 0.10.2.2 is released
    @parametrize(new_version=str(DEV_VERSION))
    def test_metadata_upgrade(self, new_version):
        """
        Starts 3 KafkaStreams instances with version 0.10.0, and upgrades one-by-one to <new_version>
        """

        self.driver.start()
        self.start_all_nodes_with(str(LATEST_0_10_0))

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # first rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_rolling_bounce(p, "0.10.0", new_version, counter)
            counter = counter + 1

        # second rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            self.do_rolling_bounce(p, "", new_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

        self.driver.stop()

    def start_all_nodes_with(self, version):
        # start first with <version>
        self.prepare_for(self.processor1, version)
        node1 = self.processor1.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor:
            with node1.account.monitor_log(
                    self.processor1.LOG_FILE) as log_monitor:
                self.processor1.start()
                log_monitor.wait_until(
                    "Kafka version : " + version,
                    timeout_sec=60,
                    err_msg="Could not detect Kafka Streams version " +
                    version + " " + str(node1.account))
                monitor.wait_until(
                    "processed 100 records from topic",
                    timeout_sec=60,
                    err_msg=
                    "Never saw output 'processed 100 records from topic' on" +
                    str(node1.account))

        # start second with <version>
        self.prepare_for(self.processor2, version)
        node2 = self.processor2.node
        with node1.account.monitor_log(
                self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(
                    self.processor2.STDOUT_FILE) as second_monitor:
                with node2.account.monitor_log(
                        self.processor2.LOG_FILE) as log_monitor:
                    self.processor2.start()
                    log_monitor.wait_until(
                        "Kafka version : " + version,
                        timeout_sec=60,
                        err_msg="Could not detect Kafka Streams version " +
                        version + " " + str(node2.account))
                    first_monitor.wait_until(
                        "processed 100 records from topic",
                        timeout_sec=60,
                        err_msg=
                        "Never saw output 'processed 100 records from topic' on"
                        + str(node1.account))
                    second_monitor.wait_until(
                        "processed 100 records from topic",
                        timeout_sec=60,
                        err_msg=
                        "Never saw output 'processed 100 records from topic' on"
                        + str(node2.account))

        # start third with <version>
        self.prepare_for(self.processor3, version)
        node3 = self.processor3.node
        with node1.account.monitor_log(
                self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(
                    self.processor2.STDOUT_FILE) as second_monitor:
                with node3.account.monitor_log(
                        self.processor3.STDOUT_FILE) as third_monitor:
                    with node3.account.monitor_log(
                            self.processor3.LOG_FILE) as log_monitor:
                        self.processor3.start()
                        log_monitor.wait_until(
                            "Kafka version : " + version,
                            timeout_sec=60,
                            err_msg="Could not detect Kafka Streams version " +
                            version + " " + str(node3.account))
                        first_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node1.account))
                        second_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node2.account))
                        third_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node3.account))

    @staticmethod
    def prepare_for(processor, version):
        processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT,
                                   allow_fail=False)
        processor.set_version(version)

    def do_rolling_bounce(self, processor, upgrade_from, new_version, counter):
        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        # stop processor and wait for rebalance of others
        with first_other_node.account.monitor_log(
                first_other_processor.STDOUT_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(
                    second_other_processor.STDOUT_FILE
            ) as second_other_monitor:
                processor.stop()
                first_other_monitor.wait_until(
                    "processed 100 records from topic",
                    timeout_sec=60,
                    err_msg=
                    "Never saw output 'processed 100 records from topic' on" +
                    str(first_other_node.account))
                second_other_monitor.wait_until(
                    "processed 100 records from topic",
                    timeout_sec=60,
                    err_msg=
                    "Never saw output 'processed 100 records from topic' on" +
                    str(second_other_node.account))
        node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" %
                                 processor.STDOUT_FILE,
                                 allow_fail=False)

        if upgrade_from == "":  # upgrade disabled -- second round of rolling bounces
            roll_counter = ".1-"  # second round of rolling bounces
        else:
            roll_counter = ".0-"  # first  round of rolling boundes

        node.account.ssh("mv " + processor.STDOUT_FILE + " " +
                         processor.STDOUT_FILE + roll_counter + str(counter),
                         allow_fail=False)
        node.account.ssh("mv " + processor.STDERR_FILE + " " +
                         processor.STDERR_FILE + roll_counter + str(counter),
                         allow_fail=False)
        node.account.ssh("mv " + processor.LOG_FILE + " " +
                         processor.LOG_FILE + roll_counter + str(counter),
                         allow_fail=False)

        if new_version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(new_version)
        processor.set_upgrade_from(upgrade_from)

        grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" "
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            with node.account.monitor_log(processor.LOG_FILE) as log_monitor:
                with first_other_node.account.monitor_log(
                        first_other_processor.STDOUT_FILE
                ) as first_other_monitor:
                    with second_other_node.account.monitor_log(
                            second_other_processor.STDOUT_FILE
                    ) as second_other_monitor:
                        processor.start()

                        log_monitor.wait_until(
                            "Kafka version : " + new_version,
                            timeout_sec=60,
                            err_msg="Could not detect Kafka Streams version " +
                            new_version + " " + str(node.account))
                        first_other_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(first_other_node.account))
                        found = list(
                            first_other_node.account.ssh_capture(
                                grep_metadata_error +
                                first_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        second_other_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(second_other_node.account))
                        found = list(
                            second_other_node.account.ssh_capture(
                                grep_metadata_error +
                                second_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node.account))
예제 #5
0
class StreamsUpgradeTest(Test):
    """
    Test upgrading Kafka Streams (all version combination)
    If metadata was changes, upgrade is more difficult
    Metadata version was bumped in 0.10.1.0
    """

    def __init__(self, test_context):
        super(StreamsUpgradeTest, self).__init__(test_context)
        self.topics = {
            'echo' : { 'partitions': 5 },
            'data' : { 'partitions': 5 },
        }
        self.leader = None

    def perform_broker_upgrade(self, to_version):
        self.logger.info("First pass bounce - rolling broker upgrade")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            node.version = KafkaVersion(to_version)
            self.kafka.start_node(node)

    @ignore
    @cluster(num_nodes=6)
    @matrix(from_version=broker_upgrade_versions, to_version=broker_upgrade_versions)
    def test_upgrade_downgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker.
        """

        if from_version == to_version:
            return

        self.replication = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr}},
            'data' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr} },
            'min' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'max' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'sum' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'dif' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'cnt' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'avg' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'wcnt' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr} },
            'tagg' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr} }
        }

        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context, num_nodes=3,
                                  zk=self.zk, version=KafkaVersion(from_version), topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        time.sleep(10)

        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka)
        
        self.driver.start()
        self.processor1.start()
        time.sleep(15)

        self.perform_broker_upgrade(to_version)

        time.sleep(15)
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False)
        self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False)

    @ignore
    @matrix(from_version=metadata_2_versions, to_version=metadata_2_versions)
    def test_simple_upgrade_downgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version>
        """

        if from_version == to_version:
            return

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # upgrade one-by-one via rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_rolling_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED",
                                   timeout_sec=60,
                                   err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account))

        self.driver.stop()

    #@matrix(from_version=metadata_1_versions, to_version=backward_compatible_metadata_2_versions)
    @ignore
    @matrix(from_version=metadata_1_versions, to_version=metadata_3_versions)
    @matrix(from_version=metadata_2_versions, to_version=metadata_3_versions)
    def test_metadata_upgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version>
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # first rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_rolling_bounce(p, from_version[:-2], to_version, counter)
            counter = counter + 1

        # second rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            self.do_rolling_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED",
                                   timeout_sec=60,
                                   err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account))

        self.driver.stop()

    def start_all_nodes_with(self, version):
        # start first with <version>
        self.prepare_for(self.processor1, version)
        node1 = self.processor1.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor:
            with node1.account.monitor_log(self.processor1.LOG_FILE) as log_monitor:
                self.processor1.start()
                log_monitor.wait_until("Kafka version : " + version,
                                       timeout_sec=60,
                                       err_msg="Could not detect Kafka Streams version " + version + " " + str(node1.account))
                monitor.wait_until("processed 100 records from topic",
                                   timeout_sec=60,
                                   err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account))

        # start second with <version>
        self.prepare_for(self.processor2, version)
        node2 = self.processor2.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(self.processor2.STDOUT_FILE) as second_monitor:
                with node2.account.monitor_log(self.processor2.LOG_FILE) as log_monitor:
                    self.processor2.start()
                    log_monitor.wait_until("Kafka version : " + version,
                                           timeout_sec=60,
                                           err_msg="Could not detect Kafka Streams version " + version + " " + str(node2.account))
                    first_monitor.wait_until("processed 100 records from topic",
                                             timeout_sec=60,
                                             err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account))
                    second_monitor.wait_until("processed 100 records from topic",
                                              timeout_sec=60,
                                              err_msg="Never saw output 'processed 100 records from topic' on" + str(node2.account))

        # start third with <version>
        self.prepare_for(self.processor3, version)
        node3 = self.processor3.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(self.processor2.STDOUT_FILE) as second_monitor:
                with node3.account.monitor_log(self.processor3.STDOUT_FILE) as third_monitor:
                    with node3.account.monitor_log(self.processor3.LOG_FILE) as log_monitor:
                        self.processor3.start()
                        log_monitor.wait_until("Kafka version : " + version,
                                               timeout_sec=60,
                                               err_msg="Could not detect Kafka Streams version " + version + " " + str(node3.account))
                        first_monitor.wait_until("processed 100 records from topic",
                                                 timeout_sec=60,
                                                 err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account))
                        second_monitor.wait_until("processed 100 records from topic",
                                                  timeout_sec=60,
                                                  err_msg="Never saw output 'processed 100 records from topic' on" + str(node2.account))
                        third_monitor.wait_until("processed 100 records from topic",
                                                  timeout_sec=60,
                                                  err_msg="Never saw output 'processed 100 records from topic' on" + str(node3.account))

    @staticmethod
    def prepare_for(processor, version):
        processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT, allow_fail=False)
        if version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(version)

    def do_rolling_bounce(self, processor, upgrade_from, new_version, counter):
        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        # stop processor and wait for rebalance of others
        with first_other_node.account.monitor_log(first_other_processor.STDOUT_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(second_other_processor.STDOUT_FILE) as second_other_monitor:
                processor.stop()
                first_other_monitor.wait_until("processed 100 records from topic",
                                               timeout_sec=60,
                                               err_msg="Never saw output 'processed 100 records from topic' on" + str(first_other_node.account))
                second_other_monitor.wait_until("processed 100 records from topic",
                                                timeout_sec=60,
                                                err_msg="Never saw output 'processed 100 records from topic' on" + str(second_other_node.account))
        node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False)

        if upgrade_from is None:  # upgrade disabled -- second round of rolling bounces
            roll_counter = ".1-"  # second round of rolling bounces
        else:
            roll_counter = ".0-"  # first  round of rolling boundes

        node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + roll_counter + str(counter), allow_fail=False)
        node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + roll_counter + str(counter), allow_fail=False)
        node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + roll_counter + str(counter), allow_fail=False)

        if new_version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(new_version)
        processor.set_upgrade_from(upgrade_from)

        grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" "
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            with node.account.monitor_log(processor.LOG_FILE) as log_monitor:
                with first_other_node.account.monitor_log(first_other_processor.STDOUT_FILE) as first_other_monitor:
                    with second_other_node.account.monitor_log(second_other_processor.STDOUT_FILE) as second_other_monitor:
                        processor.start()

                        log_monitor.wait_until("Kafka version : " + new_version,
                                               timeout_sec=60,
                                               err_msg="Could not detect Kafka Streams version " + new_version + " " + str(node.account))
                        first_other_monitor.wait_until("processed 100 records from topic",
                                                       timeout_sec=60,
                                                       err_msg="Never saw output 'processed 100 records from topic' on" + str(first_other_node.account))
                        found = list(first_other_node.account.ssh_capture(grep_metadata_error + first_other_processor.STDERR_FILE, allow_fail=True))
                        if len(found) > 0:
                            raise Exception("Kafka Streams failed with 'unable to decode subscription data: version=2'")

                        second_other_monitor.wait_until("processed 100 records from topic",
                                                        timeout_sec=60,
                                                        err_msg="Never saw output 'processed 100 records from topic' on" + str(second_other_node.account))
                        found = list(second_other_node.account.ssh_capture(grep_metadata_error + second_other_processor.STDERR_FILE, allow_fail=True))
                        if len(found) > 0:
                            raise Exception("Kafka Streams failed with 'unable to decode subscription data: version=2'")

                        monitor.wait_until("processed 100 records from topic",
                                           timeout_sec=60,
                                           err_msg="Never saw output 'processed 100 records from topic' on" + str(node.account))
class StreamsUpgradeTest(Test):
    """
    Test upgrading Kafka Streams (all version combination)
    If metadata was changes, upgrade is more difficult
    Metadata version was bumped in 0.10.1.0 and
    subsequently bumped in 2.0.0
    """
    def __init__(self, test_context):
        super(StreamsUpgradeTest, self).__init__(test_context)
        self.topics = {
            'echo': {
                'partitions': 5
            },
            'data': {
                'partitions': 5
            },
        }

    processed_msg = "processed [0-9]* records"
    base_version_number = str(DEV_VERSION).split("-")[0]

    def perform_broker_upgrade(self, to_version):
        self.logger.info("First pass bounce - rolling broker upgrade")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            node.version = KafkaVersion(to_version)
            self.kafka.start_node(node)

    @cluster(num_nodes=6)
    @matrix(from_version=smoke_test_versions,
            to_version=dev_version,
            bounce_type=["full"])
    def test_app_upgrade(self, from_version, to_version, bounce_type):
        """
        Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version>
        """

        if from_version == to_version:
            return

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics={
                                      'echo': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'data': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'min': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'min-suppressed': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'min-raw': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'max': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'sum': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'sws-raw': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'sws-suppressed': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'dif': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'cnt': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'avg': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'wcnt': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'tagg': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      }
                                  })
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context,
            self.kafka,
            processing_guarantee="at_least_once",
            replication_factor=1)
        self.processor2 = StreamsSmokeTestJobRunnerService(
            self.test_context,
            self.kafka,
            processing_guarantee="at_least_once",
            replication_factor=1)
        self.processor3 = StreamsSmokeTestJobRunnerService(
            self.test_context,
            self.kafka,
            processing_guarantee="at_least_once",
            replication_factor=1)

        self.purge_state_dir(self.processor1)
        self.purge_state_dir(self.processor2)
        self.purge_state_dir(self.processor3)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        if bounce_type == "rolling":
            counter = 1
            random.seed()
            # upgrade one-by-one via rolling bounce
            random.shuffle(self.processors)
            for p in self.processors:
                p.CLEAN_NODE_ENABLED = False
                self.do_stop_start_bounce(p, None, to_version, counter)
                counter = counter + 1
        elif bounce_type == "full":
            self.restart_all_nodes_with(to_version)
        else:
            raise Exception("Unrecognized bounce_type: " + str(bounce_type))

        # shutdown
        self.driver.stop()

        # Ideally, we would actually verify the expected results.
        # See KAFKA-10202

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "SMOKE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'SMOKE-TEST-CLIENT-CLOSED' on " +
                    str(node.account))

    def start_all_nodes_with(self, version):

        self.set_version(self.processor1, version)
        self.set_version(self.processor2, version)
        self.set_version(self.processor3, version)

        self.processor1.start()
        self.processor2.start()
        self.processor3.start()

        # double-check the version
        kafka_version_str = self.get_version_string(version)
        self.wait_for_verification(self.processor1, kafka_version_str,
                                   self.processor1.LOG_FILE)
        self.wait_for_verification(self.processor2, kafka_version_str,
                                   self.processor2.LOG_FILE)
        self.wait_for_verification(self.processor3, kafka_version_str,
                                   self.processor3.LOG_FILE)

        # wait for the members to join
        self.wait_for_verification(self.processor1,
                                   "SMOKE-TEST-CLIENT-STARTED",
                                   self.processor1.STDOUT_FILE)
        self.wait_for_verification(self.processor2,
                                   "SMOKE-TEST-CLIENT-STARTED",
                                   self.processor2.STDOUT_FILE)
        self.wait_for_verification(self.processor3,
                                   "SMOKE-TEST-CLIENT-STARTED",
                                   self.processor3.STDOUT_FILE)

        # make sure they've processed something
        self.wait_for_verification(self.processor1, self.processed_msg,
                                   self.processor1.STDOUT_FILE)
        self.wait_for_verification(self.processor2, self.processed_msg,
                                   self.processor2.STDOUT_FILE)
        self.wait_for_verification(self.processor3, self.processed_msg,
                                   self.processor3.STDOUT_FILE)

    def restart_all_nodes_with(self, version):
        self.processor1.stop_node(self.processor1.node)
        self.processor2.stop_node(self.processor2.node)
        self.processor3.stop_node(self.processor3.node)

        # make sure the members have stopped
        self.wait_for_verification(self.processor1, "SMOKE-TEST-CLIENT-CLOSED",
                                   self.processor1.STDOUT_FILE)
        self.wait_for_verification(self.processor2, "SMOKE-TEST-CLIENT-CLOSED",
                                   self.processor2.STDOUT_FILE)
        self.wait_for_verification(self.processor3, "SMOKE-TEST-CLIENT-CLOSED",
                                   self.processor3.STDOUT_FILE)

        self.roll_logs(self.processor1, ".1-1")
        self.roll_logs(self.processor2, ".1-1")
        self.roll_logs(self.processor3, ".1-1")

        self.set_version(self.processor1, version)
        self.set_version(self.processor2, version)
        self.set_version(self.processor3, version)

        self.processor1.start_node(self.processor1.node)
        self.processor2.start_node(self.processor2.node)
        self.processor3.start_node(self.processor3.node)

        # double-check the version
        kafka_version_str = self.get_version_string(version)
        self.wait_for_verification(self.processor1, kafka_version_str,
                                   self.processor1.LOG_FILE)
        self.wait_for_verification(self.processor2, kafka_version_str,
                                   self.processor2.LOG_FILE)
        self.wait_for_verification(self.processor3, kafka_version_str,
                                   self.processor3.LOG_FILE)

        # wait for the members to join
        self.wait_for_verification(self.processor1,
                                   "SMOKE-TEST-CLIENT-STARTED",
                                   self.processor1.STDOUT_FILE)
        self.wait_for_verification(self.processor2,
                                   "SMOKE-TEST-CLIENT-STARTED",
                                   self.processor2.STDOUT_FILE)
        self.wait_for_verification(self.processor3,
                                   "SMOKE-TEST-CLIENT-STARTED",
                                   self.processor3.STDOUT_FILE)

        # make sure they've processed something
        self.wait_for_verification(self.processor1, self.processed_msg,
                                   self.processor1.STDOUT_FILE)
        self.wait_for_verification(self.processor2, self.processed_msg,
                                   self.processor2.STDOUT_FILE)
        self.wait_for_verification(self.processor3, self.processed_msg,
                                   self.processor3.STDOUT_FILE)

    def get_version_string(self, version):
        if version.startswith("0") or version.startswith("1") \
          or version.startswith("2.0") or version.startswith("2.1"):
            return "Kafka version : " + version
        elif "SNAPSHOT" in version:
            return "Kafka version.*" + self.base_version_number + ".*SNAPSHOT"
        else:
            return "Kafka version: " + version

    def wait_for_verification(self, processor, message, file, num_lines=1):
        wait_until(lambda: self.verify_from_file(processor, message, file
                                                 ) >= num_lines,
                   timeout_sec=60,
                   err_msg="Did expect to read '%s' from %s" %
                   (message, processor.node.account))

    def verify_from_file(self, processor, message, file):
        result = processor.node.account.ssh_output("grep -E '%s' %s | wc -l" %
                                                   (message, file),
                                                   allow_fail=False)
        try:
            return int(result)
        except ValueError:
            self.logger.warn("Command failed with ValueError: " + result)
            return 0

    def set_version(self, processor, version):
        if version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(version)

    def purge_state_dir(self, processor):
        processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT,
                                   allow_fail=False)

    def do_stop_start_bounce(self, processor, upgrade_from, new_version,
                             counter):
        kafka_version_str = self.get_version_string(new_version)

        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        # stop processor and wait for rebalance of others
        with first_other_node.account.monitor_log(
                first_other_processor.STDOUT_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(
                    second_other_processor.STDOUT_FILE
            ) as second_other_monitor:
                processor.stop_node(processor.node)
                first_other_monitor.wait_until(
                    self.processed_msg,
                    timeout_sec=60,
                    err_msg="Never saw output '%s' on " % self.processed_msg +
                    str(first_other_node.account))
                second_other_monitor.wait_until(
                    self.processed_msg,
                    timeout_sec=60,
                    err_msg="Never saw output '%s' on " % self.processed_msg +
                    str(second_other_node.account))
        node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" %
                                 processor.STDOUT_FILE,
                                 allow_fail=False)

        if upgrade_from is None:  # upgrade disabled -- second round of rolling bounces
            roll_counter = ".1-"  # second round of rolling bounces
        else:
            roll_counter = ".0-"  # first  round of rolling bounces

        self.roll_logs(processor, roll_counter + str(counter))

        self.set_version(processor, new_version)
        processor.set_upgrade_from(upgrade_from)

        grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" "
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            with node.account.monitor_log(processor.LOG_FILE) as log_monitor:
                with first_other_node.account.monitor_log(
                        first_other_processor.STDOUT_FILE
                ) as first_other_monitor:
                    with second_other_node.account.monitor_log(
                            second_other_processor.STDOUT_FILE
                    ) as second_other_monitor:
                        processor.start_node(processor.node)

                        log_monitor.wait_until(
                            kafka_version_str,
                            timeout_sec=60,
                            err_msg="Could not detect Kafka Streams version " +
                            new_version + " on " + str(node.account))
                        first_other_monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg + str(first_other_node.account))
                        found = list(
                            first_other_node.account.ssh_capture(
                                grep_metadata_error +
                                first_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        second_other_monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg +
                            str(second_other_node.account))
                        found = list(
                            second_other_node.account.ssh_capture(
                                grep_metadata_error +
                                second_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg + str(node.account))

    def roll_logs(self, processor, roll_suffix):
        processor.node.account.ssh("mv " + processor.STDOUT_FILE + " " +
                                   processor.STDOUT_FILE + roll_suffix,
                                   allow_fail=False)
        processor.node.account.ssh("mv " + processor.STDERR_FILE + " " +
                                   processor.STDERR_FILE + roll_suffix,
                                   allow_fail=False)
        processor.node.account.ssh("mv " + processor.LOG_FILE + " " +
                                   processor.LOG_FILE + roll_suffix,
                                   allow_fail=False)
        processor.node.account.ssh("mv " + processor.CONFIG_FILE + " " +
                                   processor.CONFIG_FILE + roll_suffix,
                                   allow_fail=False)
예제 #7
0
class StreamsUpgradeTest(Test):
    """
    Test upgrading Kafka Streams (all version combination)
    If metadata was changes, upgrade is more difficult
    Metadata version was bumped in 0.10.1.0
    """

    def __init__(self, test_context):
        super(StreamsUpgradeTest, self).__init__(test_context)
        self.topics = {
            'echo' : { 'partitions': 5 },
            'data' : { 'partitions': 5 },
        }
        self.leader = None
        self.leader_counter = {}

    def perform_broker_upgrade(self, to_version):
        self.logger.info("First pass bounce - rolling broker upgrade")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            node.version = KafkaVersion(to_version)
            self.kafka.start_node(node)

    @ignore
    @cluster(num_nodes=6)
    @matrix(from_version=broker_upgrade_versions, to_version=broker_upgrade_versions)
    def test_upgrade_downgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker.
        """

        if from_version == to_version:
            return

        self.replication = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr}},
            'data' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr} },
            'min' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'max' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'sum' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'dif' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'cnt' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'avg' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'wcnt' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr} },
            'tagg' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr} }
        }

        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context, num_nodes=3,
                                  zk=self.zk, version=KafkaVersion(from_version), topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        time.sleep(10)

        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka)
        
        self.driver.start()
        self.processor1.start()
        time.sleep(15)

        self.perform_broker_upgrade(to_version)

        time.sleep(15)
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False)
        self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False)

    @matrix(from_version=metadata_2_versions, to_version=metadata_2_versions)
    def test_simple_upgrade_downgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version>
        """

        if from_version == to_version:
            return

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # upgrade one-by-one via rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_stop_start_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED",
                                   timeout_sec=60,
                                   err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account))

        self.driver.stop()

    @matrix(from_version=metadata_1_versions, to_version=backward_compatible_metadata_2_versions)
    @matrix(from_version=metadata_1_versions, to_version=metadata_3_or_higher_versions)
    @matrix(from_version=metadata_2_versions, to_version=metadata_3_or_higher_versions)
    def test_metadata_upgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version>
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # first rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_stop_start_bounce(p, from_version[:-2], to_version, counter)
            counter = counter + 1

        # second rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            self.do_stop_start_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED",
                                   timeout_sec=60,
                                   err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account))

        self.driver.stop()

    def test_version_probing_upgrade(self):
        """
        Starts 3 KafkaStreams instances, and upgrades one-by-one to "future version"
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with("") # run with TRUNK

        self.processors = [self.processor1, self.processor2, self.processor3]
        self.old_processors = [self.processor1, self.processor2, self.processor3]
        self.upgraded_processors = []
        for p in self.processors:
            self.leader_counter[p] = 2

        self.update_leader()
        for p in self.processors:
            self.leader_counter[p] = 0
        self.leader_counter[self.leader] = 3

        counter = 1
        current_generation = 3

        random.seed()
        random.shuffle(self.processors)

        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            current_generation = self.do_rolling_bounce(p, counter, current_generation)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED",
                                   timeout_sec=60,
                                   err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account))

        self.driver.stop()

    def update_leader(self):
        self.leader = None
        retries = 10
        while retries > 0:
            for p in self.processors:
                found = list(p.node.account.ssh_capture("grep \"Finished assignment for group\" %s" % p.LOG_FILE, allow_fail=True))
                if len(found) == self.leader_counter[p] + 1:
                    if self.leader is not None:
                        raise Exception("Could not uniquely identify leader")
                    self.leader = p
                    self.leader_counter[p] = self.leader_counter[p] + 1

            if self.leader is None:
                retries = retries - 1
                time.sleep(5)
            else:
                break

        if self.leader is None:
            raise Exception("Could not identify leader")

    def start_all_nodes_with(self, version):
        # start first with <version>
        self.prepare_for(self.processor1, version)
        node1 = self.processor1.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor:
            with node1.account.monitor_log(self.processor1.LOG_FILE) as log_monitor:
                self.processor1.start()
                log_monitor.wait_until("Kafka version : " + version,
                                       timeout_sec=60,
                                       err_msg="Could not detect Kafka Streams version " + version + " " + str(node1.account))
                monitor.wait_until("processed 100 records from topic",
                                   timeout_sec=60,
                                   err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account))

        # start second with <version>
        self.prepare_for(self.processor2, version)
        node2 = self.processor2.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(self.processor2.STDOUT_FILE) as second_monitor:
                with node2.account.monitor_log(self.processor2.LOG_FILE) as log_monitor:
                    self.processor2.start()
                    log_monitor.wait_until("Kafka version : " + version,
                                           timeout_sec=60,
                                           err_msg="Could not detect Kafka Streams version " + version + " " + str(node2.account))
                    first_monitor.wait_until("processed 100 records from topic",
                                             timeout_sec=60,
                                             err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account))
                    second_monitor.wait_until("processed 100 records from topic",
                                              timeout_sec=60,
                                              err_msg="Never saw output 'processed 100 records from topic' on" + str(node2.account))

        # start third with <version>
        self.prepare_for(self.processor3, version)
        node3 = self.processor3.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(self.processor2.STDOUT_FILE) as second_monitor:
                with node3.account.monitor_log(self.processor3.STDOUT_FILE) as third_monitor:
                    with node3.account.monitor_log(self.processor3.LOG_FILE) as log_monitor:
                        self.processor3.start()
                        log_monitor.wait_until("Kafka version : " + version,
                                               timeout_sec=60,
                                               err_msg="Could not detect Kafka Streams version " + version + " " + str(node3.account))
                        first_monitor.wait_until("processed 100 records from topic",
                                                 timeout_sec=60,
                                                 err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account))
                        second_monitor.wait_until("processed 100 records from topic",
                                                  timeout_sec=60,
                                                  err_msg="Never saw output 'processed 100 records from topic' on" + str(node2.account))
                        third_monitor.wait_until("processed 100 records from topic",
                                                  timeout_sec=60,
                                                  err_msg="Never saw output 'processed 100 records from topic' on" + str(node3.account))

    @staticmethod
    def prepare_for(processor, version):
        processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT, allow_fail=False)
        if version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(version)

    def do_stop_start_bounce(self, processor, upgrade_from, new_version, counter):
        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        # stop processor and wait for rebalance of others
        with first_other_node.account.monitor_log(first_other_processor.STDOUT_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(second_other_processor.STDOUT_FILE) as second_other_monitor:
                processor.stop()
                first_other_monitor.wait_until("processed 100 records from topic",
                                               timeout_sec=60,
                                               err_msg="Never saw output 'processed 100 records from topic' on" + str(first_other_node.account))
                second_other_monitor.wait_until("processed 100 records from topic",
                                                timeout_sec=60,
                                                err_msg="Never saw output 'processed 100 records from topic' on" + str(second_other_node.account))
        node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False)

        if upgrade_from is None:  # upgrade disabled -- second round of rolling bounces
            roll_counter = ".1-"  # second round of rolling bounces
        else:
            roll_counter = ".0-"  # first  round of rolling boundes

        node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + roll_counter + str(counter), allow_fail=False)
        node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + roll_counter + str(counter), allow_fail=False)
        node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + roll_counter + str(counter), allow_fail=False)

        if new_version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(new_version)
        processor.set_upgrade_from(upgrade_from)

        grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" "
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            with node.account.monitor_log(processor.LOG_FILE) as log_monitor:
                with first_other_node.account.monitor_log(first_other_processor.STDOUT_FILE) as first_other_monitor:
                    with second_other_node.account.monitor_log(second_other_processor.STDOUT_FILE) as second_other_monitor:
                        processor.start()

                        log_monitor.wait_until("Kafka version : " + new_version,
                                               timeout_sec=60,
                                               err_msg="Could not detect Kafka Streams version " + new_version + " " + str(node.account))
                        first_other_monitor.wait_until("processed 100 records from topic",
                                                       timeout_sec=60,
                                                       err_msg="Never saw output 'processed 100 records from topic' on" + str(first_other_node.account))
                        found = list(first_other_node.account.ssh_capture(grep_metadata_error + first_other_processor.STDERR_FILE, allow_fail=True))
                        if len(found) > 0:
                            raise Exception("Kafka Streams failed with 'unable to decode subscription data: version=2'")

                        second_other_monitor.wait_until("processed 100 records from topic",
                                                        timeout_sec=60,
                                                        err_msg="Never saw output 'processed 100 records from topic' on" + str(second_other_node.account))
                        found = list(second_other_node.account.ssh_capture(grep_metadata_error + second_other_processor.STDERR_FILE, allow_fail=True))
                        if len(found) > 0:
                            raise Exception("Kafka Streams failed with 'unable to decode subscription data: version=2'")

                        monitor.wait_until("processed 100 records from topic",
                                           timeout_sec=60,
                                           err_msg="Never saw output 'processed 100 records from topic' on" + str(node.account))

    def do_rolling_bounce(self, processor, counter, current_generation):
        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        with first_other_node.account.monitor_log(first_other_processor.LOG_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(second_other_processor.LOG_FILE) as second_other_monitor:
                # stop processor
                processor.stop()
                node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False)

                node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + "." + str(counter), allow_fail=False)
                node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + "." + str(counter), allow_fail=False)
                node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + "." + str(counter), allow_fail=False)
                self.leader_counter[processor] = 0

                with node.account.monitor_log(processor.LOG_FILE) as log_monitor:
                    processor.set_upgrade_to("future_version")
                    processor.start()
                    self.old_processors.remove(processor)
                    self.upgraded_processors.append(processor)

                    current_generation = current_generation + 1

                    log_monitor.wait_until("Kafka version : " + str(DEV_VERSION),
                                           timeout_sec=60,
                                           err_msg="Could not detect Kafka Streams version " + str(DEV_VERSION) + " in " + str(node.account))
                    log_monitor.offset = 5
                    log_monitor.wait_until("partition\.assignment\.strategy = \[org\.apache\.kafka\.streams\.tests\.StreamsUpgradeTest$FutureStreamsPartitionAssignor\]",
                                           timeout_sec=60,
                                           err_msg="Could not detect FutureStreamsPartitionAssignor in " + str(node.account))

                    log_monitor.wait_until("Successfully joined group with generation " + str(current_generation),
                                           timeout_sec=60,
                                           err_msg="Never saw output 'Successfully joined group with generation " + str(current_generation) + "' on" + str(node.account))
                    first_other_monitor.wait_until("Successfully joined group with generation " + str(current_generation),
                                                   timeout_sec=60,
                                                   err_msg="Never saw output 'Successfully joined group with generation " + str(current_generation) + "' on" + str(first_other_node.account))
                    second_other_monitor.wait_until("Successfully joined group with generation " + str(current_generation),
                                                    timeout_sec=60,
                                                    err_msg="Never saw output 'Successfully joined group with generation " + str(current_generation) + "' on" + str(second_other_node.account))

                    if processor == self.leader:
                        self.update_leader()
                    else:
                        self.leader_counter[self.leader] = self.leader_counter[self.leader] + 1

                    if processor == self.leader:
                        leader_monitor = log_monitor
                    elif first_other_processor == self.leader:
                        leader_monitor = first_other_monitor
                    elif second_other_processor == self.leader:
                        leader_monitor = second_other_monitor
                    else:
                        raise Exception("Could not identify leader.")

                    monitors = {}
                    monitors[processor] = log_monitor
                    monitors[first_other_processor] = first_other_monitor
                    monitors[second_other_processor] = second_other_monitor

                    leader_monitor.wait_until("Received a future (version probing) subscription (version: 5). Sending empty assignment back (with supported version 4).",
                                              timeout_sec=60,
                                              err_msg="Could not detect 'version probing' attempt at leader " + str(self.leader.node.account))

                    if len(self.old_processors) > 0:
                        log_monitor.wait_until("Sent a version 5 subscription and got version 4 assignment back (successful version probing). Downgrading subscription metadata to received version and trigger new rebalance.",
                                               timeout_sec=60,
                                               err_msg="Could not detect 'successful version probing' at upgrading node " + str(node.account))
                    else:
                        log_monitor.wait_until("Sent a version 5 subscription and got version 4 assignment back (successful version probing). Setting subscription metadata to leaders supported version 5 and trigger new rebalance.",
                                               timeout_sec=60,
                                               err_msg="Could not detect 'successful version probing with upgraded leader' at upgrading node " + str(node.account))
                        first_other_monitor.wait_until("Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.",
                                                       timeout_sec=60,
                                                       err_msg="Never saw output 'Upgrade metadata to version 4' on" + str(first_other_node.account))
                        second_other_monitor.wait_until("Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.",
                                                        timeout_sec=60,
                                                        err_msg="Never saw output 'Upgrade metadata to version 4' on" + str(second_other_node.account))

                    log_monitor.wait_until("Version probing detected. Triggering new rebalance.",
                                           timeout_sec=60,
                                           err_msg="Could not detect 'Triggering new rebalance' at upgrading node " + str(node.account))

                    # version probing should trigger second rebalance
                    current_generation = current_generation + 1

                    for p in self.processors:
                        monitors[p].wait_until("Successfully joined group with generation " + str(current_generation),
                                               timeout_sec=60,
                                               err_msg="Never saw output 'Successfully joined group with generation " + str(current_generation) + "' on" + str(p.node.account))

                    if processor == self.leader:
                        self.update_leader()
                    else:
                        self.leader_counter[self.leader] = self.leader_counter[self.leader] + 1

                    if self.leader in self.old_processors or len(self.old_processors) > 0:
                        self.verify_metadata_no_upgraded_yet()

        return current_generation

    def verify_metadata_no_upgraded_yet(self):
        for p in self.processors:
            found = list(p.node.account.ssh_capture("grep \"Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.\" " + p.LOG_FILE, allow_fail=True))
            if len(found) > 0:
                raise Exception("Kafka Streams failed with 'group member upgraded to metadata 4 too early'")