예제 #1
0
class StreamsBounceTest(KafkaTest):
    """
    Simple test of Kafka Streams.
    """

    def __init__(self, test_context):
        super(StreamsBounceTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={
            'echo' : { 'partitions': 5, 'replication-factor': 2 },
            'data' : { 'partitions': 5, 'replication-factor': 2 },
            'min' : { 'partitions': 5, 'replication-factor': 2 },
            'max' : { 'partitions': 5, 'replication-factor': 2 },
            'sum' : { 'partitions': 5, 'replication-factor': 2 },
            'dif' : { 'partitions': 5, 'replication-factor': 2 },
            'cnt' : { 'partitions': 5, 'replication-factor': 2 },
            'avg' : { 'partitions': 5, 'replication-factor': 2 },
            'wcnt' : { 'partitions': 5, 'replication-factor': 2 },
            'tagg' : { 'partitions': 5, 'replication-factor': 2 }
        })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)

    @cluster(num_nodes=6)
    def test_bounce(self):
        """
        Start a smoke test client, then abort (kill -9) and restart it a few times.
        Ensure that all records are delivered.
        """

        self.driver.start()

        self.processor1.start()

        time.sleep(15)

        self.processor1.abortThenRestart()

        time.sleep(15)

        # enable this after we add change log partition replicas
        #self.kafka.signal_leader("data")

        #time.sleep(15);

        self.processor1.abortThenRestart()

        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False)
예제 #2
0
class StreamsBounceTest(KafkaTest):
    """
    Simple test of Kafka Streams.
    """

    def __init__(self, test_context):
        super(StreamsBounceTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={
            'echo' : { 'partitions': 5, 'replication-factor': 2 },
            'data' : { 'partitions': 5, 'replication-factor': 2 },
            'min' : { 'partitions': 5, 'replication-factor': 2 },
            'max' : { 'partitions': 5, 'replication-factor': 2 },
            'sum' : { 'partitions': 5, 'replication-factor': 2 },
            'dif' : { 'partitions': 5, 'replication-factor': 2 },
            'cnt' : { 'partitions': 5, 'replication-factor': 2 },
            'avg' : { 'partitions': 5, 'replication-factor': 2 },
            'wcnt' : { 'partitions': 5, 'replication-factor': 2 },
            'tagg' : { 'partitions': 5, 'replication-factor': 2 }
        })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)

    @cluster(num_nodes=6)
    def test_bounce(self):
        """
        Start a smoke test client, then abort (kill -9) and restart it a few times.
        Ensure that all records are delivered.
        """

        self.driver.start()

        self.processor1.start()

        time.sleep(15)

        self.processor1.abortThenRestart()

        time.sleep(15)

        # enable this after we add change log partition replicas
        #self.kafka.signal_leader("data")

        #time.sleep(15);

        self.processor1.abortThenRestart()

        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False)
예제 #3
0
class StreamsSmokeTest(KafkaTest):
    """
    Simple test of Kafka Streams.
    """

    def __init__(self, test_context):
        super(StreamsSmokeTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={
            'echo' : { 'partitions': 5, 'replication-factor': 1 },
            'data' : { 'partitions': 5, 'replication-factor': 1 },
            'min' : { 'partitions': 5, 'replication-factor': 1 },
            'max' : { 'partitions': 5, 'replication-factor': 1 },
            'sum' : { 'partitions': 5, 'replication-factor': 1 },
            'dif' : { 'partitions': 5, 'replication-factor': 1 },
            'cnt' : { 'partitions': 5, 'replication-factor': 1 },
            'avg' : { 'partitions': 5, 'replication-factor': 1 },
            'wcnt' : { 'partitions': 5, 'replication-factor': 1 },
            'tagg' : { 'partitions': 5, 'replication-factor': 1 }
        })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor2 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor3 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor4 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)

    @cluster(num_nodes=9)
    def test_streams(self):
        """
        Start a few smoke test clients, then repeat start a new one, stop (cleanly) running one a few times.
        Ensure that all results (stats on values computed by Kafka Streams) are correct.
        """

        self.driver.start()

        self.processor1.start()
        self.processor2.start()

        time.sleep(15)

        self.processor3.start()
        self.processor1.stop()

        time.sleep(15)

        self.processor4.start()

        self.driver.wait()
        self.driver.stop()

        self.processor2.stop()
        self.processor3.stop()
        self.processor4.stop()

        node = self.driver.node
        node.account.ssh("grep SUCCESS %s" % self.driver.STDOUT_FILE, allow_fail=False)
예제 #4
0
class StreamsSmokeTest(KafkaTest):
    """
    Simple test of Kafka Streams.
    """

    def __init__(self, test_context):
        super(StreamsSmokeTest, self).__init__(test_context, num_zk=1, num_brokers=2, topics={
            'echo' : { 'partitions': 5, 'replication-factor': 1 },
            'data' : { 'partitions': 5, 'replication-factor': 1 },
            'min' : { 'partitions': 5, 'replication-factor': 1 },
            'max' : { 'partitions': 5, 'replication-factor': 1 },
            'sum' : { 'partitions': 5, 'replication-factor': 1 },
            'dif' : { 'partitions': 5, 'replication-factor': 1 },
            'cnt' : { 'partitions': 5, 'replication-factor': 1 },
            'avg' : { 'partitions': 5, 'replication-factor': 1 },
            'wcnt' : { 'partitions': 5, 'replication-factor': 1 },
            'tagg' : { 'partitions': 5, 'replication-factor': 1 }
        })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor2 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor3 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor4 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)

    @cluster(num_nodes=8)
    def test_streams(self):
        """
        Start a few smoke test clients, then repeat start a new one, stop (cleanly) running one a few times.
        Ensure that all results (stats on values computed by Kafka Streams) are correct.
        """

        self.driver.start()

        self.processor1.start()
        self.processor2.start()

        time.sleep(15)

        self.processor3.start()
        self.processor1.stop()

        time.sleep(15)

        self.processor4.start()

        self.driver.wait()
        self.driver.stop()

        self.processor2.stop()
        self.processor3.stop()
        self.processor4.stop()

        node = self.driver.node
        node.account.ssh("grep SUCCESS %s" % self.driver.STDOUT_FILE, allow_fail=False)
class StreamsBrokerBounceTest(Test):
    """
    Simple test of Kafka Streams with brokers failing
    """
    def __init__(self, test_context):
        super(StreamsBrokerBounceTest, self).__init__(test_context)
        self.replication = 3
        self.partitions = 3
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            '__consumer_offsets': {
                'partitions': 50,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            }
        }

    def fail_broker_type(self, failure_mode, broker_type):
        # Pick a random topic and bounce it's leader
        topic_index = randint(0, len(self.topics.keys()) - 1)
        topic = self.topics.keys()[topic_index]
        failures[failure_mode](self, topic, broker_type)

    def fail_many_brokers(self, failure_mode, num_failures):
        sig = signal.SIGTERM
        if (failure_mode == "clean_shutdown"):
            sig = signal.SIGTERM
        else:
            sig = signal.SIGKILL

        for num in range(0, num_failures - 1):
            signal_node(self, self.kafka.nodes[num], sig)

    def setup_system(self, start_processor=True):
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=self.replication,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()
        # Start test harness
        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()

        if (start_processor):
            self.processor1.start()

    def collect_results(self, sleep_time_secs):
        data = {}
        # End test
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node

        # Success is declared if streams does not crash when sleep time > 0
        # It should give an exception when sleep time is 0 since we kill the brokers immediately
        # and the topic manager cannot create internal topics with the desired replication factor
        if (sleep_time_secs == 0):
            output_streams = self.processor1.node.account.ssh_capture(
                "grep SMOKE-TEST-CLIENT-EXCEPTION %s" %
                self.processor1.STDOUT_FILE,
                allow_fail=False)
        else:
            output_streams = self.processor1.node.account.ssh_capture(
                "grep SMOKE-TEST-CLIENT-CLOSED %s" %
                self.processor1.STDOUT_FILE,
                allow_fail=False)

        for line in output_streams:
            data["Client closed"] = line

        # Currently it is hard to guarantee anything about Kafka since we don't have exactly once.
        # With exactly once in place, success will be defined as ALL-RECORDS-DELIEVERD and SUCCESS
        output = node.account.ssh_capture(
            "grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED|PROCESSED-LESS-THAN-GENERATED' %s"
            % self.driver.STDOUT_FILE,
            allow_fail=False)
        for line in output:
            data["Records Delivered"] = line
        output = node.account.ssh_capture("grep -E 'SUCCESS|FAILURE' %s" %
                                          self.driver.STDOUT_FILE,
                                          allow_fail=False)
        for line in output:
            data["Logic Success/Failure"] = line

        return data

    @cluster(num_nodes=7)
    @matrix(failure_mode=[
        "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"
    ],
            broker_type=["leader", "controller"],
            sleep_time_secs=[120])
    def test_broker_type_bounce(self, failure_mode, broker_type,
                                sleep_time_secs):
        """
        Start a smoke test client, then kill one particular broker and ensure data is still received
        Record if records are delivered. 
        """
        self.setup_system()

        # Sleep to allow test to run for a bit
        time.sleep(sleep_time_secs)

        # Fail brokers
        self.fail_broker_type(failure_mode, broker_type)

        return self.collect_results(sleep_time_secs)

    @ignore
    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_shutdown"],
            broker_type=["controller"],
            sleep_time_secs=[0])
    def test_broker_type_bounce_at_start(self, failure_mode, broker_type,
                                         sleep_time_secs):
        """
        Start a smoke test client, then kill one particular broker immediately before streams stats
        Streams should throw an exception since it cannot create topics with the desired
        replication factor of 3
        """
        self.setup_system(start_processor=False)

        # Sleep to allow test to run for a bit
        time.sleep(sleep_time_secs)

        # Fail brokers
        self.fail_broker_type(failure_mode, broker_type)

        self.processor1.start()

        return self.collect_results(sleep_time_secs)

    @cluster(num_nodes=7)
    @matrix(failure_mode=[
        "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"
    ],
            num_failures=[2])
    def test_many_brokers_bounce(self, failure_mode, num_failures):
        """
        Start a smoke test client, then kill a few brokers and ensure data is still received
        Record if records are delivered
        """
        self.setup_system()

        # Sleep to allow test to run for a bit
        time.sleep(120)

        # Fail brokers
        self.fail_many_brokers(failure_mode, num_failures)

        return self.collect_results(120)

    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_bounce", "hard_bounce"], num_failures=[3])
    def test_all_brokers_bounce(self, failure_mode, num_failures):
        """
        Start a smoke test client, then kill a few brokers and ensure data is still received
        Record if records are delivered
        """
        self.setup_system()

        # Sleep to allow test to run for a bit
        time.sleep(120)

        # Fail brokers
        self.fail_many_brokers(failure_mode, num_failures)

        return self.collect_results(120)
예제 #6
0
    def test_upgrade_downgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker.
        """

        if from_version == to_version:
            return

        self.replication = 3
        self.num_kafka_nodes = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            }
        }

        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=self.num_kafka_nodes,
                                  zk=self.zk,
                                  version=KafkaVersion(from_version),
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        wait_until(lambda: self.confirm_topics_on_all_brokers(
            set(self.topics.keys())),
                   timeout_sec=60,
                   err_msg="Broker did not create all topics in 60 seconds ")

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)

        processor = StreamsSmokeTestJobRunnerService(self.test_context,
                                                     self.kafka)

        with self.driver.node.account.monitor_log(
                self.driver.STDOUT_FILE) as driver_monitor:
            self.driver.start()

            with processor.node.account.monitor_log(
                    processor.STDOUT_FILE) as monitor:
                processor.start()
                monitor.wait_until(
                    self.processed_msg,
                    timeout_sec=60,
                    err_msg="Never saw output '%s' on " % self.processed_msg +
                    str(processor.node))

            connected_message = "Discovered group coordinator"
            with processor.node.account.monitor_log(
                    processor.LOG_FILE) as log_monitor:
                with processor.node.account.monitor_log(
                        processor.STDOUT_FILE) as stdout_monitor:
                    self.perform_broker_upgrade(to_version)

                    log_monitor.wait_until(
                        connected_message,
                        timeout_sec=120,
                        err_msg=("Never saw output '%s' on " %
                                 connected_message) +
                        str(processor.node.account))

                    stdout_monitor.wait_until(
                        self.processed_msg,
                        timeout_sec=60,
                        err_msg="Never saw output '%s' on" % self.processed_msg
                        + str(processor.node.account))

            # SmokeTestDriver allows up to 6 minutes to consume all
            # records for the verification step so this timeout is set to
            # 6 minutes (360 seconds) for consuming of verification records
            # and a very conservative additional 2 minutes (120 seconds) to process
            # the records in the verification step
            driver_monitor.wait_until(
                'ALL-RECORDS-DELIVERED\|PROCESSED-MORE-THAN-GENERATED',
                timeout_sec=480,
                err_msg="Never saw output '%s' on" %
                'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED' +
                str(self.driver.node.account))

        self.driver.stop()
        processor.stop()
        processor.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" %
                                           processor.STDOUT_FILE,
                                           allow_fail=False)
예제 #7
0
class StreamsUpgradeTest(Test):
    """
    Tests rolling upgrades and downgrades of the Kafka Streams library.
    """
    def __init__(self, test_context):
        super(StreamsUpgradeTest, self).__init__(test_context)
        self.replication = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            }
        }

    def perform_streams_upgrade(self, to_version):
        self.logger.info("First pass bounce - rolling streams upgrade")

        # get the node running the streams app
        node = self.processor1.node
        self.processor1.stop()

        # change it's version. This will automatically make it pick up a different
        # JAR when it starts again
        node.version = KafkaVersion(to_version)
        self.processor1.start()

    def perform_broker_upgrade(self, to_version):
        self.logger.info("First pass bounce - rolling broker upgrade")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            node.version = KafkaVersion(to_version)
            self.kafka.start_node(node)

    @cluster(num_nodes=6)
    @parametrize(from_version=str(LATEST_0_10_1), to_version=str(DEV_BRANCH))
    @parametrize(from_version=str(LATEST_0_10_2), to_version=str(DEV_BRANCH))
    @parametrize(from_version=str(LATEST_0_10_1),
                 to_version=str(LATEST_0_11_0))
    @parametrize(from_version=str(LATEST_0_10_2),
                 to_version=str(LATEST_0_11_0))
    @parametrize(from_version=str(LATEST_0_11_0),
                 to_version=str(LATEST_0_10_2))
    @parametrize(from_version=str(DEV_BRANCH), to_version=str(LATEST_0_10_2))
    def test_upgrade_downgrade_streams(self, from_version, to_version):
        """
        Start a smoke test client, then abort (kill -9) and restart it a few times.
        Ensure that all records are delivered.

        Note, that just like tests/core/upgrade_test.py, a prerequisite for this test to succeed
        if the inclusion of all parametrized versions of kafka in kafka/vagrant/base.sh 
        (search for get_kafka()). For streams in particular, that means that someone has manually
        copies the kafka-stream-$version-test.jar in the right S3 bucket as shown in base.sh.
        """
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zk,
                                  version=KafkaVersion(from_version),
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        time.sleep(10)

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.processor1.start()
        time.sleep(15)

        self.perform_streams_upgrade(to_version)

        time.sleep(15)
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" %
                         self.driver.STDOUT_FILE,
                         allow_fail=False)
        self.processor1.node.account.ssh_capture(
            "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE,
            allow_fail=False)

    @cluster(num_nodes=6)
    @parametrize(from_version=str(LATEST_0_10_2), to_version=str(DEV_BRANCH))
    def test_upgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker. 
        """
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zk,
                                  version=KafkaVersion(from_version),
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        time.sleep(10)

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.processor1.start()
        time.sleep(15)

        self.perform_broker_upgrade(to_version)

        time.sleep(15)
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" %
                         self.driver.STDOUT_FILE,
                         allow_fail=False)
        self.processor1.node.account.ssh_capture(
            "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE,
            allow_fail=False)
예제 #8
0
    def test_streams(self, processing_guarantee, crash, metadata_quorum=quorum.zk):
        processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka, processing_guarantee)
        processor2 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka, processing_guarantee)
        processor3 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka, processing_guarantee)

        with processor1.node.account.monitor_log(processor1.STDOUT_FILE) as monitor1:
            processor1.start()
            monitor1.wait_until('REBALANCING -> RUNNING',
                               timeout_sec=60,
                               err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor1.node.account)
                               )

            self.driver.start()

            monitor1.wait_until('processed',
                                timeout_sec=30,
                                err_msg="Didn't see any processing messages " + str(processor1.node.account)
                                )

            # make sure we're not already done processing (which would invalidate the test)
            self.driver.node.account.ssh("! grep 'Result Verification' %s" % self.driver.STDOUT_FILE, allow_fail=False)

            processor1.stop_nodes(not crash)

        with processor2.node.account.monitor_log(processor2.STDOUT_FILE) as monitor2:
            processor2.start()
            monitor2.wait_until('REBALANCING -> RUNNING',
                                timeout_sec=120,
                                err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor2.node.account)
                                )
            monitor2.wait_until('processed',
                                timeout_sec=30,
                                err_msg="Didn't see any processing messages " + str(processor2.node.account)
                                )

        # make sure we're not already done processing (which would invalidate the test)
        self.driver.node.account.ssh("! grep 'Result Verification' %s" % self.driver.STDOUT_FILE, allow_fail=False)

        processor2.stop_nodes(not crash)

        with processor3.node.account.monitor_log(processor3.STDOUT_FILE) as monitor3:
            processor3.start()
            monitor3.wait_until('REBALANCING -> RUNNING',
                                timeout_sec=120,
                                err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor3.node.account)
                                )
            # there should still be some data left for this processor to work on.
            monitor3.wait_until('processed',
                                timeout_sec=30,
                                err_msg="Didn't see any processing messages " + str(processor3.node.account)
                                )

        self.driver.wait()
        self.driver.stop()

        processor3.stop()

        if crash and processing_guarantee == 'at_least_once':
            self.driver.node.account.ssh("grep -E 'SUCCESS|PROCESSED-MORE-THAN-GENERATED' %s" % self.driver.STDOUT_FILE, allow_fail=False)
        else:
            self.driver.node.account.ssh("grep SUCCESS %s" % self.driver.STDOUT_FILE, allow_fail=False)
예제 #9
0
class StreamsBrokerBounceTest(Test):
    """
    Simple test of Kafka Streams with brokers failing
    """

    def __init__(self, test_context):
        super(StreamsBrokerBounceTest, self).__init__(test_context)
        self.replication = 3
        self.partitions = 3
        self.topics = {
            'echo' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": 2}},
            'data' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": 2} },
            'min' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": 2} },
            'max' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": 2} },
            'sum' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": 2} },
            'dif' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": 2} },
            'cnt' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": 2} },
            'avg' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": 2} },
            'wcnt' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": 2} },
            'tagg' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": 2} },
            '__consumer_offsets' : { 'partitions': 50, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": 2} }
        }

    def fail_broker_type(self, failure_mode, broker_type):
        # Pick a random topic and bounce it's leader
        topic_index = randint(0, len(self.topics.keys()) - 1)
        topic = self.topics.keys()[topic_index]
        failures[failure_mode](self, topic, broker_type)

    def fail_many_brokers(self, failure_mode, num_failures):
        sig = signal.SIGTERM
        if (failure_mode == "clean_shutdown"):
            sig = signal.SIGTERM
        else:
            sig = signal.SIGKILL
            
        for num in range(0, num_failures - 1):
            signal_node(self, self.kafka.nodes[num], sig)

    def confirm_topics_on_all_brokers(self, expected_topic_set):
        for node in self.kafka.nodes:
            match_count = 0
            # need to iterate over topic_list_generator as kafka.list_topics()
            # returns a python generator so values are fetched lazily
            # so we can't just compare directly we must iterate over what's returned
            topic_list_generator = self.kafka.list_topics(node=node)
            for topic in topic_list_generator:
                if topic in expected_topic_set:
                    match_count += 1

            if len(expected_topic_set) != match_count:
                return False

        return True

        
    def setup_system(self, start_processor=True):
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context, num_nodes=self.replication, zk=self.zk, topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        wait_until(lambda: self.confirm_topics_on_all_brokers(set(self.topics.keys())),
                   timeout_sec=60,
                   err_msg="Broker did not create all topics in 60 seconds ")

        # Start test harness
        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka)

        self.driver.start()

        if (start_processor):
           self.processor1.start()

    def collect_results(self, sleep_time_secs):
        data = {}
        # End test
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        
        # Success is declared if streams does not crash when sleep time > 0
        # It should give an exception when sleep time is 0 since we kill the brokers immediately
        # and the topic manager cannot create internal topics with the desired replication factor
        if (sleep_time_secs == 0):
            output_streams = self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-EXCEPTION %s" % self.processor1.STDOUT_FILE, allow_fail=False)
        else:
            output_streams = self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False)
            
        for line in output_streams:
            data["Client closed"] = line

        # Currently it is hard to guarantee anything about Kafka since we don't have exactly once.
        # With exactly once in place, success will be defined as ALL-RECORDS-DELIEVERD and SUCCESS
        output = node.account.ssh_capture("grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED|PROCESSED-LESS-THAN-GENERATED' %s" % self.driver.STDOUT_FILE, allow_fail=False)
        for line in output:
            data["Records Delivered"] = line
        output = node.account.ssh_capture("grep -E 'SUCCESS|FAILURE' %s" % self.driver.STDOUT_FILE, allow_fail=False)
        for line in output:
            data["Logic Success/Failure"] = line
            
        
        return data

    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"],
            broker_type=["leader", "controller"],
            sleep_time_secs=[120])
    def test_broker_type_bounce(self, failure_mode, broker_type, sleep_time_secs):
        """
        Start a smoke test client, then kill one particular broker and ensure data is still received
        Record if records are delivered. 
        """
        self.setup_system() 

        # Sleep to allow test to run for a bit
        time.sleep(sleep_time_secs)

        # Fail brokers
        self.fail_broker_type(failure_mode, broker_type)

        return self.collect_results(sleep_time_secs)

    @ignore
    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_shutdown"],
            broker_type=["controller"],
            sleep_time_secs=[0])
    def test_broker_type_bounce_at_start(self, failure_mode, broker_type, sleep_time_secs):
        """
        Start a smoke test client, then kill one particular broker immediately before streams stats
        Streams should throw an exception since it cannot create topics with the desired
        replication factor of 3
        """
        self.setup_system(start_processor=False)

        # Sleep to allow test to run for a bit
        time.sleep(sleep_time_secs)

        # Fail brokers
        self.fail_broker_type(failure_mode, broker_type)

        self.processor1.start()

        return self.collect_results(sleep_time_secs)

    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"],
            num_failures=[2])
    def test_many_brokers_bounce(self, failure_mode, num_failures):
        """
        Start a smoke test client, then kill a few brokers and ensure data is still received
        Record if records are delivered
        """
        self.setup_system() 

        # Sleep to allow test to run for a bit
        time.sleep(120)

        # Fail brokers
        self.fail_many_brokers(failure_mode, num_failures)

        return self.collect_results(120)

    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_bounce", "hard_bounce"],
            num_failures=[3])
    def test_all_brokers_bounce(self, failure_mode, num_failures):
        """
        Start a smoke test client, then kill a few brokers and ensure data is still received
        Record if records are delivered
        """
        self.setup_system() 

        # Sleep to allow test to run for a bit
        time.sleep(120)

        # Fail brokers
        self.fail_many_brokers(failure_mode, num_failures)

        return self.collect_results(120)
예제 #10
0
class StreamsBrokerBounceTest(Test):
    """
    Simple test of Kafka Streams with brokers failing
    """
    def __init__(self, test_context):
        super(StreamsBrokerBounceTest, self).__init__(test_context)
        self.replication = 3
        self.partitions = 3
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            '__consumer_offsets': {
                'partitions': 50,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            }
        }

    def fail_broker_type(self, failure_mode, broker_type):
        # Pick a random topic and bounce it's leader
        topic_index = randint(0, len(self.topics.keys()) - 1)
        topic = self.topics.keys()[topic_index]
        failures[failure_mode](self, topic, broker_type)

    def fail_many_brokers(self, failure_mode, num_failures):
        sig = signal.SIGTERM
        if (failure_mode == "clean_shutdown"):
            sig = signal.SIGTERM
        else:
            sig = signal.SIGKILL

        for num in range(0, num_failures - 1):
            signal_node(self, self.kafka.nodes[num], sig)

    def confirm_topics_on_all_brokers(self, expected_topic_set):
        for node in self.kafka.nodes:
            match_count = 0
            # need to iterate over topic_list_generator as kafka.list_topics()
            # returns a python generator so values are fetched lazily
            # so we can't just compare directly we must iterate over what's returned
            topic_list_generator = self.kafka.list_topics(node=node)
            for topic in topic_list_generator:
                if topic in expected_topic_set:
                    match_count += 1

            if len(expected_topic_set) != match_count:
                return False

        return True

    def setup_system(self, start_processor=True, num_threads=3):
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=self.replication,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        wait_until(lambda: self.confirm_topics_on_all_brokers(
            set(self.topics.keys())),
                   timeout_sec=60,
                   err_msg="Broker did not create all topics in 60 seconds ")

        # Start test harness
        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka, "at_least_once", num_threads)

        self.driver.start()

        if (start_processor):
            self.processor1.start()

    def collect_results(self, sleep_time_secs):
        data = {}
        # End test
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node

        # Success is declared if streams does not crash when sleep time > 0
        # It should give an exception when sleep time is 0 since we kill the brokers immediately
        # and the topic manager cannot create internal topics with the desired replication factor
        if (sleep_time_secs == 0):
            output_streams = self.processor1.node.account.ssh_capture(
                "grep SMOKE-TEST-CLIENT-EXCEPTION %s" %
                self.processor1.STDOUT_FILE,
                allow_fail=False)
        else:
            output_streams = self.processor1.node.account.ssh_capture(
                "grep SMOKE-TEST-CLIENT-CLOSED %s" %
                self.processor1.STDOUT_FILE,
                allow_fail=False)

        for line in output_streams:
            data["Client closed"] = line

        # Currently it is hard to guarantee anything about Kafka since we don't have exactly once.
        # With exactly once in place, success will be defined as ALL-RECORDS-DELIEVERD and SUCCESS
        output = node.account.ssh_capture(
            "grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED|PROCESSED-LESS-THAN-GENERATED' %s"
            % self.driver.STDOUT_FILE,
            allow_fail=False)
        for line in output:
            data["Records Delivered"] = line
        output = node.account.ssh_capture("grep -E 'SUCCESS|FAILURE' %s" %
                                          self.driver.STDOUT_FILE,
                                          allow_fail=False)
        for line in output:
            data["Logic Success/Failure"] = line

        return data

    @cluster(num_nodes=7)
    @matrix(failure_mode=[
        "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"
    ],
            broker_type=["leader", "controller"],
            num_threads=[1, 3],
            sleep_time_secs=[120])
    def test_broker_type_bounce(self, failure_mode, broker_type,
                                sleep_time_secs, num_threads):
        """
        Start a smoke test client, then kill one particular broker and ensure data is still received
        Record if records are delivered.
        We also add a single thread stream client to make sure we could get all partitions reassigned in
        next generation so to verify the partition lost is correctly triggered.
        """
        self.setup_system(num_threads=num_threads)

        # Sleep to allow test to run for a bit
        time.sleep(sleep_time_secs)

        # Fail brokers
        self.fail_broker_type(failure_mode, broker_type)

        return self.collect_results(sleep_time_secs)

    @ignore
    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_shutdown"],
            broker_type=["controller"],
            sleep_time_secs=[0])
    def test_broker_type_bounce_at_start(self, failure_mode, broker_type,
                                         sleep_time_secs):
        """
        Start a smoke test client, then kill one particular broker immediately before streams stats
        Streams should throw an exception since it cannot create topics with the desired
        replication factor of 3
        """
        self.setup_system(start_processor=False)

        # Sleep to allow test to run for a bit
        time.sleep(sleep_time_secs)

        # Fail brokers
        self.fail_broker_type(failure_mode, broker_type)

        self.processor1.start()

        return self.collect_results(sleep_time_secs)

    @cluster(num_nodes=7)
    @matrix(failure_mode=[
        "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"
    ],
            num_failures=[2])
    def test_many_brokers_bounce(self, failure_mode, num_failures):
        """
        Start a smoke test client, then kill a few brokers and ensure data is still received
        Record if records are delivered
        """
        self.setup_system()

        # Sleep to allow test to run for a bit
        time.sleep(120)

        # Fail brokers
        self.fail_many_brokers(failure_mode, num_failures)

        return self.collect_results(120)

    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_bounce", "hard_bounce"], num_failures=[3])
    def test_all_brokers_bounce(self, failure_mode, num_failures):
        """
        Start a smoke test client, then kill a few brokers and ensure data is still received
        Record if records are delivered
        """

        # Set min.insync.replicas to 1 because in the last stage of the test there is only one broker left.
        # Otherwise the last offset commit will never succeed and time out and potentially take longer as
        # duration passed to the close method of the Kafka Streams client.
        self.topics['__consumer_offsets'] = {
            'partitions': 50,
            'replication-factor': self.replication,
            'configs': {
                "min.insync.replicas": 1
            }
        }

        self.setup_system()

        # Sleep to allow test to run for a bit
        time.sleep(120)

        # Fail brokers
        self.fail_many_brokers(failure_mode, num_failures)

        return self.collect_results(120)