Пример #1
0
    def start(self, add_principals=""):
        self.open_port(self.security_protocol)
        self.open_port(self.interbroker_security_protocol)

        self.start_minikdc(add_principals)
        self._ensure_zk_chroot()

        Service.start(self)

        self.logger.info("Waiting for brokers to register at ZK")

        retries = 30
        expected_broker_ids = set(self.nodes)
        wait_until(lambda: {node for node in self.nodes if self.is_registered(node)} == expected_broker_ids, 30, 1)

        if retries == 0:
            raise RuntimeError("Kafka servers didn't register at ZK within 30 seconds")

        # Create topics if necessary
        if self.topics is not None:
            for topic, topic_cfg in self.topics.items():
                if topic_cfg is None:
                    topic_cfg = {}

                topic_cfg["topic"] = topic
                self.create_topic(topic_cfg)
 def check_producing(self):
     currently_acked = self.producer.num_acked
     wait_until(
         lambda: self.producer.num_acked > currently_acked + 5,
         timeout_sec=30,
         err_msg="Expected producer to still be producing.",
     )
Пример #3
0
    def stop_node(self, node, clean_shutdown=True):
        pids = self.pids(node)
        sig = signal.SIGTERM if clean_shutdown else signal.SIGKILL

        for pid in pids:
            node.account.signal(pid, sig, allow_fail=False)
        wait_until(lambda: len(self.pids(node)) == 0, timeout_sec=20, err_msg="Kafka node failed to stop")
Пример #4
0
    def start_node(self, node):
        node.account.ssh("mkdir -p %s" % MirrorMaker.PERSISTENT_ROOT, allow_fail=False)
        node.account.ssh("mkdir -p %s" % MirrorMaker.LOG_DIR, allow_fail=False)

        self.security_config = self.source.security_config.client_config()
        self.security_config.setup_node(node)

        # Create, upload one consumer config file for source cluster
        consumer_props = self.render("mirror_maker_consumer.properties")
        consumer_props += str(self.security_config)

        node.account.create_file(MirrorMaker.CONSUMER_CONFIG, consumer_props)
        self.logger.info("Mirrormaker consumer props:\n" + consumer_props)

        # Create, upload producer properties file for target cluster
        producer_props = self.render('mirror_maker_producer.properties')
        producer_props += str(self.security_config)
        self.logger.info("Mirrormaker producer props:\n" + producer_props)
        node.account.create_file(MirrorMaker.PRODUCER_CONFIG, producer_props)


        # Create and upload log properties
        log_config = self.render('tools_log4j.properties', log_file=MirrorMaker.LOG_FILE)
        node.account.create_file(MirrorMaker.LOG4J_CONFIG, log_config)

        # Run mirror maker
        cmd = self.start_cmd(node)
        self.logger.debug("Mirror maker command: %s", cmd)
        node.account.ssh(cmd, allow_fail=False)
        wait_until(lambda: self.alive(node), timeout_sec=10, backoff_sec=.5,
                   err_msg="Mirror maker took to long to start.")
        self.logger.debug("Mirror maker is alive")
Пример #5
0
    def test_log_compaction(self, security_protocol='PLAINTEXT'):

        self.start_kafka(security_protocol, security_protocol)
        self.start_test_log_compaction_tool(security_protocol)

        # Verify that compacted data verification completed in LogCompactionTester
        wait_until(lambda: self.compaction_verifier.is_done, timeout_sec=180, err_msg="Timed out waiting to complete compaction")
    def test_lifecycle(self, security_protocol, new_consumer=True):
        """Check that console consumer starts/stops properly, and that we are capturing log output."""

        self.kafka.security_protocol = security_protocol
        self.kafka.start()

        self.consumer.security_protocol = security_protocol
        self.consumer.new_consumer = new_consumer

        t0 = time.time()
        self.consumer.start()
        node = self.consumer.nodes[0]

        wait_until(lambda: self.consumer.alive(node),
            timeout_sec=10, backoff_sec=.2, err_msg="Consumer was too slow to start")
        self.logger.info("consumer started in %s seconds " % str(time.time() - t0))

        # Verify that log output is happening
        wait_until(lambda: file_exists(node, ConsoleConsumer.LOG_FILE), timeout_sec=10,
                   err_msg="Timed out waiting for logging to start.")
        assert line_count(node, ConsoleConsumer.LOG_FILE) > 0

        # Verify no consumed messages
        assert line_count(node, ConsoleConsumer.STDOUT_CAPTURE) == 0

        self.consumer.stop_node(node)
Пример #7
0
    def start_node(self, node):
        cmd = "mkdir -p %s" % self.datadir
        self.logger.debug("Attpempting to create datadir %s", self.datadir)
        node.account.ssh(cmd)

        node.account.create_file("%s/my.cnf" % self.basedir,
                                 self.render('my.cnf', basedir=self.basedir, datadir=self.datadir,
                                             pidfile=self.pidfile))

        cmd = "%s/scripts/mysql_install_db --basedir=%s --datadir=%s" % (self.basedir, self.basedir, self.datadir)
        self.logger.debug("Attempting to install mysql at %s with data at %s", self.basedir, self.datadir)
        node.account.ssh(cmd)

        cmd = "pushd %s; MYSQL_HOME=%s support-files/mysql.server start; popd" % (self.basedir, self.basedir)
        self.logger.debug("Attempting to start MysqlService on %s with command: %s", str(node.account), cmd)
        node.account.ssh(cmd)
        wait_until(lambda: self.alive(node), timeout_sec=5, backoff_sec=.5,
                   err_msg="Mysql takes too long to start.")
        self.logger.debug("Mysql is successfully started.")

        cmd = "%s/bin/mysql_config_editor set --host=%s --warn=FALSE" % (self.basedir, node.account.hostname)
        node.account.ssh(cmd)

        query = "GRANT ALL PRIVILEGES ON *.* TO 'root'@'%' IDENTIFIED BY '' WITH GRANT OPTION"
        cmd = "%s/bin/mysql -h %s -u root -s -N -e \"%s\"" % (self.basedir, node.account.hostname, query)
        node.account.ssh(cmd)
Пример #8
0
 def test_network_partition_fault(self):
     """
     Test that the network partition fault results in a true network partition between nodes.
     """
     self.set_up_trogdor(3)
     spec = NetworkPartitionFaultSpec(0, TaskSpec.MAX_DURATION_MS,
                                         [[self.agent_nodes[0]], self.agent_nodes[1:]])
     partitions = spec.message["partitions"]
     assert 2 == len(partitions)
     assert [self.agent_nodes[0].name] == partitions[0]
     assert [self.agent_nodes[1].name, self.agent_nodes[2].name] == partitions[1]
     self.trogdor.create_task("partition0", spec)
     def verify_nodes_partitioned():
         if node_is_reachable(self.agent_nodes[0], self.agent_nodes[1]):
             return False
         if node_is_reachable(self.agent_nodes[1], self.agent_nodes[0]):
             return False
         if node_is_reachable(self.agent_nodes[2], self.agent_nodes[0]):
             return False
         return True
     wait_until(lambda: verify_nodes_partitioned,
                timeout_sec=10, backoff_sec=.2, err_msg="Failed to verify that the nodes were partitioned.")
     if not node_is_reachable(self.agent_nodes[0], self.agent_nodes[0]):
         raise RuntimeError("Node 0 must be reachable from itself.")
     if not node_is_reachable(self.agent_nodes[1], self.agent_nodes[2]):
         raise RuntimeError("Node 2 must be reachable from node 1.")
     if not node_is_reachable(self.agent_nodes[2], self.agent_nodes[1]):
         raise RuntimeError("Node 1 must be reachable from node 2.")
Пример #9
0
    def assert_consume(self, client_id, test_state, topic, num_messages=5, timeout_sec=60):
        consumer = self.get_consumer(client_id, topic, num_messages)
        consumer.start()

        wait_until(lambda: consumer.total_consumed() >= num_messages,
                   timeout_sec=timeout_sec,
                   err_msg="At %s streams did not process messages in %s seconds " % (test_state, timeout_sec))
Пример #10
0
 def start_producer(self, max_messages, acks, timeout):
     # This will produce to kafka cluster
     current_acked = 0
     self.producer = VerifiableProducer(self.test_context, num_nodes=1, kafka=self.kafka, topic=TOPIC, throughput=1000, acks=acks, max_messages=max_messages)
     self.producer.start()
     wait_until(lambda: acks == 0 or self.producer.num_acked >= current_acked + max_messages, timeout_sec=timeout,
                err_msg="Timeout awaiting messages to be produced and acked")
Пример #11
0
    def test_broker_failure(self, clean_shutdown, enable_autocommit):
        partition = TopicPartition(self.STOPIC, 0)
        
        consumer = self._setup_consumer(self.STOPIC, enable_autocommit=enable_autocommit)
        producer = self._setup_producer(self.STOPIC)

        producer.start()
        consumer.start()
        self._await_all_members(consumer)

        num_rebalances = consumer.num_rebalances()

        # shutdown one of the brokers
        # TODO: we need a way to target the coordinator instead of picking arbitrarily
        self.kafka.signal_node(self.kafka.nodes[0], signal.SIGTERM if clean_shutdown else signal.SIGKILL)

        # ensure that the consumers do some work after the broker failure
        current_total_consumed = consumer.total_consumed()
        wait_until(lambda: consumer.total_consumed() > current_total_consumed + 1000, timeout_sec=20,
                   err_msg="Timed out waiting for additional records to be consumed after first consumer failed")

        # verify that there were no rebalances on failover
        assert num_rebalances == consumer.num_rebalances(), "Broker failure should not cause a rebalance"

        consumer.stop_all()

        # if the total records consumed matches the current position, we haven't seen any duplicates
        assert consumer.current_position(partition) == consumer.total_consumed(), \
            "Total consumed records did not match consumed position"

        # if autocommit is not turned on, we can also verify the last committed offset
        if not enable_autocommit:
            assert consumer.last_commit(partition) == consumer.current_position(partition), \
                "Last committed offset did not match last consumed position"
    def test_file_source_and_sink(self, converter="org.apache.kafka.connect.json.JsonConverter", schemas=True):
        assert converter != None, "converter type must be set"
        # Template parameters
        self.key_converter = converter
        self.value_converter = converter
        self.schemas = schemas

        self.cc.set_configs(lambda node: self.render("connect-distributed.properties", node=node))

        self.cc.start()

        self.logger.info("Creating connectors")
        for connector_props in [self.render("connect-file-source.properties"), self.render("connect-file-sink.properties")]:
            connector_config = dict([line.strip().split('=', 1) for line in connector_props.split('\n') if line.strip() and not line.strip().startswith('#')])
            self.cc.create_connector(connector_config)

        # Generating data on the source node should generate new records and create new output on the sink node. Timeouts
        # here need to be more generous than they are for standalone mode because a) it takes longer to write configs,
        # do rebalancing of the group, etc, and b) without explicit leave group support, rebalancing takes awhile
        for node in self.cc.nodes:
            node.account.ssh("echo -e -n " + repr(self.FIRST_INPUTS) + " >> " + self.INPUT_FILE)
        wait_until(lambda: self.validate_output(self.FIRST_INPUT_LIST), timeout_sec=120, err_msg="Data added to input file was not seen in the output file in a reasonable amount of time.")

        # Restarting both should result in them picking up where they left off,
        # only processing new data.
        self.cc.restart()

        for node in self.cc.nodes:
            node.account.ssh("echo -e -n " + repr(self.SECOND_INPUTS) + " >> " + self.INPUT_FILE)
        wait_until(lambda: self.validate_output(self.FIRST_INPUT_LIST + self.SECOND_INPUT_LIST), timeout_sec=120, err_msg="Sink output file never converged to the same state as the input file")
Пример #13
0
    def wait_for_http_service(self, port, headers, timeout=20, path='/'):
        """Wait until this service node is available/awake."""
        url = "http://%s:%s%s" % (self.externally_routable_ip, str(port), path)

        err_msg = "Timed out trying to contact service on %s. " % url + \
                            "Either the service failed to start, or there is a problem with the url."
        wait_until(lambda: self._can_ping_url(url, headers), timeout_sec=timeout, backoff_sec=.25, err_msg=err_msg)
Пример #14
0
 def move_start_offset(self):
     """We move the start offset of the topic by writing really old messages
     and waiting for them to be cleaned up.
     """
     producer = VerifiableProducer(self.test_context, 1, self.kafka, self.topic,
                                   throughput=-1, enable_idempotence=True,
                                   create_time=1000)
     producer.start()
     wait_until(lambda: producer.num_acked > 0,
                timeout_sec=30,
                err_msg="Failed to get an acknowledgement for %ds" % 30)
     # Wait 8 seconds to let the topic be seeded with messages that will
     # be deleted. The 8 seconds is important, since we should get 2 deleted
     # segments in this period based on the configured log roll time and the
     # retention check interval.
     time.sleep(8)
     producer.stop()
     self.logger.info("Seeded topic with %d messages which will be deleted" %\
                      producer.num_acked)
     # Since the configured check interval is 5 seconds, we wait another
     # 6 seconds to ensure that at least one more cleaning so that the last
     # segment is deleted. An altenate to using timeouts is to poll each
     # partition until the log start offset matches the end offset. The
     # latter is more robust.
     time.sleep(6)
Пример #15
0
    def copy_messages_transactionally(self, failure_mode, bounce_target,
                                      input_topic, output_topic,
                                      num_copiers, num_messages_to_copy):
        """Copies messages transactionally from the seeded input topic to the
        output topic, either bouncing brokers or clients in a hard and soft
        way as it goes.

        This method also consumes messages in read_committed mode from the
        output topic while the bounces and copy is going on.

        It returns the concurrently consumed messages.
        """
        copiers = self.create_and_start_copiers(input_topic=input_topic,
                                                output_topic=output_topic,
                                                num_copiers=num_copiers)
        concurrent_consumer = self.start_consumer(output_topic,
                                                  group_id="concurrent_consumer")
        clean_shutdown = False
        if failure_mode == "clean_bounce":
            clean_shutdown = True

        if bounce_target == "brokers":
            self.bounce_brokers(clean_shutdown)
        elif bounce_target == "clients":
            self.bounce_copiers(copiers, clean_shutdown)

        for copier in copiers:
            wait_until(lambda: copier.is_done,
                       timeout_sec=120,
                       err_msg="%s - Failed to copy all messages in  %ds." %\
                       (copier.transactional_id, 120))
        self.logger.info("finished copying messages")

        return self.drain_consumer(concurrent_consumer, num_messages_to_copy)
Пример #16
0
 def start_producer(self):
     # This will produce to kafka cluster
     self.producer = VerifiableProducer(self.test_context, num_nodes=1, kafka=self.kafka, topic=TOPIC, throughput=1000, max_messages=MAX_MESSAGES)
     self.producer.start()
     current_acked = self.producer.num_acked
     wait_until(lambda: self.producer.num_acked >= current_acked + MAX_MESSAGES, timeout_sec=10,
                err_msg="Timeout awaiting messages to be produced and acked")
Пример #17
0
    def reassign_partitions(self, bounce_brokers):
        partition_info = self.kafka.parse_describe_topic(self.kafka.describe_topic(self.topic))
        self.logger.debug("Partitions before reassignment:" + str(partition_info))

        # jumble partition assignment in dictionary
        seed = random.randint(0, 2 ** 31 - 1)
        self.logger.debug("Jumble partition assignment with seed " + str(seed))
        random.seed(seed)
        # The list may still be in order, but that's ok
        shuffled_list = range(0, self.num_partitions)
        random.shuffle(shuffled_list)

        for i in range(0, self.num_partitions):
            partition_info["partitions"][i]["partition"] = shuffled_list[i]
        self.logger.debug("Jumbled partitions: " + str(partition_info))

        # send reassign partitions command
        self.kafka.execute_reassign_partitions(partition_info)

        if bounce_brokers:
            # bounce a few brokers at the same time
            self.clean_bounce_some_brokers()

        # Wait until finished or timeout
        wait_until(lambda: self.kafka.verify_reassign_partitions(partition_info), timeout_sec=self.timeout_sec, backoff_sec=.5)
Пример #18
0
    def assert_produce(self, topic, test_state, num_messages=5, timeout_sec=60):
        producer = self.get_producer(topic, num_messages)
        producer.start()

        wait_until(lambda: producer.num_acked >= num_messages,
                   timeout_sec=timeout_sec,
                   err_msg="At %s failed to send messages " % test_state)
Пример #19
0
    def test_simple_run(self, producer_version=DEV_BRANCH):
        """
        Test that we can start VerifiableProducer on the current branch snapshot version or against the 0.8.2 jar, and
        verify that we can produce a small number of messages.
        """
        node = self.producer.nodes[0]
        node.version = KafkaVersion(producer_version)
        self.producer.start()
        wait_until(lambda: self.producer.num_acked > 5, timeout_sec=5,
             err_msg="Producer failed to start in a reasonable amount of time.")

        # using version.vstring (distutils.version.LooseVersion) is a tricky way of ensuring
        # that this check works with DEV_BRANCH
        # When running VerifiableProducer 0.8.X, both the current branch version and 0.8.X should show up because of the
        # way verifiable producer pulls in some development directories into its classpath
        #
        # If the test fails here because 'ps .. | grep' couldn't find the process it means
        # the login and grep that is_version() performs is slower than
        # the time it takes the producer to produce its messages.
        # Easy fix is to decrease throughput= above, the good fix is to make the producer
        # not terminate until explicitly killed in this case.
        if node.version <= LATEST_0_8_2:
            assert is_version(node, [node.version.vstring, DEV_BRANCH.vstring], logger=self.logger)
        else:
            assert is_version(node, [node.version.vstring], logger=self.logger)

        self.producer.wait()
        num_produced = self.producer.num_acked
        assert num_produced == self.num_messages, "num_produced: %d, num_messages: %d" % (num_produced, self.num_messages)
Пример #20
0
    def test_file_source_and_sink(self):
        """
        Tests that a basic file connector works across clean rolling bounces. This validates that the connector is
        correctly created, tasks instantiated, and as nodes restart the work is rebalanced across nodes.
        """

        self.cc.set_configs(lambda node: self.render("connect-distributed.properties", node=node))

        self.cc.start()

        self.logger.info("Creating connectors")
        for connector_props in [self.render("connect-file-source.properties"), self.render("connect-file-sink.properties")]:
            connector_config = dict([line.strip().split('=', 1) for line in connector_props.split('\n') if line.strip() and not line.strip().startswith('#')])
            self.cc.create_connector(connector_config)

        # Generating data on the source node should generate new records and create new output on the sink node. Timeouts
        # here need to be more generous than they are for standalone mode because a) it takes longer to write configs,
        # do rebalancing of the group, etc, and b) without explicit leave group support, rebalancing takes awhile
        for node in self.cc.nodes:
            node.account.ssh("echo -e -n " + repr(self.FIRST_INPUTS) + " >> " + self.INPUT_FILE)
        wait_until(lambda: self._validate_file_output(self.FIRST_INPUT_LIST), timeout_sec=70, err_msg="Data added to input file was not seen in the output file in a reasonable amount of time.")

        # Restarting both should result in them picking up where they left off,
        # only processing new data.
        self.cc.restart()

        for node in self.cc.nodes:
            node.account.ssh("echo -e -n " + repr(self.SECOND_INPUTS) + " >> " + self.INPUT_FILE)
        wait_until(lambda: self._validate_file_output(self.FIRST_INPUT_LIST + self.SECOND_INPUT_LIST), timeout_sec=70, err_msg="Sink output file never converged to the same state as the input file")
Пример #21
0
    def bounce(self, clean_shutdown=True):
        """Bounce mirror maker with a clean (kill -15) or hard (kill -9) shutdown"""

        # Wait until messages start appearing in the target cluster
        wait_until(lambda: len(self.consumer.messages_consumed[1]) > 0, timeout_sec=15)

        # Wait for at least one offset to be committed.
        #
        # This step is necessary to prevent data loss with default mirror maker settings:
        # currently, if we don't have at least one committed offset,
        # and we bounce mirror maker, the consumer internals will throw OffsetOutOfRangeException, and the default
        # auto.offset.reset policy ("largest") will kick in, causing mirrormaker to start consuming from the largest
        # offset. As a result, any messages produced to the source cluster while mirrormaker was dead won't get
        # mirrored to the target cluster.
        # (see https://issues.apache.org/jira/browse/KAFKA-2759)
        #
        # This isn't necessary with kill -15 because mirror maker commits its offsets during graceful
        # shutdown.
        if not clean_shutdown:
            time.sleep(self.mirror_maker.offset_commit_interval_ms / 1000.0 + .5)

        for i in range(3):
            self.logger.info("Bringing mirror maker nodes down...")
            for node in self.mirror_maker.nodes:
                self.mirror_maker.stop_node(node, clean_shutdown=clean_shutdown)

            num_consumed = len(self.consumer.messages_consumed[1])
            self.logger.info("Bringing mirror maker nodes back up...")
            for node in self.mirror_maker.nodes:
                self.mirror_maker.start_node(node)

            # Ensure new messages are once again showing up on the target cluster
            # new consumer requires higher timeout here
            wait_until(lambda: len(self.consumer.messages_consumed[1]) > num_consumed + 100, timeout_sec=60)
 def start_producer_and_consumer(self):
     # Start background producer and consumer
     self.producer.start()
     wait_until(lambda: self.producer.num_acked > 5, timeout_sec=10,
          err_msg="Producer failed to start in a reasonable amount of time.")
     self.consumer.start()
     wait_until(lambda: len(self.consumer.messages_consumed[1]) > 0, timeout_sec=10,
          err_msg="Consumer failed to start in a reasonable amount of time.")
Пример #23
0
    def check_wait_until_timeout(self):
        """Check that timeout throws exception"""
        start = time.time()

        try:
            wait_until(lambda: time.time() > start + 5, timeout_sec=.5, backoff_sec=.1, err_msg="Hello world")
            raise Exception("This should have timed out")
        except Exception as e:
            assert e.message == "Hello world"
Пример #24
0
 def await_startup(self, min_records=5, timeout_sec=30):
     try:
         wait_until(lambda: self.consumer.total_consumed() >= min_records,
                    timeout_sec=timeout_sec,
                    err_msg="Timed out after %ds while awaiting initial record delivery of %d records" %\
                    (timeout_sec, min_records))
     except BaseException:
         self._collect_all_logs()
         raise
Пример #25
0
    def rolling_bounce_brokers(self, consumer, num_bounces=5, clean_shutdown=True):
        for _ in range(num_bounces):
            for node in self.kafka.nodes:
                total_consumed = consumer.total_consumed()

                self.kafka.restart_node(node, clean_shutdown=True)

                wait_until(lambda: len(consumer.joined_nodes()) == self.num_consumers and consumer.total_consumed() > total_consumed,
                           timeout_sec=30,
                           err_msg="Timed out waiting for the broker to shutdown")
Пример #26
0
    def stop_node(self, node, clean_shutdown=True):
        pids = self.pids(node)
        sig = signal.SIGTERM if clean_shutdown else signal.SIGKILL

        for pid in pids:
            node.account.signal(pid, sig, allow_fail=False)
        for pid in pids:
            wait_until(lambda: not node.account.alive(pid), timeout_sec=60, err_msg="Kafka Connect standalone process took too long to exit")

        node.account.ssh("rm -f /mnt/connect.pid", allow_fail=False)
Пример #27
0
 def bounce_copiers(self, copiers, clean_shutdown):
     for _ in range(3):
         for copier in copiers:
             wait_until(lambda: copier.progress_percent() >= 20.0,
                        timeout_sec=30,
                        err_msg="%s : Message copier didn't make enough progress in 30s. Current progress: %s" \
                        % (copier.transactional_id, str(copier.progress_percent())))
             self.logger.info("%s - progress: %s" % (copier.transactional_id,
                                                     str(copier.progress_percent())))
             copier.restart(clean_shutdown)
Пример #28
0
 def bounce_brokers(self, clean_shutdown):
    for node in self.kafka.nodes:
         if clean_shutdown:
             self.kafka.restart_node(node, clean_shutdown = True)
         else:
             self.kafka.stop_node(node, clean_shutdown = False)
             wait_until(lambda: len(self.kafka.pids(node)) == 0 and not self.kafka.is_registered(node),
                        timeout_sec=self.kafka.zk_session_timeout + 5,
                        err_msg="Failed to see timely deregistration of \
                        hard-killed broker %s" % str(node.account))
             self.kafka.start_node(node)
Пример #29
0
    def stop_node(self, node, clean_shutdown=True):
        self.logger.info((clean_shutdown and "Cleanly" or "Forcibly") + " stopping Kafka Connect on " + str(node.account))
        pids = self.pids(node)
        sig = signal.SIGTERM if clean_shutdown else signal.SIGKILL

        for pid in pids:
            node.account.signal(pid, sig, allow_fail=True)
        if clean_shutdown:
            for pid in pids:
                wait_until(lambda: not node.account.alive(pid), timeout_sec=60, err_msg="Kafka Connect process on " + str(node.account) + " took too long to exit")

        node.account.ssh("rm -f " + self.PID_FILE, allow_fail=False)
Пример #30
0
    def test_file_source_and_sink(self, converter="org.apache.kafka.connect.json.JsonConverter", schemas=True, security_protocol='PLAINTEXT'):
        """
        Validates basic end-to-end functionality of Connect standalone using the file source and sink converters. Includes
        parameterizations to test different converters (which also test per-connector converter overrides), schema/schemaless
        modes, and security support.
        """
        assert converter != None, "converter type must be set"
        # Template parameters. Note that we don't set key/value.converter. These default to JsonConverter and we validate
        # converter overrides via the connector configuration.
        if converter != "org.apache.kafka.connect.json.JsonConverter":
            self.override_key_converter = converter
            self.override_value_converter = converter
        self.schemas = schemas

        self.kafka = KafkaService(self.test_context, self.num_brokers, self.zk,
                                  security_protocol=security_protocol, interbroker_security_protocol=security_protocol,
                                  topics=self.topics)

        self.source = ConnectStandaloneService(self.test_context, self.kafka, [self.INPUT_FILE, self.OFFSETS_FILE])
        self.sink = ConnectStandaloneService(self.test_context, self.kafka, [self.OUTPUT_FILE, self.OFFSETS_FILE])
        self.consumer_validator = ConsoleConsumer(self.test_context, 1, self.kafka, self.TOPIC_TEST,
                                                  consumer_timeout_ms=10000)

        self.zk.start()
        self.kafka.start()

        self.source.set_configs(lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-source.properties")])
        self.sink.set_configs(lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-sink.properties")])

        self.source.set_external_configs(lambda node: self.render("connect-file-external.properties", node=node))
        self.sink.set_external_configs(lambda node: self.render("connect-file-external.properties", node=node))

        self.source.start()
        self.sink.start()

        # Generating data on the source node should generate new records and create new output on the sink node
        self.source.node.account.ssh("echo -e -n " + repr(self.FIRST_INPUT) + " >> " + self.INPUT_FILE)
        wait_until(lambda: self.validate_output(self.FIRST_INPUT), timeout_sec=60, err_msg="Data added to input file was not seen in the output file in a reasonable amount of time.")

        # Restarting both should result in them picking up where they left off,
        # only processing new data.
        self.source.restart()
        self.sink.restart()

        self.source.node.account.ssh("echo -e -n " + repr(self.SECOND_INPUT) + " >> " + self.INPUT_FILE)
        wait_until(lambda: self.validate_output(self.FIRST_INPUT + self.SECOND_INPUT), timeout_sec=60, err_msg="Sink output file never converged to the same state as the input file")

        # Validate the format of the data in the Kafka topic
        self.consumer_validator.run()
        expected = json.dumps([line if not self.schemas else { "schema": self.SCHEMA, "payload": line } for line in self.FIRST_INPUT_LIST + self.SECOND_INPUT_LIST])
        decoder = (json.loads if converter.endswith("JsonConverter") else str)
        actual = json.dumps([decoder(x) for x in self.consumer_validator.messages_consumed[1]])
        assert expected == actual, "Expected %s but saw %s in Kafka" % (expected, actual)
Пример #31
0
 def wait_node(self, node, timeout_sec=None):
     for pid in self.pids(node):
         wait_until(lambda: not node.account.alive(pid),
                    timeout_sec=timeout_sec,
                    err_msg="Streams Test process on " + str(node.account) +
                    " took too long to exit")
Пример #32
0
 def check_producing(self):
     currently_acked = self.producer.num_acked
     wait_until(lambda: self.producer.num_acked > currently_acked + 5, timeout_sec=30,
                err_msg="Expected producer to still be producing.")
    def test_upgrade_downgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker.
        """

        if from_version == to_version:
            return

        self.replication = 3
        self.num_kafka_nodes = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            }
        }

        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=self.num_kafka_nodes,
                                  zk=self.zk,
                                  version=KafkaVersion(from_version),
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        wait_until(lambda: self.confirm_topics_on_all_brokers(
            set(self.topics.keys())),
                   timeout_sec=60,
                   err_msg="Broker did not create all topics in 60 seconds ")

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)

        processor = StreamsSmokeTestJobRunnerService(self.test_context,
                                                     self.kafka)

        with self.driver.node.account.monitor_log(
                self.driver.STDOUT_FILE) as driver_monitor:
            self.driver.start()

            with processor.node.account.monitor_log(
                    processor.STDOUT_FILE) as monitor:
                processor.start()
                monitor.wait_until(
                    self.processed_msg,
                    timeout_sec=60,
                    err_msg="Never saw output '%s' on " % self.processed_msg +
                    str(processor.node))

            connected_message = "Discovered group coordinator"
            with processor.node.account.monitor_log(
                    processor.LOG_FILE) as log_monitor:
                with processor.node.account.monitor_log(
                        processor.STDOUT_FILE) as stdout_monitor:
                    self.perform_broker_upgrade(to_version)

                    log_monitor.wait_until(
                        connected_message,
                        timeout_sec=120,
                        err_msg=("Never saw output '%s' on " %
                                 connected_message) +
                        str(processor.node.account))

                    stdout_monitor.wait_until(
                        self.processed_msg,
                        timeout_sec=60,
                        err_msg="Never saw output '%s' on" % self.processed_msg
                        + str(processor.node.account))

            # SmokeTestDriver allows up to 6 minutes to consume all
            # records for the verification step so this timeout is set to
            # 6 minutes (360 seconds) for consuming of verification records
            # and a very conservative additional 2 minutes (120 seconds) to process
            # the records in the verification step
            driver_monitor.wait_until(
                'ALL-RECORDS-DELIVERED\|PROCESSED-MORE-THAN-GENERATED',
                timeout_sec=480,
                err_msg="Never saw output '%s' on" %
                'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED' +
                str(self.driver.node.account))

        self.driver.stop()
        processor.stop()
        processor.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" %
                                           processor.STDOUT_FILE,
                                           allow_fail=False)
Пример #34
0
    def test_standby_tasks_rebalance(self):
        # TODO KIP-441: consider rewriting the test for HighAvailabilityTaskAssignor
        configs = self.get_configs(
            ",sourceTopic=%s,sinkTopic1=%s,sinkTopic2=%s,internal.task.assignor.class=org.apache.kafka.streams.processor.internals.assignment.StickyTaskAssignor"
            % (self.streams_source_topic, self.streams_sink_topic_1,
               self.streams_sink_topic_2))

        producer = self.get_producer(self.streams_source_topic,
                                     self.num_messages,
                                     throughput=15000,
                                     repeating_keys=6)
        producer.start()

        processor_1 = StreamsStandbyTaskService(self.test_context, self.kafka,
                                                configs)
        processor_2 = StreamsStandbyTaskService(self.test_context, self.kafka,
                                                configs)
        processor_3 = StreamsStandbyTaskService(self.test_context, self.kafka,
                                                configs)

        processor_1.start()

        self.wait_for_verification(processor_1,
                                   "ACTIVE_TASKS:6 STANDBY_TASKS:0",
                                   processor_1.STDOUT_FILE)

        processor_2.start()

        self.wait_for_verification(processor_1,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_1.STDOUT_FILE)
        self.wait_for_verification(processor_2,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_2.STDOUT_FILE)

        processor_3.start()

        self.wait_for_verification(processor_1,
                                   "ACTIVE_TASKS:2 STANDBY_TASKS:[1-3]",
                                   processor_1.STDOUT_FILE)
        self.wait_for_verification(processor_2,
                                   "ACTIVE_TASKS:2 STANDBY_TASKS:[1-3]",
                                   processor_2.STDOUT_FILE)
        self.wait_for_verification(processor_3,
                                   "ACTIVE_TASKS:2 STANDBY_TASKS:[1-3]",
                                   processor_3.STDOUT_FILE)

        processor_1.stop()

        self.wait_for_verification(processor_2,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_2.STDOUT_FILE,
                                   num_lines=2)
        self.wait_for_verification(processor_3,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_3.STDOUT_FILE)

        processor_2.stop()

        self.wait_for_verification(processor_3,
                                   "ACTIVE_TASKS:6 STANDBY_TASKS:0",
                                   processor_3.STDOUT_FILE)

        processor_1.start()

        self.wait_for_verification(processor_1,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_1.STDOUT_FILE)
        self.wait_for_verification(processor_3,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_3.STDOUT_FILE,
                                   num_lines=2)

        processor_2.start()

        self.wait_for_verification(processor_1,
                                   "ACTIVE_TASKS:2 STANDBY_TASKS:[1-3]",
                                   processor_1.STDOUT_FILE)
        self.wait_for_verification(processor_2,
                                   "ACTIVE_TASKS:2 STANDBY_TASKS:[1-3]",
                                   processor_2.STDOUT_FILE)
        self.wait_for_verification(processor_3,
                                   "ACTIVE_TASKS:2 STANDBY_TASKS:[1-3]",
                                   processor_3.STDOUT_FILE,
                                   num_lines=2)

        processor_3.stop()

        self.wait_for_verification(processor_1,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_1.STDOUT_FILE,
                                   num_lines=2)
        self.wait_for_verification(processor_2,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_2.STDOUT_FILE)

        processor_1.stop()

        self.wait_for_verification(processor_2,
                                   "ACTIVE_TASKS:6 STANDBY_TASKS:0",
                                   processor_2.STDOUT_FILE)

        processor_3.start()

        self.wait_for_verification(processor_3,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_3.STDOUT_FILE)
        self.wait_for_verification(processor_2,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_2.STDOUT_FILE,
                                   num_lines=2)

        processor_1.start()
        self.wait_for_verification(processor_1,
                                   "ACTIVE_TASKS:2 STANDBY_TASKS:[1-3]",
                                   processor_1.STDOUT_FILE)
        self.wait_for_verification(processor_2,
                                   "ACTIVE_TASKS:2 STANDBY_TASKS:[1-3]",
                                   processor_2.STDOUT_FILE,
                                   num_lines=2)
        self.wait_for_verification(processor_3,
                                   "ACTIVE_TASKS:2 STANDBY_TASKS:[1-3]",
                                   processor_3.STDOUT_FILE)

        self.assert_consume(
            self.client_id,
            "assert all messages consumed from %s" % self.streams_sink_topic_1,
            self.streams_sink_topic_1, self.num_messages)
        self.assert_consume(
            self.client_id,
            "assert all messages consumed from %s" % self.streams_sink_topic_2,
            self.streams_sink_topic_2, self.num_messages)

        wait_until(lambda: producer.num_acked >= self.num_messages,
                   timeout_sec=60,
                   err_msg="Failed to send all %s messages" %
                   str(self.num_messages))

        producer.stop()

        processor_1.stop()
        processor_2.stop()
        processor_3.stop()

        # Validate the checkpoint/restore logs for monotonicity
        # This was added to ensure that standby restoration resumes from the checkpoint
        # rather than the beginning of the changelog, as part of KAFKA-9169

        # First, process the logs to look for invariant violations
        processor_1.node.account.ssh(
            validateMonotonicCheckpointsCmd(processor_1.LOG_FILE,
                                            processor_1.STDOUT_FILE))
        processor_2.node.account.ssh(
            validateMonotonicCheckpointsCmd(processor_2.LOG_FILE,
                                            processor_2.STDOUT_FILE))
        processor_3.node.account.ssh(
            validateMonotonicCheckpointsCmd(processor_3.LOG_FILE,
                                            processor_3.STDOUT_FILE))

        # Second, check to make sure no invariant violations were reported
        processor_1.node.account.ssh("! grep ERROR " + processor_1.STDOUT_FILE,
                                     allow_fail=False)
        processor_2.node.account.ssh("! grep ERROR " + processor_2.STDOUT_FILE,
                                     allow_fail=False)
        processor_3.node.account.ssh("! grep ERROR " + processor_3.STDOUT_FILE,
                                     allow_fail=False)
Пример #35
0
 def wait(self):
     for node in self.nodes:
         for pid in self.pids(node):
             wait_until(lambda: not node.account.alive(pid), timeout_sec=600, err_msg="SimpleBenchmark process on " + str(node.account) + " took too long to exit")
Пример #36
0
 def stop_node(self, node):
     node.account.kill_process("java", allow_fail=True)
     wait_until(lambda: not self.alive(node), timeout_sec=10, backoff_sec=.2,
                err_msg="Timed out waiting for consumer to stop.")
Пример #37
0
    def prepare_cluster(self, use_tls, use_sasl):
        self.security = SecurityConfig()
        self.security.enable_sasl = use_sasl
        self.security.enable_mtls_identity = use_tls and not use_sasl

        if use_tls:
            self.tls = tls.TLSCertManager(self.logger)

            # cert for principal with no explicitly granted permissions
            self.base_user_cert = self.tls.create_cert(socket.gethostname(),
                                                       common_name="morty",
                                                       name="base_client")

            # cert for principal with cluster describe permissions
            self.cluster_describe_user_cert = self.tls.create_cert(
                socket.gethostname(),
                common_name="cluster_describe",
                name="cluster_describe_client")

            # cert for admin user used to bootstrap
            self.admin_user_cert = self.tls.create_cert(
                socket.gethostname(),
                common_name="admin",
                name="test_admin_client")

            self.security.tls_provider = MTLSProvider(self.tls)

        self.redpanda.set_security_settings(self.security)
        self.redpanda.start()

        admin = Admin(self.redpanda)

        if self.security.enable_mtls_identity:
            feature_name = "mtls_authentication"
            admin.put_feature(feature_name, {"state": "active"})

            # wait for feature to be active so that tests don't have to retry
            def check_feature_active():
                for f in admin.get_features()["features"]:
                    if f["name"] == feature_name and f["state"] == "active":
                        return True
                return False

            wait_until(check_feature_active, timeout_sec=10, backoff_sec=1)

        # base case user is not a superuser and has no configured ACLs
        if use_sasl:
            admin.create_user("base", self.password, self.algorithm)

        # only grant cluster describe permission to user cluster_describe
        if use_sasl:
            admin.create_user("cluster_describe", self.password,
                              self.algorithm)
        client = self.get_super_client()
        client.acl_create_allow_cluster("cluster_describe", "describe")

        # there is not a convenient interface for waiting for acls to propogate
        # to all nodes so when we are using mtls only for identity we inject a
        # sleep here to try to avoid any acl propogation races.
        if self.security.enable_mtls_identity:
            time.sleep(5)
            return

        # wait for users to proogate to nodes
        def users_propogated():
            for node in self.redpanda.nodes:
                users = admin.list_users(node=node)
                if "base" not in users or "cluster_describe" not in users:
                    return False
            return True

        wait_until(users_propogated, timeout_sec=10, backoff_sec=1)
Пример #38
0
 def await_members(self, consumer, num_consumers):
     # Wait until all members have joined the group
     wait_until(
         lambda: len(consumer.joined_nodes()) == num_consumers,
         timeout_sec=self.session_timeout_sec * 2,
         err_msg="Consumers failed to join in a reasonable amount of time")
Пример #39
0
 def await_consumed_messages(self, consumer, min_messages=1):
     current_total = consumer.total_consumed()
     wait_until(
         lambda: consumer.total_consumed() >= current_total + min_messages,
         timeout_sec=self.session_timeout_sec * 2,
         err_msg="Timed out waiting for consumption")
Пример #40
0
    def test_transformations(self, connect_protocol):
        self.CONNECT_PROTOCOL = connect_protocol
        self.setup_services(timestamp_type='CreateTime')
        self.cc.set_configs(lambda node: self.render(
            "connect-distributed.properties", node=node))
        self.cc.start()

        ts_fieldname = 'the_timestamp'

        NamedConnector = namedtuple('Connector', ['name'])

        source_connector = NamedConnector(name='file-src')

        self.cc.create_connector({
            'name':
            source_connector.name,
            'connector.class':
            'org.apache.kafka.connect.file.FileStreamSourceConnector',
            'tasks.max':
            1,
            'file':
            self.INPUT_FILE,
            'topic':
            self.TOPIC,
            'transforms':
            'hoistToStruct,insertTimestampField',
            'transforms.hoistToStruct.type':
            'org.apache.kafka.connect.transforms.HoistField$Value',
            'transforms.hoistToStruct.field':
            'content',
            'transforms.insertTimestampField.type':
            'org.apache.kafka.connect.transforms.InsertField$Value',
            'transforms.insertTimestampField.timestamp.field':
            ts_fieldname,
        })

        wait_until(
            lambda: self.connector_is_running(source_connector),
            timeout_sec=30,
            err_msg='Failed to see connector transition to the RUNNING state')

        for node in self.cc.nodes:
            node.account.ssh("echo -e -n " + repr(self.FIRST_INPUTS) + " >> " +
                             self.INPUT_FILE)

        consumer = ConsoleConsumer(self.test_context,
                                   1,
                                   self.kafka,
                                   self.TOPIC,
                                   consumer_timeout_ms=15000,
                                   print_timestamp=True)
        consumer.run()

        assert len(consumer.messages_consumed[1]) == len(self.FIRST_INPUT_LIST)

        expected_schema = {
            'type':
            'struct',
            'fields': [
                {
                    'field': 'content',
                    'type': 'string',
                    'optional': False
                },
                {
                    'field': ts_fieldname,
                    'name': 'org.apache.kafka.connect.data.Timestamp',
                    'type': 'int64',
                    'version': 1,
                    'optional': True
                },
            ],
            'optional':
            False
        }

        for msg in consumer.messages_consumed[1]:
            (ts_info, value) = msg.split('\t')

            assert ts_info.startswith('CreateTime:')
            ts = int(ts_info[len('CreateTime:'):])

            obj = json.loads(value)
            assert obj['schema'] == expected_schema
            assert obj['payload']['content'] in self.FIRST_INPUT_LIST
            assert obj['payload'][ts_fieldname] == ts
Пример #41
0
    def test_leadership_transfer(self):
        topics = list(filter(lambda x: x.partition_count > 1, self.topics))
        group = "g0"

        producers = []
        for topic in topics:
            producer = RpkProducer(self._ctx,
                                   self.redpanda,
                                   topic.name,
                                   msg_size=5,
                                   msg_count=1000)
            producer.start()
            producers.append(producer)

        consumers = []
        for topic in topics:
            consumer = RpkConsumer(self._ctx,
                                   self.redpanda,
                                   topic.name,
                                   group=group)
            consumer.start()
            consumers.append(consumer)

        # Wait until cluster starts producing metrics
        wait_until(
            lambda: self.redpanda.metrics_sample("kafka_group_offset") != None,
            timeout_sec=30,
            backoff_sec=5)

        admin = Admin(redpanda=self.redpanda)

        def get_group_partition():
            return admin.get_partitions(namespace="kafka",
                                        topic="__consumer_offsets",
                                        partition=0)

        def get_group_leader():
            return get_group_partition()['leader_id']

        def metrics_from_single_node(node):
            """
            Check that metrics are produced only by the given node.
            """
            metrics = self.redpanda.metrics_sample("kafka_group_offset")
            if not metrics:
                self.logger.debug("No metrics found")
                return False
            metrics = metrics.label_filter(dict(group=group)).samples
            for metric in metrics:
                self.logger.debug(
                    f"Retrieved metric from node={metric.node.account.hostname}: {metric}"
                )
            return all([
                metric.node.account.hostname == node.account.hostname
                for metric in metrics
            ])

        def transfer_leadership(new_leader):
            """
            Request leadership transfer of the internal consumer group partition
            and check that it completes successfully.
            """
            self.logger.debug(
                f"Transferring leadership to {new_leader.account.hostname}")
            admin.transfer_leadership_to(namespace="kafka",
                                         topic="__consumer_offsets",
                                         partition=0,
                                         target=self.redpanda.idx(new_leader))
            for _ in range(3):  # re-check a few times
                leader = get_group_leader()
                self.logger.debug(f"Current leader: {leader}")
                if leader != -1 and self.redpanda.get_node(
                        leader) == new_leader:
                    return True
                time.sleep(1)
            return False

        def partition_ready():
            """
            All replicas present and known leader
            """
            partition = get_group_partition()
            self.logger.debug(f"XXXXX: {partition}")
            return len(
                partition['replicas']) == 3 and partition['leader_id'] >= 0

        def select_next_leader():
            """
            Select a leader different than the current leader
            """
            wait_until(partition_ready, timeout_sec=30, backoff_sec=5)
            partition = get_group_partition()
            replicas = partition['replicas']
            assert len(replicas) == 3
            leader = partition['leader_id']
            assert leader >= 0
            replicas = filter(lambda r: r["node_id"] != leader, replicas)
            new_leader = random.choice(list(replicas))['node_id']
            return self.redpanda.get_node(new_leader)

        # repeat the following test a few times.
        #
        #  1. transfer leadership to a new node
        #  2. check that new leader reports metrics
        #  3. check that prev leader does not report
        #
        # note that here reporting does not mean that the node does not report
        # any metrics but that it does not report metrics for consumer groups
        # for which it is not leader.
        for _ in range(4):
            new_leader = select_next_leader()

            wait_until(lambda: transfer_leadership(new_leader),
                       timeout_sec=30,
                       backoff_sec=5)

            wait_until(lambda: metrics_from_single_node(new_leader),
                       timeout_sec=30,
                       backoff_sec=5)

        for host in producers + consumers:
            host.stop()
            host.free()
Пример #42
0
    def test_bounce(self, clean, connect_protocol):
        """
        Validates that source and sink tasks that run continuously and produce a predictable sequence of messages
        run correctly and deliver messages exactly once when Kafka Connect workers undergo clean rolling bounces.
        """
        num_tasks = 3

        self.CONNECT_PROTOCOL = connect_protocol
        self.setup_services()
        self.cc.set_configs(lambda node: self.render(
            "connect-distributed.properties", node=node))
        self.cc.start()

        self.source = VerifiableSource(self.cc,
                                       topic=self.TOPIC,
                                       tasks=num_tasks,
                                       throughput=100)
        self.source.start()
        self.sink = VerifiableSink(self.cc,
                                   tasks=num_tasks,
                                   topics=[self.TOPIC])
        self.sink.start()

        for _ in range(3):
            for node in self.cc.nodes:
                started = time.time()
                self.logger.info("%s bouncing Kafka Connect on %s",
                                 clean and "Clean" or "Hard",
                                 str(node.account))
                self.cc.stop_node(node, clean_shutdown=clean)
                with node.account.monitor_log(self.cc.LOG_FILE) as monitor:
                    self.cc.start_node(node)
                    monitor.wait_until(
                        "Starting connectors and tasks using config offset",
                        timeout_sec=90,
                        err_msg=
                        "Kafka Connect worker didn't successfully join group and start work"
                    )
                self.logger.info(
                    "Bounced Kafka Connect on %s and rejoined in %f seconds",
                    node.account,
                    time.time() - started)

                # Give additional time for the consumer groups to recover. Even if it is not a hard bounce, there are
                # some cases where a restart can cause a rebalance to take the full length of the session timeout
                # (e.g. if the client shuts down before it has received the memberId from its initial JoinGroup).
                # If we don't give enough time for the group to stabilize, the next bounce may cause consumers to
                # be shut down before they have any time to process data and we can end up with zero data making it
                # through the test.
                time.sleep(15)

        # Wait at least scheduled.rebalance.max.delay.ms to expire and rebalance
        time.sleep(60)

        # Allow the connectors to startup, recover, and exit cleanly before
        # ending the test. It's possible for the source connector to make
        # uncommitted progress, and for the sink connector to read messages that
        # have not been committed yet, and fail a later assertion.
        wait_until(
            lambda: self.is_running(self.source),
            timeout_sec=30,
            err_msg="Failed to see connector transition to the RUNNING state")
        time.sleep(15)
        self.source.stop()
        # Ensure that the sink connector has an opportunity to read all
        # committed messages from the source connector.
        wait_until(
            lambda: self.is_running(self.sink),
            timeout_sec=30,
            err_msg="Failed to see connector transition to the RUNNING state")
        time.sleep(15)
        self.sink.stop()
        self.cc.stop()

        # Validate at least once delivery of everything that was reported as written since we should have flushed and
        # cleanly exited. Currently this only tests at least once delivery because the sink task may not have consumed
        # all the messages generated by the source task. This needs to be done per-task since seqnos are not unique across
        # tasks.
        success = True
        errors = []
        allow_dups = not clean
        src_messages = self.source.committed_messages()
        sink_messages = self.sink.flushed_messages()
        for task in range(num_tasks):
            # Validate source messages
            src_seqnos = [
                msg['seqno'] for msg in src_messages if msg['task'] == task
            ]
            # Every seqno up to the largest one we ever saw should appear. Each seqno should only appear once because clean
            # bouncing should commit on rebalance.
            src_seqno_max = max(src_seqnos) if len(src_seqnos) else 0
            self.logger.debug("Max source seqno: %d", src_seqno_max)
            src_seqno_counts = Counter(src_seqnos)
            missing_src_seqnos = sorted(
                set(range(src_seqno_max)).difference(set(src_seqnos)))
            duplicate_src_seqnos = sorted(
                seqno for seqno, count in src_seqno_counts.items()
                if count > 1)

            if missing_src_seqnos:
                self.logger.error("Missing source sequence numbers for task " +
                                  str(task))
                errors.append(
                    "Found missing source sequence numbers for task %d: %s" %
                    (task, missing_src_seqnos))
                success = False
            if not allow_dups and duplicate_src_seqnos:
                self.logger.error(
                    "Duplicate source sequence numbers for task " + str(task))
                errors.append(
                    "Found duplicate source sequence numbers for task %d: %s" %
                    (task, duplicate_src_seqnos))
                success = False

            # Validate sink messages
            sink_seqnos = [
                msg['seqno'] for msg in sink_messages if msg['task'] == task
            ]
            # Every seqno up to the largest one we ever saw should appear. Each seqno should only appear once because
            # clean bouncing should commit on rebalance.
            sink_seqno_max = max(sink_seqnos) if len(sink_seqnos) else 0
            self.logger.debug("Max sink seqno: %d", sink_seqno_max)
            sink_seqno_counts = Counter(sink_seqnos)
            missing_sink_seqnos = sorted(
                set(range(sink_seqno_max)).difference(set(sink_seqnos)))
            duplicate_sink_seqnos = sorted(
                seqno for seqno, count in iter(sink_seqno_counts.items())
                if count > 1)

            if missing_sink_seqnos:
                self.logger.error("Missing sink sequence numbers for task " +
                                  str(task))
                errors.append(
                    "Found missing sink sequence numbers for task %d: %s" %
                    (task, missing_sink_seqnos))
                success = False
            if not allow_dups and duplicate_sink_seqnos:
                self.logger.error("Duplicate sink sequence numbers for task " +
                                  str(task))
                errors.append(
                    "Found duplicate sink sequence numbers for task %d: %s" %
                    (task, duplicate_sink_seqnos))
                success = False

            # Validate source and sink match
            if sink_seqno_max > src_seqno_max:
                self.logger.error(
                    "Found sink sequence number greater than any generated sink sequence number for task %d: %d > %d",
                    task, sink_seqno_max, src_seqno_max)
                errors.append(
                    "Found sink sequence number greater than any generated sink sequence number for task %d: %d > %d"
                    % (task, sink_seqno_max, src_seqno_max))
                success = False
            if src_seqno_max < 1000 or sink_seqno_max < 1000:
                errors.append(
                    "Not enough messages were processed: source:%d sink:%d" %
                    (src_seqno_max, sink_seqno_max))
                success = False

        if not success:
            self.mark_for_collect(self.cc)
            # Also collect the data in the topic to aid in debugging
            consumer_validator = ConsoleConsumer(self.test_context,
                                                 1,
                                                 self.kafka,
                                                 self.source.topic,
                                                 consumer_timeout_ms=1000,
                                                 print_key=True)
            consumer_validator.run()
            self.mark_for_collect(consumer_validator, "consumer_stdout")

        assert success, "Found validation errors:\n" + "\n  ".join(errors)
Пример #43
0
    def test_node_operations(self, enable_failures):
        # allocate 5 nodes for the cluster
        self.redpanda = RedpandaService(
            self.test_context,
            5,
            extra_rp_conf={
                "enable_auto_rebalance_on_node_add": True,
                "group_topic_partitions": 3,
                "default_topic_replications": 3,
            })

        self.redpanda.start()
        # create some topics
        topics = self._create_random_topics(10)
        self.redpanda.logger.info(f"using topics: {topics}")
        # select one of the topics to use in consumer/producer
        self.topic = random.choice(topics).name

        self.start_producer(1, throughput=100)
        self.start_consumer(1)
        self.await_startup()
        self.active_nodes = set(
            [self.redpanda.idx(n) for n in self.redpanda.nodes])
        # collect current mapping
        self.ids_mapping = {}
        for n in self.redpanda.nodes:
            self.ids_mapping[self.redpanda.idx(n)] = self.redpanda.idx(n)
        self.next_id = sorted(list(self.ids_mapping.keys()))[-1] + 1
        self.redpanda.logger.info(f"Initial ids mapping: {self.ids_mapping}")
        NODE_OP_TIMEOUT = 360

        def get_next_id():
            id = self.next_id
            self.next_id += 1
            return id

        def failure_injector_loop():
            f_injector = FailureInjector(self.redpanda)
            while enable_failures:
                f_type = random.choice(FailureSpec.FAILURE_TYPES)
                length = 0
                # allow suspending any node
                if f_type == FailureSpec.FAILURE_SUSPEND:
                    length = random.randint(
                        1, NodeOperationFuzzyTest.max_suspend_duration_seconds)
                    node = random.choice(self.redpanda.nodes)
                else:
                    #kill/termianate only active nodes (not to influence the test outcome)
                    idx = random.choice(list(self.active_nodes))
                    node = self.redpanda.get_node(idx)

                f_injector.inject_failure(
                    FailureSpec(node=node, type=f_type, length=length))

                delay = random.randint(
                    NodeOperationFuzzyTest.min_inter_failure_time,
                    NodeOperationFuzzyTest.max_inter_failure_time)
                self.redpanda.logger.info(
                    f"waiting {delay} seconds before next failure")
                time.sleep(delay)

        if enable_failures:
            finjector_thread = threading.Thread(target=failure_injector_loop,
                                                args=())
            finjector_thread.daemon = True
            finjector_thread.start()

        def decommission(idx):
            node_id = self.ids_mapping[idx]
            self.logger.info(f"decommissioning node: {idx} with id: {node_id}")

            def decommissioned():
                try:
                    admin = Admin(self.redpanda)
                    # if broker is already draining, it is suceess

                    brokers = admin.get_brokers()
                    for b in brokers:
                        if b['node_id'] == node_id and b[
                                'membership_status'] == 'draining':
                            return True

                    r = admin.decommission_broker(id=node_id)
                    return r.status_code == 200
                except requests.exceptions.RetryError:
                    return False
                except requests.exceptions.ConnectionError:
                    return False
                except requests.exceptions.HTTPError:
                    return False

            wait_until(decommissioned,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)
            admin = Admin(self.redpanda)

            def is_node_removed(idx_to_query, node_id):
                try:
                    brokers = admin.get_brokers(
                        self.redpanda.get_node(idx_to_query))
                    ids = map(lambda broker: broker['node_id'], brokers)
                    return not node_id in ids
                except:
                    return False

            def node_removed():
                node_removed_cnt = 0
                for idx in self.active_nodes:
                    if is_node_removed(idx, node_id):
                        node_removed_cnt += 1

                node_count = len(self.redpanda.nodes)
                majority = int(node_count / 2) + 1
                self.redpanda.logger.debug(
                    f"node {node_id} removed on {node_removed_cnt} nodes, majority: {majority}"
                )
                return node_removed_cnt >= majority

            wait_until(node_removed,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)
            self.redpanda.stop_node(self.redpanda.get_node(idx))

        kafkacat = KafkaCat(self.redpanda)

        def replicas_per_node():
            node_replicas = {}
            md = kafkacat.metadata()
            self.redpanda.logger.info(f"metadata: {md}")
            for topic in md['topics']:
                for p in topic['partitions']:
                    for r in p['replicas']:
                        id = r['id']
                        if id not in node_replicas:
                            node_replicas[id] = 0
                        node_replicas[id] += 1

            return node_replicas

        def seed_servers_for(idx):
            seeds = map(
                lambda n: {
                    "address": n.account.hostname,
                    "port": 33145
                }, self.redpanda.nodes)

            return list(
                filter(
                    lambda n: n['address'] != self.redpanda.get_node(idx).
                    account.hostname, seeds))

        def add_node(idx, cleanup=True):
            id = get_next_id()
            self.logger.info(f"adding node: {idx} back with new id: {id}")
            self.ids_mapping[idx] = id
            self.redpanda.stop_node(self.redpanda.get_node(idx))
            if cleanup:
                self.redpanda.clean_node(self.redpanda.get_node(idx),
                                         preserve_logs=True)
            # we do not reuse previous node ids and override seed server list
            self.redpanda.start_node(
                self.redpanda.get_node(idx),
                timeout=NodeOperationFuzzyTest.min_inter_failure_time +
                NodeOperationFuzzyTest.max_suspend_duration_seconds + 30,
                override_cfg_params={
                    "node_id": id,
                    "seed_servers": seed_servers_for(idx)
                })

            def has_new_replicas():
                per_node = replicas_per_node()
                self.logger.info(f"replicas per node: {per_node}")
                return id in per_node

            wait_until(has_new_replicas,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)

        def is_topic_present(name):
            kcl = KCL(self.redpanda)
            lines = kcl.list_topics().splitlines()
            self.redpanda.logger.debug(
                f"checking if topic {name} is present in {lines}")
            for l in lines:
                if l.startswith(name):
                    return True
            return False

        def create_topic(spec):
            try:
                DefaultClient(self.redpanda).create_topic(spec)
            except Exception as e:
                self.redpanda.logger.warn(
                    f"error creating topic {spec.name} - {e}")
            try:
                return is_topic_present(spec.name)
            except Exception as e:
                self.redpanda.logger.warn(f"error while listing topics - {e}")
                return False

        def delete_topic(name):
            try:
                DefaultClient(self.redpanda).delete_topic(name)
            except Exception as e:
                self.redpanda.logger.warn(f"error deleting topic {name} - {e}")
            try:
                return not is_topic_present(name)
            except Exception as e:
                self.redpanda.logger.warn(f"error while listing topics - {e}")
                return False

        work = self.generate_random_workload(10,
                                             skip_nodes=set(),
                                             available_nodes=self.active_nodes)
        self.redpanda.logger.info(f"node operations to execute: {work}")
        for op in work:
            op_type = op[0]
            self.logger.info(
                f"executing - {op} - current ids: {self.ids_mapping}")
            if op_type == ADD:
                idx = op[1]
                self.active_nodes.add(idx)
                add_node(idx)
            if op_type == DECOMMISSION:
                idx = op[1]
                self.active_nodes.remove(idx)
                decommission(idx)
            elif op_type == ADD_TOPIC:
                spec = TopicSpec(name=op[1],
                                 replication_factor=op[2],
                                 partition_count=op[3])
                wait_until(lambda: create_topic(spec) == True,
                           timeout_sec=180,
                           backoff_sec=2)
            elif op_type == DELETE_TOPIC:
                wait_until(lambda: delete_topic(op[1]) == True,
                           timeout_sec=180,
                           backoff_sec=2)

        enable_failures = False
        self.run_validation(enable_idempotence=False,
                            producer_timeout_sec=60,
                            consumer_timeout_sec=180)
Пример #44
0
 def start_and_wait_to_start_listening(self, node, worker_type, remote_connector_configs):
     self.start_and_return_immediately(node, worker_type, remote_connector_configs)
     wait_until(lambda: self.listening(node), timeout_sec=60,
                err_msg="Kafka Connect failed to start on node: %s in condition mode: %s" %
                (str(node.account), self.startup_mode))
    def test_rest_api(self):
        # Template parameters
        self.key_converter = "org.apache.kafka.connect.json.JsonConverter"
        self.value_converter = "org.apache.kafka.connect.json.JsonConverter"
        self.schemas = True

        self.cc.set_configs(lambda node: self.render(
            "connect-distributed.properties", node=node))

        self.cc.start()

        assert self.cc.list_connectors() == []

        assert set([
            connector_plugin['class']
            for connector_plugin in self.cc.list_connector_plugins()
        ]) == {self.FILE_SOURCE_CONNECTOR, self.FILE_SINK_CONNECTOR}

        source_connector_props = self.render("connect-file-source.properties")
        sink_connector_props = self.render("connect-file-sink.properties")

        self.logger.info("Validating connector configurations")
        source_connector_config = self._config_dict_from_props(
            source_connector_props)
        configs = self.cc.validate_config(self.FILE_SOURCE_CONNECTOR,
                                          source_connector_config)
        self.verify_config(self.FILE_SOURCE_CONNECTOR,
                           self.FILE_SOURCE_CONFIGS, configs)

        sink_connector_config = self._config_dict_from_props(
            sink_connector_props)
        configs = self.cc.validate_config(self.FILE_SINK_CONNECTOR,
                                          sink_connector_config)
        self.verify_config(self.FILE_SINK_CONNECTOR, self.FILE_SINK_CONFIGS,
                           configs)

        self.logger.info("Creating connectors")
        self.cc.create_connector(source_connector_config,
                                 retries=120,
                                 retry_backoff=1)
        self.cc.create_connector(sink_connector_config,
                                 retries=120,
                                 retry_backoff=1)

        # We should see the connectors appear
        wait_until(
            lambda: set(self.cc.list_connectors(retries=5, retry_backoff=1)
                        ) == set(["local-file-source", "local-file-sink"]),
            timeout_sec=10,
            err_msg=
            "Connectors that were just created did not appear in connector listing"
        )

        # We'll only do very simple validation that the connectors and tasks really ran.
        for node in self.cc.nodes:
            node.account.ssh("echo -e -n " + repr(self.INPUTS) + " >> " +
                             self.INPUT_FILE)
        wait_until(
            lambda: self.validate_output(self.INPUT_LIST),
            timeout_sec=120,
            err_msg=
            "Data added to input file was not seen in the output file in a reasonable amount of time."
        )

        # Trying to create the same connector again should cause an error
        try:
            self.cc.create_connector(
                self._config_dict_from_props(source_connector_props))
            assert False, "creating the same connector should have caused a conflict"
        except ConnectRestError:
            pass  # expected

        # Validate that we can get info about connectors
        expected_source_info = {
            'name': 'local-file-source',
            'config': self._config_dict_from_props(source_connector_props),
            'tasks': [{
                'connector': 'local-file-source',
                'task': 0
            }]
        }
        source_info = self.cc.get_connector("local-file-source")
        assert expected_source_info == source_info, "Incorrect info:" + json.dumps(
            source_info)
        source_config = self.cc.get_connector_config("local-file-source")
        assert expected_source_info[
            'config'] == source_config, "Incorrect config: " + json.dumps(
                source_config)
        expected_sink_info = {
            'name': 'local-file-sink',
            'config': self._config_dict_from_props(sink_connector_props),
            'tasks': [{
                'connector': 'local-file-sink',
                'task': 0
            }]
        }
        sink_info = self.cc.get_connector("local-file-sink")
        assert expected_sink_info == sink_info, "Incorrect info:" + json.dumps(
            sink_info)
        sink_config = self.cc.get_connector_config("local-file-sink")
        assert expected_sink_info[
            'config'] == sink_config, "Incorrect config: " + json.dumps(
                sink_config)

        # Validate that we can get info about tasks. This info should definitely be available now without waiting since
        # we've already seen data appear in files.
        # TODO: It would be nice to validate a complete listing, but that doesn't make sense for the file connectors
        expected_source_task_info = [{
            'id': {
                'connector': 'local-file-source',
                'task': 0
            },
            'config': {
                'task.class':
                'org.apache.kafka.connect.file.FileStreamSourceTask',
                'file': self.INPUT_FILE,
                'topic': self.TOPIC
            }
        }]
        source_task_info = self.cc.get_connector_tasks("local-file-source")
        assert expected_source_task_info == source_task_info, "Incorrect info:" + json.dumps(
            source_task_info)
        expected_sink_task_info = [{
            'id': {
                'connector': 'local-file-sink',
                'task': 0
            },
            'config': {
                'task.class':
                'org.apache.kafka.connect.file.FileStreamSinkTask',
                'file': self.OUTPUT_FILE,
                'topics': self.TOPIC
            }
        }]
        sink_task_info = self.cc.get_connector_tasks("local-file-sink")
        assert expected_sink_task_info == sink_task_info, "Incorrect info:" + json.dumps(
            sink_task_info)

        file_source_config = self._config_dict_from_props(
            source_connector_props)
        file_source_config['file'] = self.INPUT_FILE2
        self.cc.set_connector_config("local-file-source", file_source_config)

        # We should also be able to verify that the modified configs caused the tasks to move to the new file and pick up
        # more data.
        for node in self.cc.nodes:
            node.account.ssh("echo -e -n " + repr(self.LONER_INPUTS) + " >> " +
                             self.INPUT_FILE2)
        wait_until(
            lambda: self.validate_output(self.LONGER_INPUT_LIST),
            timeout_sec=120,
            err_msg=
            "Data added to input file was not seen in the output file in a reasonable amount of time."
        )

        self.cc.delete_connector("local-file-source",
                                 retries=120,
                                 retry_backoff=1)
        self.cc.delete_connector("local-file-sink",
                                 retries=120,
                                 retry_backoff=1)
        wait_until(
            lambda: len(self.cc.list_connectors(retries=5, retry_backoff=1)
                        ) == 0,
            timeout_sec=10,
            err_msg="Deleted connectors did not disappear from REST listing")
Пример #46
0
    def test_replication_with_disk_failure(self, bounce_broker, security_protocol, broker_type):
        """Replication tests.
        These tests verify that replication provides simple durability guarantees by checking that data acked by
        brokers is still available for consumption in the face of various failure scenarios.

        Setup: 1 zk, 3 kafka nodes, 1 topic with partitions=3, replication-factor=3, and min.insync.replicas=2
               and another topic with partitions=3, replication-factor=3, and min.insync.replicas=1
            - Produce messages in the background
            - Consume messages in the background
            - Drive broker failures (shutdown, or bounce repeatedly with kill -15 or kill -9)
            - When done driving failures, stop producing, and finish consuming
            - Validate that every acked message was consumed
        """

        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = security_protocol
        self.kafka.start()

        try:
            # Initialize producer/consumer for topic2
            self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic2,
                                               throughput=self.producer_throughput)
            self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic2, group_id="test-consumer-group-1",
                                            consumer_timeout_ms=60000, message_validator=is_int)
            self.start_producer_and_consumer()

            # Get a replica of the partition of topic2 and make its log directory offline by changing the log dir's permission.
            # We assume that partition of topic2 is created in the second log directory of respective brokers.
            broker_node = select_node(self, broker_type, self.topic2)
            broker_idx = self.kafka.idx(broker_node)
            assert broker_idx in self.kafka.isr_idx_list(self.topic2), \
                   "Broker %d should be in isr set %s" % (broker_idx, str(self.kafka.isr_idx_list(self.topic2)))

            # Verify that topic1 and the consumer offset topic is in the first log directory and topic2 is in the second log directory
            topic_1_partition_0 = KafkaService.DATA_LOG_DIR_1 + "/test_topic_1-0"
            topic_2_partition_0 = KafkaService.DATA_LOG_DIR_2 + "/test_topic_2-0"
            offset_topic_partition_0 = KafkaService.DATA_LOG_DIR_1 + "/__consumer_offsets-0"
            for path in [topic_1_partition_0, topic_2_partition_0, offset_topic_partition_0]:
                assert path_exists(broker_node, path), "%s should exist" % path

            self.logger.debug("Making log dir %s inaccessible" % (KafkaService.DATA_LOG_DIR_2))
            cmd = "chmod a-w %s -R" % (KafkaService.DATA_LOG_DIR_2)
            broker_node.account.ssh(cmd, allow_fail=False)

            if bounce_broker:
                self.kafka.restart_node(broker_node, clean_shutdown=True)

            # Verify the following:
            # 1) The broker with offline log directory is not the leader of the partition of topic2
            # 2) The broker with offline log directory is not in the ISR
            # 3) The broker with offline log directory is still online
            # 4) Messages can still be produced and consumed from topic2
            wait_until(lambda: self.kafka.leader(self.topic2, partition=0) != broker_node,
                       timeout_sec=60,
                       err_msg="Broker %d should not be leader of topic %s and partition 0" % (broker_idx, self.topic2))
            assert self.kafka.alive(broker_node), "Broker %d should be still online" % (broker_idx)
            wait_until(lambda: broker_idx not in self.kafka.isr_idx_list(self.topic2),
                       timeout_sec=60,
                       err_msg="Broker %d should not be in isr set %s" % (broker_idx, str(self.kafka.isr_idx_list(self.topic2))))

            self.stop_producer_and_consumer()
            self.validate()

            # Shutdown all other brokers so that the broker with offline log dir is the only online broker
            offline_nodes = []
            for node in self.kafka.nodes:
                if broker_node != node:
                    offline_nodes.append(node)
                    self.logger.debug("Hard shutdown broker %d" % (self.kafka.idx(node)))
                    self.kafka.stop_node(node)

            # Verify the following:
            # 1) The broker with offline directory is the only in-sync broker of the partition of topic1
            # 2) Messages can still be produced and consumed from topic1
            self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic1,
                                               throughput=self.producer_throughput, offline_nodes=offline_nodes)
            self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic1, group_id="test-consumer-group-2",
                                            consumer_timeout_ms=90000, message_validator=is_int)
            self.consumer_start_timeout_sec = 90
            self.start_producer_and_consumer()

            assert self.kafka.isr_idx_list(self.topic1) == [broker_idx], \
                   "In-sync replicas of topic %s and partition 0 should be %s" % (self.topic1, str([broker_idx]))

            self.stop_producer_and_consumer()
            self.validate()

        except BaseException as e:
            for s in self.test_context.services:
                self.mark_for_collect(s)
            raise
Пример #47
0
 def wait_until_rejoin(self):
     for partition in range(0, self.partitions):
         wait_until(lambda: len(self.kafka.isr_idx_list(self.topic, partition)) == self.replication_factor, timeout_sec=60,
                 backoff_sec=1, err_msg="Replicas did not rejoin the ISR in a reasonable amount of time")
Пример #48
0
    def test_skip_and_log_to_dlq(self, error_tolerance):
        self.kafka = KafkaService(self.test_context, self.num_brokers, self.zk, topics=self.topics)

        # set config props
        self.override_error_tolerance_props = error_tolerance
        self.enable_deadletterqueue = True

        successful_records = []
        faulty_records = []
        records = []
        for i in range(0, 1000):
            if i % 2 == 0:
                records.append('{"some_key":' + str(i) + '}')
                successful_records.append('{some_key=' + str(i) + '}')
            else:
                # badly formatted json records (missing a quote after the key)
                records.append('{"some_key:' + str(i) + '}')
                faulty_records.append('{"some_key:' + str(i) + '}')

        records = "\n".join(records) + "\n"
        successful_records = "\n".join(successful_records) + "\n"
        if error_tolerance == ErrorTolerance.ALL:
            faulty_records = ",".join(faulty_records)
        else:
            faulty_records = faulty_records[0]

        self.source = ConnectStandaloneService(self.test_context, self.kafka, [self.INPUT_FILE, self.OFFSETS_FILE])
        self.sink = ConnectStandaloneService(self.test_context, self.kafka, [self.OUTPUT_FILE, self.OFFSETS_FILE])

        self.zk.start()
        self.kafka.start()

        self.override_key_converter = "org.apache.kafka.connect.storage.StringConverter"
        self.override_value_converter = "org.apache.kafka.connect.storage.StringConverter"
        self.source.set_configs(lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-source.properties")])

        self.override_key_converter = "org.apache.kafka.connect.json.JsonConverter"
        self.override_value_converter = "org.apache.kafka.connect.json.JsonConverter"
        self.override_key_converter_schemas_enable = False
        self.override_value_converter_schemas_enable = False
        self.sink.set_configs(lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-sink.properties")])

        self.source.set_external_configs(lambda node: self.render("connect-file-external.properties", node=node))
        self.sink.set_external_configs(lambda node: self.render("connect-file-external.properties", node=node))

        self.source.start()
        self.sink.start()

        # Generating data on the source node should generate new records and create new output on the sink node
        self.source.node.account.ssh("echo -e -n " + repr(records) + " >> " + self.INPUT_FILE)

        if error_tolerance == ErrorTolerance.NONE:
            try:
                wait_until(lambda: self.validate_output(successful_records), timeout_sec=15,
                           err_msg="Clean records added to input file were not seen in the output file in a reasonable amount of time.")
                raise Exception("Expected to not find any results in this file.")
            except TimeoutError:
                self.logger.info("Caught expected exception")
        else:
            wait_until(lambda: self.validate_output(successful_records), timeout_sec=15,
                       err_msg="Clean records added to input file were not seen in the output file in a reasonable amount of time.")

        if self.enable_deadletterqueue:
            self.logger.info("Reading records from deadletterqueue")
            consumer_validator = ConsoleConsumer(self.test_context, 1, self.kafka, "my-connector-errors",
                                                 consumer_timeout_ms=10000)
            consumer_validator.run()
            actual = ",".join(consumer_validator.messages_consumed[1])
            assert faulty_records == actual, "Expected %s but saw %s in dead letter queue" % (faulty_records, actual)
Пример #49
0
 def stop_node(self, node):
     node.account.kill_process("java", allow_fail=True)
     wait_until(lambda: not self.alive(node), timeout_sec=10, backoff_sec=.5,
                err_msg="Mirror maker took to long to stop.")
Пример #50
0
    def test_file_source_and_sink(
            self,
            converter="org.apache.kafka.connect.json.JsonConverter",
            schemas=True,
            security_protocol='PLAINTEXT'):
        """
        Validates basic end-to-end functionality of Connect standalone using the file source and sink converters. Includes
        parameterizations to test different converters (which also test per-connector converter overrides), schema/schemaless
        modes, and security support.
        """
        assert converter != None, "converter type must be set"
        # Template parameters. Note that we don't set key/value.converter. These default to JsonConverter and we validate
        # converter overrides via the connector configuration.
        if converter != "org.apache.kafka.connect.json.JsonConverter":
            self.override_key_converter = converter
            self.override_value_converter = converter
        self.schemas = schemas

        self.kafka = KafkaService(
            self.test_context,
            self.num_brokers,
            self.zk,
            security_protocol=security_protocol,
            interbroker_security_protocol=security_protocol,
            topics=self.topics)

        self.source = ConnectStandaloneService(
            self.test_context, self.kafka,
            [self.INPUT_FILE, self.OFFSETS_FILE])
        self.sink = ConnectStandaloneService(
            self.test_context, self.kafka,
            [self.OUTPUT_FILE, self.OFFSETS_FILE])
        self.consumer_validator = ConsoleConsumer(self.test_context,
                                                  1,
                                                  self.kafka,
                                                  self.TOPIC_TEST,
                                                  consumer_timeout_ms=10000)

        self.zk.start()
        self.kafka.start()

        source_external_props = os.path.join(
            self.source.PERSISTENT_ROOT, "connect-file-external.properties")
        self.source.node.account.create_file(
            source_external_props,
            self.render('connect-file-external.properties'))
        self.source.set_configs(
            lambda node: self.render("connect-standalone.properties",
                                     node=node),
            [self.render("connect-file-source.properties")])

        sink_external_props = os.path.join(self.sink.PERSISTENT_ROOT,
                                           "connect-file-external.properties")
        self.sink.node.account.create_file(
            sink_external_props,
            self.render('connect-file-external.properties'))
        self.sink.set_configs(
            lambda node: self.render("connect-standalone.properties",
                                     node=node),
            [self.render("connect-file-sink.properties")])

        self.source.start()
        self.sink.start()

        # Generating data on the source node should generate new records and create new output on the sink node
        self.source.node.account.ssh("echo -e -n " + repr(self.FIRST_INPUT) +
                                     " >> " + self.INPUT_FILE)
        wait_until(
            lambda: self.validate_output(self.FIRST_INPUT),
            timeout_sec=60,
            err_msg=
            "Data added to input file was not seen in the output file in a reasonable amount of time."
        )

        # Restarting both should result in them picking up where they left off,
        # only processing new data.
        self.source.restart()
        self.sink.restart()

        self.source.node.account.ssh("echo -e -n " + repr(self.SECOND_INPUT) +
                                     " >> " + self.INPUT_FILE)
        wait_until(
            lambda: self.validate_output(self.FIRST_INPUT + self.SECOND_INPUT),
            timeout_sec=60,
            err_msg=
            "Sink output file never converged to the same state as the input file"
        )

        # Validate the format of the data in the Kafka topic
        self.consumer_validator.run()
        expected = json.dumps([
            line if not self.schemas else {
                "schema": self.SCHEMA,
                "payload": line
            } for line in self.FIRST_INPUT_LIST + self.SECOND_INPUT_LIST
        ])
        decoder = (json.loads if converter.endswith("JsonConverter") else str)
        actual = json.dumps(
            [decoder(x) for x in self.consumer_validator.messages_consumed[1]])
        assert expected == actual, "Expected %s but saw %s in Kafka" % (
            expected, actual)
    def test_replication_with_replica_failure(self, metadata_quorum=quorum.zk):
        """
        This test verifies that replication shrinks the ISR when a replica is not fetching anymore.
        It also verifies that replication provides simple durability guarantees by checking that data acked by
        brokers is still available for consumption.

        Setup: 1 zk/KRaft controller, 3 kafka nodes, 1 topic with partitions=1, replication-factor=3, and min.insync.replicas=2
          - Produce messages in the background
          - Consume messages in the background
          - Partition a follower
          - Validate that the ISR was shrunk
          - Stop producing and finish consuming
          - Validate that every acked message was consumed
        """
        self.create_zookeeper_if_necessary()
        if self.zk:
            self.zk.start()

        self.create_kafka(
            num_nodes=3,
            server_prop_overrides=[["replica.lag.time.max.ms", "10000"]],
            controller_num_nodes_override=1)
        self.kafka.start()

        self.trogdor = TrogdorService(context=self.test_context,
                                      client_services=[self.kafka])
        self.trogdor.start()

        # If ZK is used, the partition leader is put on the controller node
        # to avoid partitioning the controller later on in the test.
        if self.zk:
            controller = self.kafka.controller()
            assignment = [self.kafka.idx(controller)] + [
                self.kafka.idx(node)
                for node in self.kafka.nodes if node != controller
            ]
        else:
            assignment = [self.kafka.idx(node) for node in self.kafka.nodes]

        self.topic = "test_topic"
        self.kafka.create_topic({
            "topic":
            self.topic,
            "replica-assignment":
            ":".join(map(str, assignment)),
            "configs": {
                "min.insync.replicas": 2
            }
        })

        self.logger.info("Created topic %s with assignment %s", self.topic,
                         ", ".join(map(str, assignment)))

        self.create_producer()
        self.producer.start()

        self.create_consumer()
        self.consumer.start()

        self.await_startup()

        leader = self.kafka.leader(self.topic, partition=0)
        replicas = self.kafka.replicas(self.topic, partition=0)

        # One of the followers is picked to be partitioned.
        follower_to_partition = [
            replica for replica in replicas if replica != leader
        ][0]
        self.logger.info(
            "Partitioning follower %s (%s) from the other brokers",
            self.kafka.idx(follower_to_partition), follower_to_partition.name)
        partition_spec = NetworkPartitionFaultSpec(
            0, 5 * 60 * 1000,
            [[follower_to_partition],
             [
                 node
                 for node in self.kafka.nodes if node != follower_to_partition
             ]])
        partition = self.trogdor.create_task("partition", partition_spec)

        def current_isr():
            try:
                # Due to the network partition, the kafka-topics command could fail if it tries
                # to connect to the partitioned broker. Therefore we catch the error here and retry.
                return set(
                    self.kafka.isr_idx_list(
                        self.topic,
                        partition=0,
                        node=leader,
                        offline_nodes=[follower_to_partition]))
            except RemoteCommandError as e:
                return set()

        # Verify that ISR is shrunk.
        expected_isr = {
            self.kafka.idx(replica)
            for replica in replicas if replica != follower_to_partition
        }
        wait_until(lambda: current_isr() == expected_isr,
                   timeout_sec=120,
                   backoff_sec=1,
                   err_msg="ISR should have been shrunk.")

        # Wait until the network partition is removed.
        partition.stop()
        partition.wait_for_done(timeout_sec=300)

        # Verify that ISR is expanded.
        expected_isr = {self.kafka.idx(replica) for replica in replicas}
        wait_until(lambda: current_isr() == expected_isr,
                   timeout_sec=120,
                   backoff_sec=1,
                   err_msg="ISR should have been expanded.")

        self.run_validation(producer_timeout_sec=120, min_records=25000)
Пример #52
0
 def await_produced_records(self, min_records, timeout_sec=30):
     wait_until(lambda: self.producer.num_acked > min_records,
                timeout_sec=timeout_sec,
                err_msg="Producer failed to produce messages for %ds." %\
                timeout_sec)
Пример #53
0
    def recover_dirty_replica(self, src_replica_idx, dst_replica_idx,
                              num_active_partitions, txn_per_client,
                              num_clients, interval, timeout):
        """
        A validate function to test offline recovery if a dirty replica.

        :param src_replica_idx: The index of source replica, where new replica recovers from
        :param dst_replica_idx: The index of destination replica
        :param num_active_partitions: Number of active partitions
        :param txn_per_client: Number of transactions per client
        :param num_clients: Number of total clients
        :param interval: Average interval(millisecond) between transactions
        :param timeout: Test timeout
        """
        port = self.waltz_storage.port
        admin_port = self.waltz_storage.admin_port
        src_node = self.waltz_storage.nodes[src_replica_idx]
        src_node_hostname = src_node.account.ssh_hostname
        src_storage = self.get_host(src_node_hostname, admin_port)
        dst_node = self.waltz_storage.nodes[dst_replica_idx]
        dst_node_hostname = dst_node.account.ssh_hostname
        dst_storage = self.get_host(dst_node_hostname, admin_port)
        partition = randrange(num_active_partitions)

        # Step 1: Submit transactions to all replicas.
        cmd = self.client_cli.validate_txn_cmd(self.log_file_path,
                                               num_active_partitions,
                                               txn_per_client, num_clients,
                                               interval)
        self.verifiable_client.start(cmd)
        wait_until(lambda: self.is_max_transaction_id_updated(
            src_storage, port, partition, -1),
                   timeout_sec=timeout)

        # Step 2: Mark destination replica offline for reads and writes
        self.storage_set_availability(storage=dst_storage,
                                      partition=partition,
                                      online=False)

        # Step 3: Trigger recovery to update source replicas' low watermark.
        self.trigger_recovery(bounce_node_idx=src_replica_idx)
        wait_until(lambda: self.is_triggered_recovery_completed(),
                   timeout_sec=timeout)
        src_node_local_low_watermark = self.get_storage_local_low_watermark(
            self.get_host(src_node_hostname, admin_port), partition)

        # Step 4: Run recovery operation on offline replica.
        # Source replica's partition low watermark will be used as target for recovery.
        self.storage_recover_partition(source_storage=src_storage,
                                       destination_storage=dst_storage,
                                       destination_storage_port=port,
                                       partition=partition,
                                       batch_size=20)

        # Step 5: Check if destination replica catch up with source replica.
        dst_node_max_transaction_id = self.get_storage_max_transaction_id(
            self.get_host(dst_node_hostname, admin_port), port, partition,
            True)
        assert src_node_local_low_watermark == dst_node_max_transaction_id, \
            "partition recovery failed on storage {}, expected max transaction ID = {}, actual max transaction ID = {}" \
            .format(dst_node_hostname, src_node_local_low_watermark, dst_node_max_transaction_id)

        # Step 6: Wait until validation complete.
        wait_until(
            lambda: self.verifiable_client.task_complete() == True,
            timeout_sec=timeout,
            err_msg="verifiable_client failed to complete task in %d seconds."
            % timeout)
Пример #54
0
def await_minimum_produced_records(redpanda: RedpandaService,
                                   producer: FranzGoVerifiableProducer,
                                   min_acked: int = 0) -> None:
    wait_until(lambda: producer.produce_status.acked > min_acked,
               timeout_sec=300,
               backoff_sec=5)
Пример #55
0
    def test_client_ssl_endpoint_validation_failure(
            self,
            security_protocol,
            interbroker_security_protocol,
            metadata_quorum=quorum.zk):
        """
        Test that invalid hostname in certificate results in connection failures.
        When security_protocol=SSL, client SSL handshakes are expected to fail due to hostname verification failure.
        When security_protocol=PLAINTEXT and interbroker_security_protocol=SSL, controller connections fail
        with hostname verification failure. Hence clients are expected to fail with LEADER_NOT_AVAILABLE.
        """

        # Start Kafka with valid hostnames in the certs' SANs so that we can create the test topic via the admin client
        SecurityConfig.ssl_stores = TestSslStores(
            self.test_context.local_scratch_dir, valid_hostname=True)

        self.create_zookeeper_if_necessary()
        if self.zk:
            self.zk.start()

        self.create_kafka(
            security_protocol=security_protocol,
            interbroker_security_protocol=interbroker_security_protocol)
        if self.kafka.quorum_info.using_raft and interbroker_security_protocol == 'SSL':
            # we don't want to interfere with communication to the controller quorum
            # (we separately test this below) so make sure it isn't using TLS
            # (it uses the inter-broker security information by default)
            controller_quorum = self.kafka.controller_quorum
            controller_quorum.controller_security_protocol = 'PLAINTEXT'
            controller_quorum.intercontroller_security_protocol = 'PLAINTEXT'
        self.kafka.start()

        # now set the certs to have invalid hostnames so we can run the actual test
        SecurityConfig.ssl_stores.valid_hostname = False
        self.kafka.restart_cluster()

        if self.kafka.quorum_info.using_raft and security_protocol == 'PLAINTEXT':
            # the inter-broker security protocol using TLS with a hostname verification failure
            # doesn't impact a producer in case of a single broker with a Raft Controller,
            # so confirm that this is in fact the observed behavior
            self.create_and_start_clients(log_level="INFO")
            self.run_validation()
        else:
            # We need more verbose logging to catch the expected errors
            self.create_and_start_clients(log_level="DEBUG")

            try:
                wait_until(lambda: self.producer.num_acked > 0, timeout_sec=30)

                # Fail quickly if messages are successfully acked
                raise RuntimeError(
                    "Messages published successfully but should not have!"
                    " Endpoint validation did not fail with invalid hostname")
            except TimeoutError:
                # expected
                pass

            error = 'SSLHandshakeException' if security_protocol == 'SSL' else 'LEADER_NOT_AVAILABLE'
            wait_until(
                lambda: self.producer_consumer_have_expected_error(error),
                timeout_sec=30)
            self.producer.stop()
            self.consumer.stop()

            SecurityConfig.ssl_stores.valid_hostname = True
            self.kafka.restart_cluster()
            self.create_and_start_clients(log_level="INFO")
            self.run_validation()
Пример #56
0
 def wait_for_messages(self, messages, timeout=30):
     wait_until(lambda: len(self._messages) >= messages,
                timeout,
                backoff_sec=2)
Пример #57
0
 def stop_node(self, node):
     idx = self.idx(node)
     self.logger.info("Stopping %s node %d on %s" % (type(self).__name__, idx, node.account.hostname))
     node.account.kill_process("zookeeper", allow_fail=False)
     wait_until(lambda: not self.alive(node), timeout_sec=5, err_msg="Timed out waiting for zookeeper to stop.")
 def wait_for_verification(self, processor, message, file, num_lines=1):
     wait_until(lambda: self.verify_from_file(processor, message, file
                                              ) >= num_lines,
                timeout_sec=60,
                err_msg="Did expect to read '%s' from %s" %
                (message, processor.node.account))
    def stop_node(self, node, clean_shutdown=True):
        self.logger.info((clean_shutdown and "Cleanly" or "Forcibly") + " stopping Kafka Connect on " + str(node.account))
        pids = self.pids(node)
        sig = signal.SIGTERM if clean_shutdown else signal.SIGKILL

        for pid in pids:
            node.account.signal(pid, sig, allow_fail=True)
        if clean_shutdown:
        for pid in pids:
                wait_until(lambda: not node.account.alive(pid), timeout_sec=60, err_msg="Kafka Connect process on " + str(node.account) + " took too long to exit")

        node.account.ssh("rm -f " + self.PID_FILE, allow_fail=False)

    def restart(self):
        # We don't want to do any clean up here, just restart the process.
        for node in self.nodes:
            self.logger.info("Restarting Kafka Connect on " + str(node.account))
            self.stop_node(node)
            self.start_node(node)

    def clean_node(self, node):
        node.account.kill_process("connect", clean_shutdown=False, allow_fail=True)
        node.account.ssh("rm -rf " + " ".join([self.CONFIG_FILE, self.LOG4J_CONFIG_FILE, self.PID_FILE, self.LOG_FILE, self.STDOUT_FILE, self.STDERR_FILE] + self.config_filenames() + self.files), allow_fail=False)

    def config_filenames(self):
        return [os.path.join(self.PERSISTENT_ROOT, "connect-connector-" + str(idx) + ".properties") for idx, template in enumerate(self.connector_config_templates or [])]


    def list_connectors(self, node=None):
        return self._rest('/connectors', node=node)

    def create_connector(self, config, node=None):
        create_request = {
            'name': config['name'],
            'config': config
        }
        return self._rest('/connectors', create_request, node=node, method="POST")

    def get_connector(self, name, node=None):
        return self._rest('/connectors/' + name, node=node)

    def get_connector_config(self, name, node=None):
        return self._rest('/connectors/' + name + '/config', node=node)

    def set_connector_config(self, name, config, node=None):
        return self._rest('/connectors/' + name + '/config', config, node=node, method="PUT")

    def get_connector_tasks(self, name, node=None):
        return self._rest('/connectors/' + name + '/tasks', node=node)

    def delete_connector(self, name, node=None):
        return self._rest('/connectors/' + name, node=node, method="DELETE")

    def _rest(self, path, body=None, node=None, method="GET"):
        if node is None:
            node = random.choice(self.nodes)

        meth = getattr(requests, method.lower())
        url = self._base_url(node) + path
        self.logger.debug("Kafka Connect REST request: %s %s %s %s", node.account.hostname, url, method, body)
        resp = meth(url, json=body)
        self.logger.debug("%s %s response: %d", url, method, resp.status_code)
        if resp.status_code > 400:
            raise ConnectRestError(resp.status_code, resp.text, resp.url)
        if resp.status_code == 204:
            return None
        else:
            return resp.json()


    def _base_url(self, node):
        return 'http://' + node.account.externally_routable_ip + ':' + '8083'
Пример #60
0
    def test_standby_tasks_rebalance(self):

        configs = self.get_configs(
            ",sourceTopic=%s,sinkTopic1=%s,sinkTopic2=%s" %
            (self.streams_source_topic, self.streams_sink_topic_1,
             self.streams_sink_topic_2))

        producer = self.get_producer(self.streams_source_topic,
                                     self.num_messages,
                                     throughput=15000,
                                     repeating_keys=6)
        producer.start()

        processor_1 = StreamsStandbyTaskService(self.test_context, self.kafka,
                                                configs)
        processor_2 = StreamsStandbyTaskService(self.test_context, self.kafka,
                                                configs)
        processor_3 = StreamsStandbyTaskService(self.test_context, self.kafka,
                                                configs)

        processor_1.start()

        self.wait_for_verification(processor_1,
                                   "ACTIVE_TASKS:6 STANDBY_TASKS:0",
                                   processor_1.STDOUT_FILE)

        processor_2.start()

        self.wait_for_verification(processor_1,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_1.STDOUT_FILE)
        self.wait_for_verification(processor_2,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_2.STDOUT_FILE)

        processor_3.start()

        self.wait_for_verification(processor_1,
                                   "ACTIVE_TASKS:2 STANDBY_TASKS:2",
                                   processor_1.STDOUT_FILE)
        self.wait_for_verification(processor_2,
                                   "ACTIVE_TASKS:2 STANDBY_TASKS:2",
                                   processor_2.STDOUT_FILE)
        self.wait_for_verification(processor_3,
                                   "ACTIVE_TASKS:2 STANDBY_TASKS:2",
                                   processor_3.STDOUT_FILE)

        processor_1.stop()

        self.wait_for_verification(processor_2,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_2.STDOUT_FILE,
                                   num_lines=2)
        self.wait_for_verification(processor_3,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_3.STDOUT_FILE)

        processor_2.stop()

        self.wait_for_verification(processor_3,
                                   "ACTIVE_TASKS:6 STANDBY_TASKS:0",
                                   processor_3.STDOUT_FILE)

        processor_1.start()

        self.wait_for_verification(processor_1,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_1.STDOUT_FILE)
        self.wait_for_verification(processor_3,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_3.STDOUT_FILE,
                                   num_lines=2)

        processor_2.start()

        self.wait_for_verification(processor_1,
                                   "ACTIVE_TASKS:2 STANDBY_TASKS:2",
                                   processor_1.STDOUT_FILE)
        self.wait_for_verification(processor_2,
                                   "ACTIVE_TASKS:2 STANDBY_TASKS:2",
                                   processor_2.STDOUT_FILE)
        self.wait_for_verification(processor_3,
                                   "ACTIVE_TASKS:2 STANDBY_TASKS:2",
                                   processor_3.STDOUT_FILE,
                                   num_lines=2)

        processor_3.stop()

        self.wait_for_verification(processor_1,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_1.STDOUT_FILE,
                                   num_lines=2)
        self.wait_for_verification(processor_2,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_2.STDOUT_FILE)

        processor_1.stop()

        self.wait_for_verification(processor_2,
                                   "ACTIVE_TASKS:6 STANDBY_TASKS:0",
                                   processor_2.STDOUT_FILE)

        processor_3.start()

        self.wait_for_verification(processor_3,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_3.STDOUT_FILE)
        self.wait_for_verification(processor_2,
                                   "ACTIVE_TASKS:3 STANDBY_TASKS:3",
                                   processor_2.STDOUT_FILE,
                                   num_lines=2)

        processor_1.start()

        self.wait_for_verification(processor_1,
                                   "ACTIVE_TASKS:2 STANDBY_TASKS:2",
                                   processor_1.STDOUT_FILE)
        self.wait_for_verification(processor_3,
                                   "ACTIVE_TASKS:2 STANDBY_TASKS:2",
                                   processor_3.STDOUT_FILE)
        self.wait_for_verification(processor_2,
                                   "ACTIVE_TASKS:2 STANDBY_TASKS:2",
                                   processor_2.STDOUT_FILE,
                                   num_lines=2)

        self.assert_consume(
            self.client_id,
            "assert all messages consumed from %s" % self.streams_sink_topic_1,
            self.streams_sink_topic_1, self.num_messages)
        self.assert_consume(
            self.client_id,
            "assert all messages consumed from %s" % self.streams_sink_topic_2,
            self.streams_sink_topic_2, self.num_messages)

        wait_until(lambda: producer.num_acked >= self.num_messages,
                   timeout_sec=60,
                   err_msg="Failed to send all %s messages" %
                   str(self.num_messages))

        producer.stop()