예제 #1
0
    def stress_test(self):
        for i in range(10):
            spec = TopicSpec(partition_count=2,
                             cleanup_policy=TopicSpec.CLEANUP_COMPACT)
            topic_name = spec.name
            self.client().create_topic(spec)

            producer = RpkProducer(self.test_context, self.redpanda,
                                   topic_name, 1024, 100000)
            producer.start()

            metrics = [
                MetricCheck(self.logger, self.redpanda, n,
                            'vectorized_storage_log_compacted_segment_total',
                            {}, sum) for n in self.redpanda.nodes
            ]

            def check_compaction():
                return all([
                    m.evaluate([
                        ('vectorized_storage_log_compacted_segment_total',
                         lambda a, b: b > 3)
                    ]) for m in metrics
                ])

            wait_until(check_compaction,
                       timeout_sec=120,
                       backoff_sec=5,
                       err_msg="Segments were not compacted")

            self.client().delete_topic(topic_name)

            try:
                producer.stop()
            except:
                # Should ignore exception form rpk
                pass
            producer.free()

            def topic_storage_purged():
                storage = self.redpanda.storage()
                return all(
                    map(lambda n: topic_name not in n.ns["kafka"].topics,
                        storage.nodes))

            try:
                wait_until(lambda: topic_storage_purged(),
                           timeout_sec=60,
                           backoff_sec=2,
                           err_msg="Topic storage was not removed")

            except:
                # On errors, dump listing of the storage location
                for node in self.redpanda.nodes:
                    self.logger.error(f"Storage listing on {node.name}:")
                    for line in node.account.ssh_capture(
                            f"find {self.redpanda.DATA_DIR}"):
                        self.logger.error(line.strip())

                raise
예제 #2
0
    def test_leader_transfers_recovery(self, acks):
        """
        Validate that leadership transfers complete successfully
        under acks=1 writes that prompt the leader to frequently
        activate recovery_stm.

        When acks=1, this is a reproducer for
        https://github.com/vectorizedio/redpanda/issues/2580

        When acks=-1, this is a reproducer rfor
        https://github.com/vectorizedio/redpanda/issues/2606
        """

        leader_node_id, replicas = self._wait_for_leader()

        if acks == -1:
            producer = RpkProducer(self._ctx,
                                   self.redpanda,
                                   self.topic,
                                   16384,
                                   sys.maxsize,
                                   acks=acks)
        else:
            # To reproduce acks=1 issue, we need an intermittent producer that
            # waits long enough between messages to let recovery_stm go to sleep
            # waiting for follower_state_change

            # KafProducer is intermittent because it starts a fresh process for
            # each message, whereas RpkProducer writes a continuous stream.
            # TODO: create a test traffic generator that has inter-message
            # delay as an explicit parameter, rather than relying on implementation
            # details of the producer helpers.
            producer = KafProducer(self._ctx, self.redpanda, self.topic)

        producer.start()

        # Pass leadership around in a ring
        self.logger.info(f"Initial leader of {self.topic} is {leader_node_id}")

        transfer_count = 50

        # FIXME: with a transfer count >100, we tend to see
        # reactor stalls and corresponding nondeterministic behaviour/failures.
        # This appears unrelated to the functionality under test, something else
        # is tripping up the cluster when we have so many leadership transfers.
        # https://github.com/vectorizedio/redpanda/issues/2623

        admin = Admin(self.redpanda)

        initial_leader_id = leader_node_id
        for n in range(0, transfer_count):
            target_idx = (initial_leader_id + n) % len(self.redpanda.nodes)
            target_node_id = target_idx + 1

            self.logger.info(f"Starting transfer to {target_node_id}")
            admin.partition_transfer_leadership("kafka", self.topic, 0,
                                                target_node_id)

            self._wait_for_leader(
                lambda l: l is not None and l == target_node_id,
                timeout=ELECTION_TIMEOUT * 2)
            self.logger.info(f"Completed transfer to {target_node_id}")

        self.logger.info(f"Completed {transfer_count} transfers successfully")

        # Explicit stop of producer so that we see any errors
        producer.stop()
        producer.wait()
        producer.free()