Exemplo n.º 1
0
    def test_fetch_after_committed_offset_was_removed(self,
                                                      transactions_enabled):
        """
        Test fetching when consumer offset was deleted by retention
        """

        self.redpanda._extra_rp_conf[
            "enable_transactions"] = transactions_enabled
        self.redpanda._extra_rp_conf[
            "enable_idempotence"] = transactions_enabled
        self.redpanda.start()

        topic = TopicSpec(partition_count=1,
                          replication_factor=3,
                          cleanup_policy=TopicSpec.CLEANUP_DELETE)
        self.client().create_topic(topic)

        kafka_tools = KafkaCliTools(self.redpanda)

        # produce until segments have been compacted
        produce_until_segments(
            self.redpanda,
            topic=topic.name,
            partition_idx=0,
            count=10,
        )
        consumer_group = 'test'
        rpk = RpkTool(self.redpanda)

        def consume(n=1):

            out = rpk.consume(topic.name, group=consumer_group, n=n)
            split = out.split('}')
            split = filter(lambda s: "{" in s, split)

            return map(lambda s: json.loads(s + "}"), split)

        #consume from the beggining
        msgs = consume(10)
        last = list(msgs).pop()
        offset = last['offset']

        # change retention time
        kafka_tools.alter_topic_config(
            topic.name, {
                TopicSpec.PROPERTY_RETENTION_BYTES: 2 * self.segment_size,
            })

        wait_for_segments_removal(self.redpanda,
                                  topic.name,
                                  partition_idx=0,
                                  count=5)

        partitions = list(rpk.describe_topic(topic.name))
        p = partitions[0]
        assert p.start_offset > offset
        # consume from the offset that doesn't exists,
        # the one that was committed previously was already removed
        out = list(consume(1))
        assert out[0]['offset'] == p.start_offset
Exemplo n.º 2
0
    def test_bootstrapping_after_move(self):
        """
        Move partitions with active consumer / producer
        """
        self.start_redpanda(num_nodes=3)
        spec = TopicSpec(name="topic", partition_count=3, replication_factor=3)
        self.client().create_topic(spec)
        self.topic = spec.name
        self.start_producer(1)
        self.start_consumer(1)
        self.await_startup()
        # execute single move
        self._move_and_verify()
        self.run_validation(enable_idempotence=False, consumer_timeout_sec=45)

        # snapshot offsets
        rpk = RpkTool(self.redpanda)
        partitions = rpk.describe_topic(spec.name)
        offset_map = {}
        for p in partitions:
            offset_map[p.id] = p.high_watermark

        # restart all the nodes
        self.redpanda.restart_nodes(self.redpanda.nodes)

        def offsets_are_recovered():

            return all([
                offset_map[p.id] == p.high_watermark
                for p in rpk.describe_topic(spec.name)
            ])

        wait_until(offsets_are_recovered, 30, 2)
Exemplo n.º 3
0
 def _restore_topic(self, topic_spec, overrides={}):
     """Restore individual topic"""
     self.logger.info(f"Restore topic called. Topic-manifest: {topic_spec}")
     conf = {
         'redpanda.remote.recovery': 'true',
         #'redpanda.remote.write': 'true',
     }
     conf.update(overrides)
     self.logger.info(f"Confg: {conf}")
     topic = topic_spec.name
     npart = topic_spec.partition_count
     nrepl = topic_spec.replication_factor
     rpk = RpkTool(self.redpanda)
     rpk.create_topic(topic, npart, nrepl, conf)
     time.sleep(10)
     rpk.describe_topic(topic)
     rpk.describe_topic_configs(topic)
    def _all_have_leaders(self, topic):
        rpk = RpkTool(self.redpanda)
        partitions = rpk.describe_topic(topic)

        for p in partitions:
            self.logger.debug(f"rpk partition: {p}")
            if p.leader is None or p.leader == -1:
                return False
        return True
Exemplo n.º 5
0
    def test_querying_remote_partitions(self):
        topic = TopicSpec(redpanda_remote_read=True,
                          redpanda_remote_write=True)
        epoch_offsets = {}
        rpk = RpkTool(self.redpanda)
        self.client().create_topic(topic)
        rpk.alter_topic_config(topic.name, "redpanda.remote.read", 'true')
        rpk.alter_topic_config(topic.name, "redpanda.remote.write", 'true')

        def wait_for_topic():
            wait_until(lambda: len(list(rpk.describe_topic(topic.name))) > 0,
                       30,
                       backoff_sec=2)

        # restart whole cluster 6 times to trigger term rolls
        for i in range(0, 6):
            wait_for_topic()
            produce_until_segments(
                redpanda=self.redpanda,
                topic=topic.name,
                partition_idx=0,
                count=2 * i,
            )
            res = list(rpk.describe_topic(topic.name))
            epoch_offsets[res[0].leader_epoch] = res[0].high_watermark
            self.redpanda.restart_nodes(self.redpanda.nodes)

        self.logger.info(f"ledear epoch high watermarks: {epoch_offsets}")

        wait_for_topic()

        rpk.alter_topic_config(topic.name, TopicSpec.PROPERTY_RETENTION_BYTES,
                               OffsetForLeaderEpochArchivalTest.segment_size)
        wait_for_segments_removal(redpanda=self.redpanda,
                                  topic=topic.name,
                                  partition_idx=0,
                                  count=7)
        kcl = KCL(self.redpanda)

        for epoch, offset in epoch_offsets.items():
            self.logger.info(f"querying partition epoch {epoch} end offsets")
            epoch_end_offset = kcl.offset_for_leader_epoch(
                topics=topic.name, leader_epoch=epoch)[0].epoch_end_offset
            self.logger.info(
                f"epoch {epoch} end_offset: {epoch_end_offset}, expected offset: {offset}"
            )
            assert epoch_end_offset == offset
Exemplo n.º 6
0
class ManyPartitionsTest(PreallocNodesTest):
    """
    Validates basic functionality in the presence of larger numbers
    of partitions than most other tests.
    """
    topics = ()

    def __init__(self, test_ctx, *args, **kwargs):
        self._ctx = test_ctx
        super(ManyPartitionsTest, self).__init__(
            test_ctx,
            *args,
            num_brokers=6,
            node_prealloc_count=1,
            extra_rp_conf={
                # Disable leader balancer initially, to enable us to check for
                # stable leadership during initial elections and post-restart
                # elections.  We will switch it on later, to exercise it during
                # the traffic stress test.
                'enable_leader_balancer': False,
            },
            # Usually tests run with debug or trace logs, but when testing resource
            # limits we want to test in a more production-like configuration.
            log_level='info',
            **kwargs)
        self.rpk = RpkTool(self.redpanda)

    def _all_elections_done(self, topic_names: list[str], p_per_topic: int):
        any_incomplete = False
        for tn in topic_names:
            partitions = list(self.rpk.describe_topic(tn))
            if len(partitions) < p_per_topic:
                self.logger.info(f"describe omits partitions for topic {tn}")
                any_incomplete = True
                continue

            assert len(partitions) == p_per_topic
            for p in partitions:
                if p.leader == -1:
                    self.logger.info(
                        f"partition {tn}/{p.id} has no leader yet")
                    any_incomplete = True

        return not any_incomplete

    def _consume_all(self, topic_names: list[str], msg_count_per_topic: int,
                     timeout_per_topic: int):
        """
        Don't do anything with the messages, just consume them to demonstrate
        that doing so does not exhaust redpanda resources.
        """
        def consumer_saw_msgs(consumer):
            self.logger.info(
                f"Consumer message_count={consumer.message_count} / {msg_count_per_topic}"
            )
            # Tolerate greater-than, because if there were errors during production
            # there can have been retries.
            return consumer.message_count >= msg_count_per_topic

        for tn in topic_names:
            consumer = RpkConsumer(self._ctx,
                                   self.redpanda,
                                   tn,
                                   save_msgs=False,
                                   fetch_max_bytes=BIG_FETCH,
                                   num_msgs=msg_count_per_topic)
            consumer.start()
            wait_until(lambda: consumer_saw_msgs(consumer),
                       timeout_sec=timeout_per_topic,
                       backoff_sec=5)
            consumer.stop()
            consumer.free()

    def setUp(self):
        # defer redpanda startup to the test, it might want to tweak
        # ResourceSettings based on its parameters.
        pass

    @cluster(num_nodes=7, log_allow_list=RESTART_LOG_ALLOW_LIST)
    def test_many_partitions(self):
        """
        Validate that redpanda works with partition counts close to its resource
        limits.

        This test should evolve over time as we improve efficiency and can reliably
        run with higher partition counts.  It should roughly track the values we
        use for topic_memory_per_partition and topic_fds_per_partition.

        * Check topic can be created.
        * Check leadership election succeeds for all partitions.
        * Write in enough data such that an unlimited size fetch
          would exhaust ram (check enforcement of kafka_max_bytes_per_fetch).
        * Consume all the data from the topic

        * Restart nodes several times (check that recovery works, and that the additional
          log segments created by rolling segments on restart do not cause us
          to exhaust resources.

        * Run a general produce+consume workload to check that the system remains in
          a functional state.
        """

        # This test requires dedicated system resources to run reliably.
        #assert self.redpanda.dedicated_nodes

        # Scale tests are not run on debug builds
        assert not self.debug_mode

        replication_factor = 3
        node_count = len(self.redpanda.nodes)

        # If we run on nodes with more memory than our HARD_PARTITION_LIMIT, then
        # artificially throttle the nodes' memory to avoid the test being too easy.
        # We are validating that the system works up to the limit, and that it works
        # up to the limit within the default per-partition memory footprint.
        node_memory = self.redpanda.get_node_memory_mb()

        # HARD_PARTITION_LIMIT is for a 3 node cluster, adjust according to
        # the number of nodes in this cluster.
        partition_limit = HARD_PARTITION_LIMIT * (node_count / 3)

        mb_per_partition = 1

        # How much memory to reserve for internal partitions, such as
        # id_allocator.  This is intentionally higher than needed, to
        # avoid having to update this test each time a new internal topic
        # is added.
        internal_partition_slack = 10

        # Emulate seastar's policy for default reserved memory
        reserved_memory = max(1536, int(0.07 * node_memory) + 1)
        effective_node_memory = node_memory - reserved_memory

        # TODO: calculate an appropriate segment size for the disk space divided
        # by the partition count, then set an appropriate retention.bytes and
        # enable compaction, so that during the final stress period of the test,
        # we are exercising compaction.

        # Clamp memory if nodes have more memory than should be required
        # to exercise the partition limit.
        if effective_node_memory > HARD_PARTITION_LIMIT / mb_per_partition:
            clamp_memory = mb_per_partition * (
                (HARD_PARTITION_LIMIT + internal_partition_slack) +
                reserved_memory)

            # Handy if hacking HARD_PARTITION_LIMIT to something low to run on a workstation
            clamp_memory = max(clamp_memory, 500)

            resource_settings = ResourceSettings(memory_mb=clamp_memory)
            self.redpanda.set_resource_settings(resource_settings)
        elif effective_node_memory < HARD_PARTITION_LIMIT / mb_per_partition:
            raise RuntimeError(
                f"Node memory is too small ({node_memory}MB - {reserved_memory}MB)"
            )

        # Run with one huge topic: this is the more stressful case for Redpanda, compared
        # with multiple modestly-sized topics, so it's what we test to find the system's limits.
        n_topics = 1

        # Partitions per topic
        n_partitions = int(partition_limit / n_topics)

        self.logger.info(
            f"Running partition scale test with {n_partitions} partitions on {n_topics} topics"
        )

        self.redpanda.start()

        self.logger.info("Entering topic creation")
        topic_names = [f"scale_{i:06d}" for i in range(0, n_topics)]
        for tn in topic_names:
            self.logger.info(
                f"Creating topic {tn} with {n_partitions} partitions")
            self.rpk.create_topic(tn,
                                  partitions=n_partitions,
                                  replicas=replication_factor)

        self.logger.info(f"Awaiting elections...")
        wait_until(lambda: self._all_elections_done(topic_names, n_partitions),
                   timeout_sec=60,
                   backoff_sec=5)
        self.logger.info(f"Initial elections done.")

        for node in self.redpanda.nodes:
            files = self.redpanda.lsof_node(node)
            file_count = sum(1 for _ in files)
            self.logger.info(
                f"Open files after initial selection on {node.name}: {file_count}"
            )

        # Assume fetches will be 10MB, the franz-go default
        fetch_mb_per_partition = 10 * 1024 * 1024

        # * Need enough data that if a consumer tried to fetch it all at once
        # in a single request, it would run out of memory.  OR the amount of
        # data that would fill a 10MB max_bytes per partition in a fetch, whichever
        # is lower (avoid writing excessive data for tests with fewer partitions).
        # * Then apply a factor of two to make sure we have enough data to drive writes
        # to disk during consumption, not just enough data to hold it all in the batch
        # cache.
        write_bytes_per_topic = min(
            int((self.redpanda.get_node_memory_mb() * 1024 * 1024) / n_topics),
            fetch_mb_per_partition * n_partitions) * 2

        if self.scale.release:
            # Release tests can be much longer running: 10x the amount of
            # data we fire through the system
            write_bytes_per_topic *= 10

        msg_size = 128 * 1024
        msg_count_per_topic = int((write_bytes_per_topic / msg_size))

        # Approx time to write or read all messages, for timeouts
        # Pessimistic bandwidth guess, accounting for the sub-disk bandwidth
        # that a single-threaded consumer may see
        expect_bandwidth = 50 * 1024 * 1024

        expect_transmit_time = int(write_bytes_per_topic / expect_bandwidth)
        expect_transmit_time = max(expect_transmit_time, 30)

        self.logger.info("Entering initial produce")
        for tn in topic_names:
            t1 = time.time()
            producer = FranzGoVerifiableProducer(
                self.test_context,
                self.redpanda,
                tn,
                msg_size,
                msg_count_per_topic,
                custom_node=self.preallocated_nodes)
            producer.start()
            producer.wait(timeout_sec=expect_transmit_time)
            self.free_preallocated_nodes()
            duration = time.time() - t1
            self.logger.info(
                f"Wrote {write_bytes_per_topic} bytes to {tn} in {duration}s, bandwidth {(write_bytes_per_topic / duration)/(1024 * 1024)}MB/s"
            )

        def get_fd_counts():
            counts = {}
            with concurrent.futures.ThreadPoolExecutor(
                    max_workers=node_count) as executor:
                futs = {}
                for node in self.redpanda.nodes:
                    futs[node.name] = executor.submit(
                        lambda: sum(1 for _ in self.redpanda.lsof_node(node)))

                for node_name, fut in futs.items():
                    file_count = fut.result()
                    counts[node_name] = file_count

            return counts

        for node_name, file_count in get_fd_counts().items():
            self.logger.info(
                f"Open files before restarts on {node_name}: {file_count}")

        # Over large partition counts, the startup time is linear with the
        # amount of data we played in, because no one partition gets far
        # enough to snapshot.
        expect_start_time = expect_transmit_time

        # Measure the impact of restarts on resource utilization on an idle system:
        # at time of writing we know that the used FD count will go up substantially
        # on each restart (https://github.com/redpanda-data/redpanda/issues/4057)
        restart_count = 2

        self.logger.info("Entering restart stress test")
        for i in range(1, restart_count + 1):
            self.logger.info(f"Cluster restart {i}/{restart_count}...")

            # Normal restarts are rolling restarts, but because replay takes substantial time,
            # on an idle system it is helpful to do a concurrent global restart rather than
            # waiting for each node one by one.
            with concurrent.futures.ThreadPoolExecutor(
                    max_workers=node_count) as executor:
                futs = []
                for node in self.redpanda.nodes:
                    futs.append(
                        executor.submit(self.redpanda.restart_nodes,
                                        nodes=[node],
                                        start_timeout=expect_start_time))

                for f in futs:
                    # Raise on error
                    f.result()
            self.logger.info(
                f"Restart {i}/{restart_count} complete.  Waiting for elections..."
            )

            wait_until(
                lambda: self._all_elections_done(topic_names, n_partitions),
                timeout_sec=60,
                backoff_sec=5)
            self.logger.info(f"Post-restart elections done.")

            for node_name, file_count in get_fd_counts().items():
                self.logger.info(
                    f"Open files after {i} restarts on {node_name}: {file_count}"
                )

        # With increased overhead from all those segment rolls during restart,
        # check that consume still works.
        self._consume_all(topic_names, msg_count_per_topic,
                          expect_transmit_time)

        # Now that we've tested basic ability to form consensus and survive some
        # restarts, move on to a more general stress test.

        self.logger.info("Entering traffic stress test")
        target_topic = topic_names[0]

        stress_msg_size = 32768
        stress_data_size = 1024 * 1024 * 1024 * 100
        stress_msg_count = int(stress_data_size / stress_msg_size)
        fast_producer = FranzGoVerifiableProducer(
            self.test_context,
            self.redpanda,
            target_topic,
            stress_msg_size,
            stress_msg_count,
            custom_node=self.preallocated_nodes)
        fast_producer.start()

        # Don't start consumers until the producer has written out its first
        # checkpoint with valid ranges.
        wait_until(lambda: fast_producer.produce_status.acked > 0,
                   timeout_sec=30,
                   backoff_sec=1.0)

        rand_consumer = FranzGoVerifiableRandomConsumer(
            self.test_context,
            self.redpanda,
            target_topic,
            0,
            100,
            10,
            nodes=self.preallocated_nodes)
        rand_consumer.start(clean=False)
        rand_consumer.shutdown()
        rand_consumer.wait()

        fast_producer.wait()

        seq_consumer = FranzGoVerifiableSeqConsumer(self.test_context,
                                                    self.redpanda,
                                                    target_topic, 0,
                                                    self.preallocated_nodes)
        seq_consumer.start(clean=False)
        seq_consumer.shutdown()
        seq_consumer.wait()
        assert seq_consumer.consumer_status.invalid_reads == 0
        assert seq_consumer.consumer_status.valid_reads == stress_msg_count + msg_count_per_topic

        self.logger.info("Entering leader balancer stress test")

        # Enable the leader balancer and check that the system remains stable
        # under load.  We do not leave the leader balancer on for most of the test, because
        # it makes reads _much_ slower, because the consumer keeps stalling and waiting for
        # elections: at any moment in a 10k partition topic, it's highly likely at least
        # one partition is offline for a leadership migration.
        self.redpanda.set_cluster_config({'enable_leader_balancer': True},
                                         expect_restart=False)
        lb_stress_period = 120
        lb_stress_produce_bytes = expect_bandwidth * lb_stress_period
        lb_stress_message_count = int(lb_stress_produce_bytes /
                                      stress_msg_size)
        fast_producer = FranzGoVerifiableProducer(
            self.test_context,
            self.redpanda,
            target_topic,
            stress_msg_size,
            lb_stress_message_count,
            custom_node=self.preallocated_nodes)
        fast_producer.start()
        rand_consumer.start()
        time.sleep(lb_stress_period
                   )  # Let the system receive traffic for a set time period
        rand_consumer.shutdown()
        rand_consumer.wait()
        fast_producer.wait()
    def test_offset_for_leader_epoch(self):
        replication_factors = [1, 3, 5]
        cleanup_policies = [
            TopicSpec.CLEANUP_COMPACT, TopicSpec.CLEANUP_DELETE
        ]
        topics = []

        for i in range(0, 10):
            topics.append(
                TopicSpec(
                    partition_count=random.randint(1, 50),
                    replication_factor=random.choice(replication_factors),
                    cleanup_policy=random.choice(cleanup_policies)))

        topic_names = [t.name for t in topics]
        # create test topics
        self.client().create_topic(topics)
        kcl = KCL(self.redpanda)
        for t in topics:
            self._produce(t.name, 20)

        def list_offsets_map():
            offsets_map = {}
            offsets = kcl.list_offsets(topic_names)
            self.logger.info(f"offsets_list: {offsets}")
            for p in offsets:
                offsets_map[(p.topic, p.partition)] = int(p.end_offset)
            self.logger.info(f"offsets_map: {offsets_map}")
            return offsets_map

        initial_offsets = list_offsets_map()

        leader_epoch_offsets = kcl.offset_for_leader_epoch(topics=topic_names,
                                                           leader_epoch=1)

        for o in leader_epoch_offsets:
            assert initial_offsets[(o.topic,
                                    o.partition)] == o.epoch_end_offset

        # restart all the nodes to force leader election,
        # increase start timeout as partition count may get large
        self.redpanda.restart_nodes(self.redpanda.nodes, start_timeout=30)
        # produce more data
        for t in topics:
            self._produce(t.name, 20)

        # check epoch end offsets for term 1
        leader_epoch_offsets = kcl.offset_for_leader_epoch(topics=topic_names,
                                                           leader_epoch=1)

        for o in leader_epoch_offsets:
            assert initial_offsets[(o.topic,
                                    o.partition)] == o.epoch_end_offset

        last_offsets = list_offsets_map()
        rpk = RpkTool(self.redpanda)
        for t in topics:
            tp_desc = rpk.describe_topic(t.name)
            for p in tp_desc:
                for o in kcl.offset_for_leader_epoch(
                        topics=f"{t.name}:{p.id}",
                        leader_epoch=p.leader_epoch,
                        current_leader_epoch=p.leader_epoch):
                    assert last_offsets[(o.topic,
                                         o.partition)] == o.epoch_end_offset

        # test returning unknown leader epoch error, we use large leader epoch value

        leader_epoch_offsets = kcl.offset_for_leader_epoch(
            topics=topic_names, leader_epoch=1, current_leader_epoch=1000)

        for o in leader_epoch_offsets:
            assert o.error is not None and "UNKNOWN_LEADER_EPOCH" in o.error