Exemplo n.º 1
0
 def _get_partition_leaders(self):
     kcat = KafkaCat(self.redpanda)
     m = kcat.metadata()
     self.logger.info(f"kcat.metadata() == {m}")
     brokers = {}
     for b in m['brokers']:
         id = b['id']
         ip = b['name']
         ip = ip[:ip.index(':')]
         for n in self.redpanda.nodes:
             n_ip = n.account.hostname
             self.logger.debug(f"matching {n_ip} over {ip}")
             if n_ip == ip:
                 brokers[id] = n
                 break
     self.logger.debug(f"found brokers {brokers}")
     assert len(brokers) == 3
     leaders = {}
     for topic in m['topics']:
         if topic['topic'] == ArchivalTest.s3_topic_name:
             for part in topic['partitions']:
                 leader_id = part['leader']
                 partition_id = part['partition']
                 leader = brokers[leader_id]
                 leaders[partition_id] = leader
     return leaders
Exemplo n.º 2
0
 def controller(self):
     kc = KafkaCat(self)
     cid = kc.metadata()["controllerid"]
     self.logger.debug("Controller reported with id: {}".format(cid))
     if cid != -1:
         node = self.get_node(cid)
         self.logger.debug("Controller node found: {}".format(node))
         return node
Exemplo n.º 3
0
 def registered(self, node):
     idx = self.idx(node)
     self.logger.debug("Checking if broker %d/%s is registered", idx, node)
     kc = KafkaCat(self)
     brokers = kc.metadata()["brokers"]
     brokers = {b["id"]: b for b in brokers}
     broker = brokers.get(idx, None)
     self.logger.debug("Found broker info: %s", broker)
     return broker is not None
Exemplo n.º 4
0
    def test_produce_topic(self):
        """
        Create a topic and verify that pandaproxy can produce to it.
        """
        name = create_topic_names(1)[0]
        data = '''
        {
            "records": [
                {"value": "dmVjdG9yaXplZA==", "partition": 0},
                {"value": "cGFuZGFwcm94eQ==", "partition": 1},
                {"value": "bXVsdGlicm9rZXI=", "partition": 2}
            ]
        }'''

        self.logger.info(f"Producing to non-existant topic: {name}")
        produce_result_raw = self._produce_topic(name, data)
        assert produce_result_raw.status_code == requests.codes.ok
        produce_result = produce_result_raw.json()
        for o in produce_result["offsets"]:
            assert o["error_code"] == 3
            assert o["offset"] == -1

        self.logger.info(f"Creating test topic: {name}")
        self._create_topics([name], partitions=3)

        self.logger.info(f"Producing to topic: {name}")
        produce_result_raw = self._produce_topic(name, data)
        assert produce_result_raw.status_code == requests.codes.ok
        assert produce_result_raw.headers[
            "Content-Type"] == "application/vnd.kafka.v2+json"

        produce_result = produce_result_raw.json()
        for o in produce_result["offsets"]:
            assert o["offset"] == 0, f'error_code {o["error_code"]}'

        self.logger.info(f"Consuming from topic: {name}")
        kc = KafkaCat(self.redpanda)
        assert kc.consume_one(name, 0, 0)["payload"] == "vectorized"
        assert kc.consume_one(name, 1, 0)["payload"] == "pandaproxy"
        assert kc.consume_one(name, 2, 0)["payload"] == "multibroker"

        self.logger.info(f"Producing to topic without partition: {name}")
        produce_result_raw = self._produce_topic(
            name, '''
        {
            "records": [
                {"value": "dmVjdG9yaXplZA=="},
                {"value": "cGFuZGFwcm94eQ=="},
                {"value": "bXVsdGlicm9rZXI="}
            ]
        }''')

        assert produce_result_raw.status_code == requests.codes.ok
        produce_result = produce_result_raw.json()
        for o in produce_result["offsets"]:
            assert o["offset"] == 1, f'error_code {o["error_code"]}'
Exemplo n.º 5
0
 def registered(self, node):
     idx = self.idx(node)
     self.logger.debug(
         f"Checking if broker {idx} ({node.name} is registered")
     kc = KafkaCat(self)
     brokers = kc.metadata()["brokers"]
     brokers = {b["id"]: b for b in brokers}
     broker = brokers.get(idx, None)
     self.logger.debug(f"Found broker info: {broker}")
     return broker is not None
Exemplo n.º 6
0
 def _registered(self, service, node):
     idx = service.idx(node)
     service.logger.debug("Checking if broker %d/%s is registered", idx,
                          node)
     kc = KafkaCat(RedpandaMuServiceServiceProxy(service, self))
     brokers = kc.metadata()["brokers"]
     brokers = {b["id"]: b for b in brokers}
     broker = brokers.get(idx, None)
     service.logger.debug("Found broker info: %s", broker)
     return broker is not None
Exemplo n.º 7
0
    def test_produce_topic(self):
        """
        Create a topic and verify that pandaproxy can produce to it.
        """
        name = "pandaproxy-topic-{}".format(uuid.uuid4())
        self.logger.debug("Topic name %s", name)

        prev = set(self._get_topics())
        self.logger.debug("Existing topics %s", prev)
        assert prev.isdisjoint(name)

        data = '{"records": [{"value": "dmVjdG9yaXplZA==", "partition": 0},{"value": "cGFuZGFwcm94eQ==", "partition": 1},{"value": "bXVsdGlicm9rZXI=", "partition": 2}]}'

        self.logger.debug("Producing to non-existant topic")
        produce_result = self._produce_topic(name, data)
        for o in produce_result["offsets"]:
            assert o["error_code"] == 3
            assert o["offset"] == -1

        kc = KafkaCat(self.redpanda)

        self.logger.debug("Creating test topic")
        kafka_tools = KafkaCliTools(self.redpanda)
        kafka_tools.create_topic(
            TopicSpec(name=name, replication_factor=1, partition_count=3))

        self.logger.debug("Waiting for leaders to settle")
        has_leaders = False
        while not has_leaders:
            topics = kc.metadata()["topics"]
            maybe_leaders = True
            for t in topics:
                if t["topic"] == name:
                    for p in t["partitions"]:
                        if p["leader"] == -1:
                            maybe_leaders = False
            has_leaders = maybe_leaders
        # TODO:
        #  Despite the above test, Pandaproxy can still get back no leaders
        #  Query Pandaproxy metadata to see when leaders have settled
        #  The retry logic for produce should have sufficient time for this
        #  additional settle time.

        self.logger.debug("Producing to topic")
        produce_result = self._produce_topic(name, data)
        self.logger.debug("Producing to topic: %s", produce_result)
        for o in produce_result["offsets"]:
            assert o["offset"] == 1, f'error_code {o["error_code"]}'

        self.logger.debug(f"Consuming topic: {name}")
        assert kc.consume_one(name, 0, 1)["payload"] == "vectorized"
        assert kc.consume_one(name, 1, 1)["payload"] == "pandaproxy"
        assert kc.consume_one(name, 2, 1)["payload"] == "multibroker"
Exemplo n.º 8
0
            def done():
                kcat = KafkaCat(self.redpanda)
                ts = 1638748800  # 12.6.2021 - old timestamp, query first offset
                offset = kcat.query_offset(self.topic, 0, ts)
                # assert that offset is valid
                assert offset >= 0

                topic_partitions = segments_count(self.redpanda, self.topic, 0)
                partitions = []
                for p in topic_partitions:
                    partitions.append(p <= 5)
                return all([p <= 5 for p in topic_partitions])
Exemplo n.º 9
0
 def _wait_for_topic(self, name):
     kc = KafkaCat(self.redpanda)
     has_leaders = False
     while not has_leaders:
         topics = kc.metadata()["topics"]
         maybe_leaders = True
         for t in topics:
             if t["topic"] == name:
                 for p in t["partitions"]:
                     if p["leader"] == -1:
                         maybe_leaders = False
         has_leaders = maybe_leaders
Exemplo n.º 10
0
    def test_produce_topic(self):
        """
        Create a topic and verify that pandaproxy can produce to it.
        """
        name = create_topic_names(1)[0]
        data = '''
        {
            "records": [
                {"value": "dmVjdG9yaXplZA==", "partition": 0},
                {"value": "cGFuZGFwcm94eQ==", "partition": 1},
                {"value": "bXVsdGlicm9rZXI=", "partition": 2}
            ]
        }'''

        self.logger.info(f"Producing to non-existant topic: {name}")
        produce_result = self._produce_topic(name, data)
        for o in produce_result["offsets"]:
            assert o["error_code"] == 3
            assert o["offset"] == -1

        kc = KafkaCat(self.redpanda)

        self.logger.info(f"Creating test topic: {name}")
        self._create_topics([name], partitions=3)

        self.logger.debug("Waiting for leaders to settle")
        has_leaders = False
        while not has_leaders:
            topics = kc.metadata()["topics"]
            maybe_leaders = True
            for t in topics:
                if t["topic"] == name:
                    for p in t["partitions"]:
                        if p["leader"] == -1:
                            maybe_leaders = False
            has_leaders = maybe_leaders
        # TODO:
        #  Despite the above test, Pandaproxy can still get back no leaders
        #  Query Pandaproxy metadata to see when leaders have settled
        #  The retry logic for produce should have sufficient time for this
        #  additional settle time.

        self.logger.info(f"Producing to topic: {name}")
        produce_result = self._produce_topic(name, data)
        for o in produce_result["offsets"]:
            assert o["offset"] == 1, f'error_code {o["error_code"]}'

        self.logger.info(f"Consuming from topic: {name}")
        assert kc.consume_one(name, 0, 1)["payload"] == "vectorized"
        assert kc.consume_one(name, 1, 1)["payload"] == "pandaproxy"
        assert kc.consume_one(name, 2, 1)["payload"] == "multibroker"
Exemplo n.º 11
0
    def _ping_pong(self):
        kc = KafkaCat(self.redpanda)
        rpk = RpkTool(self.redpanda)

        payload = str(random.randint(0, 1000))
        start = time.time()
        offset = rpk.produce(self.topic, "tkey", payload, timeout=5)
        consumed = kc.consume_one(self.topic, 0, offset)
        latency = time.time() - start
        self.logger.info(
            f"_ping_pong produced '{payload}' consumed '{consumed}' in {(latency)*1000.0:.2f} ms"
        )
        if consumed['payload'] != payload:
            raise RuntimeError(f"expected '{payload}' got '{consumed}'")
    def __init__(self, test_context):
        extra_rp_conf = dict(
            log_segment_size=1048576,
            retention_bytes=3145728,
            log_compaction_interval_ms=1000,
            enable_leader_balancer=False,
        )

        super(PrefixTruncateRecoveryTest,
              self).__init__(test_context=test_context,
                             num_brokers=3,
                             extra_rp_conf=extra_rp_conf)

        self.kafka_tools = KafkaCliTools(self.redpanda)
        self.kafka_cat = KafkaCat(self.redpanda)
Exemplo n.º 13
0
    def test_adding_nodes_to_cluster(self):
        self.redpanda = RedpandaService(
            self.test_context, 3, extra_rp_conf={"group_topic_partitions": 1})
        # start single node cluster
        self.redpanda.start(nodes=[self.redpanda.nodes[0]])
        # create some topics
        topics = []
        # include __consumer_offsets topic replica
        total_replicas = 1
        for partition_count in range(1, 5):
            name = f"topic{len(topics)}"
            spec = TopicSpec(name=name,
                             partition_count=partition_count,
                             replication_factor=1)
            total_replicas += partition_count
            topics.append(spec)

        for spec in topics:
            DefaultClient(self.redpanda).create_topic(spec)
            self.topic = spec.name

        self.start_producer(1)
        self.start_consumer(1)
        self.await_startup()
        # add second node
        self.redpanda.start_node(self.redpanda.nodes[1])
        kafkacat = KafkaCat(self.redpanda)

        def _replicas_per_node():
            node_replicas = {}
            md = kafkacat.metadata()
            self.redpanda.logger.info(f"metadata: {md}")
            for topic in md['topics']:
                for p in topic['partitions']:
                    for r in p['replicas']:
                        id = r['id']
                        if id not in node_replicas:
                            node_replicas[id] = 0
                        node_replicas[id] += 1

            return node_replicas

        def partitions_rebalanced():
            per_node = _replicas_per_node()
            self.redpanda.logger.info(f"replicas per node: {per_node}")
            if len(per_node) < len(self.redpanda.started_nodes()):
                return False

            replicas = sum(per_node.values())
            if replicas != total_replicas:
                return False

            return all(p[1] > 1 for p in per_node.items())

        wait_until(partitions_rebalanced, timeout_sec=30, backoff_sec=1)
        # add third node
        self.redpanda.start_node(self.redpanda.nodes[2])
        wait_until(partitions_rebalanced, timeout_sec=30, backoff_sec=1)

        self.run_validation(enable_idempotence=False, consumer_timeout_sec=45)
Exemplo n.º 14
0
    def partitions(self, topic):
        """
        Return partition metadata for the topic.
        """
        kc = KafkaCat(self)
        md = kc.metadata()
        topic = next(filter(lambda t: t["topic"] == topic, md["topics"]))

        def make_partition(p):
            index = p["partition"]
            leader_id = p["leader"]
            leader = None if leader_id == -1 else self.get_node(leader_id)
            replicas = [self.get_node(r["id"]) for r in p["replicas"]]
            return Partition(index, leader, replicas)

        return [make_partition(p) for p in topic["partitions"]]
Exemplo n.º 15
0
    def test_controller_recovery(self):
        kc = KafkaCat(self.redpanda)

        # choose a partition and a target node
        partition = self._get_partition(kc)
        target_node_id = next(
            filter(lambda r: r["id"] != partition["leader"],
                   partition["replicas"]))["id"]
        self.logger.debug(
            f"Transfering leader from {partition['leader']} to {target_node_id}"
        )

        # build the transfer url
        meta = kc.metadata()
        brokers = meta["brokers"]
        source_broker = next(
            filter(lambda b: b["id"] == partition["leader"], brokers))
        target_broker = next(
            filter(lambda b: b["id"] == target_node_id, brokers))
        self.logger.debug(f"Source broker {source_broker}")
        self.logger.debug(f"Target broker {target_broker}")
        host = source_broker["name"]
        host = host.split(":")[0]
        partition_id = partition["partition"]
        url = "http://{}:9644/v1/kafka/{}/{}/transfer_leadership?target={}".format(
            host, self.topic, partition["partition"], target_node_id)

        def try_transfer():
            self.logger.debug(url)
            res = requests.post(url)
            self.logger.debug(res.text)
            for _ in range(3):  # just give it a moment
                time.sleep(1)
                meta = kc.metadata()
                partition = next(
                    filter(lambda p: p["partition"] == partition_id,
                           meta["topics"][0]["partitions"]))
                if partition["leader"] == target_node_id:
                    return True
            return False

        wait_until(lambda: try_transfer(),
                   timeout_sec=30,
                   backoff_sec=5,
                   err_msg="Transfer did not complete")
Exemplo n.º 16
0
    def test_controller_recovery(self):
        kc = KafkaCat(self.redpanda)

        # choose a partition and a target node
        partition = self._get_partition(kc)
        target_node_id = next(
            filter(lambda r: r["id"] != partition["leader"],
                   partition["replicas"]))["id"]
        self.logger.debug(
            f"Transfering leader from {partition['leader']} to {target_node_id}"
        )

        # build the transfer url
        meta = kc.metadata()
        brokers = meta["brokers"]
        source_broker = next(
            filter(lambda b: b["id"] == partition["leader"], brokers))
        target_broker = next(
            filter(lambda b: b["id"] == target_node_id, brokers))
        self.logger.debug(f"Source broker {source_broker}")
        self.logger.debug(f"Target broker {target_broker}")

        # Send the request to any host, they should redirect to
        # the leader of the partition.
        partition_id = partition['partition']

        admin = Admin(self.redpanda)
        admin.partition_transfer_leadership("kafka", self.topic, partition_id,
                                            target_node_id)

        def transfer_complete():
            for _ in range(3):  # just give it a moment
                time.sleep(1)
                meta = kc.metadata()
                partition = next(
                    filter(lambda p: p["partition"] == partition_id,
                           meta["topics"][0]["partitions"]))
                if partition["leader"] == target_node_id:
                    return True
            return False

        wait_until(lambda: transfer_complete(),
                   timeout_sec=30,
                   backoff_sec=5,
                   err_msg="Transfer did not complete")
Exemplo n.º 17
0
    def test_produce_topic(self):
        """
        Create a topic and verify that pandaproxy can produce to it.
        """
        name = create_topic_names(1)[0]
        data = '''
        {
            "records": [
                {"value": "dmVjdG9yaXplZA==", "partition": 0},
                {"value": "cGFuZGFwcm94eQ==", "partition": 1},
                {"value": "bXVsdGlicm9rZXI=", "partition": 2}
            ]
        }'''

        self.logger.info(f"Producing to non-existant topic: {name}")
        produce_result = self._produce_topic(name, data)
        for o in produce_result["offsets"]:
            assert o["error_code"] == 3
            assert o["offset"] == -1

        self.logger.info(f"Creating test topic: {name}")
        self._create_topics([name], partitions=3)

        self.logger.debug("Waiting for leaders to settle")
        self._wait_for_topic(name)

        self.logger.info(f"Producing to topic: {name}")
        produce_result = self._produce_topic(name, data)
        for o in produce_result["offsets"]:
            assert o["offset"] == 1, f'error_code {o["error_code"]}'

        self.logger.info(f"Consuming from topic: {name}")
        kc = KafkaCat(self.redpanda)
        assert kc.consume_one(name, 0, 1)["payload"] == "vectorized"
        assert kc.consume_one(name, 1, 1)["payload"] == "pandaproxy"
        assert kc.consume_one(name, 2, 1)["payload"] == "multibroker"
class PrefixTruncateRecoveryTest(RedpandaTest):
    """
    The purpose of this test is to exercise recovery of partitions which have
    had data reclaimed based on retention policy. The testing strategy is:

       1. Stop 1 out 3 nodes
       2. Produce until retention policy reclaims data
       3. Restart the stopped node
       4. Verify that the stopped node recovers

    Leadership balancing is disabled in this test because the final verification
    step tries to force leadership so that verification may query metadata from
    specific nodes where the kafka protocol only returns state from leaders.
    """
    topics = (TopicSpec(cleanup_policy=TopicSpec.CLEANUP_DELETE), )

    def __init__(self, test_context):
        extra_rp_conf = dict(
            log_segment_size=1048576,
            retention_bytes=3145728,
            log_compaction_interval_ms=1000,
            enable_leader_balancer=False,
        )

        super(PrefixTruncateRecoveryTest,
              self).__init__(test_context=test_context,
                             num_brokers=3,
                             extra_rp_conf=extra_rp_conf)

        self.kafka_tools = KafkaCliTools(self.redpanda)
        self.kafka_cat = KafkaCat(self.redpanda)

    def fully_replicated(self, nodes):
        """
        Test that for each specified node that there are no reported under
        replicated partitions corresponding to the test topic.
        """
        metric = self.redpanda.metrics_sample("under_replicated_replicas",
                                              nodes)
        metric = metric.label_filter(dict(namespace="kafka", topic=self.topic))
        assert len(metric.samples) == len(nodes)
        return all(map(lambda s: s.value == 0, metric.samples))

    def get_segments_deleted(self, nodes):
        """
        Return the values of the log segments removed metric.
        """
        metric = self.redpanda.metrics_sample("log_segments_removed", nodes)
        metric = metric.label_filter(dict(namespace="kafka", topic=self.topic))
        assert len(metric.samples) == len(nodes)
        return [s.value for s in metric.samples]

    def produce_until_reclaim(self, initial_deleted, acks):
        """
        Produce data until we observe that segments have been deleted. The
        initial_deleted parameter is the max number of segments deleted across
        nodes, and we wait for all partitions to report at least initial + 3
        deletions so that all nodes have experienced some deletion.
        """
        deleted = self.get_segments_deleted(self.redpanda.nodes[1:])
        if all(map(lambda d: d >= initial_deleted + 2, deleted)):
            return True
        self.kafka_tools.produce(self.topic, 1024, 1024, acks=acks)
        return False

    @cluster(num_nodes=3, log_allow_list=LOG_ALLOW_LIST)
    @matrix(acks=[-1, 1], start_empty=[True, False])
    def test_prefix_truncate_recovery(self, acks, start_empty):
        # cover boundary conditions of partition being empty/non-empty
        if not start_empty:
            self.kafka_tools.produce(self.topic, 2048, 1024, acks=acks)
            wait_until(lambda: self.fully_replicated(self.redpanda.nodes),
                       timeout_sec=90,
                       backoff_sec=5)

        # stop this unfortunate node
        stopped_node = self.redpanda.nodes[0]
        self.redpanda.stop_node(stopped_node)

        # produce data into the topic until segments are reclaimed
        # by the configured retention policy
        deleted = max(self.get_segments_deleted(self.redpanda.nodes[1:]))
        wait_until(lambda: self.produce_until_reclaim(deleted, acks),
                   timeout_sec=90,
                   backoff_sec=5)

        # we should now observe an under replicated state
        wait_until(lambda: not self.fully_replicated(self.redpanda.nodes[1:]),
                   timeout_sec=90,
                   backoff_sec=5)

        # finally restart the node and wait until fully replicated
        self.redpanda.start_node(stopped_node)
        wait_until(lambda: self.fully_replicated(self.redpanda.nodes),
                   timeout_sec=90,
                   backoff_sec=5)

        self.verify_offsets()

    def verify_offsets(self):
        """
        Test that the ending offset for the partition as seen on each
        node are identical. Since we can only query this from the leader, we
        disable auto leadership balancing, and manually transfer leadership
        before querying.

        Note that because each node applies retention policy independently to a
        prefix of the log we can't reliably compare the starting offsets.
        """
        admin = Admin(self.redpanda)
        offsets = []
        for node in self.redpanda.nodes:
            admin.transfer_leadership_to(namespace="kafka",
                                         topic=self.topic,
                                         partition=0,
                                         target=node)
            # % ERROR: offsets_for_times failed: Local: Unknown partition
            # may occur here presumably because there is an interaction
            # with leadership transfer. the built-in retries in list_offsets
            # appear to deal with this gracefully and we still pass.
            offsets.append(self.kafka_cat.list_offsets(self.topic, 0))
        assert all(map(lambda o: o[1] == offsets[0][1], offsets))
Exemplo n.º 19
0
    def test_disabling_transactions_after_they_being_used(self):
        '''
        Validate that transactions can be safely disabled after 
        the feature have been used
        '''
        # start redpanda with tranasactions enabled, we use
        # replication factor 1 for group topic to make
        # it unavailable when one of the nodes is down,
        self.start_redpanda(num_nodes=3,
                            extra_rp_conf={
                                "transaction_coordinator_replication": 3,
                                "id_allocator_replication": 3,
                                "enable_idempotence": True,
                                "enable_transactions": True,
                                "default_topic_replications": 1,
                                "default_topic_partitions": 1,
                                "health_manager_tick_interval": 3600000
                            })

        tx_topic = TopicSpec(name="tx-topic",
                             partition_count=1,
                             replication_factor=3)
        self.client().create_topic(tx_topic)

        # produce some messages to tx_topic

        kcat = KafkaCat(self.redpanda)
        kcat.produce_one(tx_topic.name, msg='test-msg', tx_id='test-tx-id')

        # disable transactions,
        self.redpanda.stop()

        for n in self.redpanda.nodes:
            self.redpanda.start_node(n,
                                     override_cfg_params={
                                         "transaction_coordinator_replication":
                                         3,
                                         "id_allocator_replication": 3,
                                         "enable_idempotence": False,
                                         "enable_transactions": False,
                                         "transactional_id_expiration_ms":
                                         1000,
                                         "default_topic_replications": 3,
                                         "default_topic_partitions": 1
                                     })

        # create topic for test
        tester = TopicSpec(name="tester",
                           partition_count=1,
                           replication_factor=3)
        self.client().create_topic(tester)
        self.topic = tester
        self.start_producer(2, throughput=10000)
        self.start_consumer(1)
        self.await_startup()

        self.run_validation(min_records=100000,
                            producer_timeout_sec=300,
                            consumer_timeout_sec=300)

        # make sure that all redpanda nodes are up and running
        for n in self.redpanda.nodes:
            assert self.redpanda.redpanda_pid(n) != None
Exemplo n.º 20
0
 def _get_leader(self):
     """
     :returns: 2 tuple of (leader, [replica ids])
     """
     return KafkaCat(self.redpanda).get_partition_leader(self.topic, 0)
Exemplo n.º 21
0
 def partition_ready():
     return KafkaCat(self.redpanda).get_partition_leader(
         name, 0)[0] is not None
Exemplo n.º 22
0
    def test_node_recovery(self, recovery_type):
        self.start_redpanda(num_nodes=3)
        kafka_tools = KafkaCliTools(self.redpanda)
        kafka_cat = KafkaCat(self.redpanda)
        # create topics
        topics = []
        for _ in range(0, 6):
            topics.append(TopicSpec(partition_count=random.randint(1, 10)))
        # chose one topic to run the main workload
        DefaultClient(self.redpanda).create_topic(topics)
        self.topic = random.choice(topics).name

        self.start_producer(1)
        self.start_consumer(2)
        self.await_startup()

        # chose another topic and populate it with data
        prepopulated_topic = random.choice(topics)

        while self.topic == prepopulated_topic.name:
            prepopulated_topic = random.choice(topics)

        # populate topic with data
        kafka_tools.produce(prepopulated_topic.name, 20000, 1024)

        def list_offsets():
            offsets = {}
            for p in range(0, prepopulated_topic.partition_count):
                offsets[p] = kafka_cat.list_offsets(prepopulated_topic.name, p)

        # store offsets
        offsets = list_offsets()

        self.redpanda.logger.info(f"Topic offsets: {offsets}")
        # stop one of the nodes and remove its data
        stopped = random.choice(self.redpanda.nodes)
        # prepare seed servers list
        seeds = map(lambda n: {
            "address": n.account.hostname,
            "port": 33145
        }, self.redpanda.nodes)
        seeds = list(
            filter(lambda n: n['address'] != stopped.account.hostname, seeds))

        self.redpanda.stop_node(stopped)
        if recovery_type == FullNodeRecoveryTest.FULL_RECOVERY:
            self.redpanda.clean_node(stopped, preserve_logs=True)

        # produce some more data to make sure that stopped node is behind
        kafka_tools.produce(prepopulated_topic.name, 20000, 1024)

        # start node with the same node id, and not empty seed server list to

        # give node more time to start as it has to recover
        self.redpanda.start_node(stopped,
                                 override_cfg_params={'seed_servers': seeds},
                                 timeout=90)

        def all_topics_recovered():
            metric = self.redpanda.metrics_sample("under_replicated_replicas",
                                                  self.redpanda.nodes)
            under_replicated = filter(lambda s: s.value == 1, metric.samples)
            under_replicated = list(
                map(
                    lambda s: (s.labels['namespace'], s.labels['topic'], s.
                               labels['partition']), under_replicated))
            self.redpanda.logger.info(
                f"under replicated partitions: {list(under_replicated)}")
            return len(under_replicated) == 0

        # wait for prepopulated topic to recover
        wait_until(all_topics_recovered, 60, 1)

        self.run_validation(min_records=20000,
                            enable_idempotence=False,
                            producer_timeout_sec=60,
                            consumer_timeout_sec=180)

        # validate prepopulated topic offsets
        assert offsets == list_offsets()
    def test_recreated_topic_metadata_are_valid(self, replication_factor):
        """
        Test recreated topic metadata are valid across all the nodes
        """

        topic = 'tp-test'
        partition_count = 5
        rpk = RpkTool(self.redpanda)
        kcat = KafkaCat(self.redpanda)
        admin = Admin(self.redpanda)
        # create topic with replication factor of 3
        rpk.create_topic(topic='tp-test',
                         partitions=partition_count,
                         replicas=replication_factor)

        # produce some data to the topic

        def wait_for_leader(partition, expected_leader):
            leader, _ = kcat.get_partition_leader(topic, partition)
            return leader == expected_leader

        def transfer_all_leaders():
            partitions = rpk.describe_topic(topic)
            for p in partitions:
                replicas = set(p.replicas)
                replicas.remove(p.leader)
                target = random.choice(list(replicas))
                admin.partition_transfer_leadership("kafka", topic, p.id,
                                                    target)
                wait_until(lambda: wait_for_leader(p.id, target),
                           timeout_sec=30,
                           backoff_sec=1)
            msg_cnt = 100
            producer = RpkProducer(self.test_context,
                                   self.redpanda,
                                   topic,
                                   16384,
                                   msg_cnt,
                                   acks=-1)

            producer.start()
            producer.wait()
            producer.free()

        # transfer leadership to grow the term
        for i in range(0, 10):
            transfer_all_leaders()

        # recreate the topic
        rpk.delete_topic(topic)
        rpk.create_topic(topic='tp-test',
                         partitions=partition_count,
                         replicas=3)

        def metadata_consistent():
            # validate leadership information on each node
            for p in range(0, partition_count):
                leaders = set()
                for n in self.redpanda.nodes:
                    admin_partition = admin.get_partitions(topic=topic,
                                                           partition=p,
                                                           namespace="kafka",
                                                           node=n)
                    self.logger.info(
                        f"node: {n.account.hostname} partition: {admin_partition}"
                    )
                    leaders.add(admin_partition['leader_id'])

                self.logger.info(f"{topic}/{p} leaders: {leaders}")
                if len(leaders) != 1:
                    return False
            return True

        wait_until(metadata_consistent, 45, backoff_sec=2)
Exemplo n.º 24
0
    def test_node_opeartions(self):
        # allocate 5 nodes for the cluster
        self.redpanda = RedpandaService(
            self.test_context,
            5,
            KafkaCliTools,
            extra_rp_conf={
                "enable_auto_rebalance_on_node_add": True,
                "group_topic_partitions": 3,
                "default_topic_replications": 3,
            })
        # start 3 nodes
        self.redpanda.start()
        # create some topics
        topics = self._create_random_topics(10)
        self.redpanda.logger.info(f"using topics: {topics}")
        # select one of the topics to use in consumer/producer
        self.topic = random.choice(topics).name

        self.start_producer(1, throughput=100)
        self.start_consumer(1)
        self.await_startup()

        def decommission(node_id):
            self.logger.info(f"decommissioning node: {node_id}")
            admin = Admin(self.redpanda)
            admin.decommission_broker(id=node_id)

            def node_removed():
                admin = Admin(self.redpanda)
                brokers = admin.get_brokers()
                for b in brokers:
                    if b['node_id'] == node_id:
                        return False
                return True

            wait_until(node_removed, timeout_sec=240, backoff_sec=2)

        kafkacat = KafkaCat(self.redpanda)

        def replicas_per_node():
            node_replicas = {}
            md = kafkacat.metadata()
            self.redpanda.logger.info(f"metadata: {md}")
            for topic in md['topics']:
                for p in topic['partitions']:
                    for r in p['replicas']:
                        id = r['id']
                        if id not in node_replicas:
                            node_replicas[id] = 0
                        node_replicas[id] += 1

            return node_replicas

        def restart_node(node_id, cleanup=True):
            self.logger.info(f"restarting node: {node_id}")
            self.redpanda.stop_node(self.redpanda.nodes[node_id - 1])
            if cleanup:
                self.redpanda.clean_node(self.redpanda.nodes[node_id - 1])
            self.redpanda.start_node(self.redpanda.nodes[node_id - 1])
            admin = Admin(self.redpanda)
            admin.set_log_level("cluster", "trace")

            def has_new_replicas():
                per_node = replicas_per_node()
                self.logger.info(f"replicas per node: {per_node}")
                return node_id in per_node

            wait_until(has_new_replicas, timeout_sec=240, backoff_sec=2)

        admin = Admin(self.redpanda)
        admin.set_log_level("cluster", "trace")
        work = self.generate_random_workload(10, skip_nodes=set())
        self.redpanda.logger.info(f"node operations to execute: {work}")
        for op in work:
            op_type = op[0]
            self.logger.info(f"executing - {op}")
            if op_type == ADD:
                id = op[1]
                restart_node(id)
            if op_type == DECOMMISSION:
                id = op[1]
                decommission(id)
            elif op_type == ADD_TOPIC:
                spec = TopicSpec(name=op[1],
                                 replication_factor=op[2],
                                 partition_count=op[3])

                self.redpanda.create_topic(spec)
            elif op_type == DELETE_TOPIC:
                self.redpanda.delete_topic(op[1])

        self.run_validation(enable_idempotence=False, consumer_timeout_sec=180)
Exemplo n.º 25
0
    def test_node_operations(self, enable_failures):
        # allocate 5 nodes for the cluster
        self.redpanda = RedpandaService(
            self.test_context,
            5,
            extra_rp_conf={
                "enable_auto_rebalance_on_node_add": True,
                "group_topic_partitions": 3,
                "default_topic_replications": 3,
            })

        self.redpanda.start()
        # create some topics
        topics = self._create_random_topics(10)
        self.redpanda.logger.info(f"using topics: {topics}")
        # select one of the topics to use in consumer/producer
        self.topic = random.choice(topics).name

        self.start_producer(1, throughput=100)
        self.start_consumer(1)
        self.await_startup()
        self.active_nodes = set(
            [self.redpanda.idx(n) for n in self.redpanda.nodes])
        # collect current mapping
        self.ids_mapping = {}
        for n in self.redpanda.nodes:
            self.ids_mapping[self.redpanda.idx(n)] = self.redpanda.idx(n)
        self.next_id = sorted(list(self.ids_mapping.keys()))[-1] + 1
        self.redpanda.logger.info(f"Initial ids mapping: {self.ids_mapping}")
        NODE_OP_TIMEOUT = 360

        def get_next_id():
            id = self.next_id
            self.next_id += 1
            return id

        def failure_injector_loop():
            f_injector = FailureInjector(self.redpanda)
            while enable_failures:
                f_type = random.choice(FailureSpec.FAILURE_TYPES)
                length = 0
                # allow suspending any node
                if f_type == FailureSpec.FAILURE_SUSPEND:
                    length = random.randint(
                        1, NodeOperationFuzzyTest.max_suspend_duration_seconds)
                    node = random.choice(self.redpanda.nodes)
                else:
                    #kill/termianate only active nodes (not to influence the test outcome)
                    idx = random.choice(list(self.active_nodes))
                    node = self.redpanda.get_node(idx)

                f_injector.inject_failure(
                    FailureSpec(node=node, type=f_type, length=length))

                delay = random.randint(
                    NodeOperationFuzzyTest.min_inter_failure_time,
                    NodeOperationFuzzyTest.max_inter_failure_time)
                self.redpanda.logger.info(
                    f"waiting {delay} seconds before next failure")
                time.sleep(delay)

        if enable_failures:
            finjector_thread = threading.Thread(target=failure_injector_loop,
                                                args=())
            finjector_thread.daemon = True
            finjector_thread.start()

        def decommission(idx):
            node_id = self.ids_mapping[idx]
            self.logger.info(f"decommissioning node: {idx} with id: {node_id}")

            def decommissioned():
                try:
                    admin = Admin(self.redpanda)
                    # if broker is already draining, it is suceess

                    brokers = admin.get_brokers()
                    for b in brokers:
                        if b['node_id'] == node_id and b[
                                'membership_status'] == 'draining':
                            return True

                    r = admin.decommission_broker(id=node_id)
                    return r.status_code == 200
                except requests.exceptions.RetryError:
                    return False
                except requests.exceptions.ConnectionError:
                    return False
                except requests.exceptions.HTTPError:
                    return False

            wait_until(decommissioned,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)
            admin = Admin(self.redpanda)

            def is_node_removed(idx_to_query, node_id):
                try:
                    brokers = admin.get_brokers(
                        self.redpanda.get_node(idx_to_query))
                    ids = map(lambda broker: broker['node_id'], brokers)
                    return not node_id in ids
                except:
                    return False

            def node_removed():
                node_removed_cnt = 0
                for idx in self.active_nodes:
                    if is_node_removed(idx, node_id):
                        node_removed_cnt += 1

                node_count = len(self.redpanda.nodes)
                majority = int(node_count / 2) + 1
                self.redpanda.logger.debug(
                    f"node {node_id} removed on {node_removed_cnt} nodes, majority: {majority}"
                )
                return node_removed_cnt >= majority

            wait_until(node_removed,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)
            self.redpanda.stop_node(self.redpanda.get_node(idx))

        kafkacat = KafkaCat(self.redpanda)

        def replicas_per_node():
            node_replicas = {}
            md = kafkacat.metadata()
            self.redpanda.logger.info(f"metadata: {md}")
            for topic in md['topics']:
                for p in topic['partitions']:
                    for r in p['replicas']:
                        id = r['id']
                        if id not in node_replicas:
                            node_replicas[id] = 0
                        node_replicas[id] += 1

            return node_replicas

        def seed_servers_for(idx):
            seeds = map(
                lambda n: {
                    "address": n.account.hostname,
                    "port": 33145
                }, self.redpanda.nodes)

            return list(
                filter(
                    lambda n: n['address'] != self.redpanda.get_node(idx).
                    account.hostname, seeds))

        def add_node(idx, cleanup=True):
            id = get_next_id()
            self.logger.info(f"adding node: {idx} back with new id: {id}")
            self.ids_mapping[idx] = id
            self.redpanda.stop_node(self.redpanda.get_node(idx))
            if cleanup:
                self.redpanda.clean_node(self.redpanda.get_node(idx),
                                         preserve_logs=True)
            # we do not reuse previous node ids and override seed server list
            self.redpanda.start_node(
                self.redpanda.get_node(idx),
                timeout=NodeOperationFuzzyTest.min_inter_failure_time +
                NodeOperationFuzzyTest.max_suspend_duration_seconds + 30,
                override_cfg_params={
                    "node_id": id,
                    "seed_servers": seed_servers_for(idx)
                })

            def has_new_replicas():
                per_node = replicas_per_node()
                self.logger.info(f"replicas per node: {per_node}")
                return id in per_node

            wait_until(has_new_replicas,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)

        def is_topic_present(name):
            kcl = KCL(self.redpanda)
            lines = kcl.list_topics().splitlines()
            self.redpanda.logger.debug(
                f"checking if topic {name} is present in {lines}")
            for l in lines:
                if l.startswith(name):
                    return True
            return False

        def create_topic(spec):
            try:
                DefaultClient(self.redpanda).create_topic(spec)
            except Exception as e:
                self.redpanda.logger.warn(
                    f"error creating topic {spec.name} - {e}")
            try:
                return is_topic_present(spec.name)
            except Exception as e:
                self.redpanda.logger.warn(f"error while listing topics - {e}")
                return False

        def delete_topic(name):
            try:
                DefaultClient(self.redpanda).delete_topic(name)
            except Exception as e:
                self.redpanda.logger.warn(f"error deleting topic {name} - {e}")
            try:
                return not is_topic_present(name)
            except Exception as e:
                self.redpanda.logger.warn(f"error while listing topics - {e}")
                return False

        work = self.generate_random_workload(10,
                                             skip_nodes=set(),
                                             available_nodes=self.active_nodes)
        self.redpanda.logger.info(f"node operations to execute: {work}")
        for op in work:
            op_type = op[0]
            self.logger.info(
                f"executing - {op} - current ids: {self.ids_mapping}")
            if op_type == ADD:
                idx = op[1]
                self.active_nodes.add(idx)
                add_node(idx)
            if op_type == DECOMMISSION:
                idx = op[1]
                self.active_nodes.remove(idx)
                decommission(idx)
            elif op_type == ADD_TOPIC:
                spec = TopicSpec(name=op[1],
                                 replication_factor=op[2],
                                 partition_count=op[3])
                wait_until(lambda: create_topic(spec) == True,
                           timeout_sec=180,
                           backoff_sec=2)
            elif op_type == DELETE_TOPIC:
                wait_until(lambda: delete_topic(op[1]) == True,
                           timeout_sec=180,
                           backoff_sec=2)

        enable_failures = False
        self.run_validation(enable_idempotence=False,
                            producer_timeout_sec=60,
                            consumer_timeout_sec=180)
Exemplo n.º 26
0
    def test_node_opeartions(self, enable_failures):
        # allocate 5 nodes for the cluster
        self.redpanda = RedpandaService(
            self.test_context,
            5,
            KafkaCliTools,
            extra_rp_conf={
                "enable_auto_rebalance_on_node_add": True,
                "group_topic_partitions": 3,
                "default_topic_replications": 3,
            })
        self.active_nodes = set([1, 2, 3, 4, 5])

        self.redpanda.start()
        # create some topics
        topics = self._create_random_topics(10)
        self.redpanda.logger.info(f"using topics: {topics}")
        # select one of the topics to use in consumer/producer
        self.topic = random.choice(topics).name

        self.start_producer(1, throughput=100)
        self.start_consumer(1)
        self.await_startup()
        NODE_OP_TIMEOUT = 360

        def failure_injector_loop():
            f_injector = FailureInjector(self.redpanda)
            while enable_failures:
                f_type = random.choice(FailureSpec.FAILURE_TYPES)
                length = 0
                # allow suspending any node
                if f_type == FailureSpec.FAILURE_SUSPEND:
                    length = random.randint(1, 10)
                    node = random.choice(self.redpanda.nodes)
                else:
                    #kill/termianate only active nodes (not to influence the test outcome)
                    idx = random.choice(list(self.active_nodes)) - 1
                    node = self.redpanda.nodes[idx]

                f_injector.inject_failure(
                    FailureSpec(node=node, type=f_type, length=length))

                delay = random.randint(20, 45)
                self.redpanda.logger.info(
                    f"waiting {delay} seconds before next failure")
                time.sleep(delay)

        if enable_failures:
            finjector_thread = threading.Thread(target=failure_injector_loop,
                                                args=())
            finjector_thread.daemon = True
            finjector_thread.start()

        def decommission(node_id):
            self.logger.info(f"decommissioning node: {node_id}")

            def decommissioned():
                try:
                    admin = Admin(self.redpanda)
                    # if broker is already draining, it is suceess
                    brokers = admin.get_brokers()
                    for b in brokers:
                        if b['node_id'] == node_id and b[
                                'membership_status'] == 'draining':
                            return True

                    r = admin.decommission_broker(id=node_id)
                    return r.status_code == 200
                except requests.exceptions.RetryError:
                    return False
                except requests.exceptions.ConnectionError:
                    return False
                except requests.exceptions.HTTPError:
                    return False

            wait_until(decommissioned,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)

            def node_removed():
                admin = Admin(self.redpanda)
                try:
                    brokers = admin.get_brokers(node=self.redpanda.nodes[0])
                    for b in brokers:
                        if b['node_id'] == node_id:
                            return False
                    return True
                except:
                    return False

            wait_until(node_removed,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)

        kafkacat = KafkaCat(self.redpanda)

        def replicas_per_node():
            node_replicas = {}
            md = kafkacat.metadata()
            self.redpanda.logger.info(f"metadata: {md}")
            for topic in md['topics']:
                for p in topic['partitions']:
                    for r in p['replicas']:
                        id = r['id']
                        if id not in node_replicas:
                            node_replicas[id] = 0
                        node_replicas[id] += 1

            return node_replicas

        def restart_node(node_id, cleanup=True):
            self.logger.info(f"restarting node: {node_id}")
            self.redpanda.stop_node(self.redpanda.nodes[node_id - 1])
            if cleanup:
                self.redpanda.clean_node(self.redpanda.nodes[node_id - 1],
                                         preserve_logs=True)
            self.redpanda.start_node(self.redpanda.nodes[node_id - 1])

            def has_new_replicas():
                per_node = replicas_per_node()
                self.logger.info(f"replicas per node: {per_node}")
                return node_id in per_node

            wait_until(has_new_replicas,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)

        def is_topic_present(name):
            kcl = KCL(self.redpanda)
            lines = kcl.list_topics().splitlines()
            self.redpanda.logger.debug(
                f"checking if topic {name} is present in {lines}")
            for l in lines:
                if l.startswith(name):
                    return True
            return False

        def create_topic(spec):
            try:
                self.redpanda.create_topic(spec)
            except Exception as e:
                self.redpanda.logger.warn(
                    f"error creating topic {spec.name} - {e}")
            try:
                return is_topic_present(spec.name)
            except Exception as e:
                self.redpanda.logger.warn(f"error while listing topics - {e}")
                return False

        def delete_topic(name):
            try:
                self.redpanda.delete_topic(name)
            except Exception as e:
                self.redpanda.logger.warn(f"error deleting topic {name} - {e}")
            try:
                return not is_topic_present(name)
            except Exception as e:
                self.redpanda.logger.warn(f"error while listing topics - {e}")
                return False

        work = self.generate_random_workload(10, skip_nodes=set())
        self.redpanda.logger.info(f"node operations to execute: {work}")
        for op in work:
            op_type = op[0]
            self.logger.info(f"executing - {op}")

            if op_type == ADD:
                id = op[1]
                self.active_nodes.add(id)
                restart_node(id)
            if op_type == DECOMMISSION:
                id = op[1]
                self.active_nodes.remove(id)
                decommission(id)
            elif op_type == ADD_TOPIC:
                spec = TopicSpec(name=op[1],
                                 replication_factor=op[2],
                                 partition_count=op[3])
                wait_until(lambda: create_topic(spec) == True,
                           timeout_sec=180,
                           backoff_sec=2)
            elif op_type == DELETE_TOPIC:
                wait_until(lambda: delete_topic(op[1]) == True,
                           timeout_sec=180,
                           backoff_sec=2)

        enable_failures = False
        self.run_validation(enable_idempotence=False,
                            producer_timeout_sec=60,
                            consumer_timeout_sec=180)
Exemplo n.º 27
0
 def _get_leaders_by_node(self):
     kc = KafkaCat(self.redpanda)
     md = kc.metadata()
     topic = next(filter(lambda t: t["topic"] == self.topic, md["topics"]))
     leaders = (p["leader"] for p in topic["partitions"])
     return collections.Counter(leaders)