Exemplo n.º 1
0
    def test_adding_nodes_to_cluster(self):
        self.redpanda = RedpandaService(
            self.test_context, 3, extra_rp_conf={"group_topic_partitions": 1})
        # start single node cluster
        self.redpanda.start(nodes=[self.redpanda.nodes[0]])
        # create some topics
        topics = []
        # include __consumer_offsets topic replica
        total_replicas = 1
        for partition_count in range(1, 5):
            name = f"topic{len(topics)}"
            spec = TopicSpec(name=name,
                             partition_count=partition_count,
                             replication_factor=1)
            total_replicas += partition_count
            topics.append(spec)

        for spec in topics:
            DefaultClient(self.redpanda).create_topic(spec)
            self.topic = spec.name

        self.start_producer(1)
        self.start_consumer(1)
        self.await_startup()
        # add second node
        self.redpanda.start_node(self.redpanda.nodes[1])
        kafkacat = KafkaCat(self.redpanda)

        def _replicas_per_node():
            node_replicas = {}
            md = kafkacat.metadata()
            self.redpanda.logger.info(f"metadata: {md}")
            for topic in md['topics']:
                for p in topic['partitions']:
                    for r in p['replicas']:
                        id = r['id']
                        if id not in node_replicas:
                            node_replicas[id] = 0
                        node_replicas[id] += 1

            return node_replicas

        def partitions_rebalanced():
            per_node = _replicas_per_node()
            self.redpanda.logger.info(f"replicas per node: {per_node}")
            if len(per_node) < len(self.redpanda.started_nodes()):
                return False

            replicas = sum(per_node.values())
            if replicas != total_replicas:
                return False

            return all(p[1] > 1 for p in per_node.items())

        wait_until(partitions_rebalanced, timeout_sec=30, backoff_sec=1)
        # add third node
        self.redpanda.start_node(self.redpanda.nodes[2])
        wait_until(partitions_rebalanced, timeout_sec=30, backoff_sec=1)

        self.run_validation(enable_idempotence=False, consumer_timeout_sec=45)
Exemplo n.º 2
0
 def _get_partition_leaders(self):
     kcat = KafkaCat(self.redpanda)
     m = kcat.metadata()
     self.logger.info(f"kcat.metadata() == {m}")
     brokers = {}
     for b in m['brokers']:
         id = b['id']
         ip = b['name']
         ip = ip[:ip.index(':')]
         for n in self.redpanda.nodes:
             n_ip = n.account.hostname
             self.logger.debug(f"matching {n_ip} over {ip}")
             if n_ip == ip:
                 brokers[id] = n
                 break
     self.logger.debug(f"found brokers {brokers}")
     assert len(brokers) == 3
     leaders = {}
     for topic in m['topics']:
         if topic['topic'] == ArchivalTest.s3_topic_name:
             for part in topic['partitions']:
                 leader_id = part['leader']
                 partition_id = part['partition']
                 leader = brokers[leader_id]
                 leaders[partition_id] = leader
     return leaders
Exemplo n.º 3
0
 def controller(self):
     kc = KafkaCat(self)
     cid = kc.metadata()["controllerid"]
     self.logger.debug("Controller reported with id: {}".format(cid))
     if cid != -1:
         node = self.get_node(cid)
         self.logger.debug("Controller node found: {}".format(node))
         return node
Exemplo n.º 4
0
 def registered(self, node):
     idx = self.idx(node)
     self.logger.debug("Checking if broker %d/%s is registered", idx, node)
     kc = KafkaCat(self)
     brokers = kc.metadata()["brokers"]
     brokers = {b["id"]: b for b in brokers}
     broker = brokers.get(idx, None)
     self.logger.debug("Found broker info: %s", broker)
     return broker is not None
Exemplo n.º 5
0
    def test_produce_topic(self):
        """
        Create a topic and verify that pandaproxy can produce to it.
        """
        name = create_topic_names(1)[0]
        data = '''
        {
            "records": [
                {"value": "dmVjdG9yaXplZA==", "partition": 0},
                {"value": "cGFuZGFwcm94eQ==", "partition": 1},
                {"value": "bXVsdGlicm9rZXI=", "partition": 2}
            ]
        }'''

        self.logger.info(f"Producing to non-existant topic: {name}")
        produce_result_raw = self._produce_topic(name, data)
        assert produce_result_raw.status_code == requests.codes.ok
        produce_result = produce_result_raw.json()
        for o in produce_result["offsets"]:
            assert o["error_code"] == 3
            assert o["offset"] == -1

        self.logger.info(f"Creating test topic: {name}")
        self._create_topics([name], partitions=3)

        self.logger.info(f"Producing to topic: {name}")
        produce_result_raw = self._produce_topic(name, data)
        assert produce_result_raw.status_code == requests.codes.ok
        assert produce_result_raw.headers[
            "Content-Type"] == "application/vnd.kafka.v2+json"

        produce_result = produce_result_raw.json()
        for o in produce_result["offsets"]:
            assert o["offset"] == 0, f'error_code {o["error_code"]}'

        self.logger.info(f"Consuming from topic: {name}")
        kc = KafkaCat(self.redpanda)
        assert kc.consume_one(name, 0, 0)["payload"] == "vectorized"
        assert kc.consume_one(name, 1, 0)["payload"] == "pandaproxy"
        assert kc.consume_one(name, 2, 0)["payload"] == "multibroker"

        self.logger.info(f"Producing to topic without partition: {name}")
        produce_result_raw = self._produce_topic(
            name, '''
        {
            "records": [
                {"value": "dmVjdG9yaXplZA=="},
                {"value": "cGFuZGFwcm94eQ=="},
                {"value": "bXVsdGlicm9rZXI="}
            ]
        }''')

        assert produce_result_raw.status_code == requests.codes.ok
        produce_result = produce_result_raw.json()
        for o in produce_result["offsets"]:
            assert o["offset"] == 1, f'error_code {o["error_code"]}'
Exemplo n.º 6
0
 def _registered(self, service, node):
     idx = service.idx(node)
     service.logger.debug("Checking if broker %d/%s is registered", idx,
                          node)
     kc = KafkaCat(RedpandaMuServiceServiceProxy(service, self))
     brokers = kc.metadata()["brokers"]
     brokers = {b["id"]: b for b in brokers}
     broker = brokers.get(idx, None)
     service.logger.debug("Found broker info: %s", broker)
     return broker is not None
Exemplo n.º 7
0
 def registered(self, node):
     idx = self.idx(node)
     self.logger.debug(
         f"Checking if broker {idx} ({node.name} is registered")
     kc = KafkaCat(self)
     brokers = kc.metadata()["brokers"]
     brokers = {b["id"]: b for b in brokers}
     broker = brokers.get(idx, None)
     self.logger.debug(f"Found broker info: {broker}")
     return broker is not None
Exemplo n.º 8
0
            def done():
                kcat = KafkaCat(self.redpanda)
                ts = 1638748800  # 12.6.2021 - old timestamp, query first offset
                offset = kcat.query_offset(self.topic, 0, ts)
                # assert that offset is valid
                assert offset >= 0

                topic_partitions = segments_count(self.redpanda, self.topic, 0)
                partitions = []
                for p in topic_partitions:
                    partitions.append(p <= 5)
                return all([p <= 5 for p in topic_partitions])
Exemplo n.º 9
0
 def _wait_for_topic(self, name):
     kc = KafkaCat(self.redpanda)
     has_leaders = False
     while not has_leaders:
         topics = kc.metadata()["topics"]
         maybe_leaders = True
         for t in topics:
             if t["topic"] == name:
                 for p in t["partitions"]:
                     if p["leader"] == -1:
                         maybe_leaders = False
         has_leaders = maybe_leaders
Exemplo n.º 10
0
    def test_produce_topic(self):
        """
        Create a topic and verify that pandaproxy can produce to it.
        """
        name = "pandaproxy-topic-{}".format(uuid.uuid4())
        self.logger.debug("Topic name %s", name)

        prev = set(self._get_topics())
        self.logger.debug("Existing topics %s", prev)
        assert prev.isdisjoint(name)

        data = '{"records": [{"value": "dmVjdG9yaXplZA==", "partition": 0},{"value": "cGFuZGFwcm94eQ==", "partition": 1},{"value": "bXVsdGlicm9rZXI=", "partition": 2}]}'

        self.logger.debug("Producing to non-existant topic")
        produce_result = self._produce_topic(name, data)
        for o in produce_result["offsets"]:
            assert o["error_code"] == 3
            assert o["offset"] == -1

        kc = KafkaCat(self.redpanda)

        self.logger.debug("Creating test topic")
        kafka_tools = KafkaCliTools(self.redpanda)
        kafka_tools.create_topic(
            TopicSpec(name=name, replication_factor=1, partition_count=3))

        self.logger.debug("Waiting for leaders to settle")
        has_leaders = False
        while not has_leaders:
            topics = kc.metadata()["topics"]
            maybe_leaders = True
            for t in topics:
                if t["topic"] == name:
                    for p in t["partitions"]:
                        if p["leader"] == -1:
                            maybe_leaders = False
            has_leaders = maybe_leaders
        # TODO:
        #  Despite the above test, Pandaproxy can still get back no leaders
        #  Query Pandaproxy metadata to see when leaders have settled
        #  The retry logic for produce should have sufficient time for this
        #  additional settle time.

        self.logger.debug("Producing to topic")
        produce_result = self._produce_topic(name, data)
        self.logger.debug("Producing to topic: %s", produce_result)
        for o in produce_result["offsets"]:
            assert o["offset"] == 1, f'error_code {o["error_code"]}'

        self.logger.debug(f"Consuming topic: {name}")
        assert kc.consume_one(name, 0, 1)["payload"] == "vectorized"
        assert kc.consume_one(name, 1, 1)["payload"] == "pandaproxy"
        assert kc.consume_one(name, 2, 1)["payload"] == "multibroker"
Exemplo n.º 11
0
    def test_produce_topic(self):
        """
        Create a topic and verify that pandaproxy can produce to it.
        """
        name = create_topic_names(1)[0]
        data = '''
        {
            "records": [
                {"value": "dmVjdG9yaXplZA==", "partition": 0},
                {"value": "cGFuZGFwcm94eQ==", "partition": 1},
                {"value": "bXVsdGlicm9rZXI=", "partition": 2}
            ]
        }'''

        self.logger.info(f"Producing to non-existant topic: {name}")
        produce_result = self._produce_topic(name, data)
        for o in produce_result["offsets"]:
            assert o["error_code"] == 3
            assert o["offset"] == -1

        kc = KafkaCat(self.redpanda)

        self.logger.info(f"Creating test topic: {name}")
        self._create_topics([name], partitions=3)

        self.logger.debug("Waiting for leaders to settle")
        has_leaders = False
        while not has_leaders:
            topics = kc.metadata()["topics"]
            maybe_leaders = True
            for t in topics:
                if t["topic"] == name:
                    for p in t["partitions"]:
                        if p["leader"] == -1:
                            maybe_leaders = False
            has_leaders = maybe_leaders
        # TODO:
        #  Despite the above test, Pandaproxy can still get back no leaders
        #  Query Pandaproxy metadata to see when leaders have settled
        #  The retry logic for produce should have sufficient time for this
        #  additional settle time.

        self.logger.info(f"Producing to topic: {name}")
        produce_result = self._produce_topic(name, data)
        for o in produce_result["offsets"]:
            assert o["offset"] == 1, f'error_code {o["error_code"]}'

        self.logger.info(f"Consuming from topic: {name}")
        assert kc.consume_one(name, 0, 1)["payload"] == "vectorized"
        assert kc.consume_one(name, 1, 1)["payload"] == "pandaproxy"
        assert kc.consume_one(name, 2, 1)["payload"] == "multibroker"
Exemplo n.º 12
0
    def _ping_pong(self):
        kc = KafkaCat(self.redpanda)
        rpk = RpkTool(self.redpanda)

        payload = str(random.randint(0, 1000))
        start = time.time()
        offset = rpk.produce(self.topic, "tkey", payload, timeout=5)
        consumed = kc.consume_one(self.topic, 0, offset)
        latency = time.time() - start
        self.logger.info(
            f"_ping_pong produced '{payload}' consumed '{consumed}' in {(latency)*1000.0:.2f} ms"
        )
        if consumed['payload'] != payload:
            raise RuntimeError(f"expected '{payload}' got '{consumed}'")
    def __init__(self, test_context):
        extra_rp_conf = dict(
            log_segment_size=1048576,
            retention_bytes=3145728,
            log_compaction_interval_ms=1000,
            enable_leader_balancer=False,
        )

        super(PrefixTruncateRecoveryTest,
              self).__init__(test_context=test_context,
                             num_brokers=3,
                             extra_rp_conf=extra_rp_conf)

        self.kafka_tools = KafkaCliTools(self.redpanda)
        self.kafka_cat = KafkaCat(self.redpanda)
Exemplo n.º 14
0
    def partitions(self, topic):
        """
        Return partition metadata for the topic.
        """
        kc = KafkaCat(self)
        md = kc.metadata()
        topic = next(filter(lambda t: t["topic"] == topic, md["topics"]))

        def make_partition(p):
            index = p["partition"]
            leader_id = p["leader"]
            leader = None if leader_id == -1 else self.get_node(leader_id)
            replicas = [self.get_node(r["id"]) for r in p["replicas"]]
            return Partition(index, leader, replicas)

        return [make_partition(p) for p in topic["partitions"]]
Exemplo n.º 15
0
    def test_controller_recovery(self):
        kc = KafkaCat(self.redpanda)

        # choose a partition and a target node
        partition = self._get_partition(kc)
        target_node_id = next(
            filter(lambda r: r["id"] != partition["leader"],
                   partition["replicas"]))["id"]
        self.logger.debug(
            f"Transfering leader from {partition['leader']} to {target_node_id}"
        )

        # build the transfer url
        meta = kc.metadata()
        brokers = meta["brokers"]
        source_broker = next(
            filter(lambda b: b["id"] == partition["leader"], brokers))
        target_broker = next(
            filter(lambda b: b["id"] == target_node_id, brokers))
        self.logger.debug(f"Source broker {source_broker}")
        self.logger.debug(f"Target broker {target_broker}")

        # Send the request to any host, they should redirect to
        # the leader of the partition.
        partition_id = partition['partition']

        admin = Admin(self.redpanda)
        admin.partition_transfer_leadership("kafka", self.topic, partition_id,
                                            target_node_id)

        def transfer_complete():
            for _ in range(3):  # just give it a moment
                time.sleep(1)
                meta = kc.metadata()
                partition = next(
                    filter(lambda p: p["partition"] == partition_id,
                           meta["topics"][0]["partitions"]))
                if partition["leader"] == target_node_id:
                    return True
            return False

        wait_until(lambda: transfer_complete(),
                   timeout_sec=30,
                   backoff_sec=5,
                   err_msg="Transfer did not complete")
Exemplo n.º 16
0
    def test_controller_recovery(self):
        kc = KafkaCat(self.redpanda)

        # choose a partition and a target node
        partition = self._get_partition(kc)
        target_node_id = next(
            filter(lambda r: r["id"] != partition["leader"],
                   partition["replicas"]))["id"]
        self.logger.debug(
            f"Transfering leader from {partition['leader']} to {target_node_id}"
        )

        # build the transfer url
        meta = kc.metadata()
        brokers = meta["brokers"]
        source_broker = next(
            filter(lambda b: b["id"] == partition["leader"], brokers))
        target_broker = next(
            filter(lambda b: b["id"] == target_node_id, brokers))
        self.logger.debug(f"Source broker {source_broker}")
        self.logger.debug(f"Target broker {target_broker}")
        host = source_broker["name"]
        host = host.split(":")[0]
        partition_id = partition["partition"]
        url = "http://{}:9644/v1/kafka/{}/{}/transfer_leadership?target={}".format(
            host, self.topic, partition["partition"], target_node_id)

        def try_transfer():
            self.logger.debug(url)
            res = requests.post(url)
            self.logger.debug(res.text)
            for _ in range(3):  # just give it a moment
                time.sleep(1)
                meta = kc.metadata()
                partition = next(
                    filter(lambda p: p["partition"] == partition_id,
                           meta["topics"][0]["partitions"]))
                if partition["leader"] == target_node_id:
                    return True
            return False

        wait_until(lambda: try_transfer(),
                   timeout_sec=30,
                   backoff_sec=5,
                   err_msg="Transfer did not complete")
Exemplo n.º 17
0
    def test_produce_topic(self):
        """
        Create a topic and verify that pandaproxy can produce to it.
        """
        name = create_topic_names(1)[0]
        data = '''
        {
            "records": [
                {"value": "dmVjdG9yaXplZA==", "partition": 0},
                {"value": "cGFuZGFwcm94eQ==", "partition": 1},
                {"value": "bXVsdGlicm9rZXI=", "partition": 2}
            ]
        }'''

        self.logger.info(f"Producing to non-existant topic: {name}")
        produce_result = self._produce_topic(name, data)
        for o in produce_result["offsets"]:
            assert o["error_code"] == 3
            assert o["offset"] == -1

        self.logger.info(f"Creating test topic: {name}")
        self._create_topics([name], partitions=3)

        self.logger.debug("Waiting for leaders to settle")
        self._wait_for_topic(name)

        self.logger.info(f"Producing to topic: {name}")
        produce_result = self._produce_topic(name, data)
        for o in produce_result["offsets"]:
            assert o["offset"] == 1, f'error_code {o["error_code"]}'

        self.logger.info(f"Consuming from topic: {name}")
        kc = KafkaCat(self.redpanda)
        assert kc.consume_one(name, 0, 1)["payload"] == "vectorized"
        assert kc.consume_one(name, 1, 1)["payload"] == "pandaproxy"
        assert kc.consume_one(name, 2, 1)["payload"] == "multibroker"
Exemplo n.º 18
0
    def test_node_opeartions(self):
        # allocate 5 nodes for the cluster
        self.redpanda = RedpandaService(
            self.test_context,
            5,
            KafkaCliTools,
            extra_rp_conf={
                "enable_auto_rebalance_on_node_add": True,
                "group_topic_partitions": 3,
                "default_topic_replications": 3,
            })
        # start 3 nodes
        self.redpanda.start()
        # create some topics
        topics = self._create_random_topics(10)
        self.redpanda.logger.info(f"using topics: {topics}")
        # select one of the topics to use in consumer/producer
        self.topic = random.choice(topics).name

        self.start_producer(1, throughput=100)
        self.start_consumer(1)
        self.await_startup()

        def decommission(node_id):
            self.logger.info(f"decommissioning node: {node_id}")
            admin = Admin(self.redpanda)
            admin.decommission_broker(id=node_id)

            def node_removed():
                admin = Admin(self.redpanda)
                brokers = admin.get_brokers()
                for b in brokers:
                    if b['node_id'] == node_id:
                        return False
                return True

            wait_until(node_removed, timeout_sec=240, backoff_sec=2)

        kafkacat = KafkaCat(self.redpanda)

        def replicas_per_node():
            node_replicas = {}
            md = kafkacat.metadata()
            self.redpanda.logger.info(f"metadata: {md}")
            for topic in md['topics']:
                for p in topic['partitions']:
                    for r in p['replicas']:
                        id = r['id']
                        if id not in node_replicas:
                            node_replicas[id] = 0
                        node_replicas[id] += 1

            return node_replicas

        def restart_node(node_id, cleanup=True):
            self.logger.info(f"restarting node: {node_id}")
            self.redpanda.stop_node(self.redpanda.nodes[node_id - 1])
            if cleanup:
                self.redpanda.clean_node(self.redpanda.nodes[node_id - 1])
            self.redpanda.start_node(self.redpanda.nodes[node_id - 1])
            admin = Admin(self.redpanda)
            admin.set_log_level("cluster", "trace")

            def has_new_replicas():
                per_node = replicas_per_node()
                self.logger.info(f"replicas per node: {per_node}")
                return node_id in per_node

            wait_until(has_new_replicas, timeout_sec=240, backoff_sec=2)

        admin = Admin(self.redpanda)
        admin.set_log_level("cluster", "trace")
        work = self.generate_random_workload(10, skip_nodes=set())
        self.redpanda.logger.info(f"node operations to execute: {work}")
        for op in work:
            op_type = op[0]
            self.logger.info(f"executing - {op}")
            if op_type == ADD:
                id = op[1]
                restart_node(id)
            if op_type == DECOMMISSION:
                id = op[1]
                decommission(id)
            elif op_type == ADD_TOPIC:
                spec = TopicSpec(name=op[1],
                                 replication_factor=op[2],
                                 partition_count=op[3])

                self.redpanda.create_topic(spec)
            elif op_type == DELETE_TOPIC:
                self.redpanda.delete_topic(op[1])

        self.run_validation(enable_idempotence=False, consumer_timeout_sec=180)
Exemplo n.º 19
0
 def partition_ready():
     return KafkaCat(self.redpanda).get_partition_leader(
         name, 0)[0] is not None
Exemplo n.º 20
0
    def test_node_recovery(self, recovery_type):
        self.start_redpanda(num_nodes=3)
        kafka_tools = KafkaCliTools(self.redpanda)
        kafka_cat = KafkaCat(self.redpanda)
        # create topics
        topics = []
        for _ in range(0, 6):
            topics.append(TopicSpec(partition_count=random.randint(1, 10)))
        # chose one topic to run the main workload
        DefaultClient(self.redpanda).create_topic(topics)
        self.topic = random.choice(topics).name

        self.start_producer(1)
        self.start_consumer(2)
        self.await_startup()

        # chose another topic and populate it with data
        prepopulated_topic = random.choice(topics)

        while self.topic == prepopulated_topic.name:
            prepopulated_topic = random.choice(topics)

        # populate topic with data
        kafka_tools.produce(prepopulated_topic.name, 20000, 1024)

        def list_offsets():
            offsets = {}
            for p in range(0, prepopulated_topic.partition_count):
                offsets[p] = kafka_cat.list_offsets(prepopulated_topic.name, p)

        # store offsets
        offsets = list_offsets()

        self.redpanda.logger.info(f"Topic offsets: {offsets}")
        # stop one of the nodes and remove its data
        stopped = random.choice(self.redpanda.nodes)
        # prepare seed servers list
        seeds = map(lambda n: {
            "address": n.account.hostname,
            "port": 33145
        }, self.redpanda.nodes)
        seeds = list(
            filter(lambda n: n['address'] != stopped.account.hostname, seeds))

        self.redpanda.stop_node(stopped)
        if recovery_type == FullNodeRecoveryTest.FULL_RECOVERY:
            self.redpanda.clean_node(stopped, preserve_logs=True)

        # produce some more data to make sure that stopped node is behind
        kafka_tools.produce(prepopulated_topic.name, 20000, 1024)

        # start node with the same node id, and not empty seed server list to

        # give node more time to start as it has to recover
        self.redpanda.start_node(stopped,
                                 override_cfg_params={'seed_servers': seeds},
                                 timeout=90)

        def all_topics_recovered():
            metric = self.redpanda.metrics_sample("under_replicated_replicas",
                                                  self.redpanda.nodes)
            under_replicated = filter(lambda s: s.value == 1, metric.samples)
            under_replicated = list(
                map(
                    lambda s: (s.labels['namespace'], s.labels['topic'], s.
                               labels['partition']), under_replicated))
            self.redpanda.logger.info(
                f"under replicated partitions: {list(under_replicated)}")
            return len(under_replicated) == 0

        # wait for prepopulated topic to recover
        wait_until(all_topics_recovered, 60, 1)

        self.run_validation(min_records=20000,
                            enable_idempotence=False,
                            producer_timeout_sec=60,
                            consumer_timeout_sec=180)

        # validate prepopulated topic offsets
        assert offsets == list_offsets()
    def test_recreated_topic_metadata_are_valid(self, replication_factor):
        """
        Test recreated topic metadata are valid across all the nodes
        """

        topic = 'tp-test'
        partition_count = 5
        rpk = RpkTool(self.redpanda)
        kcat = KafkaCat(self.redpanda)
        admin = Admin(self.redpanda)
        # create topic with replication factor of 3
        rpk.create_topic(topic='tp-test',
                         partitions=partition_count,
                         replicas=replication_factor)

        # produce some data to the topic

        def wait_for_leader(partition, expected_leader):
            leader, _ = kcat.get_partition_leader(topic, partition)
            return leader == expected_leader

        def transfer_all_leaders():
            partitions = rpk.describe_topic(topic)
            for p in partitions:
                replicas = set(p.replicas)
                replicas.remove(p.leader)
                target = random.choice(list(replicas))
                admin.partition_transfer_leadership("kafka", topic, p.id,
                                                    target)
                wait_until(lambda: wait_for_leader(p.id, target),
                           timeout_sec=30,
                           backoff_sec=1)
            msg_cnt = 100
            producer = RpkProducer(self.test_context,
                                   self.redpanda,
                                   topic,
                                   16384,
                                   msg_cnt,
                                   acks=-1)

            producer.start()
            producer.wait()
            producer.free()

        # transfer leadership to grow the term
        for i in range(0, 10):
            transfer_all_leaders()

        # recreate the topic
        rpk.delete_topic(topic)
        rpk.create_topic(topic='tp-test',
                         partitions=partition_count,
                         replicas=3)

        def metadata_consistent():
            # validate leadership information on each node
            for p in range(0, partition_count):
                leaders = set()
                for n in self.redpanda.nodes:
                    admin_partition = admin.get_partitions(topic=topic,
                                                           partition=p,
                                                           namespace="kafka",
                                                           node=n)
                    self.logger.info(
                        f"node: {n.account.hostname} partition: {admin_partition}"
                    )
                    leaders.add(admin_partition['leader_id'])

                self.logger.info(f"{topic}/{p} leaders: {leaders}")
                if len(leaders) != 1:
                    return False
            return True

        wait_until(metadata_consistent, 45, backoff_sec=2)
Exemplo n.º 22
0
    def test_disabling_transactions_after_they_being_used(self):
        '''
        Validate that transactions can be safely disabled after 
        the feature have been used
        '''
        # start redpanda with tranasactions enabled, we use
        # replication factor 1 for group topic to make
        # it unavailable when one of the nodes is down,
        self.start_redpanda(num_nodes=3,
                            extra_rp_conf={
                                "transaction_coordinator_replication": 3,
                                "id_allocator_replication": 3,
                                "enable_idempotence": True,
                                "enable_transactions": True,
                                "default_topic_replications": 1,
                                "default_topic_partitions": 1,
                                "health_manager_tick_interval": 3600000
                            })

        tx_topic = TopicSpec(name="tx-topic",
                             partition_count=1,
                             replication_factor=3)
        self.client().create_topic(tx_topic)

        # produce some messages to tx_topic

        kcat = KafkaCat(self.redpanda)
        kcat.produce_one(tx_topic.name, msg='test-msg', tx_id='test-tx-id')

        # disable transactions,
        self.redpanda.stop()

        for n in self.redpanda.nodes:
            self.redpanda.start_node(n,
                                     override_cfg_params={
                                         "transaction_coordinator_replication":
                                         3,
                                         "id_allocator_replication": 3,
                                         "enable_idempotence": False,
                                         "enable_transactions": False,
                                         "transactional_id_expiration_ms":
                                         1000,
                                         "default_topic_replications": 3,
                                         "default_topic_partitions": 1
                                     })

        # create topic for test
        tester = TopicSpec(name="tester",
                           partition_count=1,
                           replication_factor=3)
        self.client().create_topic(tester)
        self.topic = tester
        self.start_producer(2, throughput=10000)
        self.start_consumer(1)
        self.await_startup()

        self.run_validation(min_records=100000,
                            producer_timeout_sec=300,
                            consumer_timeout_sec=300)

        # make sure that all redpanda nodes are up and running
        for n in self.redpanda.nodes:
            assert self.redpanda.redpanda_pid(n) != None
Exemplo n.º 23
0
    def test_node_operations(self, enable_failures):
        # allocate 5 nodes for the cluster
        self.redpanda = RedpandaService(
            self.test_context,
            5,
            extra_rp_conf={
                "enable_auto_rebalance_on_node_add": True,
                "group_topic_partitions": 3,
                "default_topic_replications": 3,
            })

        self.redpanda.start()
        # create some topics
        topics = self._create_random_topics(10)
        self.redpanda.logger.info(f"using topics: {topics}")
        # select one of the topics to use in consumer/producer
        self.topic = random.choice(topics).name

        self.start_producer(1, throughput=100)
        self.start_consumer(1)
        self.await_startup()
        self.active_nodes = set(
            [self.redpanda.idx(n) for n in self.redpanda.nodes])
        # collect current mapping
        self.ids_mapping = {}
        for n in self.redpanda.nodes:
            self.ids_mapping[self.redpanda.idx(n)] = self.redpanda.idx(n)
        self.next_id = sorted(list(self.ids_mapping.keys()))[-1] + 1
        self.redpanda.logger.info(f"Initial ids mapping: {self.ids_mapping}")
        NODE_OP_TIMEOUT = 360

        def get_next_id():
            id = self.next_id
            self.next_id += 1
            return id

        def failure_injector_loop():
            f_injector = FailureInjector(self.redpanda)
            while enable_failures:
                f_type = random.choice(FailureSpec.FAILURE_TYPES)
                length = 0
                # allow suspending any node
                if f_type == FailureSpec.FAILURE_SUSPEND:
                    length = random.randint(
                        1, NodeOperationFuzzyTest.max_suspend_duration_seconds)
                    node = random.choice(self.redpanda.nodes)
                else:
                    #kill/termianate only active nodes (not to influence the test outcome)
                    idx = random.choice(list(self.active_nodes))
                    node = self.redpanda.get_node(idx)

                f_injector.inject_failure(
                    FailureSpec(node=node, type=f_type, length=length))

                delay = random.randint(
                    NodeOperationFuzzyTest.min_inter_failure_time,
                    NodeOperationFuzzyTest.max_inter_failure_time)
                self.redpanda.logger.info(
                    f"waiting {delay} seconds before next failure")
                time.sleep(delay)

        if enable_failures:
            finjector_thread = threading.Thread(target=failure_injector_loop,
                                                args=())
            finjector_thread.daemon = True
            finjector_thread.start()

        def decommission(idx):
            node_id = self.ids_mapping[idx]
            self.logger.info(f"decommissioning node: {idx} with id: {node_id}")

            def decommissioned():
                try:
                    admin = Admin(self.redpanda)
                    # if broker is already draining, it is suceess

                    brokers = admin.get_brokers()
                    for b in brokers:
                        if b['node_id'] == node_id and b[
                                'membership_status'] == 'draining':
                            return True

                    r = admin.decommission_broker(id=node_id)
                    return r.status_code == 200
                except requests.exceptions.RetryError:
                    return False
                except requests.exceptions.ConnectionError:
                    return False
                except requests.exceptions.HTTPError:
                    return False

            wait_until(decommissioned,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)
            admin = Admin(self.redpanda)

            def is_node_removed(idx_to_query, node_id):
                try:
                    brokers = admin.get_brokers(
                        self.redpanda.get_node(idx_to_query))
                    ids = map(lambda broker: broker['node_id'], brokers)
                    return not node_id in ids
                except:
                    return False

            def node_removed():
                node_removed_cnt = 0
                for idx in self.active_nodes:
                    if is_node_removed(idx, node_id):
                        node_removed_cnt += 1

                node_count = len(self.redpanda.nodes)
                majority = int(node_count / 2) + 1
                self.redpanda.logger.debug(
                    f"node {node_id} removed on {node_removed_cnt} nodes, majority: {majority}"
                )
                return node_removed_cnt >= majority

            wait_until(node_removed,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)
            self.redpanda.stop_node(self.redpanda.get_node(idx))

        kafkacat = KafkaCat(self.redpanda)

        def replicas_per_node():
            node_replicas = {}
            md = kafkacat.metadata()
            self.redpanda.logger.info(f"metadata: {md}")
            for topic in md['topics']:
                for p in topic['partitions']:
                    for r in p['replicas']:
                        id = r['id']
                        if id not in node_replicas:
                            node_replicas[id] = 0
                        node_replicas[id] += 1

            return node_replicas

        def seed_servers_for(idx):
            seeds = map(
                lambda n: {
                    "address": n.account.hostname,
                    "port": 33145
                }, self.redpanda.nodes)

            return list(
                filter(
                    lambda n: n['address'] != self.redpanda.get_node(idx).
                    account.hostname, seeds))

        def add_node(idx, cleanup=True):
            id = get_next_id()
            self.logger.info(f"adding node: {idx} back with new id: {id}")
            self.ids_mapping[idx] = id
            self.redpanda.stop_node(self.redpanda.get_node(idx))
            if cleanup:
                self.redpanda.clean_node(self.redpanda.get_node(idx),
                                         preserve_logs=True)
            # we do not reuse previous node ids and override seed server list
            self.redpanda.start_node(
                self.redpanda.get_node(idx),
                timeout=NodeOperationFuzzyTest.min_inter_failure_time +
                NodeOperationFuzzyTest.max_suspend_duration_seconds + 30,
                override_cfg_params={
                    "node_id": id,
                    "seed_servers": seed_servers_for(idx)
                })

            def has_new_replicas():
                per_node = replicas_per_node()
                self.logger.info(f"replicas per node: {per_node}")
                return id in per_node

            wait_until(has_new_replicas,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)

        def is_topic_present(name):
            kcl = KCL(self.redpanda)
            lines = kcl.list_topics().splitlines()
            self.redpanda.logger.debug(
                f"checking if topic {name} is present in {lines}")
            for l in lines:
                if l.startswith(name):
                    return True
            return False

        def create_topic(spec):
            try:
                DefaultClient(self.redpanda).create_topic(spec)
            except Exception as e:
                self.redpanda.logger.warn(
                    f"error creating topic {spec.name} - {e}")
            try:
                return is_topic_present(spec.name)
            except Exception as e:
                self.redpanda.logger.warn(f"error while listing topics - {e}")
                return False

        def delete_topic(name):
            try:
                DefaultClient(self.redpanda).delete_topic(name)
            except Exception as e:
                self.redpanda.logger.warn(f"error deleting topic {name} - {e}")
            try:
                return not is_topic_present(name)
            except Exception as e:
                self.redpanda.logger.warn(f"error while listing topics - {e}")
                return False

        work = self.generate_random_workload(10,
                                             skip_nodes=set(),
                                             available_nodes=self.active_nodes)
        self.redpanda.logger.info(f"node operations to execute: {work}")
        for op in work:
            op_type = op[0]
            self.logger.info(
                f"executing - {op} - current ids: {self.ids_mapping}")
            if op_type == ADD:
                idx = op[1]
                self.active_nodes.add(idx)
                add_node(idx)
            if op_type == DECOMMISSION:
                idx = op[1]
                self.active_nodes.remove(idx)
                decommission(idx)
            elif op_type == ADD_TOPIC:
                spec = TopicSpec(name=op[1],
                                 replication_factor=op[2],
                                 partition_count=op[3])
                wait_until(lambda: create_topic(spec) == True,
                           timeout_sec=180,
                           backoff_sec=2)
            elif op_type == DELETE_TOPIC:
                wait_until(lambda: delete_topic(op[1]) == True,
                           timeout_sec=180,
                           backoff_sec=2)

        enable_failures = False
        self.run_validation(enable_idempotence=False,
                            producer_timeout_sec=60,
                            consumer_timeout_sec=180)
Exemplo n.º 24
0
 def _get_leader(self):
     """
     :returns: 2 tuple of (leader, [replica ids])
     """
     return KafkaCat(self.redpanda).get_partition_leader(self.topic, 0)
Exemplo n.º 25
0
    def test_node_opeartions(self, enable_failures):
        # allocate 5 nodes for the cluster
        self.redpanda = RedpandaService(
            self.test_context,
            5,
            KafkaCliTools,
            extra_rp_conf={
                "enable_auto_rebalance_on_node_add": True,
                "group_topic_partitions": 3,
                "default_topic_replications": 3,
            })
        self.active_nodes = set([1, 2, 3, 4, 5])

        self.redpanda.start()
        # create some topics
        topics = self._create_random_topics(10)
        self.redpanda.logger.info(f"using topics: {topics}")
        # select one of the topics to use in consumer/producer
        self.topic = random.choice(topics).name

        self.start_producer(1, throughput=100)
        self.start_consumer(1)
        self.await_startup()
        NODE_OP_TIMEOUT = 360

        def failure_injector_loop():
            f_injector = FailureInjector(self.redpanda)
            while enable_failures:
                f_type = random.choice(FailureSpec.FAILURE_TYPES)
                length = 0
                # allow suspending any node
                if f_type == FailureSpec.FAILURE_SUSPEND:
                    length = random.randint(1, 10)
                    node = random.choice(self.redpanda.nodes)
                else:
                    #kill/termianate only active nodes (not to influence the test outcome)
                    idx = random.choice(list(self.active_nodes)) - 1
                    node = self.redpanda.nodes[idx]

                f_injector.inject_failure(
                    FailureSpec(node=node, type=f_type, length=length))

                delay = random.randint(20, 45)
                self.redpanda.logger.info(
                    f"waiting {delay} seconds before next failure")
                time.sleep(delay)

        if enable_failures:
            finjector_thread = threading.Thread(target=failure_injector_loop,
                                                args=())
            finjector_thread.daemon = True
            finjector_thread.start()

        def decommission(node_id):
            self.logger.info(f"decommissioning node: {node_id}")

            def decommissioned():
                try:
                    admin = Admin(self.redpanda)
                    # if broker is already draining, it is suceess
                    brokers = admin.get_brokers()
                    for b in brokers:
                        if b['node_id'] == node_id and b[
                                'membership_status'] == 'draining':
                            return True

                    r = admin.decommission_broker(id=node_id)
                    return r.status_code == 200
                except requests.exceptions.RetryError:
                    return False
                except requests.exceptions.ConnectionError:
                    return False
                except requests.exceptions.HTTPError:
                    return False

            wait_until(decommissioned,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)

            def node_removed():
                admin = Admin(self.redpanda)
                try:
                    brokers = admin.get_brokers(node=self.redpanda.nodes[0])
                    for b in brokers:
                        if b['node_id'] == node_id:
                            return False
                    return True
                except:
                    return False

            wait_until(node_removed,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)

        kafkacat = KafkaCat(self.redpanda)

        def replicas_per_node():
            node_replicas = {}
            md = kafkacat.metadata()
            self.redpanda.logger.info(f"metadata: {md}")
            for topic in md['topics']:
                for p in topic['partitions']:
                    for r in p['replicas']:
                        id = r['id']
                        if id not in node_replicas:
                            node_replicas[id] = 0
                        node_replicas[id] += 1

            return node_replicas

        def restart_node(node_id, cleanup=True):
            self.logger.info(f"restarting node: {node_id}")
            self.redpanda.stop_node(self.redpanda.nodes[node_id - 1])
            if cleanup:
                self.redpanda.clean_node(self.redpanda.nodes[node_id - 1],
                                         preserve_logs=True)
            self.redpanda.start_node(self.redpanda.nodes[node_id - 1])

            def has_new_replicas():
                per_node = replicas_per_node()
                self.logger.info(f"replicas per node: {per_node}")
                return node_id in per_node

            wait_until(has_new_replicas,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)

        def is_topic_present(name):
            kcl = KCL(self.redpanda)
            lines = kcl.list_topics().splitlines()
            self.redpanda.logger.debug(
                f"checking if topic {name} is present in {lines}")
            for l in lines:
                if l.startswith(name):
                    return True
            return False

        def create_topic(spec):
            try:
                self.redpanda.create_topic(spec)
            except Exception as e:
                self.redpanda.logger.warn(
                    f"error creating topic {spec.name} - {e}")
            try:
                return is_topic_present(spec.name)
            except Exception as e:
                self.redpanda.logger.warn(f"error while listing topics - {e}")
                return False

        def delete_topic(name):
            try:
                self.redpanda.delete_topic(name)
            except Exception as e:
                self.redpanda.logger.warn(f"error deleting topic {name} - {e}")
            try:
                return not is_topic_present(name)
            except Exception as e:
                self.redpanda.logger.warn(f"error while listing topics - {e}")
                return False

        work = self.generate_random_workload(10, skip_nodes=set())
        self.redpanda.logger.info(f"node operations to execute: {work}")
        for op in work:
            op_type = op[0]
            self.logger.info(f"executing - {op}")

            if op_type == ADD:
                id = op[1]
                self.active_nodes.add(id)
                restart_node(id)
            if op_type == DECOMMISSION:
                id = op[1]
                self.active_nodes.remove(id)
                decommission(id)
            elif op_type == ADD_TOPIC:
                spec = TopicSpec(name=op[1],
                                 replication_factor=op[2],
                                 partition_count=op[3])
                wait_until(lambda: create_topic(spec) == True,
                           timeout_sec=180,
                           backoff_sec=2)
            elif op_type == DELETE_TOPIC:
                wait_until(lambda: delete_topic(op[1]) == True,
                           timeout_sec=180,
                           backoff_sec=2)

        enable_failures = False
        self.run_validation(enable_idempotence=False,
                            producer_timeout_sec=60,
                            consumer_timeout_sec=180)
Exemplo n.º 26
0
 def _get_leaders_by_node(self):
     kc = KafkaCat(self.redpanda)
     md = kc.metadata()
     topic = next(filter(lambda t: t["topic"] == self.topic, md["topics"]))
     leaders = (p["leader"] for p in topic["partitions"])
     return collections.Counter(leaders)