Exemplo n.º 1
0
class RedpandaTest(Test):
    """
    Base class for tests that use the Redpanda service.
    """

    # List of topics to be created automatically when the cluster starts. Each
    # topic is defined by an instance of a TopicSpec.
    topics = ()

    def __init__(self,
                 test_context,
                 num_brokers=3,
                 extra_rp_conf=dict(),
                 topics=None,
                 log_level='info'):
        super(RedpandaTest, self).__init__(test_context)

        self.redpanda = RedpandaService(test_context,
                                        num_brokers=num_brokers,
                                        extra_rp_conf=extra_rp_conf,
                                        topics=self.topics,
                                        log_level=log_level)

    @property
    def topic(self):
        """
        Return the name of the auto-created initial topic. Accessing this
        property requires exactly one initial topic be configured.
        """
        assert len(self.topics) == 1
        return self.topics[0].name

    def setUp(self):
        self.redpanda.start()
Exemplo n.º 2
0
class RedpandaTest(Test):
    """
    Base class for tests that use the Redpanda service.
    """

    # List of topics to be created automatically when the cluster starts. Each
    # topic is defined by an instance of a TopicSpec.
    topics = []

    def __init__(self,
                 test_context,
                 num_brokers=3,
                 extra_rp_conf=dict(),
                 enable_pp=False,
                 enable_sr=False,
                 num_cores=3):
        super(RedpandaTest, self).__init__(test_context)
        self.scale = Scale(test_context)
        self.redpanda = RedpandaService(test_context,
                                        num_brokers,
                                        extra_rp_conf=extra_rp_conf,
                                        enable_pp=enable_pp,
                                        enable_sr=enable_sr,
                                        num_cores=num_cores)
        self._client = DefaultClient(self.redpanda)

    @property
    def topic(self):
        """
        Return the name of the auto-created initial topic. Accessing this
        property requires exactly one initial topic be configured.
        """
        assert len(self.topics) == 1
        return self.topics[0].name

    def setUp(self):
        self.redpanda.start()
        self._create_initial_topics()

    def client(self):
        return self._client

    def _create_initial_topics(self):
        config = self.redpanda.security_config()
        user = config.get("sasl_plain_username")
        passwd = config.get("sasl_plain_password")
        client = KafkaCliTools(self.redpanda, user=user, passwd=passwd)
        for spec in self.topics:
            self.logger.debug(f"Creating initial topic {spec}")
            client.create_topic(spec)
Exemplo n.º 3
0
class RedpandaTest(Test):
    """
    Base class for tests that use the Redpanda service.
    """
    def __init__(self,
                 test_context,
                 num_brokers=3,
                 extra_rp_conf=dict(),
                 topics=None,
                 log_level='info'):
        super(RedpandaTest, self).__init__(test_context)

        self.redpanda = RedpandaService(test_context,
                                        num_brokers=num_brokers,
                                        extra_rp_conf=extra_rp_conf,
                                        topics=topics,
                                        log_level=log_level)

    def setUp(self):
        self.redpanda.start()
Exemplo n.º 4
0
class EndToEndShadowIndexingBase(EndToEndTest):
    segment_size = 1048576  # 1 Mb
    s3_topic_name = "panda-topic"

    num_brokers = 3

    topics = (TopicSpec(
        name=s3_topic_name,
        partition_count=1,
        replication_factor=3,
    ), )

    def __init__(self, test_context, extra_rp_conf=None):
        super(EndToEndShadowIndexingBase,
              self).__init__(test_context=test_context)

        self.test_context = test_context
        self.topic = EndToEndShadowIndexingTest.s3_topic_name

        self.si_settings = SISettings(
            cloud_storage_reconciliation_interval_ms=500,
            cloud_storage_max_connections=5,
            log_segment_size=EndToEndShadowIndexingTest.segment_size,  # 1MB
        )
        self.s3_bucket_name = self.si_settings.cloud_storage_bucket
        self.si_settings.load_context(self.logger, test_context)
        self.scale = Scale(test_context)

        self.redpanda = RedpandaService(context=self.test_context,
                                        num_brokers=self.num_brokers,
                                        si_settings=self.si_settings,
                                        extra_rp_conf=extra_rp_conf)
        self.kafka_tools = KafkaCliTools(self.redpanda)

    def setUp(self):
        self.redpanda.start()
        for topic in EndToEndShadowIndexingBase.topics:
            self.kafka_tools.create_topic(topic)

    def tearDown(self):
        self.s3_client.empty_bucket(self.s3_bucket_name)
Exemplo n.º 5
0
class NodeOperationFuzzyTest(EndToEndTest):
    def generate_random_workload(self, count, skip_nodes):
        op_types = [ADD, DECOMMISSION]
        tp_op_types = [ADD_TOPIC, DELETE_TOPIC]
        # current state
        active_nodes = [1, 2, 3, 4, 5]
        decommissioned_nodes = []
        operations = []
        topics = []

        def eligible_active_nodes():
            return list(
                filter(lambda n: not (n == 1 or n in skip_nodes),
                       active_nodes))

        def decommission(id):
            active_nodes.remove(id)
            decommissioned_nodes.append(id)

        def add(id):
            active_nodes.append(id)
            decommissioned_nodes.remove(id)

        for _ in range(0, count):
            if len(decommissioned_nodes) == 2:
                id = random.choice(decommissioned_nodes)
                operations.append((ADD, id))
                add(id)
            elif len(decommissioned_nodes) == 0:
                id = random.choice(eligible_active_nodes())
                operations.append((DECOMMISSION, id))
                decommission(id)
            else:
                op = random.choice(op_types)
                if op == DECOMMISSION:
                    id = random.choice(eligible_active_nodes())
                    operations.append((DECOMMISSION, id))
                    decommission(id)
                elif op == ADD:
                    id = random.choice(decommissioned_nodes)
                    operations.append((ADD, id))
                    add(id)
            # topic operation
            if len(topics) == 0:
                op = ADD_TOPIC
            else:
                op = random.choice(tp_op_types)

            if op == ADD_TOPIC:
                operations.append((
                    ADD_TOPIC,
                    f"test-topic-{random.randint(0,2000)}-{round(time.time()*1000000)}",
                    random.choice(ALLOWED_REPLICATION), 3))
            else:
                operations.append((DELETE_TOPIC, random.choice(topics)))

        return operations

    def _create_random_topics(self, count):
        max_partitions = 10

        topics = []
        for i in range(0, count):
            name = f"topic-{i}"
            spec = TopicSpec(
                name=name,
                partition_count=random.randint(1, max_partitions),
                replication_factor=random.choice(ALLOWED_REPLICATION))

            topics.append(spec)

        for spec in topics:
            self.redpanda.create_topic(spec)

        return topics

    """
    Adding nodes to the cluster should result in partition reallocations to new 
    nodes
    """

    @cluster(num_nodes=7)
    @parametrize(enable_failures=True)
    @parametrize(enable_failures=False)
    def test_node_opeartions(self, enable_failures):
        # allocate 5 nodes for the cluster
        self.redpanda = RedpandaService(
            self.test_context,
            5,
            KafkaCliTools,
            extra_rp_conf={
                "enable_auto_rebalance_on_node_add": True,
                "group_topic_partitions": 3,
                "default_topic_replications": 3,
            })
        self.active_nodes = set([1, 2, 3, 4, 5])

        self.redpanda.start()
        # create some topics
        topics = self._create_random_topics(10)
        self.redpanda.logger.info(f"using topics: {topics}")
        # select one of the topics to use in consumer/producer
        self.topic = random.choice(topics).name

        self.start_producer(1, throughput=100)
        self.start_consumer(1)
        self.await_startup()
        NODE_OP_TIMEOUT = 360

        def failure_injector_loop():
            f_injector = FailureInjector(self.redpanda)
            while enable_failures:
                f_type = random.choice(FailureSpec.FAILURE_TYPES)
                length = 0
                # allow suspending any node
                if f_type == FailureSpec.FAILURE_SUSPEND:
                    length = random.randint(1, 10)
                    node = random.choice(self.redpanda.nodes)
                else:
                    #kill/termianate only active nodes (not to influence the test outcome)
                    idx = random.choice(list(self.active_nodes)) - 1
                    node = self.redpanda.nodes[idx]

                f_injector.inject_failure(
                    FailureSpec(node=node, type=f_type, length=length))

                delay = random.randint(20, 45)
                self.redpanda.logger.info(
                    f"waiting {delay} seconds before next failure")
                time.sleep(delay)

        if enable_failures:
            finjector_thread = threading.Thread(target=failure_injector_loop,
                                                args=())
            finjector_thread.daemon = True
            finjector_thread.start()

        def decommission(node_id):
            self.logger.info(f"decommissioning node: {node_id}")

            def decommissioned():
                try:
                    admin = Admin(self.redpanda)
                    # if broker is already draining, it is suceess
                    brokers = admin.get_brokers()
                    for b in brokers:
                        if b['node_id'] == node_id and b[
                                'membership_status'] == 'draining':
                            return True

                    r = admin.decommission_broker(id=node_id)
                    return r.status_code == 200
                except requests.exceptions.RetryError:
                    return False
                except requests.exceptions.ConnectionError:
                    return False
                except requests.exceptions.HTTPError:
                    return False

            wait_until(decommissioned,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)

            def node_removed():
                admin = Admin(self.redpanda)
                try:
                    brokers = admin.get_brokers(node=self.redpanda.nodes[0])
                    for b in brokers:
                        if b['node_id'] == node_id:
                            return False
                    return True
                except:
                    return False

            wait_until(node_removed,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)

        kafkacat = KafkaCat(self.redpanda)

        def replicas_per_node():
            node_replicas = {}
            md = kafkacat.metadata()
            self.redpanda.logger.info(f"metadata: {md}")
            for topic in md['topics']:
                for p in topic['partitions']:
                    for r in p['replicas']:
                        id = r['id']
                        if id not in node_replicas:
                            node_replicas[id] = 0
                        node_replicas[id] += 1

            return node_replicas

        def restart_node(node_id, cleanup=True):
            self.logger.info(f"restarting node: {node_id}")
            self.redpanda.stop_node(self.redpanda.nodes[node_id - 1])
            if cleanup:
                self.redpanda.clean_node(self.redpanda.nodes[node_id - 1],
                                         preserve_logs=True)
            self.redpanda.start_node(self.redpanda.nodes[node_id - 1])

            def has_new_replicas():
                per_node = replicas_per_node()
                self.logger.info(f"replicas per node: {per_node}")
                return node_id in per_node

            wait_until(has_new_replicas,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)

        def is_topic_present(name):
            kcl = KCL(self.redpanda)
            lines = kcl.list_topics().splitlines()
            self.redpanda.logger.debug(
                f"checking if topic {name} is present in {lines}")
            for l in lines:
                if l.startswith(name):
                    return True
            return False

        def create_topic(spec):
            try:
                self.redpanda.create_topic(spec)
            except Exception as e:
                self.redpanda.logger.warn(
                    f"error creating topic {spec.name} - {e}")
            try:
                return is_topic_present(spec.name)
            except Exception as e:
                self.redpanda.logger.warn(f"error while listing topics - {e}")
                return False

        def delete_topic(name):
            try:
                self.redpanda.delete_topic(name)
            except Exception as e:
                self.redpanda.logger.warn(f"error deleting topic {name} - {e}")
            try:
                return not is_topic_present(name)
            except Exception as e:
                self.redpanda.logger.warn(f"error while listing topics - {e}")
                return False

        work = self.generate_random_workload(10, skip_nodes=set())
        self.redpanda.logger.info(f"node operations to execute: {work}")
        for op in work:
            op_type = op[0]
            self.logger.info(f"executing - {op}")

            if op_type == ADD:
                id = op[1]
                self.active_nodes.add(id)
                restart_node(id)
            if op_type == DECOMMISSION:
                id = op[1]
                self.active_nodes.remove(id)
                decommission(id)
            elif op_type == ADD_TOPIC:
                spec = TopicSpec(name=op[1],
                                 replication_factor=op[2],
                                 partition_count=op[3])
                wait_until(lambda: create_topic(spec) == True,
                           timeout_sec=180,
                           backoff_sec=2)
            elif op_type == DELETE_TOPIC:
                wait_until(lambda: delete_topic(op[1]) == True,
                           timeout_sec=180,
                           backoff_sec=2)

        enable_failures = False
        self.run_validation(enable_idempotence=False,
                            producer_timeout_sec=60,
                            consumer_timeout_sec=180)
Exemplo n.º 6
0
class NodeOperationFuzzyTest(EndToEndTest):
    def generate_random_workload(self, count, skip_nodes):
        op_types = [ADD, DECOMMISSION]
        tp_op_types = [ADD_TOPIC, DELETE_TOPIC]
        # current state
        active_nodes = [1, 2, 3, 4, 5]
        decommissioned_nodes = []
        operations = []
        topics = []

        def eligible_active_nodes():
            return list(
                filter(lambda n: not (n == 1 or n in skip_nodes),
                       active_nodes))

        def decommission(id):
            active_nodes.remove(id)
            decommissioned_nodes.append(id)

        def add(id):
            active_nodes.append(id)
            decommissioned_nodes.remove(id)

        for _ in range(0, count):
            if len(decommissioned_nodes) == 2:
                id = random.choice(decommissioned_nodes)
                operations.append((ADD, id))
                add(id)
            elif len(decommissioned_nodes) == 0:
                id = random.choice(eligible_active_nodes())
                operations.append((DECOMMISSION, id))
                decommission(id)
            else:
                op = random.choice(op_types)
                if op == DECOMMISSION:
                    id = random.choice(eligible_active_nodes())
                    operations.append((DECOMMISSION, id))
                    decommission(id)
                elif op == ADD:
                    id = random.choice(decommissioned_nodes)
                    operations.append((ADD, id))
                    add(id)
            # topic operation
            if len(topics) == 0:
                op = ADD_TOPIC
            else:
                op = random.choice(tp_op_types)

            if op == ADD_TOPIC:
                operations.append((
                    ADD_TOPIC,
                    f"test-topic-{random.randint(0,2000)}-{time.time()*1000.0}",
                    random.choice(ALLOWED_REPLICATION), 3))
            else:
                operations.append((DELETE_TOPIC, random.choice(topics)))

        return operations

    def _create_random_topics(self, count):
        max_partitions = 10

        topics = []
        for i in range(0, count):
            name = f"topic-{i}"
            spec = TopicSpec(
                name=name,
                partition_count=random.randint(1, max_partitions),
                replication_factor=random.choice(ALLOWED_REPLICATION))

            topics.append(spec)

        for spec in topics:
            self.redpanda.create_topic(spec)

        return topics

    """
    Adding nodes to the cluster should result in partition reallocations to new 
    nodes
    """

    @cluster(num_nodes=7)
    def test_node_opeartions(self):
        # allocate 5 nodes for the cluster
        self.redpanda = RedpandaService(
            self.test_context,
            5,
            KafkaCliTools,
            extra_rp_conf={
                "enable_auto_rebalance_on_node_add": True,
                "group_topic_partitions": 3,
                "default_topic_replications": 3,
            })
        # start 3 nodes
        self.redpanda.start()
        # create some topics
        topics = self._create_random_topics(10)
        self.redpanda.logger.info(f"using topics: {topics}")
        # select one of the topics to use in consumer/producer
        self.topic = random.choice(topics).name

        self.start_producer(1, throughput=100)
        self.start_consumer(1)
        self.await_startup()

        def decommission(node_id):
            self.logger.info(f"decommissioning node: {node_id}")
            admin = Admin(self.redpanda)
            admin.decommission_broker(id=node_id)

            def node_removed():
                admin = Admin(self.redpanda)
                brokers = admin.get_brokers()
                for b in brokers:
                    if b['node_id'] == node_id:
                        return False
                return True

            wait_until(node_removed, timeout_sec=240, backoff_sec=2)

        kafkacat = KafkaCat(self.redpanda)

        def replicas_per_node():
            node_replicas = {}
            md = kafkacat.metadata()
            self.redpanda.logger.info(f"metadata: {md}")
            for topic in md['topics']:
                for p in topic['partitions']:
                    for r in p['replicas']:
                        id = r['id']
                        if id not in node_replicas:
                            node_replicas[id] = 0
                        node_replicas[id] += 1

            return node_replicas

        def restart_node(node_id, cleanup=True):
            self.logger.info(f"restarting node: {node_id}")
            self.redpanda.stop_node(self.redpanda.nodes[node_id - 1])
            if cleanup:
                self.redpanda.clean_node(self.redpanda.nodes[node_id - 1])
            self.redpanda.start_node(self.redpanda.nodes[node_id - 1])
            admin = Admin(self.redpanda)
            admin.set_log_level("cluster", "trace")

            def has_new_replicas():
                per_node = replicas_per_node()
                self.logger.info(f"replicas per node: {per_node}")
                return node_id in per_node

            wait_until(has_new_replicas, timeout_sec=240, backoff_sec=2)

        admin = Admin(self.redpanda)
        admin.set_log_level("cluster", "trace")
        work = self.generate_random_workload(10, skip_nodes=set())
        self.redpanda.logger.info(f"node operations to execute: {work}")
        for op in work:
            op_type = op[0]
            self.logger.info(f"executing - {op}")
            if op_type == ADD:
                id = op[1]
                restart_node(id)
            if op_type == DECOMMISSION:
                id = op[1]
                decommission(id)
            elif op_type == ADD_TOPIC:
                spec = TopicSpec(name=op[1],
                                 replication_factor=op[2],
                                 partition_count=op[3])

                self.redpanda.create_topic(spec)
            elif op_type == DELETE_TOPIC:
                self.redpanda.delete_topic(op[1])

        self.run_validation(enable_idempotence=False, consumer_timeout_sec=180)
Exemplo n.º 7
0
class FetchAfterDeleteTest(Test):
    def __init__(self, test_context):
        super(FetchAfterDeleteTest, self).__init__(test_context)
        self.scale = Scale(test_context)

    @cluster(num_nodes=3)
    @parametrize(transactions_enabled=True)
    @parametrize(transactions_enabled=False)
    def test_fetch_after_committed_offset_was_removed(self,
                                                      transactions_enabled):
        """
        Test fetching when consumer offset was deleted by retention
        """
        segment_size = 1048576
        self.redpanda = RedpandaService(self.test_context,
                                        3,
                                        KafkaCliTools,
                                        extra_rp_conf={
                                            "enable_transactions":
                                            transactions_enabled,
                                            "enable_idempotence":
                                            transactions_enabled,
                                            "log_compaction_interval_ms": 5000,
                                            "log_segment_size": segment_size,
                                            "enable_leader_balancer": False,
                                        })
        self.redpanda.start()
        topic = TopicSpec(partition_count=1,
                          replication_factor=3,
                          cleanup_policy=TopicSpec.CLEANUP_DELETE)
        self.redpanda.create_topic(topic)
        self.topic = topic.name

        kafka_tools = KafkaCliTools(self.redpanda)

        # produce until segments have been compacted
        produce_until_segments(
            self.redpanda,
            topic=self.topic,
            partition_idx=0,
            count=10,
        )
        consumer_group = 'test'
        rpk = RpkTool(self.redpanda)

        def consume(n=1):

            out = rpk.consume(self.topic, group=consumer_group, n=n)
            split = out.split('}')
            split = filter(lambda s: "{" in s, split)

            return map(lambda s: json.loads(s + "}"), split)

        #consume from the beggining
        msgs = consume(10)
        last = list(msgs).pop()
        offset = last['offset']

        # change retention time
        kafka_tools.alter_topic_config(
            self.topic, {
                TopicSpec.PROPERTY_RETENTION_BYTES: 2 * segment_size,
            })

        wait_for_segments_removal(self.redpanda,
                                  self.topic,
                                  partition_idx=0,
                                  count=5)

        partitions = list(rpk.describe_topic(self.topic))
        p = partitions[0]
        assert p.start_offset > offset
        # consume from the offset that doesn't exists,
        # the one that was committed previously was already removed
        out = list(consume(1))
        assert out[0]['offset'] == p.start_offset
Exemplo n.º 8
0
class ScalingUpTest(EndToEndTest):
    """
    Adding nodes to the cluster should result in partition reallocations to new
    nodes
    """
    @cluster(num_nodes=5)
    def test_adding_nodes_to_cluster(self):
        self.redpanda = RedpandaService(
            self.test_context, 3, extra_rp_conf={"group_topic_partitions": 1})
        # start single node cluster
        self.redpanda.start(nodes=[self.redpanda.nodes[0]])
        # create some topics
        topics = []
        # include __consumer_offsets topic replica
        total_replicas = 1
        for partition_count in range(1, 5):
            name = f"topic{len(topics)}"
            spec = TopicSpec(name=name,
                             partition_count=partition_count,
                             replication_factor=1)
            total_replicas += partition_count
            topics.append(spec)

        for spec in topics:
            DefaultClient(self.redpanda).create_topic(spec)
            self.topic = spec.name

        self.start_producer(1)
        self.start_consumer(1)
        self.await_startup()
        # add second node
        self.redpanda.start_node(self.redpanda.nodes[1])
        kafkacat = KafkaCat(self.redpanda)

        def _replicas_per_node():
            node_replicas = {}
            md = kafkacat.metadata()
            self.redpanda.logger.info(f"metadata: {md}")
            for topic in md['topics']:
                for p in topic['partitions']:
                    for r in p['replicas']:
                        id = r['id']
                        if id not in node_replicas:
                            node_replicas[id] = 0
                        node_replicas[id] += 1

            return node_replicas

        def partitions_rebalanced():
            per_node = _replicas_per_node()
            self.redpanda.logger.info(f"replicas per node: {per_node}")
            if len(per_node) < len(self.redpanda.started_nodes()):
                return False

            replicas = sum(per_node.values())
            if replicas != total_replicas:
                return False

            return all(p[1] > 1 for p in per_node.items())

        wait_until(partitions_rebalanced, timeout_sec=30, backoff_sec=1)
        # add third node
        self.redpanda.start_node(self.redpanda.nodes[2])
        wait_until(partitions_rebalanced, timeout_sec=30, backoff_sec=1)

        self.run_validation(enable_idempotence=False, consumer_timeout_sec=45)
Exemplo n.º 9
0
class EndToEndTest(Test):
    """
    Test for common pattern:
      - Produce and consume in the background
      - Perform some action (e.g. partition movement)
      - Run validation
    """
    def __init__(self, test_context, extra_rp_conf=None):
        super(EndToEndTest, self).__init__(test_context=test_context)
        if extra_rp_conf is None:
            self._extra_rp_conf = {}
        else:
            self._extra_rp_conf = extra_rp_conf
        self.records_consumed = []
        self.last_consumed_offsets = {}
        self.redpanda = None
        self.topic = None
        self._client = None

    def start_redpanda(self, num_nodes=1, extra_rp_conf=None):
        if extra_rp_conf is not None:
            # merge both configurations, the extra_rp_conf passed in
            # paramter takes the precedence
            self._extra_rp_conf = {**self._extra_rp_conf, **extra_rp_conf}
        assert self.redpanda is None
        self.redpanda = RedpandaService(self.test_context,
                                        num_nodes,
                                        extra_rp_conf=self._extra_rp_conf)
        self.redpanda.start()
        self._client = DefaultClient(self.redpanda)

    def client(self):
        assert self._client is not None
        return self._client

    def start_consumer(self, num_nodes=1, group_id="test_group"):
        assert self.redpanda
        assert self.topic
        self.consumer = VerifiableConsumer(
            self.test_context,
            num_nodes=num_nodes,
            redpanda=self.redpanda,
            topic=self.topic,
            group_id=group_id,
            on_record_consumed=self.on_record_consumed)
        self.consumer.start()

    def start_producer(self, num_nodes=1, throughput=1000):
        assert self.redpanda
        assert self.topic
        self.producer = VerifiableProducer(
            self.test_context,
            num_nodes=num_nodes,
            redpanda=self.redpanda,
            topic=self.topic,
            throughput=throughput,
            message_validator=is_int_with_prefix)
        self.producer.start()

    def on_record_consumed(self, record, node):
        partition = TopicPartition(record["topic"], record["partition"])
        record_id = record["value"]
        offset = record["offset"]
        self.last_consumed_offsets[partition] = offset
        self.records_consumed.append(record_id)

    def await_consumed_offsets(self, last_acked_offsets, timeout_sec):
        def has_finished_consuming():
            for partition, offset in last_acked_offsets.items():
                if not partition in self.last_consumed_offsets:
                    return False
                last_commit = self.consumer.last_commit(partition)
                if not last_commit or last_commit <= offset:
                    self.logger.debug(
                        f"waiting for partition {partition} offset {offset} to be committed, last committed offset: {last_commit}"
                    )
                    return False
            return True

        wait_until(has_finished_consuming,
                   timeout_sec=timeout_sec,
                   err_msg="Consumer failed to consume up to offsets %s after waiting %ds." %\
                   (str(last_acked_offsets), timeout_sec))

    def _collect_all_logs(self):
        for s in self.test_context.services:
            self.mark_for_collect(s)

    def await_startup(self, min_records=5, timeout_sec=30):
        try:
            wait_until(lambda: self.consumer.total_consumed() >= min_records,
                       timeout_sec=timeout_sec,
                       err_msg="Timed out after %ds while awaiting initial record delivery of %d records" %\
                       (timeout_sec, min_records))
        except BaseException:
            self._collect_all_logs()
            raise

    def run_validation(self,
                       min_records=5000,
                       producer_timeout_sec=30,
                       consumer_timeout_sec=30,
                       enable_idempotence=False):
        try:
            wait_until(lambda: self.producer.num_acked > min_records,
                       timeout_sec=producer_timeout_sec,
                       err_msg="Producer failed to produce messages for %ds." %\
                       producer_timeout_sec)

            self.logger.info("Stopping producer after writing up to offsets %s" %\
                         str(self.producer.last_acked_offsets))
            self.producer.stop()

            self.await_consumed_offsets(self.producer.last_acked_offsets,
                                        consumer_timeout_sec)
            self.consumer.stop()

            self.validate(enable_idempotence)
        except BaseException:
            self._collect_all_logs()
            raise

    def validate(self, enable_idempotence):
        self.logger.info("Number of acked records: %d" %
                         len(self.producer.acked))
        self.logger.info("Number of consumed records: %d" %
                         len(self.records_consumed))

        success = True
        msg = ""

        # Correctness of the set difference operation depends on using equivalent
        # message_validators in producer and consumer
        missing = set(self.producer.acked) - set(self.records_consumed)

        if len(missing) > 0:
            success = False
            msg = annotate_missing_msgs(missing, self.producer.acked,
                                        self.records_consumed, msg)

        # Are there duplicates?
        if len(set(self.records_consumed)) != len(self.records_consumed):
            num_duplicates = abs(
                len(set(self.records_consumed)) - len(self.records_consumed))

            if enable_idempotence:
                success = False
                msg += "Detected %d duplicates even though idempotence was enabled.\n" % num_duplicates
            else:
                msg += "(There are also %d duplicate messages in the log - but that is an acceptable outcome)\n" % num_duplicates

        # Collect all logs if validation fails
        if not success:
            self._collect_all_logs()

        assert success, msg
Exemplo n.º 10
0
class RedpandaTest(Test):
    """
    Base class for tests that use the Redpanda service.
    """

    # List of topics to be created automatically when the cluster starts. Each
    # topic is defined by an instance of a TopicSpec.
    topics: Sequence[TopicSpec] = []

    def __init__(self,
                 test_context,
                 num_brokers=None,
                 extra_rp_conf=dict(),
                 enable_pp=False,
                 enable_sr=False,
                 si_settings=None,
                 **kwargs):
        """
        Any trailing keyword arguments are passed through to the
        RedpandaService constructor.
        """
        super(RedpandaTest, self).__init__(test_context)
        self.scale = Scale(test_context)
        self.si_settings = si_settings

        if num_brokers is None:
            # Default to a 3 node cluster if sufficient nodes are available, else
            # a single node cluster.  This is just a default: tests are welcome
            # to override constructor to pass an explicit size.  This logic makes
            # it convenient to mix 3 node and 1 node cases in the same class, by
            # just modifying the @cluster node count per test.
            if test_context.cluster.available().size() >= 3:
                num_brokers = 3
            else:
                num_brokers = 1

        if self.si_settings:
            self.si_settings.load_context(self.logger, test_context)

        self.redpanda = RedpandaService(test_context,
                                        num_brokers,
                                        extra_rp_conf=extra_rp_conf,
                                        enable_pp=enable_pp,
                                        enable_sr=enable_sr,
                                        si_settings=self.si_settings,
                                        **kwargs)
        self._client = DefaultClient(self.redpanda)

    @property
    def topic(self):
        """
        Return the name of the auto-created initial topic. Accessing this
        property requires exactly one initial topic be configured.
        """
        assert len(self.topics) == 1
        return self.topics[0].name

    @property
    def debug_mode(self):
        """
        Useful for tests that want to change behaviour when running on
        the much slower debug builds of redpanda, which generally cannot
        keep up with significant quantities of data or partition counts.
        """
        return os.environ.get('BUILD_TYPE', None) == 'debug'

    @property
    def ci_mode(self):
        """
        Useful for tests that want to dynamically degrade/disable on low-resource
        developer environments (e.g. laptops) but apply stricter checks in CI.
        """
        return os.environ.get('CI', None) != 'false'

    @property
    def s3_client(self):
        return self.redpanda.s3_client

    def setUp(self):
        self.redpanda.start()
        self._create_initial_topics()

    def client(self):
        return self._client

    def _create_initial_topics(self):
        config = self.redpanda.security_config()
        user = config.get("sasl_plain_username")
        passwd = config.get("sasl_plain_password")
        client = KafkaCliTools(self.redpanda, user=user, passwd=passwd)
        for spec in self.topics:
            self.logger.debug(f"Creating initial topic {spec}")
            client.create_topic(spec)
Exemplo n.º 11
0
class TestMirrorMakerService(EndToEndTest):
    kafka_source = "kafka"
    redpanda_source = "redpanda"

    def __init__(self, test_context):
        super(TestMirrorMakerService, self).__init__(test_context)

        self.topic = TopicSpec(replication_factor=3)
        # create single zookeeper node for Kafka
        self.zk = ZookeeperService(self.test_context,
                                   num_nodes=1,
                                   version=V_3_0_0)
        self.source_broker = None

    def setUp(self):
        self.zk.start()

    def tearDown(self):
        # ducktape handle service teardown automatically, but it is hard
        # to tell what went wrong if one of the services hangs.  Do it
        # explicitly here with some logging, to enable debugging issues
        # like https://github.com/redpanda-data/redpanda/issues/4270

        if self.source_broker is not None:
            self.logger.info(
                f"Stopping source broker ({self.source_broker.__class__.__name__})..."
            )
            self.source_broker.stop()
            self.logger.info(
                f"Awaiting source broker ({self.source_broker.__class__.__name__})..."
            )

        self.logger.info("Stopping zookeeper...")
        self.zk.stop()
        self.logger.info("Awaiting zookeeper...")

    def start_brokers(self, source_type=kafka_source):
        if source_type == TestMirrorMakerService.redpanda_source:
            self.source_broker = RedpandaService(self.test_context,
                                                 num_brokers=3)
        else:
            self.source_broker = KafkaServiceAdapter(
                self.test_context,
                KafkaService(self.test_context,
                             num_nodes=3,
                             zk=self.zk,
                             version=V_3_0_0))

        self.redpanda = RedpandaService(self.test_context, num_brokers=3)
        self.source_broker.start()
        self.redpanda.start()

        self.source_client = DefaultClient(self.source_broker)

        self.topic.partition_count = 1000 if self.redpanda.dedicated_nodes else 1
        self.source_client.create_topic(self.topic)

    def start_workload(self):

        self.consumer = VerifiableConsumer(
            self.test_context,
            num_nodes=1,
            redpanda=self.redpanda,
            topic=self.topic.name,
            group_id='consumer_test_group',
            on_record_consumed=self.on_record_consumed)
        self.consumer.start()

        self.producer = VerifiableProducer(
            self.test_context,
            num_nodes=1,
            redpanda=self.source_broker,
            topic=self.topic.name,
            throughput=1000,
            message_validator=is_int_with_prefix)
        self.producer.start()

    def wait_for_n_messages(self, n_messages=100):
        """Wait for a minimum number of messages to be successfully produced."""
        wait_until(
            lambda: self.producer.num_acked > n_messages,
            timeout_sec=10,
            err_msg=
            "Producer failed to produce %d messages in a reasonable amount of time."
            % n_messages)

    @cluster(num_nodes=10)
    @parametrize(source_type=kafka_source)
    @parametrize(source_type=redpanda_source)
    def test_simple_end_to_end(self, source_type):
        # start brokers
        self.start_brokers(source_type=source_type)
        # start mirror maker
        self.mirror_maker = MirrorMaker2(self.test_context,
                                         num_nodes=1,
                                         source_cluster=self.source_broker,
                                         target_cluster=self.redpanda)
        topics = []
        for i in range(0, 10):
            topics.append(
                TopicSpec(partition_count=random.randint(1, 10),
                          retention_bytes=random.randint(100000000, 300000000),
                          retention_ms=random.randint(1 * 3600000,
                                                      10 * 3600000)))
        self.source_client.create_topic(topics)
        self.mirror_maker.start()
        # start source producer & target consumer
        self.start_workload()

        self.run_validation(consumer_timeout_sec=120)
        self.mirror_maker.stop()
        target_client = DefaultClient(self.redpanda)
        for t in topics:
            desc = target_client.describe_topic(t.name)
            self.logger.debug(f'source topic: {t}, target topic: {desc}')
            assert len(desc.partitions) == t.partition_count

    @cluster(num_nodes=9)
    @parametrize(source_type=kafka_source)
    @parametrize(source_type=redpanda_source)
    def test_consumer_group_mirroring(self, source_type):
        # start redpanda
        self.start_brokers(source_type=source_type)
        consumer_group = "test-group-1"
        # start mirror maker
        self.mirror_maker = MirrorMaker2(self.test_context,
                                         num_nodes=1,
                                         source_cluster=self.source_broker,
                                         target_cluster=self.redpanda,
                                         consumer_group_pattern=consumer_group,
                                         log_level="TRACE")
        self.mirror_maker.start()

        msg_size = 512
        msg_cnt = 1000000 if self.redpanda.dedicated_nodes else 100

        # produce some messages to source redpanda
        producer = RpkProducer(self.test_context,
                               self.source_broker,
                               self.topic.name,
                               msg_size,
                               msg_cnt,
                               acks=-1)

        producer.start()
        producer.wait()
        producer.free()

        # consume some messages from source redpanda
        consumer = RpkConsumer(self.test_context,
                               self.source_broker,
                               self.topic.name,
                               ignore_errors=False,
                               retries=3,
                               group=consumer_group,
                               save_msgs=False,
                               num_msgs=int(msg_cnt / 5))

        consumer.start()
        consumer.wait()
        consumer.stop()
        source_messages = consumer.messages
        self.logger.info(f"source message count: {len(source_messages)}")
        consumer.free()

        src_rpk = RpkTool(self.source_broker)
        source_group = src_rpk.group_describe(consumer_group)
        target_rpk = RpkTool(self.redpanda)

        def target_group_equal():
            try:
                target_group = target_rpk.group_describe(consumer_group)
            except RpkException as e:
                # e.g. COORDINATOR_NOT_AVAILABLE
                self.logger.info(f"Error describing target cluster group: {e}")
                return False

            self.logger.info(
                f"source {source_group}, target_group: {target_group}")
            return target_group.partitions == source_group.partitions and target_group.name == source_group.name

        # wait for consumer group sync
        timeout = 600 if self.redpanda.dedicated_nodes else 60
        wait_until(target_group_equal, timeout_sec=timeout, backoff_sec=5)

        self.mirror_maker.stop()
Exemplo n.º 12
0
class LibrdkafkaTest(Test):
    """
    Execute the librdkafka test suite against redpanda.
    """
    TESTS_DIR = "/opt/librdkafka/tests"
    CONF_FILE = os.path.join(TESTS_DIR, "test.conf")

    def __init__(self, context):
        super(LibrdkafkaTest, self).__init__(context)
        self._context = context
        self._extra_rp_conf = dict(
            auto_create_topics_enabled=True,
            default_topic_partitions=4,
        )
        self._redpanda = None

    def _start_redpanda(self, num_brokers):
        self._redpanda = RedpandaService(self._context,
                                         num_brokers,
                                         extra_rp_conf=self._extra_rp_conf)
        self._redpanda.start()

        with open(LibrdkafkaTest.CONF_FILE, "w") as f:
            brokers = self._redpanda.brokers()
            f.write("metadata.broker.list={}".format(brokers))

    def teardown(self):
        self._redpanda.stop()
        os.remove(LibrdkafkaTest.CONF_FILE)

    # yapf: disable
    @cluster(num_nodes=3)
    @ignore(test_num=19, num_brokers=1) # segfault in group membership https://app.clubhouse.io/vectorized/story/963/dereferencing-null-pointer-in-kafka-group-membership
    @ignore(test_num=52, num_brokers=1) # https://app.clubhouse.io/vectorized/story/997/librdkafka-tests-failing-due-to-consumer-out-of-range-timestamps
    @ignore(test_num=54, num_brokers=1) # timequery issues: https://app.clubhouse.io/vectorized/story/995/librdkafka-offset-time-query
    @ignore(test_num=63, num_brokers=1) # cluster-id: https://app.clubhouse.io/vectorized/story/939/generate-cluster-id-uuid-on-bootstrap-and-expose-through-metadata-request
    @ignore(test_num=67, num_brokers=1) # empty topic offset edge case: https://app.clubhouse.io/vectorized/story/940/consuming-from-empty-topic-should-return-eof
    @ignore(test_num=77, num_brokers=1) # topic compaction settings: https://app.clubhouse.io/vectorized/story/999/support-create-topic-configurations-for-compaction-retention-policies
    @ignore(test_num=92, num_brokers=1) # no support for v2 -> v1 message version conversion in the broker
    @ignore(test_num=44, num_brokers=1) # we do not support runtime changes to topic partition count
    @ignore(test_num=69, num_brokers=1) # we do not support runtime changes to topic partition count
    @ignore(test_num=81, num_brokers=1) # we do not support replica assignment
    @ignore(test_num=61, num_brokers=1) # transactions
    @ignore(test_num=76, num_brokers=1) # idempotent producer
    @ignore(test_num=86, num_brokers=1) # idempotent producer
    @ignore(test_num=90, num_brokers=1) # idempotent producer
    @ignore(test_num=94, num_brokers=1) # idempotent producer
    @ignore(test_num=98, num_brokers=1) # transactions
    # @ignore appears to not be quite smart enough to handle the partial
    # parameterization so we repeat them here with num_brokers=3. this would be
    # a nice enhancement that we could upstream.
    @ignore(test_num=19, num_brokers=3)
    @ignore(test_num=52, num_brokers=3)
    @ignore(test_num=54, num_brokers=3)
    @ignore(test_num=63, num_brokers=3)
    @ignore(test_num=67, num_brokers=3)
    @ignore(test_num=77, num_brokers=3)
    @ignore(test_num=92, num_brokers=3)
    @ignore(test_num=44, num_brokers=3)
    @ignore(test_num=69, num_brokers=3)
    @ignore(test_num=81, num_brokers=3)
    @ignore(test_num=61, num_brokers=3)
    @ignore(test_num=76, num_brokers=3)
    @ignore(test_num=86, num_brokers=3)
    @ignore(test_num=90, num_brokers=3)
    @ignore(test_num=94, num_brokers=3)
    @ignore(test_num=98, num_brokers=3)
    @matrix(test_num=range(101), num_brokers=[1, 3])
    # yapf: enable
    def test_librdkafka(self, test_num, num_brokers):
        self._start_redpanda(num_brokers)
        p = subprocess.Popen(["make"],
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=True,
                             cwd=LibrdkafkaTest.TESTS_DIR,
                             env=dict(TESTS="%04d" % test_num, **os.environ))
        for line in iter(p.stdout.readline, b''):
            self.logger.debug(line.rstrip())
        p.wait()
        if p.returncode != 0:
            raise RuntimeError("librdkafka test failed {}".format(
                p.returncode))
Exemplo n.º 13
0
class ClusterViewTest(EndToEndTest):
    @cluster(num_nodes=3)
    def test_view_changes_on_add(self):
        self.redpanda = RedpandaService(self.test_context, 3)
        # start single node cluster
        self.redpanda.start(nodes=[self.redpanda.nodes[0]])

        admin = Admin(self.redpanda)

        seed = None

        def rp1_started():
            nonlocal seed
            try:
                #{"version": 0, "brokers": [{"node_id": 1, "num_cores": 3, "membership_status": "active", "is_alive": true}]}
                seed = admin.get_cluster_view(self.redpanda.nodes[0])
                self.redpanda.logger.info(
                    f"view from {self.redpanda.nodes[0]}: {json.dumps(seed)}")
                return len(seed["brokers"]) == 1
            except requests.exceptions.RequestException as e:
                self.redpanda.logger.debug(f"admin API isn't available ({e})")
                return False

        wait_until(
            rp1_started,
            timeout_sec=30,
            backoff_sec=1,
            err_msg="Cant get cluster view from {self.redpanda.nodes[0]}")

        self.redpanda.start_node(self.redpanda.nodes[1])
        self.redpanda.start_node(self.redpanda.nodes[2])

        def rest_started():
            try:
                last = None
                ids = None
                for i in range(0, 3):
                    view = admin.get_cluster_view(self.redpanda.nodes[i])
                    self.redpanda.logger.info(
                        f"view from {self.redpanda.nodes[i]}: {json.dumps(view)}"
                    )
                    if view["version"] <= seed["version"]:
                        return False
                    if len(view["brokers"]) != 3:
                        return False
                    if last == None:
                        last = view
                        ids = set(
                            map(lambda broker: broker["node_id"],
                                view["brokers"]))
                    if last["version"] != view["version"]:
                        return False
                    if not ids.issubset(
                            map(lambda broker: broker["node_id"],
                                view["brokers"])):
                        return False
                return True
            except requests.exceptions.RequestException as e:
                self.redpanda.logger.debug(f"admin API isn't available ({e})")
                return False

        wait_until(rest_started,
                   timeout_sec=30,
                   backoff_sec=1,
                   err_msg="Cant get cluster view from {self.redpanda.nodes}")
Exemplo n.º 14
0
class EndToEndShadowIndexingTest(EndToEndTest):
    segment_size = 1048576  # 1 Mb
    s3_host_name = "minio-s3"
    s3_access_key = "panda-user"
    s3_secret_key = "panda-secret"
    s3_region = "panda-region"
    s3_topic_name = "panda-topic"
    topics = (TopicSpec(
        name=s3_topic_name,
        partition_count=1,
        replication_factor=3,
    ), )

    def __init__(self, test_context):
        super(EndToEndShadowIndexingTest,
              self).__init__(test_context=test_context)

        self.s3_bucket_name = f"panda-bucket-{uuid.uuid1()}"
        self.topic = EndToEndShadowIndexingTest.s3_topic_name
        self._extra_rp_conf = dict(
            cloud_storage_enabled=True,
            cloud_storage_enable_remote_read=True,
            cloud_storage_enable_remote_write=True,
            cloud_storage_access_key=EndToEndShadowIndexingTest.s3_access_key,
            cloud_storage_secret_key=EndToEndShadowIndexingTest.s3_secret_key,
            cloud_storage_region=EndToEndShadowIndexingTest.s3_region,
            cloud_storage_bucket=self.s3_bucket_name,
            cloud_storage_disable_tls=True,
            cloud_storage_api_endpoint=EndToEndShadowIndexingTest.s3_host_name,
            cloud_storage_api_endpoint_port=9000,
            cloud_storage_reconciliation_interval_ms=500,
            cloud_storage_max_connections=5,
            log_segment_size=EndToEndShadowIndexingTest.segment_size,  # 1MB
        )

        self.scale = Scale(test_context)
        self.redpanda = RedpandaService(
            context=test_context,
            num_brokers=3,
            extra_rp_conf=self._extra_rp_conf,
        )

        self.kafka_tools = KafkaCliTools(self.redpanda)
        self.s3_client = S3Client(
            region=EndToEndShadowIndexingTest.s3_region,
            access_key=EndToEndShadowIndexingTest.s3_access_key,
            secret_key=EndToEndShadowIndexingTest.s3_secret_key,
            endpoint=f"http://{EndToEndShadowIndexingTest.s3_host_name}:9000",
            logger=self.logger,
        )

    def setUp(self):
        self.s3_client.empty_bucket(self.s3_bucket_name)
        self.s3_client.create_bucket(self.s3_bucket_name)
        self.redpanda.start()
        for topic in EndToEndShadowIndexingTest.topics:
            self.kafka_tools.create_topic(topic)

    def tearDown(self):
        self.s3_client.empty_bucket(self.s3_bucket_name)

    @cluster(num_nodes=5)
    def test_write(self):
        """Write at least 10 segments, set retention policy to leave only 5
        segments, wait for segments removal, consume data and run validation,
        that everything that is acked is consumed."""
        self.start_producer()
        produce_until_segments(
            redpanda=self.redpanda,
            topic=self.topic,
            partition_idx=0,
            count=10,
        )

        self.kafka_tools.alter_topic_config(
            self.topic,
            {
                TopicSpec.PROPERTY_RETENTION_BYTES:
                5 * EndToEndShadowIndexingTest.segment_size,
            },
        )
        wait_for_segments_removal(redpanda=self.redpanda,
                                  topic=self.topic,
                                  partition_idx=0,
                                  count=6)

        self.start_consumer()
        self.run_validation()
Exemplo n.º 15
0
class ConsumerOffsetsMigrationTest(EndToEndTest):
    max_suspend_duration_sec = 3
    min_inter_failure_time_sec = 30
    max_inter_failure_time_sec = 60

    @cluster(num_nodes=7, log_allow_list=CHAOS_LOG_ALLOW_LIST)
    @matrix(failures=[True, False], cpus=[1, 3])
    def test_migrating_consume_offsets(self, failures, cpus):
        '''
        Validates correctness while executing consumer offsets migration
        '''

        # set redpanda logical version to value without __consumer_offsets support
        self.redpanda = RedpandaService(
            self.test_context,
            5,
            resource_settings=ResourceSettings(num_cpus=cpus),
            extra_rp_conf={
                "group_topic_partitions": 16,
                "default_topic_replications": 3,
            },
            environment={"__REDPANDA_LOGICAL_VERSION": 1})

        self.redpanda.start()
        self._client = DefaultClient(self.redpanda)
        # set of failure suppressed nodes - required to make restarts deterministic
        suppressed = set()

        def failure_injector_loop():
            f_injector = FailureInjector(self.redpanda)
            while failures:
                f_type = random.choice(FailureSpec.FAILURE_TYPES)
                length = 0
                node = random.choice(self.redpanda.nodes)
                while self.redpanda.idx(node) in suppressed:
                    node = random.choice(self.redpanda.nodes)

                # allow suspending any node
                if f_type == FailureSpec.FAILURE_SUSPEND:
                    length = random.randint(
                        1,
                        ConsumerOffsetsMigrationTest.max_suspend_duration_sec)

                f_injector.inject_failure(
                    FailureSpec(node=node, type=f_type, length=length))

                delay = random.randint(
                    ConsumerOffsetsMigrationTest.min_inter_failure_time_sec,
                    ConsumerOffsetsMigrationTest.max_inter_failure_time_sec)
                self.redpanda.logger.info(
                    f"waiting {delay} seconds before next failure")
                time.sleep(delay)

        if failures:
            finjector_thread = threading.Thread(target=failure_injector_loop,
                                                args=())
            finjector_thread.daemon = True
            finjector_thread.start()
        spec = TopicSpec(partition_count=6, replication_factor=3)
        self.client().create_topic(spec)
        self.topic = spec.name

        self.start_producer(1, throughput=5000)
        self.start_consumer(1)
        self.await_startup()

        def cluster_is_stable():
            admin = Admin(self.redpanda)
            brokers = admin.get_brokers()
            if len(brokers) < 3:
                return False

            for b in brokers:
                self.logger.debug(f"broker:  {b}")
                if not (b['is_alive'] and 'disk_space' in b):
                    return False

            return True

        kcl = KCL(self.redpanda)

        def _group_present():
            return len(kcl.list_groups().splitlines()) > 1

        # make sure that group is there
        wait_until(_group_present, 10, 1)

        # check that consumer offsets topic is not present
        topics = set(kcl.list_topics())

        assert "__consumer_offsets" not in topics

        # enable consumer offsets support
        self.redpanda.set_environment({"__REDPANDA_LOGICAL_VERSION": 2})
        for n in self.redpanda.nodes:
            id = self.redpanda.idx(n)
            suppressed.add(id)
            self.redpanda.restart_nodes(n, stop_timeout=60)
            suppressed.remove(id)
            # wait for leader balancer to start evening out leadership
            wait_until(cluster_is_stable, 90, backoff_sec=2)

        def _consumer_offsets_present():
            try:
                partitions = list(
                    self.client().describe_topic("__consumer_offsets"))
                return len(partitions) > 0
            except:
                return False

        wait_until(_consumer_offsets_present, timeout_sec=90, backoff_sec=3)

        self.run_validation(min_records=100000,
                            producer_timeout_sec=300,
                            consumer_timeout_sec=180)

    @cluster(num_nodes=5, log_allow_list=RESTART_LOG_ALLOW_LIST)
    def test_cluster_is_available_during_upgrade_without_group_topic(self):
        '''
        Validates that cluster is available and healthy during 
        upgrade when `kafka_internal::group` topic is not present
        '''

        # set redpanda logical version to value without __consumer_offsets support
        self.redpanda = RedpandaService(
            self.test_context,
            5,
            extra_rp_conf={
                "group_topic_partitions": 16,
                "default_topic_replications": 3,
            },
            environment={"__REDPANDA_LOGICAL_VERSION": 1})

        self.redpanda.start()
        self._client = DefaultClient(self.redpanda)

        spec = TopicSpec(partition_count=6, replication_factor=3)
        self.client().create_topic(spec)
        self.topic = spec.name

        def cluster_is_stable():
            admin = Admin(self.redpanda)
            brokers = admin.get_brokers()
            if len(brokers) < 3:
                return False

            for b in brokers:
                self.logger.debug(f"broker:  {b}")
                if not (b['is_alive'] and 'disk_space' in b):
                    return False

            return True

        def node_stopped(node_id):
            admin = Admin(self.redpanda)
            brokers = admin.get_brokers()

            for b in brokers:
                self.logger.debug(f"broker:  {b}")
                if b['node_id'] == node_id:
                    return b['is_alive'] == False

            return False

        kcl = KCL(self.redpanda)

        # check that consumer offsets topic is not present
        topics = set(kcl.list_topics())

        assert "__consumer_offsets" not in topics

        # enable consumer offsets support
        self.redpanda.set_environment({"__REDPANDA_LOGICAL_VERSION": 2})

        def get_raft0_follower():
            ctrl = self.redpanda.controller
            node = random.choice(self.redpanda.nodes)
            while self.redpanda.idx(node) == self.redpanda.idx(ctrl):
                node = random.choice(self.redpanda.nodes)

            return node

        # restart node that is not controller
        n = get_raft0_follower()
        self.logger.info(f"restarting node {n.account.hostname}")
        self.redpanda.stop_node(n, timeout=60)
        # wait for leader balancer to start evening out leadership
        wait_until(lambda: node_stopped(self.redpanda.idx(n)),
                   90,
                   backoff_sec=2)
        self.redpanda.start_node(n)
        wait_until(cluster_is_stable, 90, backoff_sec=2)
Exemplo n.º 16
0
class AvailabilityTests(EndToEndFinjectorTest):
    def validate_records(self):
        min_records = 40000
        producer_timeout_sec = 60
        consumer_timeout_sec = 60

        if self.scale.ci or self.scale.release:
            min_records = 100000
            producer_timeout_sec = 180
            consumer_timeout_sec = 180

        self.run_validation(min_records=min_records,
                            enable_idempotence=False,
                            producer_timeout_sec=producer_timeout_sec,
                            consumer_timeout_sec=consumer_timeout_sec)

    @cluster(num_nodes=5)
    def test_availability_when_one_node_failed(self):
        self.redpanda = RedpandaService(
            self.test_context,
            3,
            KafkaCliTools,
            extra_rp_conf={
                "enable_auto_rebalance_on_node_add": True,
                "group_topic_partitions": 1,
                "default_topic_replications": 3,
            })

        self.redpanda.start()
        spec = TopicSpec(name="test-topic",
                         partition_count=6,
                         replication_factor=3)

        self.redpanda.create_topic(spec)
        self.topic = spec.name

        self.start_producer(1, throughput=10000)
        self.start_consumer(1)
        self.await_startup()
        # start failure injector with default parameters
        self.start_finjector()

        self.validate_records()

    @cluster(num_nodes=5)
    def test_recovery_after_catastrophic_failure(self):

        self.redpanda = RedpandaService(
            self.test_context,
            3,
            KafkaCliTools,
            extra_rp_conf={
                "enable_auto_rebalance_on_node_add": True,
                "group_topic_partitions": 1,
                "default_topic_replications": 3,
            })

        self.redpanda.start()
        spec = TopicSpec(name="test-topic",
                         partition_count=6,
                         replication_factor=3)

        self.redpanda.create_topic(spec)
        self.topic = spec.name

        self.start_producer(1, throughput=10000)
        self.start_consumer(1)
        self.await_startup()

        # inject permanent random failure
        f_spec = FailureSpec(random.choice(FailureSpec.FAILURE_TYPES),
                             random.choice(self.redpanda.nodes[0:1]))

        self.inject_failure(f_spec)

        # inject transient failure on other node
        f_spec = FailureSpec(random.choice(FailureSpec.FAILURE_TYPES),
                             self.redpanda.nodes[2],
                             length=2.0 if self.scale.local else 15.0)

        self.inject_failure(f_spec)

        self.validate_records()
Exemplo n.º 17
0
class NodeOperationFuzzyTest(EndToEndTest):
    max_suspend_duration_seconds = 10
    min_inter_failure_time = 30
    max_inter_failure_time = 60

    def generate_random_workload(self, count, skip_nodes, available_nodes):
        op_types = [ADD, DECOMMISSION]
        tp_op_types = [ADD_TOPIC, DELETE_TOPIC]
        # current state
        active_nodes = list(available_nodes)
        decommissioned_nodes = []
        operations = []
        topics = []

        def eligible_active_nodes():
            return list(
                filter(lambda n: not (n == 1 or n in skip_nodes),
                       active_nodes))

        def decommission(id):
            active_nodes.remove(id)
            decommissioned_nodes.append(id)

        def add(id):
            active_nodes.append(id)
            decommissioned_nodes.remove(id)

        for _ in range(0, count):
            if len(decommissioned_nodes) == 2:
                id = random.choice(decommissioned_nodes)
                operations.append((ADD, id))
                add(id)
            elif len(decommissioned_nodes) == 0:
                id = random.choice(eligible_active_nodes())
                operations.append((DECOMMISSION, id))
                decommission(id)
            else:
                op = random.choice(op_types)
                if op == DECOMMISSION:
                    id = random.choice(eligible_active_nodes())
                    operations.append((DECOMMISSION, id))
                    decommission(id)
                elif op == ADD:
                    id = random.choice(decommissioned_nodes)
                    operations.append((ADD, id))
                    add(id)
            # topic operation
            if len(topics) == 0:
                op = ADD_TOPIC
            else:
                op = random.choice(tp_op_types)

            if op == ADD_TOPIC:
                operations.append((
                    ADD_TOPIC,
                    f"test-topic-{random.randint(0,2000)}-{round(time.time()*1000000)}",
                    random.choice(ALLOWED_REPLICATION), 3))
            else:
                operations.append((DELETE_TOPIC, random.choice(topics)))

        return operations

    def _create_random_topics(self, count):
        max_partitions = 10

        topics = []
        for i in range(0, count):
            name = f"topic-{i}"
            spec = TopicSpec(
                name=name,
                partition_count=random.randint(1, max_partitions),
                replication_factor=random.choice(ALLOWED_REPLICATION))

            topics.append(spec)

        for spec in topics:
            DefaultClient(self.redpanda).create_topic(spec)

        return topics

    """
    Adding nodes to the cluster should result in partition reallocations to new
    nodes
    """

    @cluster(num_nodes=7, log_allow_list=CHAOS_LOG_ALLOW_LIST)
    @parametrize(enable_failures=True)
    @parametrize(enable_failures=False)
    def test_node_operations(self, enable_failures):
        # allocate 5 nodes for the cluster
        self.redpanda = RedpandaService(
            self.test_context,
            5,
            extra_rp_conf={
                "enable_auto_rebalance_on_node_add": True,
                "group_topic_partitions": 3,
                "default_topic_replications": 3,
            })

        self.redpanda.start()
        # create some topics
        topics = self._create_random_topics(10)
        self.redpanda.logger.info(f"using topics: {topics}")
        # select one of the topics to use in consumer/producer
        self.topic = random.choice(topics).name

        self.start_producer(1, throughput=100)
        self.start_consumer(1)
        self.await_startup()
        self.active_nodes = set(
            [self.redpanda.idx(n) for n in self.redpanda.nodes])
        # collect current mapping
        self.ids_mapping = {}
        for n in self.redpanda.nodes:
            self.ids_mapping[self.redpanda.idx(n)] = self.redpanda.idx(n)
        self.next_id = sorted(list(self.ids_mapping.keys()))[-1] + 1
        self.redpanda.logger.info(f"Initial ids mapping: {self.ids_mapping}")
        NODE_OP_TIMEOUT = 360

        def get_next_id():
            id = self.next_id
            self.next_id += 1
            return id

        def failure_injector_loop():
            f_injector = FailureInjector(self.redpanda)
            while enable_failures:
                f_type = random.choice(FailureSpec.FAILURE_TYPES)
                length = 0
                # allow suspending any node
                if f_type == FailureSpec.FAILURE_SUSPEND:
                    length = random.randint(
                        1, NodeOperationFuzzyTest.max_suspend_duration_seconds)
                    node = random.choice(self.redpanda.nodes)
                else:
                    #kill/termianate only active nodes (not to influence the test outcome)
                    idx = random.choice(list(self.active_nodes))
                    node = self.redpanda.get_node(idx)

                f_injector.inject_failure(
                    FailureSpec(node=node, type=f_type, length=length))

                delay = random.randint(
                    NodeOperationFuzzyTest.min_inter_failure_time,
                    NodeOperationFuzzyTest.max_inter_failure_time)
                self.redpanda.logger.info(
                    f"waiting {delay} seconds before next failure")
                time.sleep(delay)

        if enable_failures:
            finjector_thread = threading.Thread(target=failure_injector_loop,
                                                args=())
            finjector_thread.daemon = True
            finjector_thread.start()

        def decommission(idx):
            node_id = self.ids_mapping[idx]
            self.logger.info(f"decommissioning node: {idx} with id: {node_id}")

            def decommissioned():
                try:
                    admin = Admin(self.redpanda)
                    # if broker is already draining, it is suceess

                    brokers = admin.get_brokers()
                    for b in brokers:
                        if b['node_id'] == node_id and b[
                                'membership_status'] == 'draining':
                            return True

                    r = admin.decommission_broker(id=node_id)
                    return r.status_code == 200
                except requests.exceptions.RetryError:
                    return False
                except requests.exceptions.ConnectionError:
                    return False
                except requests.exceptions.HTTPError:
                    return False

            wait_until(decommissioned,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)
            admin = Admin(self.redpanda)

            def is_node_removed(idx_to_query, node_id):
                try:
                    brokers = admin.get_brokers(
                        self.redpanda.get_node(idx_to_query))
                    ids = map(lambda broker: broker['node_id'], brokers)
                    return not node_id in ids
                except:
                    return False

            def node_removed():
                node_removed_cnt = 0
                for idx in self.active_nodes:
                    if is_node_removed(idx, node_id):
                        node_removed_cnt += 1

                node_count = len(self.redpanda.nodes)
                majority = int(node_count / 2) + 1
                self.redpanda.logger.debug(
                    f"node {node_id} removed on {node_removed_cnt} nodes, majority: {majority}"
                )
                return node_removed_cnt >= majority

            wait_until(node_removed,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)
            self.redpanda.stop_node(self.redpanda.get_node(idx))

        kafkacat = KafkaCat(self.redpanda)

        def replicas_per_node():
            node_replicas = {}
            md = kafkacat.metadata()
            self.redpanda.logger.info(f"metadata: {md}")
            for topic in md['topics']:
                for p in topic['partitions']:
                    for r in p['replicas']:
                        id = r['id']
                        if id not in node_replicas:
                            node_replicas[id] = 0
                        node_replicas[id] += 1

            return node_replicas

        def seed_servers_for(idx):
            seeds = map(
                lambda n: {
                    "address": n.account.hostname,
                    "port": 33145
                }, self.redpanda.nodes)

            return list(
                filter(
                    lambda n: n['address'] != self.redpanda.get_node(idx).
                    account.hostname, seeds))

        def add_node(idx, cleanup=True):
            id = get_next_id()
            self.logger.info(f"adding node: {idx} back with new id: {id}")
            self.ids_mapping[idx] = id
            self.redpanda.stop_node(self.redpanda.get_node(idx))
            if cleanup:
                self.redpanda.clean_node(self.redpanda.get_node(idx),
                                         preserve_logs=True)
            # we do not reuse previous node ids and override seed server list
            self.redpanda.start_node(
                self.redpanda.get_node(idx),
                timeout=NodeOperationFuzzyTest.min_inter_failure_time +
                NodeOperationFuzzyTest.max_suspend_duration_seconds + 30,
                override_cfg_params={
                    "node_id": id,
                    "seed_servers": seed_servers_for(idx)
                })

            def has_new_replicas():
                per_node = replicas_per_node()
                self.logger.info(f"replicas per node: {per_node}")
                return id in per_node

            wait_until(has_new_replicas,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)

        def is_topic_present(name):
            kcl = KCL(self.redpanda)
            lines = kcl.list_topics().splitlines()
            self.redpanda.logger.debug(
                f"checking if topic {name} is present in {lines}")
            for l in lines:
                if l.startswith(name):
                    return True
            return False

        def create_topic(spec):
            try:
                DefaultClient(self.redpanda).create_topic(spec)
            except Exception as e:
                self.redpanda.logger.warn(
                    f"error creating topic {spec.name} - {e}")
            try:
                return is_topic_present(spec.name)
            except Exception as e:
                self.redpanda.logger.warn(f"error while listing topics - {e}")
                return False

        def delete_topic(name):
            try:
                DefaultClient(self.redpanda).delete_topic(name)
            except Exception as e:
                self.redpanda.logger.warn(f"error deleting topic {name} - {e}")
            try:
                return not is_topic_present(name)
            except Exception as e:
                self.redpanda.logger.warn(f"error while listing topics - {e}")
                return False

        work = self.generate_random_workload(10,
                                             skip_nodes=set(),
                                             available_nodes=self.active_nodes)
        self.redpanda.logger.info(f"node operations to execute: {work}")
        for op in work:
            op_type = op[0]
            self.logger.info(
                f"executing - {op} - current ids: {self.ids_mapping}")
            if op_type == ADD:
                idx = op[1]
                self.active_nodes.add(idx)
                add_node(idx)
            if op_type == DECOMMISSION:
                idx = op[1]
                self.active_nodes.remove(idx)
                decommission(idx)
            elif op_type == ADD_TOPIC:
                spec = TopicSpec(name=op[1],
                                 replication_factor=op[2],
                                 partition_count=op[3])
                wait_until(lambda: create_topic(spec) == True,
                           timeout_sec=180,
                           backoff_sec=2)
            elif op_type == DELETE_TOPIC:
                wait_until(lambda: delete_topic(op[1]) == True,
                           timeout_sec=180,
                           backoff_sec=2)

        enable_failures = False
        self.run_validation(enable_idempotence=False,
                            producer_timeout_sec=60,
                            consumer_timeout_sec=180)
Exemplo n.º 18
0
class MetricsReporterTest(Test):
    def __init__(self, test_ctx, *args, **kwargs):
        self._ctx = test_ctx
        super(MetricsReporterTest, self).__init__(test_context=test_ctx)

    """
    Validates key availability properties of the system using a single
    partition.
    """

    @cluster(num_nodes=4)
    def test_redpanda_metrics_reporting(self):
        """
        Testing if when fetching from single node all partitions are 
        returned in round robin fashion
        """
        # setup http server
        http = HttpServer(self._ctx)
        http.start()
        # report every two seconds
        extra_conf = {
            "health_monitor_tick_interval": 1000,
            "metrics_reporter_tick_interval": 2000,
            "metrics_reporter_report_interval": 1000,
            "enable_metrics_reporter": True,
            "metrics_reporter_url": f"{http.url}/metrics",
        }
        self.redpanda = RedpandaService(self.test_context,
                                        3,
                                        KafkaCliTools,
                                        extra_rp_conf=extra_conf)

        self.redpanda.start()

        total_topics = 5
        total_partitions = 0
        for _ in range(0, total_topics):
            partitions = random.randint(1, 8)
            total_partitions += partitions
            self.redpanda.create_topic(
                [TopicSpec(partition_count=partitions, replication_factor=3)])

        # create topics
        self.redpanda.logger.info(
            f"created {total_topics} topics with {total_partitions} partitions"
        )

        def _state_up_to_date():
            if http.requests:
                r = json.loads(http.requests[-1]['body'])
                return r['topic_count'] == total_topics
            return False

        wait_until(_state_up_to_date, 20, backoff_sec=1)
        http.stop()
        metadata = [json.loads(r['body']) for r in http.requests]
        for m in metadata:
            self.redpanda.logger.info(m)

        def assert_fields_are_the_same(metadata, field):
            assert all(m[field] == metadata[0][field] for m in metadata)

        # cluster uuid and create timestamp should stay the same across requests
        assert_fields_are_the_same(metadata, 'cluster_uuid')
        assert_fields_are_the_same(metadata, 'cluster_created_ts')
        # get the last report
        last = metadata.pop()
        assert last['topic_count'] == total_topics
        assert last['partition_count'] == total_partitions
        nodes_meta = last['nodes']

        assert len(last['nodes']) == 3

        assert all('node_id' in n for n in nodes_meta)
        assert all('cpu_count' in n for n in nodes_meta)
        assert all('version' in n for n in nodes_meta)
        assert all('uptime_ms' in n for n in nodes_meta)
        assert all('is_alive' in n for n in nodes_meta)
        assert all('disks' in n for n in nodes_meta)