예제 #1
0
class AvailabilityTests(EndToEndFinjectorTest):
    def validate_records(self):
        min_records = 40000
        producer_timeout_sec = 60
        consumer_timeout_sec = 60

        if self.scale.ci or self.scale.release:
            min_records = 100000
            producer_timeout_sec = 180
            consumer_timeout_sec = 180

        self.run_validation(min_records=min_records,
                            enable_idempotence=False,
                            producer_timeout_sec=producer_timeout_sec,
                            consumer_timeout_sec=consumer_timeout_sec)

    @cluster(num_nodes=5)
    def test_availability_when_one_node_failed(self):
        self.redpanda = RedpandaService(
            self.test_context,
            3,
            KafkaCliTools,
            extra_rp_conf={
                "enable_auto_rebalance_on_node_add": True,
                "group_topic_partitions": 1,
                "default_topic_replications": 3,
            })

        self.redpanda.start()
        spec = TopicSpec(name="test-topic",
                         partition_count=6,
                         replication_factor=3)

        self.redpanda.create_topic(spec)
        self.topic = spec.name

        self.start_producer(1, throughput=10000)
        self.start_consumer(1)
        self.await_startup()
        # start failure injector with default parameters
        self.start_finjector()

        self.validate_records()

    @cluster(num_nodes=5)
    def test_recovery_after_catastrophic_failure(self):

        self.redpanda = RedpandaService(
            self.test_context,
            3,
            KafkaCliTools,
            extra_rp_conf={
                "enable_auto_rebalance_on_node_add": True,
                "group_topic_partitions": 1,
                "default_topic_replications": 3,
            })

        self.redpanda.start()
        spec = TopicSpec(name="test-topic",
                         partition_count=6,
                         replication_factor=3)

        self.redpanda.create_topic(spec)
        self.topic = spec.name

        self.start_producer(1, throughput=10000)
        self.start_consumer(1)
        self.await_startup()

        # inject permanent random failure
        f_spec = FailureSpec(random.choice(FailureSpec.FAILURE_TYPES),
                             random.choice(self.redpanda.nodes[0:1]))

        self.inject_failure(f_spec)

        # inject transient failure on other node
        f_spec = FailureSpec(random.choice(FailureSpec.FAILURE_TYPES),
                             self.redpanda.nodes[2],
                             length=2.0 if self.scale.local else 15.0)

        self.inject_failure(f_spec)

        self.validate_records()
예제 #2
0
class NodeOperationFuzzyTest(EndToEndTest):
    def generate_random_workload(self, count, skip_nodes):
        op_types = [ADD, DECOMMISSION]
        tp_op_types = [ADD_TOPIC, DELETE_TOPIC]
        # current state
        active_nodes = [1, 2, 3, 4, 5]
        decommissioned_nodes = []
        operations = []
        topics = []

        def eligible_active_nodes():
            return list(
                filter(lambda n: not (n == 1 or n in skip_nodes),
                       active_nodes))

        def decommission(id):
            active_nodes.remove(id)
            decommissioned_nodes.append(id)

        def add(id):
            active_nodes.append(id)
            decommissioned_nodes.remove(id)

        for _ in range(0, count):
            if len(decommissioned_nodes) == 2:
                id = random.choice(decommissioned_nodes)
                operations.append((ADD, id))
                add(id)
            elif len(decommissioned_nodes) == 0:
                id = random.choice(eligible_active_nodes())
                operations.append((DECOMMISSION, id))
                decommission(id)
            else:
                op = random.choice(op_types)
                if op == DECOMMISSION:
                    id = random.choice(eligible_active_nodes())
                    operations.append((DECOMMISSION, id))
                    decommission(id)
                elif op == ADD:
                    id = random.choice(decommissioned_nodes)
                    operations.append((ADD, id))
                    add(id)
            # topic operation
            if len(topics) == 0:
                op = ADD_TOPIC
            else:
                op = random.choice(tp_op_types)

            if op == ADD_TOPIC:
                operations.append((
                    ADD_TOPIC,
                    f"test-topic-{random.randint(0,2000)}-{round(time.time()*1000000)}",
                    random.choice(ALLOWED_REPLICATION), 3))
            else:
                operations.append((DELETE_TOPIC, random.choice(topics)))

        return operations

    def _create_random_topics(self, count):
        max_partitions = 10

        topics = []
        for i in range(0, count):
            name = f"topic-{i}"
            spec = TopicSpec(
                name=name,
                partition_count=random.randint(1, max_partitions),
                replication_factor=random.choice(ALLOWED_REPLICATION))

            topics.append(spec)

        for spec in topics:
            self.redpanda.create_topic(spec)

        return topics

    """
    Adding nodes to the cluster should result in partition reallocations to new 
    nodes
    """

    @cluster(num_nodes=7)
    @parametrize(enable_failures=True)
    @parametrize(enable_failures=False)
    def test_node_opeartions(self, enable_failures):
        # allocate 5 nodes for the cluster
        self.redpanda = RedpandaService(
            self.test_context,
            5,
            KafkaCliTools,
            extra_rp_conf={
                "enable_auto_rebalance_on_node_add": True,
                "group_topic_partitions": 3,
                "default_topic_replications": 3,
            })
        self.active_nodes = set([1, 2, 3, 4, 5])

        self.redpanda.start()
        # create some topics
        topics = self._create_random_topics(10)
        self.redpanda.logger.info(f"using topics: {topics}")
        # select one of the topics to use in consumer/producer
        self.topic = random.choice(topics).name

        self.start_producer(1, throughput=100)
        self.start_consumer(1)
        self.await_startup()
        NODE_OP_TIMEOUT = 360

        def failure_injector_loop():
            f_injector = FailureInjector(self.redpanda)
            while enable_failures:
                f_type = random.choice(FailureSpec.FAILURE_TYPES)
                length = 0
                # allow suspending any node
                if f_type == FailureSpec.FAILURE_SUSPEND:
                    length = random.randint(1, 10)
                    node = random.choice(self.redpanda.nodes)
                else:
                    #kill/termianate only active nodes (not to influence the test outcome)
                    idx = random.choice(list(self.active_nodes)) - 1
                    node = self.redpanda.nodes[idx]

                f_injector.inject_failure(
                    FailureSpec(node=node, type=f_type, length=length))

                delay = random.randint(20, 45)
                self.redpanda.logger.info(
                    f"waiting {delay} seconds before next failure")
                time.sleep(delay)

        if enable_failures:
            finjector_thread = threading.Thread(target=failure_injector_loop,
                                                args=())
            finjector_thread.daemon = True
            finjector_thread.start()

        def decommission(node_id):
            self.logger.info(f"decommissioning node: {node_id}")

            def decommissioned():
                try:
                    admin = Admin(self.redpanda)
                    # if broker is already draining, it is suceess
                    brokers = admin.get_brokers()
                    for b in brokers:
                        if b['node_id'] == node_id and b[
                                'membership_status'] == 'draining':
                            return True

                    r = admin.decommission_broker(id=node_id)
                    return r.status_code == 200
                except requests.exceptions.RetryError:
                    return False
                except requests.exceptions.ConnectionError:
                    return False
                except requests.exceptions.HTTPError:
                    return False

            wait_until(decommissioned,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)

            def node_removed():
                admin = Admin(self.redpanda)
                try:
                    brokers = admin.get_brokers(node=self.redpanda.nodes[0])
                    for b in brokers:
                        if b['node_id'] == node_id:
                            return False
                    return True
                except:
                    return False

            wait_until(node_removed,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)

        kafkacat = KafkaCat(self.redpanda)

        def replicas_per_node():
            node_replicas = {}
            md = kafkacat.metadata()
            self.redpanda.logger.info(f"metadata: {md}")
            for topic in md['topics']:
                for p in topic['partitions']:
                    for r in p['replicas']:
                        id = r['id']
                        if id not in node_replicas:
                            node_replicas[id] = 0
                        node_replicas[id] += 1

            return node_replicas

        def restart_node(node_id, cleanup=True):
            self.logger.info(f"restarting node: {node_id}")
            self.redpanda.stop_node(self.redpanda.nodes[node_id - 1])
            if cleanup:
                self.redpanda.clean_node(self.redpanda.nodes[node_id - 1],
                                         preserve_logs=True)
            self.redpanda.start_node(self.redpanda.nodes[node_id - 1])

            def has_new_replicas():
                per_node = replicas_per_node()
                self.logger.info(f"replicas per node: {per_node}")
                return node_id in per_node

            wait_until(has_new_replicas,
                       timeout_sec=NODE_OP_TIMEOUT,
                       backoff_sec=2)

        def is_topic_present(name):
            kcl = KCL(self.redpanda)
            lines = kcl.list_topics().splitlines()
            self.redpanda.logger.debug(
                f"checking if topic {name} is present in {lines}")
            for l in lines:
                if l.startswith(name):
                    return True
            return False

        def create_topic(spec):
            try:
                self.redpanda.create_topic(spec)
            except Exception as e:
                self.redpanda.logger.warn(
                    f"error creating topic {spec.name} - {e}")
            try:
                return is_topic_present(spec.name)
            except Exception as e:
                self.redpanda.logger.warn(f"error while listing topics - {e}")
                return False

        def delete_topic(name):
            try:
                self.redpanda.delete_topic(name)
            except Exception as e:
                self.redpanda.logger.warn(f"error deleting topic {name} - {e}")
            try:
                return not is_topic_present(name)
            except Exception as e:
                self.redpanda.logger.warn(f"error while listing topics - {e}")
                return False

        work = self.generate_random_workload(10, skip_nodes=set())
        self.redpanda.logger.info(f"node operations to execute: {work}")
        for op in work:
            op_type = op[0]
            self.logger.info(f"executing - {op}")

            if op_type == ADD:
                id = op[1]
                self.active_nodes.add(id)
                restart_node(id)
            if op_type == DECOMMISSION:
                id = op[1]
                self.active_nodes.remove(id)
                decommission(id)
            elif op_type == ADD_TOPIC:
                spec = TopicSpec(name=op[1],
                                 replication_factor=op[2],
                                 partition_count=op[3])
                wait_until(lambda: create_topic(spec) == True,
                           timeout_sec=180,
                           backoff_sec=2)
            elif op_type == DELETE_TOPIC:
                wait_until(lambda: delete_topic(op[1]) == True,
                           timeout_sec=180,
                           backoff_sec=2)

        enable_failures = False
        self.run_validation(enable_idempotence=False,
                            producer_timeout_sec=60,
                            consumer_timeout_sec=180)
예제 #3
0
class NodeOperationFuzzyTest(EndToEndTest):
    def generate_random_workload(self, count, skip_nodes):
        op_types = [ADD, DECOMMISSION]
        tp_op_types = [ADD_TOPIC, DELETE_TOPIC]
        # current state
        active_nodes = [1, 2, 3, 4, 5]
        decommissioned_nodes = []
        operations = []
        topics = []

        def eligible_active_nodes():
            return list(
                filter(lambda n: not (n == 1 or n in skip_nodes),
                       active_nodes))

        def decommission(id):
            active_nodes.remove(id)
            decommissioned_nodes.append(id)

        def add(id):
            active_nodes.append(id)
            decommissioned_nodes.remove(id)

        for _ in range(0, count):
            if len(decommissioned_nodes) == 2:
                id = random.choice(decommissioned_nodes)
                operations.append((ADD, id))
                add(id)
            elif len(decommissioned_nodes) == 0:
                id = random.choice(eligible_active_nodes())
                operations.append((DECOMMISSION, id))
                decommission(id)
            else:
                op = random.choice(op_types)
                if op == DECOMMISSION:
                    id = random.choice(eligible_active_nodes())
                    operations.append((DECOMMISSION, id))
                    decommission(id)
                elif op == ADD:
                    id = random.choice(decommissioned_nodes)
                    operations.append((ADD, id))
                    add(id)
            # topic operation
            if len(topics) == 0:
                op = ADD_TOPIC
            else:
                op = random.choice(tp_op_types)

            if op == ADD_TOPIC:
                operations.append((
                    ADD_TOPIC,
                    f"test-topic-{random.randint(0,2000)}-{time.time()*1000.0}",
                    random.choice(ALLOWED_REPLICATION), 3))
            else:
                operations.append((DELETE_TOPIC, random.choice(topics)))

        return operations

    def _create_random_topics(self, count):
        max_partitions = 10

        topics = []
        for i in range(0, count):
            name = f"topic-{i}"
            spec = TopicSpec(
                name=name,
                partition_count=random.randint(1, max_partitions),
                replication_factor=random.choice(ALLOWED_REPLICATION))

            topics.append(spec)

        for spec in topics:
            self.redpanda.create_topic(spec)

        return topics

    """
    Adding nodes to the cluster should result in partition reallocations to new 
    nodes
    """

    @cluster(num_nodes=7)
    def test_node_opeartions(self):
        # allocate 5 nodes for the cluster
        self.redpanda = RedpandaService(
            self.test_context,
            5,
            KafkaCliTools,
            extra_rp_conf={
                "enable_auto_rebalance_on_node_add": True,
                "group_topic_partitions": 3,
                "default_topic_replications": 3,
            })
        # start 3 nodes
        self.redpanda.start()
        # create some topics
        topics = self._create_random_topics(10)
        self.redpanda.logger.info(f"using topics: {topics}")
        # select one of the topics to use in consumer/producer
        self.topic = random.choice(topics).name

        self.start_producer(1, throughput=100)
        self.start_consumer(1)
        self.await_startup()

        def decommission(node_id):
            self.logger.info(f"decommissioning node: {node_id}")
            admin = Admin(self.redpanda)
            admin.decommission_broker(id=node_id)

            def node_removed():
                admin = Admin(self.redpanda)
                brokers = admin.get_brokers()
                for b in brokers:
                    if b['node_id'] == node_id:
                        return False
                return True

            wait_until(node_removed, timeout_sec=240, backoff_sec=2)

        kafkacat = KafkaCat(self.redpanda)

        def replicas_per_node():
            node_replicas = {}
            md = kafkacat.metadata()
            self.redpanda.logger.info(f"metadata: {md}")
            for topic in md['topics']:
                for p in topic['partitions']:
                    for r in p['replicas']:
                        id = r['id']
                        if id not in node_replicas:
                            node_replicas[id] = 0
                        node_replicas[id] += 1

            return node_replicas

        def restart_node(node_id, cleanup=True):
            self.logger.info(f"restarting node: {node_id}")
            self.redpanda.stop_node(self.redpanda.nodes[node_id - 1])
            if cleanup:
                self.redpanda.clean_node(self.redpanda.nodes[node_id - 1])
            self.redpanda.start_node(self.redpanda.nodes[node_id - 1])
            admin = Admin(self.redpanda)
            admin.set_log_level("cluster", "trace")

            def has_new_replicas():
                per_node = replicas_per_node()
                self.logger.info(f"replicas per node: {per_node}")
                return node_id in per_node

            wait_until(has_new_replicas, timeout_sec=240, backoff_sec=2)

        admin = Admin(self.redpanda)
        admin.set_log_level("cluster", "trace")
        work = self.generate_random_workload(10, skip_nodes=set())
        self.redpanda.logger.info(f"node operations to execute: {work}")
        for op in work:
            op_type = op[0]
            self.logger.info(f"executing - {op}")
            if op_type == ADD:
                id = op[1]
                restart_node(id)
            if op_type == DECOMMISSION:
                id = op[1]
                decommission(id)
            elif op_type == ADD_TOPIC:
                spec = TopicSpec(name=op[1],
                                 replication_factor=op[2],
                                 partition_count=op[3])

                self.redpanda.create_topic(spec)
            elif op_type == DELETE_TOPIC:
                self.redpanda.delete_topic(op[1])

        self.run_validation(enable_idempotence=False, consumer_timeout_sec=180)
예제 #4
0
class FetchAfterDeleteTest(Test):
    def __init__(self, test_context):
        super(FetchAfterDeleteTest, self).__init__(test_context)
        self.scale = Scale(test_context)

    @cluster(num_nodes=3)
    @parametrize(transactions_enabled=True)
    @parametrize(transactions_enabled=False)
    def test_fetch_after_committed_offset_was_removed(self,
                                                      transactions_enabled):
        """
        Test fetching when consumer offset was deleted by retention
        """
        segment_size = 1048576
        self.redpanda = RedpandaService(self.test_context,
                                        3,
                                        KafkaCliTools,
                                        extra_rp_conf={
                                            "enable_transactions":
                                            transactions_enabled,
                                            "enable_idempotence":
                                            transactions_enabled,
                                            "log_compaction_interval_ms": 5000,
                                            "log_segment_size": segment_size,
                                            "enable_leader_balancer": False,
                                        })
        self.redpanda.start()
        topic = TopicSpec(partition_count=1,
                          replication_factor=3,
                          cleanup_policy=TopicSpec.CLEANUP_DELETE)
        self.redpanda.create_topic(topic)
        self.topic = topic.name

        kafka_tools = KafkaCliTools(self.redpanda)

        # produce until segments have been compacted
        produce_until_segments(
            self.redpanda,
            topic=self.topic,
            partition_idx=0,
            count=10,
        )
        consumer_group = 'test'
        rpk = RpkTool(self.redpanda)

        def consume(n=1):

            out = rpk.consume(self.topic, group=consumer_group, n=n)
            split = out.split('}')
            split = filter(lambda s: "{" in s, split)

            return map(lambda s: json.loads(s + "}"), split)

        #consume from the beggining
        msgs = consume(10)
        last = list(msgs).pop()
        offset = last['offset']

        # change retention time
        kafka_tools.alter_topic_config(
            self.topic, {
                TopicSpec.PROPERTY_RETENTION_BYTES: 2 * segment_size,
            })

        wait_for_segments_removal(self.redpanda,
                                  self.topic,
                                  partition_idx=0,
                                  count=5)

        partitions = list(rpk.describe_topic(self.topic))
        p = partitions[0]
        assert p.start_offset > offset
        # consume from the offset that doesn't exists,
        # the one that was committed previously was already removed
        out = list(consume(1))
        assert out[0]['offset'] == p.start_offset
예제 #5
0
class ScalingUpTest(EndToEndTest):
    """
    Adding nodes to the cluster should result in partition reallocations to new 
    nodes
    """
    @cluster(num_nodes=5)
    def test_adding_nodes_to_cluster(self):
        self.redpanda = RedpandaService(self.test_context, 3, KafkaCliTools)
        # start single node cluster
        self.redpanda.start(nodes=[self.redpanda.nodes[0]])
        # create some topics
        topics = []
        total_replicas = 0
        for partition_count in range(1, 5):
            name = f"topic{len(topics)}"
            spec = TopicSpec(name=name,
                             partition_count=partition_count,
                             replication_factor=1)
            total_replicas += partition_count
            topics.append(spec)

        for spec in topics:
            self.redpanda.create_topic(spec)
            self.topic = spec.name

        self.start_producer(1)
        self.start_consumer(1)
        self.await_startup()
        # add second node
        self.redpanda.start_node(self.redpanda.nodes[1])
        kafkacat = KafkaCat(self.redpanda)

        def _replicas_per_node():
            node_replicas = {}
            md = kafkacat.metadata()
            self.redpanda.logger.info(f"metadata: {md}")
            for topic in md['topics']:
                for p in topic['partitions']:
                    for r in p['replicas']:
                        id = r['id']
                        if id not in node_replicas:
                            node_replicas[id] = 0
                        node_replicas[id] += 1

            return node_replicas

        def partitions_rebalanced():
            per_node = _replicas_per_node()
            self.redpanda.logger.info(f"replicas per node: {per_node}")
            if len(per_node) < len(self.redpanda.started_nodes()):
                return False

            replicas = sum(per_node.values())
            if replicas != total_replicas:
                return False

            return all(p[1] > 1 for p in per_node.items())

        wait_until(partitions_rebalanced, timeout_sec=30, backoff_sec=1)
        # add third node
        self.redpanda.start_node(self.redpanda.nodes[2])
        wait_until(partitions_rebalanced, timeout_sec=30, backoff_sec=1)

        self.run_validation(enable_idempotence=False, consumer_timeout_sec=45)
예제 #6
0
class MetricsReporterTest(Test):
    def __init__(self, test_ctx, *args, **kwargs):
        self._ctx = test_ctx
        super(MetricsReporterTest, self).__init__(test_context=test_ctx)

    """
    Validates key availability properties of the system using a single
    partition.
    """

    @cluster(num_nodes=4)
    def test_redpanda_metrics_reporting(self):
        """
        Testing if when fetching from single node all partitions are 
        returned in round robin fashion
        """
        # setup http server
        http = HttpServer(self._ctx)
        http.start()
        # report every two seconds
        extra_conf = {
            "health_monitor_tick_interval": 1000,
            "metrics_reporter_tick_interval": 2000,
            "metrics_reporter_report_interval": 1000,
            "enable_metrics_reporter": True,
            "metrics_reporter_url": f"{http.url}/metrics",
        }
        self.redpanda = RedpandaService(self.test_context,
                                        3,
                                        KafkaCliTools,
                                        extra_rp_conf=extra_conf)

        self.redpanda.start()

        total_topics = 5
        total_partitions = 0
        for _ in range(0, total_topics):
            partitions = random.randint(1, 8)
            total_partitions += partitions
            self.redpanda.create_topic(
                [TopicSpec(partition_count=partitions, replication_factor=3)])

        # create topics
        self.redpanda.logger.info(
            f"created {total_topics} topics with {total_partitions} partitions"
        )

        def _state_up_to_date():
            if http.requests:
                r = json.loads(http.requests[-1]['body'])
                return r['topic_count'] == total_topics
            return False

        wait_until(_state_up_to_date, 20, backoff_sec=1)
        http.stop()
        metadata = [json.loads(r['body']) for r in http.requests]
        for m in metadata:
            self.redpanda.logger.info(m)

        def assert_fields_are_the_same(metadata, field):
            assert all(m[field] == metadata[0][field] for m in metadata)

        # cluster uuid and create timestamp should stay the same across requests
        assert_fields_are_the_same(metadata, 'cluster_uuid')
        assert_fields_are_the_same(metadata, 'cluster_created_ts')
        # get the last report
        last = metadata.pop()
        assert last['topic_count'] == total_topics
        assert last['partition_count'] == total_partitions
        nodes_meta = last['nodes']

        assert len(last['nodes']) == 3

        assert all('node_id' in n for n in nodes_meta)
        assert all('cpu_count' in n for n in nodes_meta)
        assert all('version' in n for n in nodes_meta)
        assert all('uptime_ms' in n for n in nodes_meta)
        assert all('is_alive' in n for n in nodes_meta)
        assert all('disks' in n for n in nodes_meta)