Пример #1
0
 def node_removed():
     admin = Admin(self.redpanda)
     brokers = admin.get_brokers()
     for b in brokers:
         if b['node_id'] == node_id:
             return False
     return True
    def test_decommissioning_working_node(self):
        self.start_redpanda(num_nodes=4)
        topics = []
        for partition_count in range(1, 5):
            for replication_factor in (3, 3):
                name = f"topic{len(topics)}"
                spec = TopicSpec(name=name,
                                 partition_count=partition_count,
                                 replication_factor=replication_factor)
                topics.append(spec)

        for spec in topics:
            self.client().create_topic(spec)
            self.topic = spec.name

        self.start_producer(1)
        self.start_consumer(1)
        self.await_startup()
        admin = Admin(self.redpanda)

        brokers = admin.get_brokers()
        to_decommission = random.choice(brokers)
        self.logger.info(f"decommissioning node: {to_decommission}", )
        admin.decommission_broker(to_decommission['node_id'])

        def node_removed():
            brokers = admin.get_brokers()
            for b in brokers:
                if b['node_id'] == to_decommission['node_id']:
                    return False
            return True

        wait_until(node_removed, timeout_sec=120, backoff_sec=2)

        self.run_validation(enable_idempotence=False, consumer_timeout_sec=45)
Пример #3
0
    def test_moving_not_fully_initialized_partition(self):
        """
        Move partition before first leader is elected
        """
        self.start_redpanda(num_nodes=3)

        hb = HoneyBadger()
        # if failure injector is not enabled simply skip this test
        if not hb.is_enabled(self.redpanda.nodes[0]):
            return

        for n in self.redpanda.nodes:
            hb.set_exception(n, 'raftgen_service::failure_probes', 'vote')
        topic = "topic-1"
        partition = 0
        spec = TopicSpec(name=topic, partition_count=1, replication_factor=3)
        self.redpanda.create_topic(spec)
        admin = Admin(self.redpanda)

        # choose a random topic-partition
        self.logger.info(f"selected topic-partition: {topic}-{partition}")

        # get the partition's replica set, including core assignments. the kafka
        # api doesn't expose core information, so we use the redpanda admin api.
        assignments = self._get_assignments(admin, topic, partition)
        self.logger.info(f"assignments for {topic}-{partition}: {assignments}")

        brokers = admin.get_brokers()
        # replace all node cores in assignment
        for assignment in assignments:
            for broker in brokers:
                if broker['node_id'] == assignment['node_id']:
                    assignment['core'] = random.randint(
                        0, broker["num_cores"] - 1)
        self.logger.info(
            f"new assignments for {topic}-{partition}: {assignments}")

        admin.set_partition_replicas(topic, partition, assignments)

        def status_done():
            info = admin.get_partitions(topic, partition)
            self.logger.info(
                f"current assignments for {topic}-{partition}: {info}")
            converged = self._equal_assignments(info["replicas"], assignments)
            return converged and info["status"] == "done"

        # unset failures
        for n in self.redpanda.nodes:
            hb.unset_failures(n, 'raftgen_service::failure_probes', 'vote')
        # wait until redpanda reports complete
        wait_until(status_done, timeout_sec=30, backoff_sec=1)

        def derived_done():
            info = self._get_current_partitions(admin, topic, partition)
            self.logger.info(
                f"derived assignments for {topic}-{partition}: {info}")
            return self._equal_assignments(info, assignments)

        wait_until(derived_done, timeout_sec=30, backoff_sec=1)
Пример #4
0
 def node_removed():
     admin = Admin(self.redpanda)
     try:
         brokers = admin.get_brokers(node=self.redpanda.nodes[0])
         for b in brokers:
             if b['node_id'] == node_id:
                 return False
         return True
     except:
         return False
Пример #5
0
        def node_stopped(node_id):
            admin = Admin(self.redpanda)
            brokers = admin.get_brokers()

            for b in brokers:
                self.logger.debug(f"broker:  {b}")
                if b['node_id'] == node_id:
                    return b['is_alive'] == False

            return False
Пример #6
0
    def registered(self, node):
        """
        Check if a newly added node is fully registered with the cluster, such
        that a kafka metadata request to any node in the cluster will include it.

        We first check the admin API to do a kafka-independent check, and then verify
        that kafka clients see the same thing.
        """
        idx = self.idx(node)
        self.logger.debug(
            f"registered: checking if broker {idx} ({node.name} is registered..."
        )

        # Query all nodes' admin APIs, so that we don't advance during setup until
        # the node is stored in raft0 AND has been replayed on all nodes.  Otherwise
        # a kafka metadata request to the last node to join could return incomplete
        # metadata and cause strange issues within a test.
        admin = Admin(self)
        for peer in self._started:
            try:
                admin_brokers = admin.get_brokers(node=peer)
            except requests.exceptions.RequestException as e:
                # We run during startup, when admin API may not even be listening yet: tolerate
                # API errors but presume that if some APIs are not up yet, then node registration
                # is also not complete.
                self.logger.debug(
                    f"registered: peer {peer.name} admin API not yet available ({e})"
                )
                return False

            found = idx in [b['node_id'] for b in admin_brokers]
            if not found:
                self.logger.info(
                    f"registered: node {node.name} not yet found in peer {peer.name}'s broker list ({admin_brokers})"
                )
                return False
            else:
                self.logger.debug(
                    f"registered: node {node.name} now visible in peer {peer.name}'s broker list ({admin_brokers})"
                )

        client = PythonLibrdkafka(self)
        brokers = client.brokers()
        broker = brokers.get(idx, None)
        if broker is None:
            # This should never happen, because we already checked via the admin API
            # that the node of interest had become visible to all peers.
            self.logger.error(
                f"registered: node {node.name} not found in kafka metadata!")
            assert broker is not None

        self.logger.debug(f"registered: found broker info: {broker}")
        return True
Пример #7
0
        def cluster_is_stable():
            admin = Admin(self.redpanda)
            brokers = admin.get_brokers()
            if len(brokers) < 3:
                return False

            for b in brokers:
                self.logger.debug(f"broker:  {b}")
                if not (b['is_alive'] and 'disk_space' in b):
                    return False

            return True
Пример #8
0
    def test_decommissioning_working_node(self):
        self.start_redpanda(num_nodes=4)
        topics = []
        for partition_count in range(1, 5):
            for replication_factor in (3, 3):
                name = f"topic{len(topics)}"
                spec = TopicSpec(name=name,
                                 partition_count=partition_count,
                                 replication_factor=replication_factor)
                topics.append(spec)

        for spec in topics:
            self.client().create_topic(spec)
            self.topic = spec.name

        self.start_producer(1)
        self.start_consumer(1)
        self.await_startup()
        admin = Admin(self.redpanda)

        brokers = admin.get_brokers()
        to_decommission = random.choice(brokers)
        self.logger.info(f"decommissioning node: {to_decommission}", )
        admin.decommission_broker(to_decommission['node_id'])

        # A node which isn't being decommed, to use when calling into
        # the admin API from this point onwards.
        survivor_node = [
            n for n in self.redpanda.nodes
            if self.redpanda.idx(n) != to_decommission['node_id']
        ][0]
        self.logger.info(
            f"Using survivor node {survivor_node.name} {self.redpanda.idx(survivor_node)}"
        )

        def node_removed():
            brokers = admin.get_brokers(node=survivor_node)
            for b in brokers:
                if b['node_id'] == to_decommission['node_id']:
                    return False
            return True

        wait_until(node_removed, timeout_sec=120, backoff_sec=2)

        self.run_validation(enable_idempotence=False, consumer_timeout_sec=45)
Пример #9
0
            def decommissioned():
                try:
                    admin = Admin(self.redpanda)
                    # if broker is already draining, it is suceess
                    brokers = admin.get_brokers()
                    for b in brokers:
                        if b['node_id'] == node_id and b[
                                'membership_status'] == 'draining':
                            return True

                    r = admin.decommission_broker(id=node_id)
                    return r.status_code == 200
                except requests.exceptions.RetryError:
                    return False
                except requests.exceptions.ConnectionError:
                    return False
                except requests.exceptions.HTTPError:
                    return False
Пример #10
0
    def test_invalid_destination(self):
        """
        Check that requuests to move to non-existent locations are properly rejected.
        """

        self.start_redpanda(num_nodes=3)
        spec = TopicSpec(name="topic", partition_count=1, replication_factor=1)
        self.client().create_topic(spec)
        topic = spec.name
        partition = 0

        admin = Admin(self.redpanda)
        brokers = admin.get_brokers()
        assignments = self._get_assignments(admin, topic, partition)

        # Pick a node id where the topic currently isn't allocated
        valid_dest = list(
            set([b['node_id']
                 for b in brokers]) - set([a['node_id']
                                           for a in assignments]))[0]

        # This test will need updating on far-future hardware when core counts go higher
        invalid_shard = 1000
        invalid_dest = 30

        # A valid node but an invalid core
        assignments = [{"node_id": valid_dest, "core": invalid_shard}]
        try:
            r = admin.set_partition_replicas(topic, partition, assignments)
        except requests.exceptions.HTTPError as e:
            assert e.response.status_code == 400
        else:
            raise RuntimeError(f"Expected 400 but got {r.status_code}")

        # An invalid node but a valid core
        assignments = [{"node_id": invalid_dest, "core": 0}]
        try:
            r = admin.set_partition_replicas(topic, partition, assignments)
        except requests.exceptions.HTTPError as e:
            assert e.response.status_code == 400
        else:
            raise RuntimeError(f"Expected 400 but got {r.status_code}")

        # An syntactically invalid destination (float instead of int)
        # Reproducer for https://github.com/vectorizedio/redpanda/issues/2286
        assignments = [{"node_id": valid_dest, "core": 3.14}]
        try:
            r = admin.set_partition_replicas(topic, partition, assignments)
        except requests.exceptions.HTTPError as e:
            assert e.response.status_code == 400
        else:
            raise RuntimeError(f"Expected 400 but got {r.status_code}")

        assignments = [{"node_id": 3.14, "core": 0}]
        try:
            r = admin.set_partition_replicas(topic, partition, assignments)
        except requests.exceptions.HTTPError as e:
            assert e.response.status_code == 400
        else:
            raise RuntimeError(f"Expected 400 but got {r.status_code}")

        # Finally a valid move
        assignments = [{"node_id": valid_dest, "core": 0}]
        r = admin.set_partition_replicas(topic, partition, assignments)
        assert r.status_code == 200
Пример #11
0
class MaintenanceTest(RedpandaTest):
    topics = (TopicSpec(partition_count=10, replication_factor=3),
              TopicSpec(partition_count=20, replication_factor=3))

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.admin = Admin(self.redpanda)
        self.rpk = RpkTool(self.redpanda)
        self._use_rpk = True

    def _has_leadership_role(self, node):
        """
        Returns true if node is leader for some partition, and false otherwise.
        """
        id = self.redpanda.idx(node)
        partitions = self.admin.get_partitions(node=node)
        has_leadership = False
        for p in partitions:
            if p["leader"] == id:
                self.logger.debug(f"{node.name} has leadership for {p}")
                has_leadership = True
        return has_leadership

    def _in_maintenance_mode(self, node):
        status = self.admin.maintenance_status(node)
        return status["draining"]

    def _in_maintenance_mode_fully(self, node):
        status = self.admin.maintenance_status(node)
        return status["finished"] and not status["errors"] and \
                status["partitions"] > 0

    def _verify_broker_metadata(self, maintenance_enabled, node):
        """
        check if both brokers interfaces in the admin server return
        the same status for maintenance mode. further, check if the
        mode is returning that draining has been enabled/disabled
        """
        node_id = self.redpanda.idx(node)
        broker_target = self.admin.get_broker(node_id)
        broker_filtered = None
        for broker in self.admin.get_brokers():
            if broker['node_id'] == node_id:
                broker_filtered = broker
                break
        # both apis should return the same info
        if broker_filtered is None:
            return False
        status = broker_target['maintenance_status']
        if status != broker_filtered['maintenance_status']:
            return False
        # check status wanted
        if maintenance_enabled:
            return status['draining'] and status['finished']
        else:
            return not status['draining']

    def _verify_maintenance_status(self, node, draining):
        """
        Check that cluster reports maintenance status as expected through
        both rpk status tooling as well as raw admin interface.
        """
        # get status for this node via rpk
        node_id = self.redpanda.idx(node)
        statuses = self.rpk.cluster_maintenance_status()
        self.logger.debug(f"finding node_id {node_id} in rpk "
                          "maintenance status: {statuses}")
        rpk_status = None
        for status in statuses:
            if status.node_id == node_id:
                rpk_status = status
                break
        if rpk_status is None:
            return False

        # get status for this node via admin interface
        admin_status = self.admin.maintenance_status(node)
        self.logger.debug(f"maintenance status from admin for "
                          "{node.name}: {admin_status}")

        # ensure that both agree on expected outcome
        return admin_status["draining"] == rpk_status.draining == draining

    def _enable_maintenance(self, node):
        """
        1. Verifies that node is leader for some partitions
        2. Verifies node is not already in maintenance mode
        3. Requests that node enter maintenance mode (persistent interface)
        4. Verifies node enters maintenance mode
        5. Verifies that node has no leadership role
        6. Verifies that maintenance mode completes

        Note that there is a terminology issue that we need to work on. When we
        say that 'maintenance mode completes' it doesn't mean that the node
        leaves maintenance mode. What we mean is that it has entered maintenance
        mode and all of the work associated with that has completed.
        """
        self.logger.debug(
            f"Checking that node {node.name} has a leadership role")
        wait_until(lambda: self._has_leadership_role(node),
                   timeout_sec=60,
                   backoff_sec=10)

        self.logger.debug(
            f"Checking that node {node.name} is not in maintenance mode")
        wait_until(lambda: self._verify_maintenance_status(node, False),
                   timeout_sec=30,
                   backoff_sec=5)

        self.logger.debug(
            f"Waiting for node {node.name} to enter maintenance mode")
        if self._use_rpk:
            self.rpk.cluster_maintenance_enable(node, wait=True)
            # the node should now report itself in maintenance mode
            assert self._in_maintenance_mode(node), \
                    f"{node.name} not in expected maintenance mode"
        else:
            # when using the low-level admin interface the barrier is
            # implemented using wait_until and query the node directly
            self.admin.maintenance_start(node)
            wait_until(lambda: self._in_maintenance_mode(node),
                       timeout_sec=30,
                       backoff_sec=5)

        def has_drained():
            """
            as we wait for leadership to drain, also print out maintenance mode
            status. this is useful for debugging to detect if maintenance mode
            has been lost or disabled for some unexpected reason.
            """
            status = self.admin.maintenance_status(node)
            self.logger.debug(f"Maintenance status for {node.name}: {status}")
            return not self._has_leadership_role(node),

        self.logger.debug(f"Waiting for node {node.name} leadership to drain")
        wait_until(has_drained, timeout_sec=60, backoff_sec=10)

        self.logger.debug(
            f"Waiting for node {node.name} maintenance mode to complete")
        wait_until(lambda: self._in_maintenance_mode_fully(node),
                   timeout_sec=60,
                   backoff_sec=10)

        self.logger.debug("Verifying expected broker metadata reported "
                          f"for enabled maintenance mode on node {node.name}")
        wait_until(lambda: self._verify_broker_metadata(True, node),
                   timeout_sec=60,
                   backoff_sec=10)

    def _verify_cluster(self, target, target_expect):
        for node in self.redpanda.nodes:
            expect = False if node != target else target_expect
            wait_until(
                lambda: self._verify_maintenance_status(node, expect),
                timeout_sec=30,
                backoff_sec=5,
                err_msg=f"expected {node.name} maintenance mode: {expect}")

    def _maintenance_disable(self, node):
        if self._use_rpk:
            self.rpk.cluster_maintenance_disable(node)
        else:
            self.admin.maintenance_stop(node)

        wait_until(lambda: not self._in_maintenance_mode(node),
                   timeout_sec=30,
                   backoff_sec=5)

        wait_until(lambda: self._has_leadership_role(node),
                   timeout_sec=120,
                   backoff_sec=10)

        self.logger.debug("Verifying expected broker metadata reported "
                          f"for disabled maintenance mode on node {node.name}")
        wait_until(lambda: self._verify_broker_metadata(False, node),
                   timeout_sec=60,
                   backoff_sec=10)

    @cluster(num_nodes=3)
    @matrix(use_rpk=[True, False])
    def test_maintenance(self, use_rpk):
        self._use_rpk = use_rpk
        target = random.choice(self.redpanda.nodes)
        self._enable_maintenance(target)
        self._maintenance_disable(target)

    @cluster(num_nodes=3, log_allow_list=RESTART_LOG_ALLOW_LIST)
    @matrix(use_rpk=[True, False])
    def test_maintenance_sticky(self, use_rpk):
        self._use_rpk = use_rpk
        nodes = random.sample(self.redpanda.nodes, len(self.redpanda.nodes))
        for node in nodes:
            self._enable_maintenance(node)
            self._verify_cluster(node, True)

            self.redpanda.restart_nodes(node)
            self._verify_cluster(node, True)

            self._maintenance_disable(node)
            self._verify_cluster(node, False)

        self.redpanda.restart_nodes(self.redpanda.nodes)
        self._verify_cluster(None, False)

    @cluster(num_nodes=3)
    @matrix(use_rpk=[True, False])
    def test_exclusive_maintenance(self, use_rpk):
        self._use_rpk = use_rpk
        target, other = random.sample(self.redpanda.nodes, k=2)
        assert target is not other
        self._enable_maintenance(target)
        try:
            self._enable_maintenance(other)
        except RpkException as e:
            assert self._use_rpk
            if "invalid state transition" in e.msg and "400" in e.msg:
                return
        except requests.exceptions.HTTPError as e:
            assert not self._use_rpk
            if "invalid state transition" in e.response.text and e.response.status_code == 400:
                return
            raise
        except:
            raise
        else:
            raise Exception("Expected maintenance enable to fail")
Пример #12
0
    def test_deletion_stops_move(self):
        """
        Delete topic which partitions are being moved and check status after 
        topic is created again, old move 
        opeartions should not influcence newly created topic
        """
        self.start_redpanda(num_nodes=3)

        # create a single topic with replication factor of 1
        topic = 'test-topic'
        rpk = RpkTool(self.redpanda)
        rpk.create_topic(topic, 1, 1)
        partition = 0
        num_records = 1000

        self.logger.info(f"Producing to {topic}")
        producer = KafProducer(self.test_context, self.redpanda, topic,
                               num_records)
        producer.start()
        self.logger.info(
            f"Finished producing to {topic}, waiting for producer...")
        producer.wait()
        producer.free()
        self.logger.info(f"Producer stop complete.")

        admin = Admin(self.redpanda)
        # get current assignments
        assignments = self._get_assignments(admin, topic, partition)
        assert len(assignments) == 1
        self.logger.info(f"assignments for {topic}-{partition}: {assignments}")
        brokers = admin.get_brokers()
        self.logger.info(f"available brokers: {brokers}")
        candidates = list(
            filter(lambda b: b['node_id'] != assignments[0]['node_id'],
                   brokers))
        replacement = random.choice(candidates)
        target_assignment = [{'node_id': replacement['node_id'], 'core': 0}]
        self.logger.info(
            f"target assignments for {topic}-{partition}: {target_assignment}")
        # shutdown target node to make sure that move will never complete
        node = self.redpanda.get_node(replacement['node_id'])
        self.redpanda.stop_node(node)
        admin.set_partition_replicas(topic, partition, target_assignment)

        # check that the status is in progress

        def get_status():
            partition_info = admin.get_partitions(topic, partition)
            self.logger.info(
                f"current assignments for {topic}-{partition}: {partition_info}"
            )
            return partition_info["status"]

        wait_until(lambda: get_status() == 'in_progress', 10, 1)
        # delete the topic
        rpk.delete_topic(topic)
        # start the node back up
        self.redpanda.start_node(node)
        # create topic again
        rpk.create_topic(topic, 1, 1)
        wait_until(lambda: get_status() == 'done', 10, 1)