def test_self_transfer(self): admin = Admin(self.redpanda) for topic in self.topics: for partition in range(topic.partition_count): leader = admin.get_partitions(topic, partition)['leader_id'] admin.partition_transfer_leadership("kafka", topic, partition, leader)
def test_overlapping_changes(self): """ Check that while a movement is in flight, rules about overlapping operations are properly enforced. """ self.start_redpanda(num_nodes=4) node_ids = {1, 2, 3, 4} # Create topic with enough data that inter-node movement # will take a while. name = f"movetest" spec = TopicSpec(name=name, partition_count=1, replication_factor=3) self.client().create_topic(spec) # Wait for the partition to have a leader (`rpk produce` errors # out if it tries to write data before this) def partition_ready(): return KafkaCat(self.redpanda).get_partition_leader( name, 0)[0] is not None wait_until(partition_ready, timeout_sec=10, backoff_sec=0.5) # Write a substantial amount of data to the topic msg_size = 512 * 1024 write_bytes = 512 * 1024 * 1024 producer = RpkProducer(self._ctx, self.redpanda, name, msg_size=msg_size, msg_count=int(write_bytes / msg_size)) t1 = time.time() producer.start() # This is an absurdly low expected throughput, but necessarily # so to run reliably on current test runners, which share an EBS # backend among many parallel tests. 10MB/s has been empirically # shown to be too high an expectation. expect_bps = 1 * 1024 * 1024 expect_runtime = write_bytes / expect_bps producer.wait(timeout_sec=expect_runtime) self.logger.info( f"Write complete {write_bytes} in {time.time() - t1} seconds") # - Admin API redirects writes but not reads. Because we want synchronous # status after submitting operations, send all operations to the controller # leader. This is not necessary for operations to work, just to simplify # this test by letting it see synchronous status updates. # - Because we will later verify that a 503 is sent in response to # a move request to an in_progress topic, set retry_codes=[] to # disable default retries on 503. admin_node = self.redpanda.controller() admin = Admin(self.redpanda, default_node=admin_node, retry_codes=[]) # Start an inter-node move, which should take some time # to complete because of recovery network traffic assignments = self._get_assignments(admin, name, 0) new_node = list(node_ids - set([a['node_id'] for a in assignments]))[0] self.logger.info(f"old assignments: {assignments}") old_assignments = assignments assignments = assignments[1:] + [{'node_id': new_node, 'core': 0}] self.logger.info(f"new assignments: {assignments}") r = admin.set_partition_replicas(name, 0, assignments) r.raise_for_status() assert admin.get_partitions(name, 0)['status'] == "in_progress" # Another move should fail assert admin.get_partitions(name, 0)['status'] == "in_progress" try: r = admin.set_partition_replicas(name, 0, old_assignments) except requests.exceptions.HTTPError as e: assert e.response.status_code == 503 else: raise RuntimeError(f"Expected 503 but got {r.status_code}") # An update to partition properties should succeed # (issue https://github.com/vectorizedio/redpanda/issues/2300) rpk = RpkTool(self.redpanda) assert admin.get_partitions(name, 0)['status'] == "in_progress" rpk.alter_topic_config(name, "retention.ms", "3600000") # A deletion should succeed assert name in rpk.list_topics() assert admin.get_partitions(name, 0)['status'] == "in_progress" rpk.delete_topic(name) assert name not in rpk.list_topics()
class MaintenanceTest(RedpandaTest): topics = (TopicSpec(partition_count=10, replication_factor=3), TopicSpec(partition_count=20, replication_factor=3)) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.admin = Admin(self.redpanda) self.rpk = RpkTool(self.redpanda) self._use_rpk = True def _has_leadership_role(self, node): """ Returns true if node is leader for some partition, and false otherwise. """ id = self.redpanda.idx(node) partitions = self.admin.get_partitions(node=node) has_leadership = False for p in partitions: if p["leader"] == id: self.logger.debug(f"{node.name} has leadership for {p}") has_leadership = True return has_leadership def _in_maintenance_mode(self, node): status = self.admin.maintenance_status(node) return status["draining"] def _in_maintenance_mode_fully(self, node): status = self.admin.maintenance_status(node) return status["finished"] and not status["errors"] and \ status["partitions"] > 0 def _verify_broker_metadata(self, maintenance_enabled, node): """ check if both brokers interfaces in the admin server return the same status for maintenance mode. further, check if the mode is returning that draining has been enabled/disabled """ node_id = self.redpanda.idx(node) broker_target = self.admin.get_broker(node_id) broker_filtered = None for broker in self.admin.get_brokers(): if broker['node_id'] == node_id: broker_filtered = broker break # both apis should return the same info if broker_filtered is None: return False status = broker_target['maintenance_status'] if status != broker_filtered['maintenance_status']: return False # check status wanted if maintenance_enabled: return status['draining'] and status['finished'] else: return not status['draining'] def _verify_maintenance_status(self, node, draining): """ Check that cluster reports maintenance status as expected through both rpk status tooling as well as raw admin interface. """ # get status for this node via rpk node_id = self.redpanda.idx(node) statuses = self.rpk.cluster_maintenance_status() self.logger.debug(f"finding node_id {node_id} in rpk " "maintenance status: {statuses}") rpk_status = None for status in statuses: if status.node_id == node_id: rpk_status = status break if rpk_status is None: return False # get status for this node via admin interface admin_status = self.admin.maintenance_status(node) self.logger.debug(f"maintenance status from admin for " "{node.name}: {admin_status}") # ensure that both agree on expected outcome return admin_status["draining"] == rpk_status.draining == draining def _enable_maintenance(self, node): """ 1. Verifies that node is leader for some partitions 2. Verifies node is not already in maintenance mode 3. Requests that node enter maintenance mode (persistent interface) 4. Verifies node enters maintenance mode 5. Verifies that node has no leadership role 6. Verifies that maintenance mode completes Note that there is a terminology issue that we need to work on. When we say that 'maintenance mode completes' it doesn't mean that the node leaves maintenance mode. What we mean is that it has entered maintenance mode and all of the work associated with that has completed. """ self.logger.debug( f"Checking that node {node.name} has a leadership role") wait_until(lambda: self._has_leadership_role(node), timeout_sec=60, backoff_sec=10) self.logger.debug( f"Checking that node {node.name} is not in maintenance mode") wait_until(lambda: self._verify_maintenance_status(node, False), timeout_sec=30, backoff_sec=5) self.logger.debug( f"Waiting for node {node.name} to enter maintenance mode") if self._use_rpk: self.rpk.cluster_maintenance_enable(node, wait=True) # the node should now report itself in maintenance mode assert self._in_maintenance_mode(node), \ f"{node.name} not in expected maintenance mode" else: # when using the low-level admin interface the barrier is # implemented using wait_until and query the node directly self.admin.maintenance_start(node) wait_until(lambda: self._in_maintenance_mode(node), timeout_sec=30, backoff_sec=5) def has_drained(): """ as we wait for leadership to drain, also print out maintenance mode status. this is useful for debugging to detect if maintenance mode has been lost or disabled for some unexpected reason. """ status = self.admin.maintenance_status(node) self.logger.debug(f"Maintenance status for {node.name}: {status}") return not self._has_leadership_role(node), self.logger.debug(f"Waiting for node {node.name} leadership to drain") wait_until(has_drained, timeout_sec=60, backoff_sec=10) self.logger.debug( f"Waiting for node {node.name} maintenance mode to complete") wait_until(lambda: self._in_maintenance_mode_fully(node), timeout_sec=60, backoff_sec=10) self.logger.debug("Verifying expected broker metadata reported " f"for enabled maintenance mode on node {node.name}") wait_until(lambda: self._verify_broker_metadata(True, node), timeout_sec=60, backoff_sec=10) def _verify_cluster(self, target, target_expect): for node in self.redpanda.nodes: expect = False if node != target else target_expect wait_until( lambda: self._verify_maintenance_status(node, expect), timeout_sec=30, backoff_sec=5, err_msg=f"expected {node.name} maintenance mode: {expect}") def _maintenance_disable(self, node): if self._use_rpk: self.rpk.cluster_maintenance_disable(node) else: self.admin.maintenance_stop(node) wait_until(lambda: not self._in_maintenance_mode(node), timeout_sec=30, backoff_sec=5) wait_until(lambda: self._has_leadership_role(node), timeout_sec=120, backoff_sec=10) self.logger.debug("Verifying expected broker metadata reported " f"for disabled maintenance mode on node {node.name}") wait_until(lambda: self._verify_broker_metadata(False, node), timeout_sec=60, backoff_sec=10) @cluster(num_nodes=3) @matrix(use_rpk=[True, False]) def test_maintenance(self, use_rpk): self._use_rpk = use_rpk target = random.choice(self.redpanda.nodes) self._enable_maintenance(target) self._maintenance_disable(target) @cluster(num_nodes=3, log_allow_list=RESTART_LOG_ALLOW_LIST) @matrix(use_rpk=[True, False]) def test_maintenance_sticky(self, use_rpk): self._use_rpk = use_rpk nodes = random.sample(self.redpanda.nodes, len(self.redpanda.nodes)) for node in nodes: self._enable_maintenance(node) self._verify_cluster(node, True) self.redpanda.restart_nodes(node) self._verify_cluster(node, True) self._maintenance_disable(node) self._verify_cluster(node, False) self.redpanda.restart_nodes(self.redpanda.nodes) self._verify_cluster(None, False) @cluster(num_nodes=3) @matrix(use_rpk=[True, False]) def test_exclusive_maintenance(self, use_rpk): self._use_rpk = use_rpk target, other = random.sample(self.redpanda.nodes, k=2) assert target is not other self._enable_maintenance(target) try: self._enable_maintenance(other) except RpkException as e: assert self._use_rpk if "invalid state transition" in e.msg and "400" in e.msg: return except requests.exceptions.HTTPError as e: assert not self._use_rpk if "invalid state transition" in e.response.text and e.response.status_code == 400: return raise except: raise else: raise Exception("Expected maintenance enable to fail")