def test_self_transfer(self): admin = Admin(self.redpanda) for topic in self.topics: for partition in range(topic.partition_count): leader = admin.get_partitions(topic, partition)['leader_id'] admin.partition_transfer_leadership("kafka", topic, partition, leader)
def test_controller_recovery(self): kc = KafkaCat(self.redpanda) # choose a partition and a target node partition = self._get_partition(kc) target_node_id = next( filter(lambda r: r["id"] != partition["leader"], partition["replicas"]))["id"] self.logger.debug( f"Transfering leader from {partition['leader']} to {target_node_id}" ) # build the transfer url meta = kc.metadata() brokers = meta["brokers"] source_broker = next( filter(lambda b: b["id"] == partition["leader"], brokers)) target_broker = next( filter(lambda b: b["id"] == target_node_id, brokers)) self.logger.debug(f"Source broker {source_broker}") self.logger.debug(f"Target broker {target_broker}") # Send the request to any host, they should redirect to # the leader of the partition. partition_id = partition['partition'] admin = Admin(self.redpanda) admin.partition_transfer_leadership("kafka", self.topic, partition_id, target_node_id) def transfer_complete(): for _ in range(3): # just give it a moment time.sleep(1) meta = kc.metadata() partition = next( filter(lambda p: p["partition"] == partition_id, meta["topics"][0]["partitions"])) if partition["leader"] == target_node_id: return True return False wait_until(lambda: transfer_complete(), timeout_sec=30, backoff_sec=5, err_msg="Transfer did not complete")
def _transfer_leadership(self, admin: Admin, namespace: str, topic: str, target_node_id: int) -> None: last_log_msg = "" # avoid spamming log def leader_predicate(l: Optional[int]) -> bool: nonlocal last_log_msg, target_node_id if not l: return False if l != target_node_id: # type: ignore log_msg = f'Still waiting for leader {target_node_id}, got {l}' if log_msg != last_log_msg: # type: ignore # "unbound" self.logger.info(log_msg) last_log_msg = log_msg return False return True retry_once = True while True: self.logger.info(f"Starting transfer to {target_node_id}") admin.partition_transfer_leadership("kafka", topic, 0, target_node_id) try: self._wait_for_leader(leader_predicate, timeout=ELECTION_TIMEOUT * 2) except ducktape.errors.TimeoutError as e: if retry_once: self.logger.info( f'Failed to get desired leader, retrying once.') retry_once = False continue else: raise e break # no exception -> success, we can return now self.logger.info(f"Completed transfer to {target_node_id}")
def test_leader_transfers_recovery(self, acks): """ Validate that leadership transfers complete successfully under acks=1 writes that prompt the leader to frequently activate recovery_stm. When acks=1, this is a reproducer for https://github.com/vectorizedio/redpanda/issues/2580 When acks=-1, this is a reproducer rfor https://github.com/vectorizedio/redpanda/issues/2606 """ leader_node_id, replicas = self._wait_for_leader() if acks == -1: producer = RpkProducer(self._ctx, self.redpanda, self.topic, 16384, sys.maxsize, acks=acks) else: # To reproduce acks=1 issue, we need an intermittent producer that # waits long enough between messages to let recovery_stm go to sleep # waiting for follower_state_change # KafProducer is intermittent because it starts a fresh process for # each message, whereas RpkProducer writes a continuous stream. # TODO: create a test traffic generator that has inter-message # delay as an explicit parameter, rather than relying on implementation # details of the producer helpers. producer = KafProducer(self._ctx, self.redpanda, self.topic) producer.start() # Pass leadership around in a ring self.logger.info(f"Initial leader of {self.topic} is {leader_node_id}") transfer_count = 50 # FIXME: with a transfer count >100, we tend to see # reactor stalls and corresponding nondeterministic behaviour/failures. # This appears unrelated to the functionality under test, something else # is tripping up the cluster when we have so many leadership transfers. # https://github.com/vectorizedio/redpanda/issues/2623 admin = Admin(self.redpanda) initial_leader_id = leader_node_id for n in range(0, transfer_count): target_idx = (initial_leader_id + n) % len(self.redpanda.nodes) target_node_id = target_idx + 1 self.logger.info(f"Starting transfer to {target_node_id}") admin.partition_transfer_leadership("kafka", self.topic, 0, target_node_id) self._wait_for_leader( lambda l: l is not None and l == target_node_id, timeout=ELECTION_TIMEOUT * 2) self.logger.info(f"Completed transfer to {target_node_id}") self.logger.info(f"Completed {transfer_count} transfers successfully") # Explicit stop of producer so that we see any errors producer.stop() producer.wait() producer.free()