def _get_partition_leaders(self): kcat = KafkaCat(self.redpanda) m = kcat.metadata() self.logger.info(f"kcat.metadata() == {m}") brokers = {} for b in m['brokers']: id = b['id'] ip = b['name'] ip = ip[:ip.index(':')] for n in self.redpanda.nodes: n_ip = n.account.hostname self.logger.debug(f"matching {n_ip} over {ip}") if n_ip == ip: brokers[id] = n break self.logger.debug(f"found brokers {brokers}") assert len(brokers) == 3 leaders = {} for topic in m['topics']: if topic['topic'] == ArchivalTest.s3_topic_name: for part in topic['partitions']: leader_id = part['leader'] partition_id = part['partition'] leader = brokers[leader_id] leaders[partition_id] = leader return leaders
def controller(self): kc = KafkaCat(self) cid = kc.metadata()["controllerid"] self.logger.debug("Controller reported with id: {}".format(cid)) if cid != -1: node = self.get_node(cid) self.logger.debug("Controller node found: {}".format(node)) return node
def registered(self, node): idx = self.idx(node) self.logger.debug("Checking if broker %d/%s is registered", idx, node) kc = KafkaCat(self) brokers = kc.metadata()["brokers"] brokers = {b["id"]: b for b in brokers} broker = brokers.get(idx, None) self.logger.debug("Found broker info: %s", broker) return broker is not None
def test_produce_topic(self): """ Create a topic and verify that pandaproxy can produce to it. """ name = create_topic_names(1)[0] data = ''' { "records": [ {"value": "dmVjdG9yaXplZA==", "partition": 0}, {"value": "cGFuZGFwcm94eQ==", "partition": 1}, {"value": "bXVsdGlicm9rZXI=", "partition": 2} ] }''' self.logger.info(f"Producing to non-existant topic: {name}") produce_result_raw = self._produce_topic(name, data) assert produce_result_raw.status_code == requests.codes.ok produce_result = produce_result_raw.json() for o in produce_result["offsets"]: assert o["error_code"] == 3 assert o["offset"] == -1 self.logger.info(f"Creating test topic: {name}") self._create_topics([name], partitions=3) self.logger.info(f"Producing to topic: {name}") produce_result_raw = self._produce_topic(name, data) assert produce_result_raw.status_code == requests.codes.ok assert produce_result_raw.headers[ "Content-Type"] == "application/vnd.kafka.v2+json" produce_result = produce_result_raw.json() for o in produce_result["offsets"]: assert o["offset"] == 0, f'error_code {o["error_code"]}' self.logger.info(f"Consuming from topic: {name}") kc = KafkaCat(self.redpanda) assert kc.consume_one(name, 0, 0)["payload"] == "vectorized" assert kc.consume_one(name, 1, 0)["payload"] == "pandaproxy" assert kc.consume_one(name, 2, 0)["payload"] == "multibroker" self.logger.info(f"Producing to topic without partition: {name}") produce_result_raw = self._produce_topic( name, ''' { "records": [ {"value": "dmVjdG9yaXplZA=="}, {"value": "cGFuZGFwcm94eQ=="}, {"value": "bXVsdGlicm9rZXI="} ] }''') assert produce_result_raw.status_code == requests.codes.ok produce_result = produce_result_raw.json() for o in produce_result["offsets"]: assert o["offset"] == 1, f'error_code {o["error_code"]}'
def registered(self, node): idx = self.idx(node) self.logger.debug( f"Checking if broker {idx} ({node.name} is registered") kc = KafkaCat(self) brokers = kc.metadata()["brokers"] brokers = {b["id"]: b for b in brokers} broker = brokers.get(idx, None) self.logger.debug(f"Found broker info: {broker}") return broker is not None
def _registered(self, service, node): idx = service.idx(node) service.logger.debug("Checking if broker %d/%s is registered", idx, node) kc = KafkaCat(RedpandaMuServiceServiceProxy(service, self)) brokers = kc.metadata()["brokers"] brokers = {b["id"]: b for b in brokers} broker = brokers.get(idx, None) service.logger.debug("Found broker info: %s", broker) return broker is not None
def test_produce_topic(self): """ Create a topic and verify that pandaproxy can produce to it. """ name = "pandaproxy-topic-{}".format(uuid.uuid4()) self.logger.debug("Topic name %s", name) prev = set(self._get_topics()) self.logger.debug("Existing topics %s", prev) assert prev.isdisjoint(name) data = '{"records": [{"value": "dmVjdG9yaXplZA==", "partition": 0},{"value": "cGFuZGFwcm94eQ==", "partition": 1},{"value": "bXVsdGlicm9rZXI=", "partition": 2}]}' self.logger.debug("Producing to non-existant topic") produce_result = self._produce_topic(name, data) for o in produce_result["offsets"]: assert o["error_code"] == 3 assert o["offset"] == -1 kc = KafkaCat(self.redpanda) self.logger.debug("Creating test topic") kafka_tools = KafkaCliTools(self.redpanda) kafka_tools.create_topic( TopicSpec(name=name, replication_factor=1, partition_count=3)) self.logger.debug("Waiting for leaders to settle") has_leaders = False while not has_leaders: topics = kc.metadata()["topics"] maybe_leaders = True for t in topics: if t["topic"] == name: for p in t["partitions"]: if p["leader"] == -1: maybe_leaders = False has_leaders = maybe_leaders # TODO: # Despite the above test, Pandaproxy can still get back no leaders # Query Pandaproxy metadata to see when leaders have settled # The retry logic for produce should have sufficient time for this # additional settle time. self.logger.debug("Producing to topic") produce_result = self._produce_topic(name, data) self.logger.debug("Producing to topic: %s", produce_result) for o in produce_result["offsets"]: assert o["offset"] == 1, f'error_code {o["error_code"]}' self.logger.debug(f"Consuming topic: {name}") assert kc.consume_one(name, 0, 1)["payload"] == "vectorized" assert kc.consume_one(name, 1, 1)["payload"] == "pandaproxy" assert kc.consume_one(name, 2, 1)["payload"] == "multibroker"
def done(): kcat = KafkaCat(self.redpanda) ts = 1638748800 # 12.6.2021 - old timestamp, query first offset offset = kcat.query_offset(self.topic, 0, ts) # assert that offset is valid assert offset >= 0 topic_partitions = segments_count(self.redpanda, self.topic, 0) partitions = [] for p in topic_partitions: partitions.append(p <= 5) return all([p <= 5 for p in topic_partitions])
def _wait_for_topic(self, name): kc = KafkaCat(self.redpanda) has_leaders = False while not has_leaders: topics = kc.metadata()["topics"] maybe_leaders = True for t in topics: if t["topic"] == name: for p in t["partitions"]: if p["leader"] == -1: maybe_leaders = False has_leaders = maybe_leaders
def test_produce_topic(self): """ Create a topic and verify that pandaproxy can produce to it. """ name = create_topic_names(1)[0] data = ''' { "records": [ {"value": "dmVjdG9yaXplZA==", "partition": 0}, {"value": "cGFuZGFwcm94eQ==", "partition": 1}, {"value": "bXVsdGlicm9rZXI=", "partition": 2} ] }''' self.logger.info(f"Producing to non-existant topic: {name}") produce_result = self._produce_topic(name, data) for o in produce_result["offsets"]: assert o["error_code"] == 3 assert o["offset"] == -1 kc = KafkaCat(self.redpanda) self.logger.info(f"Creating test topic: {name}") self._create_topics([name], partitions=3) self.logger.debug("Waiting for leaders to settle") has_leaders = False while not has_leaders: topics = kc.metadata()["topics"] maybe_leaders = True for t in topics: if t["topic"] == name: for p in t["partitions"]: if p["leader"] == -1: maybe_leaders = False has_leaders = maybe_leaders # TODO: # Despite the above test, Pandaproxy can still get back no leaders # Query Pandaproxy metadata to see when leaders have settled # The retry logic for produce should have sufficient time for this # additional settle time. self.logger.info(f"Producing to topic: {name}") produce_result = self._produce_topic(name, data) for o in produce_result["offsets"]: assert o["offset"] == 1, f'error_code {o["error_code"]}' self.logger.info(f"Consuming from topic: {name}") assert kc.consume_one(name, 0, 1)["payload"] == "vectorized" assert kc.consume_one(name, 1, 1)["payload"] == "pandaproxy" assert kc.consume_one(name, 2, 1)["payload"] == "multibroker"
def _ping_pong(self): kc = KafkaCat(self.redpanda) rpk = RpkTool(self.redpanda) payload = str(random.randint(0, 1000)) start = time.time() offset = rpk.produce(self.topic, "tkey", payload, timeout=5) consumed = kc.consume_one(self.topic, 0, offset) latency = time.time() - start self.logger.info( f"_ping_pong produced '{payload}' consumed '{consumed}' in {(latency)*1000.0:.2f} ms" ) if consumed['payload'] != payload: raise RuntimeError(f"expected '{payload}' got '{consumed}'")
def __init__(self, test_context): extra_rp_conf = dict( log_segment_size=1048576, retention_bytes=3145728, log_compaction_interval_ms=1000, enable_leader_balancer=False, ) super(PrefixTruncateRecoveryTest, self).__init__(test_context=test_context, num_brokers=3, extra_rp_conf=extra_rp_conf) self.kafka_tools = KafkaCliTools(self.redpanda) self.kafka_cat = KafkaCat(self.redpanda)
def test_adding_nodes_to_cluster(self): self.redpanda = RedpandaService( self.test_context, 3, extra_rp_conf={"group_topic_partitions": 1}) # start single node cluster self.redpanda.start(nodes=[self.redpanda.nodes[0]]) # create some topics topics = [] # include __consumer_offsets topic replica total_replicas = 1 for partition_count in range(1, 5): name = f"topic{len(topics)}" spec = TopicSpec(name=name, partition_count=partition_count, replication_factor=1) total_replicas += partition_count topics.append(spec) for spec in topics: DefaultClient(self.redpanda).create_topic(spec) self.topic = spec.name self.start_producer(1) self.start_consumer(1) self.await_startup() # add second node self.redpanda.start_node(self.redpanda.nodes[1]) kafkacat = KafkaCat(self.redpanda) def _replicas_per_node(): node_replicas = {} md = kafkacat.metadata() self.redpanda.logger.info(f"metadata: {md}") for topic in md['topics']: for p in topic['partitions']: for r in p['replicas']: id = r['id'] if id not in node_replicas: node_replicas[id] = 0 node_replicas[id] += 1 return node_replicas def partitions_rebalanced(): per_node = _replicas_per_node() self.redpanda.logger.info(f"replicas per node: {per_node}") if len(per_node) < len(self.redpanda.started_nodes()): return False replicas = sum(per_node.values()) if replicas != total_replicas: return False return all(p[1] > 1 for p in per_node.items()) wait_until(partitions_rebalanced, timeout_sec=30, backoff_sec=1) # add third node self.redpanda.start_node(self.redpanda.nodes[2]) wait_until(partitions_rebalanced, timeout_sec=30, backoff_sec=1) self.run_validation(enable_idempotence=False, consumer_timeout_sec=45)
def partitions(self, topic): """ Return partition metadata for the topic. """ kc = KafkaCat(self) md = kc.metadata() topic = next(filter(lambda t: t["topic"] == topic, md["topics"])) def make_partition(p): index = p["partition"] leader_id = p["leader"] leader = None if leader_id == -1 else self.get_node(leader_id) replicas = [self.get_node(r["id"]) for r in p["replicas"]] return Partition(index, leader, replicas) return [make_partition(p) for p in topic["partitions"]]
def test_controller_recovery(self): kc = KafkaCat(self.redpanda) # choose a partition and a target node partition = self._get_partition(kc) target_node_id = next( filter(lambda r: r["id"] != partition["leader"], partition["replicas"]))["id"] self.logger.debug( f"Transfering leader from {partition['leader']} to {target_node_id}" ) # build the transfer url meta = kc.metadata() brokers = meta["brokers"] source_broker = next( filter(lambda b: b["id"] == partition["leader"], brokers)) target_broker = next( filter(lambda b: b["id"] == target_node_id, brokers)) self.logger.debug(f"Source broker {source_broker}") self.logger.debug(f"Target broker {target_broker}") host = source_broker["name"] host = host.split(":")[0] partition_id = partition["partition"] url = "http://{}:9644/v1/kafka/{}/{}/transfer_leadership?target={}".format( host, self.topic, partition["partition"], target_node_id) def try_transfer(): self.logger.debug(url) res = requests.post(url) self.logger.debug(res.text) for _ in range(3): # just give it a moment time.sleep(1) meta = kc.metadata() partition = next( filter(lambda p: p["partition"] == partition_id, meta["topics"][0]["partitions"])) if partition["leader"] == target_node_id: return True return False wait_until(lambda: try_transfer(), timeout_sec=30, backoff_sec=5, err_msg="Transfer did not complete")
def test_controller_recovery(self): kc = KafkaCat(self.redpanda) # choose a partition and a target node partition = self._get_partition(kc) target_node_id = next( filter(lambda r: r["id"] != partition["leader"], partition["replicas"]))["id"] self.logger.debug( f"Transfering leader from {partition['leader']} to {target_node_id}" ) # build the transfer url meta = kc.metadata() brokers = meta["brokers"] source_broker = next( filter(lambda b: b["id"] == partition["leader"], brokers)) target_broker = next( filter(lambda b: b["id"] == target_node_id, brokers)) self.logger.debug(f"Source broker {source_broker}") self.logger.debug(f"Target broker {target_broker}") # Send the request to any host, they should redirect to # the leader of the partition. partition_id = partition['partition'] admin = Admin(self.redpanda) admin.partition_transfer_leadership("kafka", self.topic, partition_id, target_node_id) def transfer_complete(): for _ in range(3): # just give it a moment time.sleep(1) meta = kc.metadata() partition = next( filter(lambda p: p["partition"] == partition_id, meta["topics"][0]["partitions"])) if partition["leader"] == target_node_id: return True return False wait_until(lambda: transfer_complete(), timeout_sec=30, backoff_sec=5, err_msg="Transfer did not complete")
def test_produce_topic(self): """ Create a topic and verify that pandaproxy can produce to it. """ name = create_topic_names(1)[0] data = ''' { "records": [ {"value": "dmVjdG9yaXplZA==", "partition": 0}, {"value": "cGFuZGFwcm94eQ==", "partition": 1}, {"value": "bXVsdGlicm9rZXI=", "partition": 2} ] }''' self.logger.info(f"Producing to non-existant topic: {name}") produce_result = self._produce_topic(name, data) for o in produce_result["offsets"]: assert o["error_code"] == 3 assert o["offset"] == -1 self.logger.info(f"Creating test topic: {name}") self._create_topics([name], partitions=3) self.logger.debug("Waiting for leaders to settle") self._wait_for_topic(name) self.logger.info(f"Producing to topic: {name}") produce_result = self._produce_topic(name, data) for o in produce_result["offsets"]: assert o["offset"] == 1, f'error_code {o["error_code"]}' self.logger.info(f"Consuming from topic: {name}") kc = KafkaCat(self.redpanda) assert kc.consume_one(name, 0, 1)["payload"] == "vectorized" assert kc.consume_one(name, 1, 1)["payload"] == "pandaproxy" assert kc.consume_one(name, 2, 1)["payload"] == "multibroker"
class PrefixTruncateRecoveryTest(RedpandaTest): """ The purpose of this test is to exercise recovery of partitions which have had data reclaimed based on retention policy. The testing strategy is: 1. Stop 1 out 3 nodes 2. Produce until retention policy reclaims data 3. Restart the stopped node 4. Verify that the stopped node recovers Leadership balancing is disabled in this test because the final verification step tries to force leadership so that verification may query metadata from specific nodes where the kafka protocol only returns state from leaders. """ topics = (TopicSpec(cleanup_policy=TopicSpec.CLEANUP_DELETE), ) def __init__(self, test_context): extra_rp_conf = dict( log_segment_size=1048576, retention_bytes=3145728, log_compaction_interval_ms=1000, enable_leader_balancer=False, ) super(PrefixTruncateRecoveryTest, self).__init__(test_context=test_context, num_brokers=3, extra_rp_conf=extra_rp_conf) self.kafka_tools = KafkaCliTools(self.redpanda) self.kafka_cat = KafkaCat(self.redpanda) def fully_replicated(self, nodes): """ Test that for each specified node that there are no reported under replicated partitions corresponding to the test topic. """ metric = self.redpanda.metrics_sample("under_replicated_replicas", nodes) metric = metric.label_filter(dict(namespace="kafka", topic=self.topic)) assert len(metric.samples) == len(nodes) return all(map(lambda s: s.value == 0, metric.samples)) def get_segments_deleted(self, nodes): """ Return the values of the log segments removed metric. """ metric = self.redpanda.metrics_sample("log_segments_removed", nodes) metric = metric.label_filter(dict(namespace="kafka", topic=self.topic)) assert len(metric.samples) == len(nodes) return [s.value for s in metric.samples] def produce_until_reclaim(self, initial_deleted, acks): """ Produce data until we observe that segments have been deleted. The initial_deleted parameter is the max number of segments deleted across nodes, and we wait for all partitions to report at least initial + 3 deletions so that all nodes have experienced some deletion. """ deleted = self.get_segments_deleted(self.redpanda.nodes[1:]) if all(map(lambda d: d >= initial_deleted + 2, deleted)): return True self.kafka_tools.produce(self.topic, 1024, 1024, acks=acks) return False @cluster(num_nodes=3, log_allow_list=LOG_ALLOW_LIST) @matrix(acks=[-1, 1], start_empty=[True, False]) def test_prefix_truncate_recovery(self, acks, start_empty): # cover boundary conditions of partition being empty/non-empty if not start_empty: self.kafka_tools.produce(self.topic, 2048, 1024, acks=acks) wait_until(lambda: self.fully_replicated(self.redpanda.nodes), timeout_sec=90, backoff_sec=5) # stop this unfortunate node stopped_node = self.redpanda.nodes[0] self.redpanda.stop_node(stopped_node) # produce data into the topic until segments are reclaimed # by the configured retention policy deleted = max(self.get_segments_deleted(self.redpanda.nodes[1:])) wait_until(lambda: self.produce_until_reclaim(deleted, acks), timeout_sec=90, backoff_sec=5) # we should now observe an under replicated state wait_until(lambda: not self.fully_replicated(self.redpanda.nodes[1:]), timeout_sec=90, backoff_sec=5) # finally restart the node and wait until fully replicated self.redpanda.start_node(stopped_node) wait_until(lambda: self.fully_replicated(self.redpanda.nodes), timeout_sec=90, backoff_sec=5) self.verify_offsets() def verify_offsets(self): """ Test that the ending offset for the partition as seen on each node are identical. Since we can only query this from the leader, we disable auto leadership balancing, and manually transfer leadership before querying. Note that because each node applies retention policy independently to a prefix of the log we can't reliably compare the starting offsets. """ admin = Admin(self.redpanda) offsets = [] for node in self.redpanda.nodes: admin.transfer_leadership_to(namespace="kafka", topic=self.topic, partition=0, target=node) # % ERROR: offsets_for_times failed: Local: Unknown partition # may occur here presumably because there is an interaction # with leadership transfer. the built-in retries in list_offsets # appear to deal with this gracefully and we still pass. offsets.append(self.kafka_cat.list_offsets(self.topic, 0)) assert all(map(lambda o: o[1] == offsets[0][1], offsets))
def test_disabling_transactions_after_they_being_used(self): ''' Validate that transactions can be safely disabled after the feature have been used ''' # start redpanda with tranasactions enabled, we use # replication factor 1 for group topic to make # it unavailable when one of the nodes is down, self.start_redpanda(num_nodes=3, extra_rp_conf={ "transaction_coordinator_replication": 3, "id_allocator_replication": 3, "enable_idempotence": True, "enable_transactions": True, "default_topic_replications": 1, "default_topic_partitions": 1, "health_manager_tick_interval": 3600000 }) tx_topic = TopicSpec(name="tx-topic", partition_count=1, replication_factor=3) self.client().create_topic(tx_topic) # produce some messages to tx_topic kcat = KafkaCat(self.redpanda) kcat.produce_one(tx_topic.name, msg='test-msg', tx_id='test-tx-id') # disable transactions, self.redpanda.stop() for n in self.redpanda.nodes: self.redpanda.start_node(n, override_cfg_params={ "transaction_coordinator_replication": 3, "id_allocator_replication": 3, "enable_idempotence": False, "enable_transactions": False, "transactional_id_expiration_ms": 1000, "default_topic_replications": 3, "default_topic_partitions": 1 }) # create topic for test tester = TopicSpec(name="tester", partition_count=1, replication_factor=3) self.client().create_topic(tester) self.topic = tester self.start_producer(2, throughput=10000) self.start_consumer(1) self.await_startup() self.run_validation(min_records=100000, producer_timeout_sec=300, consumer_timeout_sec=300) # make sure that all redpanda nodes are up and running for n in self.redpanda.nodes: assert self.redpanda.redpanda_pid(n) != None
def _get_leader(self): """ :returns: 2 tuple of (leader, [replica ids]) """ return KafkaCat(self.redpanda).get_partition_leader(self.topic, 0)
def partition_ready(): return KafkaCat(self.redpanda).get_partition_leader( name, 0)[0] is not None
def test_node_recovery(self, recovery_type): self.start_redpanda(num_nodes=3) kafka_tools = KafkaCliTools(self.redpanda) kafka_cat = KafkaCat(self.redpanda) # create topics topics = [] for _ in range(0, 6): topics.append(TopicSpec(partition_count=random.randint(1, 10))) # chose one topic to run the main workload DefaultClient(self.redpanda).create_topic(topics) self.topic = random.choice(topics).name self.start_producer(1) self.start_consumer(2) self.await_startup() # chose another topic and populate it with data prepopulated_topic = random.choice(topics) while self.topic == prepopulated_topic.name: prepopulated_topic = random.choice(topics) # populate topic with data kafka_tools.produce(prepopulated_topic.name, 20000, 1024) def list_offsets(): offsets = {} for p in range(0, prepopulated_topic.partition_count): offsets[p] = kafka_cat.list_offsets(prepopulated_topic.name, p) # store offsets offsets = list_offsets() self.redpanda.logger.info(f"Topic offsets: {offsets}") # stop one of the nodes and remove its data stopped = random.choice(self.redpanda.nodes) # prepare seed servers list seeds = map(lambda n: { "address": n.account.hostname, "port": 33145 }, self.redpanda.nodes) seeds = list( filter(lambda n: n['address'] != stopped.account.hostname, seeds)) self.redpanda.stop_node(stopped) if recovery_type == FullNodeRecoveryTest.FULL_RECOVERY: self.redpanda.clean_node(stopped, preserve_logs=True) # produce some more data to make sure that stopped node is behind kafka_tools.produce(prepopulated_topic.name, 20000, 1024) # start node with the same node id, and not empty seed server list to # give node more time to start as it has to recover self.redpanda.start_node(stopped, override_cfg_params={'seed_servers': seeds}, timeout=90) def all_topics_recovered(): metric = self.redpanda.metrics_sample("under_replicated_replicas", self.redpanda.nodes) under_replicated = filter(lambda s: s.value == 1, metric.samples) under_replicated = list( map( lambda s: (s.labels['namespace'], s.labels['topic'], s. labels['partition']), under_replicated)) self.redpanda.logger.info( f"under replicated partitions: {list(under_replicated)}") return len(under_replicated) == 0 # wait for prepopulated topic to recover wait_until(all_topics_recovered, 60, 1) self.run_validation(min_records=20000, enable_idempotence=False, producer_timeout_sec=60, consumer_timeout_sec=180) # validate prepopulated topic offsets assert offsets == list_offsets()
def test_recreated_topic_metadata_are_valid(self, replication_factor): """ Test recreated topic metadata are valid across all the nodes """ topic = 'tp-test' partition_count = 5 rpk = RpkTool(self.redpanda) kcat = KafkaCat(self.redpanda) admin = Admin(self.redpanda) # create topic with replication factor of 3 rpk.create_topic(topic='tp-test', partitions=partition_count, replicas=replication_factor) # produce some data to the topic def wait_for_leader(partition, expected_leader): leader, _ = kcat.get_partition_leader(topic, partition) return leader == expected_leader def transfer_all_leaders(): partitions = rpk.describe_topic(topic) for p in partitions: replicas = set(p.replicas) replicas.remove(p.leader) target = random.choice(list(replicas)) admin.partition_transfer_leadership("kafka", topic, p.id, target) wait_until(lambda: wait_for_leader(p.id, target), timeout_sec=30, backoff_sec=1) msg_cnt = 100 producer = RpkProducer(self.test_context, self.redpanda, topic, 16384, msg_cnt, acks=-1) producer.start() producer.wait() producer.free() # transfer leadership to grow the term for i in range(0, 10): transfer_all_leaders() # recreate the topic rpk.delete_topic(topic) rpk.create_topic(topic='tp-test', partitions=partition_count, replicas=3) def metadata_consistent(): # validate leadership information on each node for p in range(0, partition_count): leaders = set() for n in self.redpanda.nodes: admin_partition = admin.get_partitions(topic=topic, partition=p, namespace="kafka", node=n) self.logger.info( f"node: {n.account.hostname} partition: {admin_partition}" ) leaders.add(admin_partition['leader_id']) self.logger.info(f"{topic}/{p} leaders: {leaders}") if len(leaders) != 1: return False return True wait_until(metadata_consistent, 45, backoff_sec=2)
def test_node_opeartions(self): # allocate 5 nodes for the cluster self.redpanda = RedpandaService( self.test_context, 5, KafkaCliTools, extra_rp_conf={ "enable_auto_rebalance_on_node_add": True, "group_topic_partitions": 3, "default_topic_replications": 3, }) # start 3 nodes self.redpanda.start() # create some topics topics = self._create_random_topics(10) self.redpanda.logger.info(f"using topics: {topics}") # select one of the topics to use in consumer/producer self.topic = random.choice(topics).name self.start_producer(1, throughput=100) self.start_consumer(1) self.await_startup() def decommission(node_id): self.logger.info(f"decommissioning node: {node_id}") admin = Admin(self.redpanda) admin.decommission_broker(id=node_id) def node_removed(): admin = Admin(self.redpanda) brokers = admin.get_brokers() for b in brokers: if b['node_id'] == node_id: return False return True wait_until(node_removed, timeout_sec=240, backoff_sec=2) kafkacat = KafkaCat(self.redpanda) def replicas_per_node(): node_replicas = {} md = kafkacat.metadata() self.redpanda.logger.info(f"metadata: {md}") for topic in md['topics']: for p in topic['partitions']: for r in p['replicas']: id = r['id'] if id not in node_replicas: node_replicas[id] = 0 node_replicas[id] += 1 return node_replicas def restart_node(node_id, cleanup=True): self.logger.info(f"restarting node: {node_id}") self.redpanda.stop_node(self.redpanda.nodes[node_id - 1]) if cleanup: self.redpanda.clean_node(self.redpanda.nodes[node_id - 1]) self.redpanda.start_node(self.redpanda.nodes[node_id - 1]) admin = Admin(self.redpanda) admin.set_log_level("cluster", "trace") def has_new_replicas(): per_node = replicas_per_node() self.logger.info(f"replicas per node: {per_node}") return node_id in per_node wait_until(has_new_replicas, timeout_sec=240, backoff_sec=2) admin = Admin(self.redpanda) admin.set_log_level("cluster", "trace") work = self.generate_random_workload(10, skip_nodes=set()) self.redpanda.logger.info(f"node operations to execute: {work}") for op in work: op_type = op[0] self.logger.info(f"executing - {op}") if op_type == ADD: id = op[1] restart_node(id) if op_type == DECOMMISSION: id = op[1] decommission(id) elif op_type == ADD_TOPIC: spec = TopicSpec(name=op[1], replication_factor=op[2], partition_count=op[3]) self.redpanda.create_topic(spec) elif op_type == DELETE_TOPIC: self.redpanda.delete_topic(op[1]) self.run_validation(enable_idempotence=False, consumer_timeout_sec=180)
def test_node_operations(self, enable_failures): # allocate 5 nodes for the cluster self.redpanda = RedpandaService( self.test_context, 5, extra_rp_conf={ "enable_auto_rebalance_on_node_add": True, "group_topic_partitions": 3, "default_topic_replications": 3, }) self.redpanda.start() # create some topics topics = self._create_random_topics(10) self.redpanda.logger.info(f"using topics: {topics}") # select one of the topics to use in consumer/producer self.topic = random.choice(topics).name self.start_producer(1, throughput=100) self.start_consumer(1) self.await_startup() self.active_nodes = set( [self.redpanda.idx(n) for n in self.redpanda.nodes]) # collect current mapping self.ids_mapping = {} for n in self.redpanda.nodes: self.ids_mapping[self.redpanda.idx(n)] = self.redpanda.idx(n) self.next_id = sorted(list(self.ids_mapping.keys()))[-1] + 1 self.redpanda.logger.info(f"Initial ids mapping: {self.ids_mapping}") NODE_OP_TIMEOUT = 360 def get_next_id(): id = self.next_id self.next_id += 1 return id def failure_injector_loop(): f_injector = FailureInjector(self.redpanda) while enable_failures: f_type = random.choice(FailureSpec.FAILURE_TYPES) length = 0 # allow suspending any node if f_type == FailureSpec.FAILURE_SUSPEND: length = random.randint( 1, NodeOperationFuzzyTest.max_suspend_duration_seconds) node = random.choice(self.redpanda.nodes) else: #kill/termianate only active nodes (not to influence the test outcome) idx = random.choice(list(self.active_nodes)) node = self.redpanda.get_node(idx) f_injector.inject_failure( FailureSpec(node=node, type=f_type, length=length)) delay = random.randint( NodeOperationFuzzyTest.min_inter_failure_time, NodeOperationFuzzyTest.max_inter_failure_time) self.redpanda.logger.info( f"waiting {delay} seconds before next failure") time.sleep(delay) if enable_failures: finjector_thread = threading.Thread(target=failure_injector_loop, args=()) finjector_thread.daemon = True finjector_thread.start() def decommission(idx): node_id = self.ids_mapping[idx] self.logger.info(f"decommissioning node: {idx} with id: {node_id}") def decommissioned(): try: admin = Admin(self.redpanda) # if broker is already draining, it is suceess brokers = admin.get_brokers() for b in brokers: if b['node_id'] == node_id and b[ 'membership_status'] == 'draining': return True r = admin.decommission_broker(id=node_id) return r.status_code == 200 except requests.exceptions.RetryError: return False except requests.exceptions.ConnectionError: return False except requests.exceptions.HTTPError: return False wait_until(decommissioned, timeout_sec=NODE_OP_TIMEOUT, backoff_sec=2) admin = Admin(self.redpanda) def is_node_removed(idx_to_query, node_id): try: brokers = admin.get_brokers( self.redpanda.get_node(idx_to_query)) ids = map(lambda broker: broker['node_id'], brokers) return not node_id in ids except: return False def node_removed(): node_removed_cnt = 0 for idx in self.active_nodes: if is_node_removed(idx, node_id): node_removed_cnt += 1 node_count = len(self.redpanda.nodes) majority = int(node_count / 2) + 1 self.redpanda.logger.debug( f"node {node_id} removed on {node_removed_cnt} nodes, majority: {majority}" ) return node_removed_cnt >= majority wait_until(node_removed, timeout_sec=NODE_OP_TIMEOUT, backoff_sec=2) self.redpanda.stop_node(self.redpanda.get_node(idx)) kafkacat = KafkaCat(self.redpanda) def replicas_per_node(): node_replicas = {} md = kafkacat.metadata() self.redpanda.logger.info(f"metadata: {md}") for topic in md['topics']: for p in topic['partitions']: for r in p['replicas']: id = r['id'] if id not in node_replicas: node_replicas[id] = 0 node_replicas[id] += 1 return node_replicas def seed_servers_for(idx): seeds = map( lambda n: { "address": n.account.hostname, "port": 33145 }, self.redpanda.nodes) return list( filter( lambda n: n['address'] != self.redpanda.get_node(idx). account.hostname, seeds)) def add_node(idx, cleanup=True): id = get_next_id() self.logger.info(f"adding node: {idx} back with new id: {id}") self.ids_mapping[idx] = id self.redpanda.stop_node(self.redpanda.get_node(idx)) if cleanup: self.redpanda.clean_node(self.redpanda.get_node(idx), preserve_logs=True) # we do not reuse previous node ids and override seed server list self.redpanda.start_node( self.redpanda.get_node(idx), timeout=NodeOperationFuzzyTest.min_inter_failure_time + NodeOperationFuzzyTest.max_suspend_duration_seconds + 30, override_cfg_params={ "node_id": id, "seed_servers": seed_servers_for(idx) }) def has_new_replicas(): per_node = replicas_per_node() self.logger.info(f"replicas per node: {per_node}") return id in per_node wait_until(has_new_replicas, timeout_sec=NODE_OP_TIMEOUT, backoff_sec=2) def is_topic_present(name): kcl = KCL(self.redpanda) lines = kcl.list_topics().splitlines() self.redpanda.logger.debug( f"checking if topic {name} is present in {lines}") for l in lines: if l.startswith(name): return True return False def create_topic(spec): try: DefaultClient(self.redpanda).create_topic(spec) except Exception as e: self.redpanda.logger.warn( f"error creating topic {spec.name} - {e}") try: return is_topic_present(spec.name) except Exception as e: self.redpanda.logger.warn(f"error while listing topics - {e}") return False def delete_topic(name): try: DefaultClient(self.redpanda).delete_topic(name) except Exception as e: self.redpanda.logger.warn(f"error deleting topic {name} - {e}") try: return not is_topic_present(name) except Exception as e: self.redpanda.logger.warn(f"error while listing topics - {e}") return False work = self.generate_random_workload(10, skip_nodes=set(), available_nodes=self.active_nodes) self.redpanda.logger.info(f"node operations to execute: {work}") for op in work: op_type = op[0] self.logger.info( f"executing - {op} - current ids: {self.ids_mapping}") if op_type == ADD: idx = op[1] self.active_nodes.add(idx) add_node(idx) if op_type == DECOMMISSION: idx = op[1] self.active_nodes.remove(idx) decommission(idx) elif op_type == ADD_TOPIC: spec = TopicSpec(name=op[1], replication_factor=op[2], partition_count=op[3]) wait_until(lambda: create_topic(spec) == True, timeout_sec=180, backoff_sec=2) elif op_type == DELETE_TOPIC: wait_until(lambda: delete_topic(op[1]) == True, timeout_sec=180, backoff_sec=2) enable_failures = False self.run_validation(enable_idempotence=False, producer_timeout_sec=60, consumer_timeout_sec=180)
def test_node_opeartions(self, enable_failures): # allocate 5 nodes for the cluster self.redpanda = RedpandaService( self.test_context, 5, KafkaCliTools, extra_rp_conf={ "enable_auto_rebalance_on_node_add": True, "group_topic_partitions": 3, "default_topic_replications": 3, }) self.active_nodes = set([1, 2, 3, 4, 5]) self.redpanda.start() # create some topics topics = self._create_random_topics(10) self.redpanda.logger.info(f"using topics: {topics}") # select one of the topics to use in consumer/producer self.topic = random.choice(topics).name self.start_producer(1, throughput=100) self.start_consumer(1) self.await_startup() NODE_OP_TIMEOUT = 360 def failure_injector_loop(): f_injector = FailureInjector(self.redpanda) while enable_failures: f_type = random.choice(FailureSpec.FAILURE_TYPES) length = 0 # allow suspending any node if f_type == FailureSpec.FAILURE_SUSPEND: length = random.randint(1, 10) node = random.choice(self.redpanda.nodes) else: #kill/termianate only active nodes (not to influence the test outcome) idx = random.choice(list(self.active_nodes)) - 1 node = self.redpanda.nodes[idx] f_injector.inject_failure( FailureSpec(node=node, type=f_type, length=length)) delay = random.randint(20, 45) self.redpanda.logger.info( f"waiting {delay} seconds before next failure") time.sleep(delay) if enable_failures: finjector_thread = threading.Thread(target=failure_injector_loop, args=()) finjector_thread.daemon = True finjector_thread.start() def decommission(node_id): self.logger.info(f"decommissioning node: {node_id}") def decommissioned(): try: admin = Admin(self.redpanda) # if broker is already draining, it is suceess brokers = admin.get_brokers() for b in brokers: if b['node_id'] == node_id and b[ 'membership_status'] == 'draining': return True r = admin.decommission_broker(id=node_id) return r.status_code == 200 except requests.exceptions.RetryError: return False except requests.exceptions.ConnectionError: return False except requests.exceptions.HTTPError: return False wait_until(decommissioned, timeout_sec=NODE_OP_TIMEOUT, backoff_sec=2) def node_removed(): admin = Admin(self.redpanda) try: brokers = admin.get_brokers(node=self.redpanda.nodes[0]) for b in brokers: if b['node_id'] == node_id: return False return True except: return False wait_until(node_removed, timeout_sec=NODE_OP_TIMEOUT, backoff_sec=2) kafkacat = KafkaCat(self.redpanda) def replicas_per_node(): node_replicas = {} md = kafkacat.metadata() self.redpanda.logger.info(f"metadata: {md}") for topic in md['topics']: for p in topic['partitions']: for r in p['replicas']: id = r['id'] if id not in node_replicas: node_replicas[id] = 0 node_replicas[id] += 1 return node_replicas def restart_node(node_id, cleanup=True): self.logger.info(f"restarting node: {node_id}") self.redpanda.stop_node(self.redpanda.nodes[node_id - 1]) if cleanup: self.redpanda.clean_node(self.redpanda.nodes[node_id - 1], preserve_logs=True) self.redpanda.start_node(self.redpanda.nodes[node_id - 1]) def has_new_replicas(): per_node = replicas_per_node() self.logger.info(f"replicas per node: {per_node}") return node_id in per_node wait_until(has_new_replicas, timeout_sec=NODE_OP_TIMEOUT, backoff_sec=2) def is_topic_present(name): kcl = KCL(self.redpanda) lines = kcl.list_topics().splitlines() self.redpanda.logger.debug( f"checking if topic {name} is present in {lines}") for l in lines: if l.startswith(name): return True return False def create_topic(spec): try: self.redpanda.create_topic(spec) except Exception as e: self.redpanda.logger.warn( f"error creating topic {spec.name} - {e}") try: return is_topic_present(spec.name) except Exception as e: self.redpanda.logger.warn(f"error while listing topics - {e}") return False def delete_topic(name): try: self.redpanda.delete_topic(name) except Exception as e: self.redpanda.logger.warn(f"error deleting topic {name} - {e}") try: return not is_topic_present(name) except Exception as e: self.redpanda.logger.warn(f"error while listing topics - {e}") return False work = self.generate_random_workload(10, skip_nodes=set()) self.redpanda.logger.info(f"node operations to execute: {work}") for op in work: op_type = op[0] self.logger.info(f"executing - {op}") if op_type == ADD: id = op[1] self.active_nodes.add(id) restart_node(id) if op_type == DECOMMISSION: id = op[1] self.active_nodes.remove(id) decommission(id) elif op_type == ADD_TOPIC: spec = TopicSpec(name=op[1], replication_factor=op[2], partition_count=op[3]) wait_until(lambda: create_topic(spec) == True, timeout_sec=180, backoff_sec=2) elif op_type == DELETE_TOPIC: wait_until(lambda: delete_topic(op[1]) == True, timeout_sec=180, backoff_sec=2) enable_failures = False self.run_validation(enable_idempotence=False, producer_timeout_sec=60, consumer_timeout_sec=180)
def _get_leaders_by_node(self): kc = KafkaCat(self.redpanda) md = kc.metadata() topic = next(filter(lambda t: t["topic"] == self.topic, md["topics"])) leaders = (p["leader"] for p in topic["partitions"]) return collections.Counter(leaders)