class RedpandaTest(Test): """ Base class for tests that use the Redpanda service. """ # List of topics to be created automatically when the cluster starts. Each # topic is defined by an instance of a TopicSpec. topics = () def __init__(self, test_context, num_brokers=3, extra_rp_conf=dict(), topics=None, log_level='info'): super(RedpandaTest, self).__init__(test_context) self.redpanda = RedpandaService(test_context, num_brokers=num_brokers, extra_rp_conf=extra_rp_conf, topics=self.topics, log_level=log_level) @property def topic(self): """ Return the name of the auto-created initial topic. Accessing this property requires exactly one initial topic be configured. """ assert len(self.topics) == 1 return self.topics[0].name def setUp(self): self.redpanda.start()
class RedpandaTest(Test): """ Base class for tests that use the Redpanda service. """ # List of topics to be created automatically when the cluster starts. Each # topic is defined by an instance of a TopicSpec. topics = [] def __init__(self, test_context, num_brokers=3, extra_rp_conf=dict(), enable_pp=False, enable_sr=False, num_cores=3): super(RedpandaTest, self).__init__(test_context) self.scale = Scale(test_context) self.redpanda = RedpandaService(test_context, num_brokers, extra_rp_conf=extra_rp_conf, enable_pp=enable_pp, enable_sr=enable_sr, num_cores=num_cores) self._client = DefaultClient(self.redpanda) @property def topic(self): """ Return the name of the auto-created initial topic. Accessing this property requires exactly one initial topic be configured. """ assert len(self.topics) == 1 return self.topics[0].name def setUp(self): self.redpanda.start() self._create_initial_topics() def client(self): return self._client def _create_initial_topics(self): config = self.redpanda.security_config() user = config.get("sasl_plain_username") passwd = config.get("sasl_plain_password") client = KafkaCliTools(self.redpanda, user=user, passwd=passwd) for spec in self.topics: self.logger.debug(f"Creating initial topic {spec}") client.create_topic(spec)
class RedpandaTest(Test): """ Base class for tests that use the Redpanda service. """ def __init__(self, test_context, num_brokers=3, extra_rp_conf=dict(), topics=None, log_level='info'): super(RedpandaTest, self).__init__(test_context) self.redpanda = RedpandaService(test_context, num_brokers=num_brokers, extra_rp_conf=extra_rp_conf, topics=topics, log_level=log_level) def setUp(self): self.redpanda.start()
class EndToEndShadowIndexingBase(EndToEndTest): segment_size = 1048576 # 1 Mb s3_topic_name = "panda-topic" num_brokers = 3 topics = (TopicSpec( name=s3_topic_name, partition_count=1, replication_factor=3, ), ) def __init__(self, test_context, extra_rp_conf=None): super(EndToEndShadowIndexingBase, self).__init__(test_context=test_context) self.test_context = test_context self.topic = EndToEndShadowIndexingTest.s3_topic_name self.si_settings = SISettings( cloud_storage_reconciliation_interval_ms=500, cloud_storage_max_connections=5, log_segment_size=EndToEndShadowIndexingTest.segment_size, # 1MB ) self.s3_bucket_name = self.si_settings.cloud_storage_bucket self.si_settings.load_context(self.logger, test_context) self.scale = Scale(test_context) self.redpanda = RedpandaService(context=self.test_context, num_brokers=self.num_brokers, si_settings=self.si_settings, extra_rp_conf=extra_rp_conf) self.kafka_tools = KafkaCliTools(self.redpanda) def setUp(self): self.redpanda.start() for topic in EndToEndShadowIndexingBase.topics: self.kafka_tools.create_topic(topic) def tearDown(self): self.s3_client.empty_bucket(self.s3_bucket_name)
class NodeOperationFuzzyTest(EndToEndTest): def generate_random_workload(self, count, skip_nodes): op_types = [ADD, DECOMMISSION] tp_op_types = [ADD_TOPIC, DELETE_TOPIC] # current state active_nodes = [1, 2, 3, 4, 5] decommissioned_nodes = [] operations = [] topics = [] def eligible_active_nodes(): return list( filter(lambda n: not (n == 1 or n in skip_nodes), active_nodes)) def decommission(id): active_nodes.remove(id) decommissioned_nodes.append(id) def add(id): active_nodes.append(id) decommissioned_nodes.remove(id) for _ in range(0, count): if len(decommissioned_nodes) == 2: id = random.choice(decommissioned_nodes) operations.append((ADD, id)) add(id) elif len(decommissioned_nodes) == 0: id = random.choice(eligible_active_nodes()) operations.append((DECOMMISSION, id)) decommission(id) else: op = random.choice(op_types) if op == DECOMMISSION: id = random.choice(eligible_active_nodes()) operations.append((DECOMMISSION, id)) decommission(id) elif op == ADD: id = random.choice(decommissioned_nodes) operations.append((ADD, id)) add(id) # topic operation if len(topics) == 0: op = ADD_TOPIC else: op = random.choice(tp_op_types) if op == ADD_TOPIC: operations.append(( ADD_TOPIC, f"test-topic-{random.randint(0,2000)}-{round(time.time()*1000000)}", random.choice(ALLOWED_REPLICATION), 3)) else: operations.append((DELETE_TOPIC, random.choice(topics))) return operations def _create_random_topics(self, count): max_partitions = 10 topics = [] for i in range(0, count): name = f"topic-{i}" spec = TopicSpec( name=name, partition_count=random.randint(1, max_partitions), replication_factor=random.choice(ALLOWED_REPLICATION)) topics.append(spec) for spec in topics: self.redpanda.create_topic(spec) return topics """ Adding nodes to the cluster should result in partition reallocations to new nodes """ @cluster(num_nodes=7) @parametrize(enable_failures=True) @parametrize(enable_failures=False) def test_node_opeartions(self, enable_failures): # allocate 5 nodes for the cluster self.redpanda = RedpandaService( self.test_context, 5, KafkaCliTools, extra_rp_conf={ "enable_auto_rebalance_on_node_add": True, "group_topic_partitions": 3, "default_topic_replications": 3, }) self.active_nodes = set([1, 2, 3, 4, 5]) self.redpanda.start() # create some topics topics = self._create_random_topics(10) self.redpanda.logger.info(f"using topics: {topics}") # select one of the topics to use in consumer/producer self.topic = random.choice(topics).name self.start_producer(1, throughput=100) self.start_consumer(1) self.await_startup() NODE_OP_TIMEOUT = 360 def failure_injector_loop(): f_injector = FailureInjector(self.redpanda) while enable_failures: f_type = random.choice(FailureSpec.FAILURE_TYPES) length = 0 # allow suspending any node if f_type == FailureSpec.FAILURE_SUSPEND: length = random.randint(1, 10) node = random.choice(self.redpanda.nodes) else: #kill/termianate only active nodes (not to influence the test outcome) idx = random.choice(list(self.active_nodes)) - 1 node = self.redpanda.nodes[idx] f_injector.inject_failure( FailureSpec(node=node, type=f_type, length=length)) delay = random.randint(20, 45) self.redpanda.logger.info( f"waiting {delay} seconds before next failure") time.sleep(delay) if enable_failures: finjector_thread = threading.Thread(target=failure_injector_loop, args=()) finjector_thread.daemon = True finjector_thread.start() def decommission(node_id): self.logger.info(f"decommissioning node: {node_id}") def decommissioned(): try: admin = Admin(self.redpanda) # if broker is already draining, it is suceess brokers = admin.get_brokers() for b in brokers: if b['node_id'] == node_id and b[ 'membership_status'] == 'draining': return True r = admin.decommission_broker(id=node_id) return r.status_code == 200 except requests.exceptions.RetryError: return False except requests.exceptions.ConnectionError: return False except requests.exceptions.HTTPError: return False wait_until(decommissioned, timeout_sec=NODE_OP_TIMEOUT, backoff_sec=2) def node_removed(): admin = Admin(self.redpanda) try: brokers = admin.get_brokers(node=self.redpanda.nodes[0]) for b in brokers: if b['node_id'] == node_id: return False return True except: return False wait_until(node_removed, timeout_sec=NODE_OP_TIMEOUT, backoff_sec=2) kafkacat = KafkaCat(self.redpanda) def replicas_per_node(): node_replicas = {} md = kafkacat.metadata() self.redpanda.logger.info(f"metadata: {md}") for topic in md['topics']: for p in topic['partitions']: for r in p['replicas']: id = r['id'] if id not in node_replicas: node_replicas[id] = 0 node_replicas[id] += 1 return node_replicas def restart_node(node_id, cleanup=True): self.logger.info(f"restarting node: {node_id}") self.redpanda.stop_node(self.redpanda.nodes[node_id - 1]) if cleanup: self.redpanda.clean_node(self.redpanda.nodes[node_id - 1], preserve_logs=True) self.redpanda.start_node(self.redpanda.nodes[node_id - 1]) def has_new_replicas(): per_node = replicas_per_node() self.logger.info(f"replicas per node: {per_node}") return node_id in per_node wait_until(has_new_replicas, timeout_sec=NODE_OP_TIMEOUT, backoff_sec=2) def is_topic_present(name): kcl = KCL(self.redpanda) lines = kcl.list_topics().splitlines() self.redpanda.logger.debug( f"checking if topic {name} is present in {lines}") for l in lines: if l.startswith(name): return True return False def create_topic(spec): try: self.redpanda.create_topic(spec) except Exception as e: self.redpanda.logger.warn( f"error creating topic {spec.name} - {e}") try: return is_topic_present(spec.name) except Exception as e: self.redpanda.logger.warn(f"error while listing topics - {e}") return False def delete_topic(name): try: self.redpanda.delete_topic(name) except Exception as e: self.redpanda.logger.warn(f"error deleting topic {name} - {e}") try: return not is_topic_present(name) except Exception as e: self.redpanda.logger.warn(f"error while listing topics - {e}") return False work = self.generate_random_workload(10, skip_nodes=set()) self.redpanda.logger.info(f"node operations to execute: {work}") for op in work: op_type = op[0] self.logger.info(f"executing - {op}") if op_type == ADD: id = op[1] self.active_nodes.add(id) restart_node(id) if op_type == DECOMMISSION: id = op[1] self.active_nodes.remove(id) decommission(id) elif op_type == ADD_TOPIC: spec = TopicSpec(name=op[1], replication_factor=op[2], partition_count=op[3]) wait_until(lambda: create_topic(spec) == True, timeout_sec=180, backoff_sec=2) elif op_type == DELETE_TOPIC: wait_until(lambda: delete_topic(op[1]) == True, timeout_sec=180, backoff_sec=2) enable_failures = False self.run_validation(enable_idempotence=False, producer_timeout_sec=60, consumer_timeout_sec=180)
class NodeOperationFuzzyTest(EndToEndTest): def generate_random_workload(self, count, skip_nodes): op_types = [ADD, DECOMMISSION] tp_op_types = [ADD_TOPIC, DELETE_TOPIC] # current state active_nodes = [1, 2, 3, 4, 5] decommissioned_nodes = [] operations = [] topics = [] def eligible_active_nodes(): return list( filter(lambda n: not (n == 1 or n in skip_nodes), active_nodes)) def decommission(id): active_nodes.remove(id) decommissioned_nodes.append(id) def add(id): active_nodes.append(id) decommissioned_nodes.remove(id) for _ in range(0, count): if len(decommissioned_nodes) == 2: id = random.choice(decommissioned_nodes) operations.append((ADD, id)) add(id) elif len(decommissioned_nodes) == 0: id = random.choice(eligible_active_nodes()) operations.append((DECOMMISSION, id)) decommission(id) else: op = random.choice(op_types) if op == DECOMMISSION: id = random.choice(eligible_active_nodes()) operations.append((DECOMMISSION, id)) decommission(id) elif op == ADD: id = random.choice(decommissioned_nodes) operations.append((ADD, id)) add(id) # topic operation if len(topics) == 0: op = ADD_TOPIC else: op = random.choice(tp_op_types) if op == ADD_TOPIC: operations.append(( ADD_TOPIC, f"test-topic-{random.randint(0,2000)}-{time.time()*1000.0}", random.choice(ALLOWED_REPLICATION), 3)) else: operations.append((DELETE_TOPIC, random.choice(topics))) return operations def _create_random_topics(self, count): max_partitions = 10 topics = [] for i in range(0, count): name = f"topic-{i}" spec = TopicSpec( name=name, partition_count=random.randint(1, max_partitions), replication_factor=random.choice(ALLOWED_REPLICATION)) topics.append(spec) for spec in topics: self.redpanda.create_topic(spec) return topics """ Adding nodes to the cluster should result in partition reallocations to new nodes """ @cluster(num_nodes=7) def test_node_opeartions(self): # allocate 5 nodes for the cluster self.redpanda = RedpandaService( self.test_context, 5, KafkaCliTools, extra_rp_conf={ "enable_auto_rebalance_on_node_add": True, "group_topic_partitions": 3, "default_topic_replications": 3, }) # start 3 nodes self.redpanda.start() # create some topics topics = self._create_random_topics(10) self.redpanda.logger.info(f"using topics: {topics}") # select one of the topics to use in consumer/producer self.topic = random.choice(topics).name self.start_producer(1, throughput=100) self.start_consumer(1) self.await_startup() def decommission(node_id): self.logger.info(f"decommissioning node: {node_id}") admin = Admin(self.redpanda) admin.decommission_broker(id=node_id) def node_removed(): admin = Admin(self.redpanda) brokers = admin.get_brokers() for b in brokers: if b['node_id'] == node_id: return False return True wait_until(node_removed, timeout_sec=240, backoff_sec=2) kafkacat = KafkaCat(self.redpanda) def replicas_per_node(): node_replicas = {} md = kafkacat.metadata() self.redpanda.logger.info(f"metadata: {md}") for topic in md['topics']: for p in topic['partitions']: for r in p['replicas']: id = r['id'] if id not in node_replicas: node_replicas[id] = 0 node_replicas[id] += 1 return node_replicas def restart_node(node_id, cleanup=True): self.logger.info(f"restarting node: {node_id}") self.redpanda.stop_node(self.redpanda.nodes[node_id - 1]) if cleanup: self.redpanda.clean_node(self.redpanda.nodes[node_id - 1]) self.redpanda.start_node(self.redpanda.nodes[node_id - 1]) admin = Admin(self.redpanda) admin.set_log_level("cluster", "trace") def has_new_replicas(): per_node = replicas_per_node() self.logger.info(f"replicas per node: {per_node}") return node_id in per_node wait_until(has_new_replicas, timeout_sec=240, backoff_sec=2) admin = Admin(self.redpanda) admin.set_log_level("cluster", "trace") work = self.generate_random_workload(10, skip_nodes=set()) self.redpanda.logger.info(f"node operations to execute: {work}") for op in work: op_type = op[0] self.logger.info(f"executing - {op}") if op_type == ADD: id = op[1] restart_node(id) if op_type == DECOMMISSION: id = op[1] decommission(id) elif op_type == ADD_TOPIC: spec = TopicSpec(name=op[1], replication_factor=op[2], partition_count=op[3]) self.redpanda.create_topic(spec) elif op_type == DELETE_TOPIC: self.redpanda.delete_topic(op[1]) self.run_validation(enable_idempotence=False, consumer_timeout_sec=180)
class FetchAfterDeleteTest(Test): def __init__(self, test_context): super(FetchAfterDeleteTest, self).__init__(test_context) self.scale = Scale(test_context) @cluster(num_nodes=3) @parametrize(transactions_enabled=True) @parametrize(transactions_enabled=False) def test_fetch_after_committed_offset_was_removed(self, transactions_enabled): """ Test fetching when consumer offset was deleted by retention """ segment_size = 1048576 self.redpanda = RedpandaService(self.test_context, 3, KafkaCliTools, extra_rp_conf={ "enable_transactions": transactions_enabled, "enable_idempotence": transactions_enabled, "log_compaction_interval_ms": 5000, "log_segment_size": segment_size, "enable_leader_balancer": False, }) self.redpanda.start() topic = TopicSpec(partition_count=1, replication_factor=3, cleanup_policy=TopicSpec.CLEANUP_DELETE) self.redpanda.create_topic(topic) self.topic = topic.name kafka_tools = KafkaCliTools(self.redpanda) # produce until segments have been compacted produce_until_segments( self.redpanda, topic=self.topic, partition_idx=0, count=10, ) consumer_group = 'test' rpk = RpkTool(self.redpanda) def consume(n=1): out = rpk.consume(self.topic, group=consumer_group, n=n) split = out.split('}') split = filter(lambda s: "{" in s, split) return map(lambda s: json.loads(s + "}"), split) #consume from the beggining msgs = consume(10) last = list(msgs).pop() offset = last['offset'] # change retention time kafka_tools.alter_topic_config( self.topic, { TopicSpec.PROPERTY_RETENTION_BYTES: 2 * segment_size, }) wait_for_segments_removal(self.redpanda, self.topic, partition_idx=0, count=5) partitions = list(rpk.describe_topic(self.topic)) p = partitions[0] assert p.start_offset > offset # consume from the offset that doesn't exists, # the one that was committed previously was already removed out = list(consume(1)) assert out[0]['offset'] == p.start_offset
class ScalingUpTest(EndToEndTest): """ Adding nodes to the cluster should result in partition reallocations to new nodes """ @cluster(num_nodes=5) def test_adding_nodes_to_cluster(self): self.redpanda = RedpandaService( self.test_context, 3, extra_rp_conf={"group_topic_partitions": 1}) # start single node cluster self.redpanda.start(nodes=[self.redpanda.nodes[0]]) # create some topics topics = [] # include __consumer_offsets topic replica total_replicas = 1 for partition_count in range(1, 5): name = f"topic{len(topics)}" spec = TopicSpec(name=name, partition_count=partition_count, replication_factor=1) total_replicas += partition_count topics.append(spec) for spec in topics: DefaultClient(self.redpanda).create_topic(spec) self.topic = spec.name self.start_producer(1) self.start_consumer(1) self.await_startup() # add second node self.redpanda.start_node(self.redpanda.nodes[1]) kafkacat = KafkaCat(self.redpanda) def _replicas_per_node(): node_replicas = {} md = kafkacat.metadata() self.redpanda.logger.info(f"metadata: {md}") for topic in md['topics']: for p in topic['partitions']: for r in p['replicas']: id = r['id'] if id not in node_replicas: node_replicas[id] = 0 node_replicas[id] += 1 return node_replicas def partitions_rebalanced(): per_node = _replicas_per_node() self.redpanda.logger.info(f"replicas per node: {per_node}") if len(per_node) < len(self.redpanda.started_nodes()): return False replicas = sum(per_node.values()) if replicas != total_replicas: return False return all(p[1] > 1 for p in per_node.items()) wait_until(partitions_rebalanced, timeout_sec=30, backoff_sec=1) # add third node self.redpanda.start_node(self.redpanda.nodes[2]) wait_until(partitions_rebalanced, timeout_sec=30, backoff_sec=1) self.run_validation(enable_idempotence=False, consumer_timeout_sec=45)
class EndToEndTest(Test): """ Test for common pattern: - Produce and consume in the background - Perform some action (e.g. partition movement) - Run validation """ def __init__(self, test_context, extra_rp_conf=None): super(EndToEndTest, self).__init__(test_context=test_context) if extra_rp_conf is None: self._extra_rp_conf = {} else: self._extra_rp_conf = extra_rp_conf self.records_consumed = [] self.last_consumed_offsets = {} self.redpanda = None self.topic = None self._client = None def start_redpanda(self, num_nodes=1, extra_rp_conf=None): if extra_rp_conf is not None: # merge both configurations, the extra_rp_conf passed in # paramter takes the precedence self._extra_rp_conf = {**self._extra_rp_conf, **extra_rp_conf} assert self.redpanda is None self.redpanda = RedpandaService(self.test_context, num_nodes, extra_rp_conf=self._extra_rp_conf) self.redpanda.start() self._client = DefaultClient(self.redpanda) def client(self): assert self._client is not None return self._client def start_consumer(self, num_nodes=1, group_id="test_group"): assert self.redpanda assert self.topic self.consumer = VerifiableConsumer( self.test_context, num_nodes=num_nodes, redpanda=self.redpanda, topic=self.topic, group_id=group_id, on_record_consumed=self.on_record_consumed) self.consumer.start() def start_producer(self, num_nodes=1, throughput=1000): assert self.redpanda assert self.topic self.producer = VerifiableProducer( self.test_context, num_nodes=num_nodes, redpanda=self.redpanda, topic=self.topic, throughput=throughput, message_validator=is_int_with_prefix) self.producer.start() def on_record_consumed(self, record, node): partition = TopicPartition(record["topic"], record["partition"]) record_id = record["value"] offset = record["offset"] self.last_consumed_offsets[partition] = offset self.records_consumed.append(record_id) def await_consumed_offsets(self, last_acked_offsets, timeout_sec): def has_finished_consuming(): for partition, offset in last_acked_offsets.items(): if not partition in self.last_consumed_offsets: return False last_commit = self.consumer.last_commit(partition) if not last_commit or last_commit <= offset: self.logger.debug( f"waiting for partition {partition} offset {offset} to be committed, last committed offset: {last_commit}" ) return False return True wait_until(has_finished_consuming, timeout_sec=timeout_sec, err_msg="Consumer failed to consume up to offsets %s after waiting %ds." %\ (str(last_acked_offsets), timeout_sec)) def _collect_all_logs(self): for s in self.test_context.services: self.mark_for_collect(s) def await_startup(self, min_records=5, timeout_sec=30): try: wait_until(lambda: self.consumer.total_consumed() >= min_records, timeout_sec=timeout_sec, err_msg="Timed out after %ds while awaiting initial record delivery of %d records" %\ (timeout_sec, min_records)) except BaseException: self._collect_all_logs() raise def run_validation(self, min_records=5000, producer_timeout_sec=30, consumer_timeout_sec=30, enable_idempotence=False): try: wait_until(lambda: self.producer.num_acked > min_records, timeout_sec=producer_timeout_sec, err_msg="Producer failed to produce messages for %ds." %\ producer_timeout_sec) self.logger.info("Stopping producer after writing up to offsets %s" %\ str(self.producer.last_acked_offsets)) self.producer.stop() self.await_consumed_offsets(self.producer.last_acked_offsets, consumer_timeout_sec) self.consumer.stop() self.validate(enable_idempotence) except BaseException: self._collect_all_logs() raise def validate(self, enable_idempotence): self.logger.info("Number of acked records: %d" % len(self.producer.acked)) self.logger.info("Number of consumed records: %d" % len(self.records_consumed)) success = True msg = "" # Correctness of the set difference operation depends on using equivalent # message_validators in producer and consumer missing = set(self.producer.acked) - set(self.records_consumed) if len(missing) > 0: success = False msg = annotate_missing_msgs(missing, self.producer.acked, self.records_consumed, msg) # Are there duplicates? if len(set(self.records_consumed)) != len(self.records_consumed): num_duplicates = abs( len(set(self.records_consumed)) - len(self.records_consumed)) if enable_idempotence: success = False msg += "Detected %d duplicates even though idempotence was enabled.\n" % num_duplicates else: msg += "(There are also %d duplicate messages in the log - but that is an acceptable outcome)\n" % num_duplicates # Collect all logs if validation fails if not success: self._collect_all_logs() assert success, msg
class RedpandaTest(Test): """ Base class for tests that use the Redpanda service. """ # List of topics to be created automatically when the cluster starts. Each # topic is defined by an instance of a TopicSpec. topics: Sequence[TopicSpec] = [] def __init__(self, test_context, num_brokers=None, extra_rp_conf=dict(), enable_pp=False, enable_sr=False, si_settings=None, **kwargs): """ Any trailing keyword arguments are passed through to the RedpandaService constructor. """ super(RedpandaTest, self).__init__(test_context) self.scale = Scale(test_context) self.si_settings = si_settings if num_brokers is None: # Default to a 3 node cluster if sufficient nodes are available, else # a single node cluster. This is just a default: tests are welcome # to override constructor to pass an explicit size. This logic makes # it convenient to mix 3 node and 1 node cases in the same class, by # just modifying the @cluster node count per test. if test_context.cluster.available().size() >= 3: num_brokers = 3 else: num_brokers = 1 if self.si_settings: self.si_settings.load_context(self.logger, test_context) self.redpanda = RedpandaService(test_context, num_brokers, extra_rp_conf=extra_rp_conf, enable_pp=enable_pp, enable_sr=enable_sr, si_settings=self.si_settings, **kwargs) self._client = DefaultClient(self.redpanda) @property def topic(self): """ Return the name of the auto-created initial topic. Accessing this property requires exactly one initial topic be configured. """ assert len(self.topics) == 1 return self.topics[0].name @property def debug_mode(self): """ Useful for tests that want to change behaviour when running on the much slower debug builds of redpanda, which generally cannot keep up with significant quantities of data or partition counts. """ return os.environ.get('BUILD_TYPE', None) == 'debug' @property def ci_mode(self): """ Useful for tests that want to dynamically degrade/disable on low-resource developer environments (e.g. laptops) but apply stricter checks in CI. """ return os.environ.get('CI', None) != 'false' @property def s3_client(self): return self.redpanda.s3_client def setUp(self): self.redpanda.start() self._create_initial_topics() def client(self): return self._client def _create_initial_topics(self): config = self.redpanda.security_config() user = config.get("sasl_plain_username") passwd = config.get("sasl_plain_password") client = KafkaCliTools(self.redpanda, user=user, passwd=passwd) for spec in self.topics: self.logger.debug(f"Creating initial topic {spec}") client.create_topic(spec)
class TestMirrorMakerService(EndToEndTest): kafka_source = "kafka" redpanda_source = "redpanda" def __init__(self, test_context): super(TestMirrorMakerService, self).__init__(test_context) self.topic = TopicSpec(replication_factor=3) # create single zookeeper node for Kafka self.zk = ZookeeperService(self.test_context, num_nodes=1, version=V_3_0_0) self.source_broker = None def setUp(self): self.zk.start() def tearDown(self): # ducktape handle service teardown automatically, but it is hard # to tell what went wrong if one of the services hangs. Do it # explicitly here with some logging, to enable debugging issues # like https://github.com/redpanda-data/redpanda/issues/4270 if self.source_broker is not None: self.logger.info( f"Stopping source broker ({self.source_broker.__class__.__name__})..." ) self.source_broker.stop() self.logger.info( f"Awaiting source broker ({self.source_broker.__class__.__name__})..." ) self.logger.info("Stopping zookeeper...") self.zk.stop() self.logger.info("Awaiting zookeeper...") def start_brokers(self, source_type=kafka_source): if source_type == TestMirrorMakerService.redpanda_source: self.source_broker = RedpandaService(self.test_context, num_brokers=3) else: self.source_broker = KafkaServiceAdapter( self.test_context, KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=V_3_0_0)) self.redpanda = RedpandaService(self.test_context, num_brokers=3) self.source_broker.start() self.redpanda.start() self.source_client = DefaultClient(self.source_broker) self.topic.partition_count = 1000 if self.redpanda.dedicated_nodes else 1 self.source_client.create_topic(self.topic) def start_workload(self): self.consumer = VerifiableConsumer( self.test_context, num_nodes=1, redpanda=self.redpanda, topic=self.topic.name, group_id='consumer_test_group', on_record_consumed=self.on_record_consumed) self.consumer.start() self.producer = VerifiableProducer( self.test_context, num_nodes=1, redpanda=self.source_broker, topic=self.topic.name, throughput=1000, message_validator=is_int_with_prefix) self.producer.start() def wait_for_n_messages(self, n_messages=100): """Wait for a minimum number of messages to be successfully produced.""" wait_until( lambda: self.producer.num_acked > n_messages, timeout_sec=10, err_msg= "Producer failed to produce %d messages in a reasonable amount of time." % n_messages) @cluster(num_nodes=10) @parametrize(source_type=kafka_source) @parametrize(source_type=redpanda_source) def test_simple_end_to_end(self, source_type): # start brokers self.start_brokers(source_type=source_type) # start mirror maker self.mirror_maker = MirrorMaker2(self.test_context, num_nodes=1, source_cluster=self.source_broker, target_cluster=self.redpanda) topics = [] for i in range(0, 10): topics.append( TopicSpec(partition_count=random.randint(1, 10), retention_bytes=random.randint(100000000, 300000000), retention_ms=random.randint(1 * 3600000, 10 * 3600000))) self.source_client.create_topic(topics) self.mirror_maker.start() # start source producer & target consumer self.start_workload() self.run_validation(consumer_timeout_sec=120) self.mirror_maker.stop() target_client = DefaultClient(self.redpanda) for t in topics: desc = target_client.describe_topic(t.name) self.logger.debug(f'source topic: {t}, target topic: {desc}') assert len(desc.partitions) == t.partition_count @cluster(num_nodes=9) @parametrize(source_type=kafka_source) @parametrize(source_type=redpanda_source) def test_consumer_group_mirroring(self, source_type): # start redpanda self.start_brokers(source_type=source_type) consumer_group = "test-group-1" # start mirror maker self.mirror_maker = MirrorMaker2(self.test_context, num_nodes=1, source_cluster=self.source_broker, target_cluster=self.redpanda, consumer_group_pattern=consumer_group, log_level="TRACE") self.mirror_maker.start() msg_size = 512 msg_cnt = 1000000 if self.redpanda.dedicated_nodes else 100 # produce some messages to source redpanda producer = RpkProducer(self.test_context, self.source_broker, self.topic.name, msg_size, msg_cnt, acks=-1) producer.start() producer.wait() producer.free() # consume some messages from source redpanda consumer = RpkConsumer(self.test_context, self.source_broker, self.topic.name, ignore_errors=False, retries=3, group=consumer_group, save_msgs=False, num_msgs=int(msg_cnt / 5)) consumer.start() consumer.wait() consumer.stop() source_messages = consumer.messages self.logger.info(f"source message count: {len(source_messages)}") consumer.free() src_rpk = RpkTool(self.source_broker) source_group = src_rpk.group_describe(consumer_group) target_rpk = RpkTool(self.redpanda) def target_group_equal(): try: target_group = target_rpk.group_describe(consumer_group) except RpkException as e: # e.g. COORDINATOR_NOT_AVAILABLE self.logger.info(f"Error describing target cluster group: {e}") return False self.logger.info( f"source {source_group}, target_group: {target_group}") return target_group.partitions == source_group.partitions and target_group.name == source_group.name # wait for consumer group sync timeout = 600 if self.redpanda.dedicated_nodes else 60 wait_until(target_group_equal, timeout_sec=timeout, backoff_sec=5) self.mirror_maker.stop()
class LibrdkafkaTest(Test): """ Execute the librdkafka test suite against redpanda. """ TESTS_DIR = "/opt/librdkafka/tests" CONF_FILE = os.path.join(TESTS_DIR, "test.conf") def __init__(self, context): super(LibrdkafkaTest, self).__init__(context) self._context = context self._extra_rp_conf = dict( auto_create_topics_enabled=True, default_topic_partitions=4, ) self._redpanda = None def _start_redpanda(self, num_brokers): self._redpanda = RedpandaService(self._context, num_brokers, extra_rp_conf=self._extra_rp_conf) self._redpanda.start() with open(LibrdkafkaTest.CONF_FILE, "w") as f: brokers = self._redpanda.brokers() f.write("metadata.broker.list={}".format(brokers)) def teardown(self): self._redpanda.stop() os.remove(LibrdkafkaTest.CONF_FILE) # yapf: disable @cluster(num_nodes=3) @ignore(test_num=19, num_brokers=1) # segfault in group membership https://app.clubhouse.io/vectorized/story/963/dereferencing-null-pointer-in-kafka-group-membership @ignore(test_num=52, num_brokers=1) # https://app.clubhouse.io/vectorized/story/997/librdkafka-tests-failing-due-to-consumer-out-of-range-timestamps @ignore(test_num=54, num_brokers=1) # timequery issues: https://app.clubhouse.io/vectorized/story/995/librdkafka-offset-time-query @ignore(test_num=63, num_brokers=1) # cluster-id: https://app.clubhouse.io/vectorized/story/939/generate-cluster-id-uuid-on-bootstrap-and-expose-through-metadata-request @ignore(test_num=67, num_brokers=1) # empty topic offset edge case: https://app.clubhouse.io/vectorized/story/940/consuming-from-empty-topic-should-return-eof @ignore(test_num=77, num_brokers=1) # topic compaction settings: https://app.clubhouse.io/vectorized/story/999/support-create-topic-configurations-for-compaction-retention-policies @ignore(test_num=92, num_brokers=1) # no support for v2 -> v1 message version conversion in the broker @ignore(test_num=44, num_brokers=1) # we do not support runtime changes to topic partition count @ignore(test_num=69, num_brokers=1) # we do not support runtime changes to topic partition count @ignore(test_num=81, num_brokers=1) # we do not support replica assignment @ignore(test_num=61, num_brokers=1) # transactions @ignore(test_num=76, num_brokers=1) # idempotent producer @ignore(test_num=86, num_brokers=1) # idempotent producer @ignore(test_num=90, num_brokers=1) # idempotent producer @ignore(test_num=94, num_brokers=1) # idempotent producer @ignore(test_num=98, num_brokers=1) # transactions # @ignore appears to not be quite smart enough to handle the partial # parameterization so we repeat them here with num_brokers=3. this would be # a nice enhancement that we could upstream. @ignore(test_num=19, num_brokers=3) @ignore(test_num=52, num_brokers=3) @ignore(test_num=54, num_brokers=3) @ignore(test_num=63, num_brokers=3) @ignore(test_num=67, num_brokers=3) @ignore(test_num=77, num_brokers=3) @ignore(test_num=92, num_brokers=3) @ignore(test_num=44, num_brokers=3) @ignore(test_num=69, num_brokers=3) @ignore(test_num=81, num_brokers=3) @ignore(test_num=61, num_brokers=3) @ignore(test_num=76, num_brokers=3) @ignore(test_num=86, num_brokers=3) @ignore(test_num=90, num_brokers=3) @ignore(test_num=94, num_brokers=3) @ignore(test_num=98, num_brokers=3) @matrix(test_num=range(101), num_brokers=[1, 3]) # yapf: enable def test_librdkafka(self, test_num, num_brokers): self._start_redpanda(num_brokers) p = subprocess.Popen(["make"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, cwd=LibrdkafkaTest.TESTS_DIR, env=dict(TESTS="%04d" % test_num, **os.environ)) for line in iter(p.stdout.readline, b''): self.logger.debug(line.rstrip()) p.wait() if p.returncode != 0: raise RuntimeError("librdkafka test failed {}".format( p.returncode))
class ClusterViewTest(EndToEndTest): @cluster(num_nodes=3) def test_view_changes_on_add(self): self.redpanda = RedpandaService(self.test_context, 3) # start single node cluster self.redpanda.start(nodes=[self.redpanda.nodes[0]]) admin = Admin(self.redpanda) seed = None def rp1_started(): nonlocal seed try: #{"version": 0, "brokers": [{"node_id": 1, "num_cores": 3, "membership_status": "active", "is_alive": true}]} seed = admin.get_cluster_view(self.redpanda.nodes[0]) self.redpanda.logger.info( f"view from {self.redpanda.nodes[0]}: {json.dumps(seed)}") return len(seed["brokers"]) == 1 except requests.exceptions.RequestException as e: self.redpanda.logger.debug(f"admin API isn't available ({e})") return False wait_until( rp1_started, timeout_sec=30, backoff_sec=1, err_msg="Cant get cluster view from {self.redpanda.nodes[0]}") self.redpanda.start_node(self.redpanda.nodes[1]) self.redpanda.start_node(self.redpanda.nodes[2]) def rest_started(): try: last = None ids = None for i in range(0, 3): view = admin.get_cluster_view(self.redpanda.nodes[i]) self.redpanda.logger.info( f"view from {self.redpanda.nodes[i]}: {json.dumps(view)}" ) if view["version"] <= seed["version"]: return False if len(view["brokers"]) != 3: return False if last == None: last = view ids = set( map(lambda broker: broker["node_id"], view["brokers"])) if last["version"] != view["version"]: return False if not ids.issubset( map(lambda broker: broker["node_id"], view["brokers"])): return False return True except requests.exceptions.RequestException as e: self.redpanda.logger.debug(f"admin API isn't available ({e})") return False wait_until(rest_started, timeout_sec=30, backoff_sec=1, err_msg="Cant get cluster view from {self.redpanda.nodes}")
class EndToEndShadowIndexingTest(EndToEndTest): segment_size = 1048576 # 1 Mb s3_host_name = "minio-s3" s3_access_key = "panda-user" s3_secret_key = "panda-secret" s3_region = "panda-region" s3_topic_name = "panda-topic" topics = (TopicSpec( name=s3_topic_name, partition_count=1, replication_factor=3, ), ) def __init__(self, test_context): super(EndToEndShadowIndexingTest, self).__init__(test_context=test_context) self.s3_bucket_name = f"panda-bucket-{uuid.uuid1()}" self.topic = EndToEndShadowIndexingTest.s3_topic_name self._extra_rp_conf = dict( cloud_storage_enabled=True, cloud_storage_enable_remote_read=True, cloud_storage_enable_remote_write=True, cloud_storage_access_key=EndToEndShadowIndexingTest.s3_access_key, cloud_storage_secret_key=EndToEndShadowIndexingTest.s3_secret_key, cloud_storage_region=EndToEndShadowIndexingTest.s3_region, cloud_storage_bucket=self.s3_bucket_name, cloud_storage_disable_tls=True, cloud_storage_api_endpoint=EndToEndShadowIndexingTest.s3_host_name, cloud_storage_api_endpoint_port=9000, cloud_storage_reconciliation_interval_ms=500, cloud_storage_max_connections=5, log_segment_size=EndToEndShadowIndexingTest.segment_size, # 1MB ) self.scale = Scale(test_context) self.redpanda = RedpandaService( context=test_context, num_brokers=3, extra_rp_conf=self._extra_rp_conf, ) self.kafka_tools = KafkaCliTools(self.redpanda) self.s3_client = S3Client( region=EndToEndShadowIndexingTest.s3_region, access_key=EndToEndShadowIndexingTest.s3_access_key, secret_key=EndToEndShadowIndexingTest.s3_secret_key, endpoint=f"http://{EndToEndShadowIndexingTest.s3_host_name}:9000", logger=self.logger, ) def setUp(self): self.s3_client.empty_bucket(self.s3_bucket_name) self.s3_client.create_bucket(self.s3_bucket_name) self.redpanda.start() for topic in EndToEndShadowIndexingTest.topics: self.kafka_tools.create_topic(topic) def tearDown(self): self.s3_client.empty_bucket(self.s3_bucket_name) @cluster(num_nodes=5) def test_write(self): """Write at least 10 segments, set retention policy to leave only 5 segments, wait for segments removal, consume data and run validation, that everything that is acked is consumed.""" self.start_producer() produce_until_segments( redpanda=self.redpanda, topic=self.topic, partition_idx=0, count=10, ) self.kafka_tools.alter_topic_config( self.topic, { TopicSpec.PROPERTY_RETENTION_BYTES: 5 * EndToEndShadowIndexingTest.segment_size, }, ) wait_for_segments_removal(redpanda=self.redpanda, topic=self.topic, partition_idx=0, count=6) self.start_consumer() self.run_validation()
class ConsumerOffsetsMigrationTest(EndToEndTest): max_suspend_duration_sec = 3 min_inter_failure_time_sec = 30 max_inter_failure_time_sec = 60 @cluster(num_nodes=7, log_allow_list=CHAOS_LOG_ALLOW_LIST) @matrix(failures=[True, False], cpus=[1, 3]) def test_migrating_consume_offsets(self, failures, cpus): ''' Validates correctness while executing consumer offsets migration ''' # set redpanda logical version to value without __consumer_offsets support self.redpanda = RedpandaService( self.test_context, 5, resource_settings=ResourceSettings(num_cpus=cpus), extra_rp_conf={ "group_topic_partitions": 16, "default_topic_replications": 3, }, environment={"__REDPANDA_LOGICAL_VERSION": 1}) self.redpanda.start() self._client = DefaultClient(self.redpanda) # set of failure suppressed nodes - required to make restarts deterministic suppressed = set() def failure_injector_loop(): f_injector = FailureInjector(self.redpanda) while failures: f_type = random.choice(FailureSpec.FAILURE_TYPES) length = 0 node = random.choice(self.redpanda.nodes) while self.redpanda.idx(node) in suppressed: node = random.choice(self.redpanda.nodes) # allow suspending any node if f_type == FailureSpec.FAILURE_SUSPEND: length = random.randint( 1, ConsumerOffsetsMigrationTest.max_suspend_duration_sec) f_injector.inject_failure( FailureSpec(node=node, type=f_type, length=length)) delay = random.randint( ConsumerOffsetsMigrationTest.min_inter_failure_time_sec, ConsumerOffsetsMigrationTest.max_inter_failure_time_sec) self.redpanda.logger.info( f"waiting {delay} seconds before next failure") time.sleep(delay) if failures: finjector_thread = threading.Thread(target=failure_injector_loop, args=()) finjector_thread.daemon = True finjector_thread.start() spec = TopicSpec(partition_count=6, replication_factor=3) self.client().create_topic(spec) self.topic = spec.name self.start_producer(1, throughput=5000) self.start_consumer(1) self.await_startup() def cluster_is_stable(): admin = Admin(self.redpanda) brokers = admin.get_brokers() if len(brokers) < 3: return False for b in brokers: self.logger.debug(f"broker: {b}") if not (b['is_alive'] and 'disk_space' in b): return False return True kcl = KCL(self.redpanda) def _group_present(): return len(kcl.list_groups().splitlines()) > 1 # make sure that group is there wait_until(_group_present, 10, 1) # check that consumer offsets topic is not present topics = set(kcl.list_topics()) assert "__consumer_offsets" not in topics # enable consumer offsets support self.redpanda.set_environment({"__REDPANDA_LOGICAL_VERSION": 2}) for n in self.redpanda.nodes: id = self.redpanda.idx(n) suppressed.add(id) self.redpanda.restart_nodes(n, stop_timeout=60) suppressed.remove(id) # wait for leader balancer to start evening out leadership wait_until(cluster_is_stable, 90, backoff_sec=2) def _consumer_offsets_present(): try: partitions = list( self.client().describe_topic("__consumer_offsets")) return len(partitions) > 0 except: return False wait_until(_consumer_offsets_present, timeout_sec=90, backoff_sec=3) self.run_validation(min_records=100000, producer_timeout_sec=300, consumer_timeout_sec=180) @cluster(num_nodes=5, log_allow_list=RESTART_LOG_ALLOW_LIST) def test_cluster_is_available_during_upgrade_without_group_topic(self): ''' Validates that cluster is available and healthy during upgrade when `kafka_internal::group` topic is not present ''' # set redpanda logical version to value without __consumer_offsets support self.redpanda = RedpandaService( self.test_context, 5, extra_rp_conf={ "group_topic_partitions": 16, "default_topic_replications": 3, }, environment={"__REDPANDA_LOGICAL_VERSION": 1}) self.redpanda.start() self._client = DefaultClient(self.redpanda) spec = TopicSpec(partition_count=6, replication_factor=3) self.client().create_topic(spec) self.topic = spec.name def cluster_is_stable(): admin = Admin(self.redpanda) brokers = admin.get_brokers() if len(brokers) < 3: return False for b in brokers: self.logger.debug(f"broker: {b}") if not (b['is_alive'] and 'disk_space' in b): return False return True def node_stopped(node_id): admin = Admin(self.redpanda) brokers = admin.get_brokers() for b in brokers: self.logger.debug(f"broker: {b}") if b['node_id'] == node_id: return b['is_alive'] == False return False kcl = KCL(self.redpanda) # check that consumer offsets topic is not present topics = set(kcl.list_topics()) assert "__consumer_offsets" not in topics # enable consumer offsets support self.redpanda.set_environment({"__REDPANDA_LOGICAL_VERSION": 2}) def get_raft0_follower(): ctrl = self.redpanda.controller node = random.choice(self.redpanda.nodes) while self.redpanda.idx(node) == self.redpanda.idx(ctrl): node = random.choice(self.redpanda.nodes) return node # restart node that is not controller n = get_raft0_follower() self.logger.info(f"restarting node {n.account.hostname}") self.redpanda.stop_node(n, timeout=60) # wait for leader balancer to start evening out leadership wait_until(lambda: node_stopped(self.redpanda.idx(n)), 90, backoff_sec=2) self.redpanda.start_node(n) wait_until(cluster_is_stable, 90, backoff_sec=2)
class AvailabilityTests(EndToEndFinjectorTest): def validate_records(self): min_records = 40000 producer_timeout_sec = 60 consumer_timeout_sec = 60 if self.scale.ci or self.scale.release: min_records = 100000 producer_timeout_sec = 180 consumer_timeout_sec = 180 self.run_validation(min_records=min_records, enable_idempotence=False, producer_timeout_sec=producer_timeout_sec, consumer_timeout_sec=consumer_timeout_sec) @cluster(num_nodes=5) def test_availability_when_one_node_failed(self): self.redpanda = RedpandaService( self.test_context, 3, KafkaCliTools, extra_rp_conf={ "enable_auto_rebalance_on_node_add": True, "group_topic_partitions": 1, "default_topic_replications": 3, }) self.redpanda.start() spec = TopicSpec(name="test-topic", partition_count=6, replication_factor=3) self.redpanda.create_topic(spec) self.topic = spec.name self.start_producer(1, throughput=10000) self.start_consumer(1) self.await_startup() # start failure injector with default parameters self.start_finjector() self.validate_records() @cluster(num_nodes=5) def test_recovery_after_catastrophic_failure(self): self.redpanda = RedpandaService( self.test_context, 3, KafkaCliTools, extra_rp_conf={ "enable_auto_rebalance_on_node_add": True, "group_topic_partitions": 1, "default_topic_replications": 3, }) self.redpanda.start() spec = TopicSpec(name="test-topic", partition_count=6, replication_factor=3) self.redpanda.create_topic(spec) self.topic = spec.name self.start_producer(1, throughput=10000) self.start_consumer(1) self.await_startup() # inject permanent random failure f_spec = FailureSpec(random.choice(FailureSpec.FAILURE_TYPES), random.choice(self.redpanda.nodes[0:1])) self.inject_failure(f_spec) # inject transient failure on other node f_spec = FailureSpec(random.choice(FailureSpec.FAILURE_TYPES), self.redpanda.nodes[2], length=2.0 if self.scale.local else 15.0) self.inject_failure(f_spec) self.validate_records()
class NodeOperationFuzzyTest(EndToEndTest): max_suspend_duration_seconds = 10 min_inter_failure_time = 30 max_inter_failure_time = 60 def generate_random_workload(self, count, skip_nodes, available_nodes): op_types = [ADD, DECOMMISSION] tp_op_types = [ADD_TOPIC, DELETE_TOPIC] # current state active_nodes = list(available_nodes) decommissioned_nodes = [] operations = [] topics = [] def eligible_active_nodes(): return list( filter(lambda n: not (n == 1 or n in skip_nodes), active_nodes)) def decommission(id): active_nodes.remove(id) decommissioned_nodes.append(id) def add(id): active_nodes.append(id) decommissioned_nodes.remove(id) for _ in range(0, count): if len(decommissioned_nodes) == 2: id = random.choice(decommissioned_nodes) operations.append((ADD, id)) add(id) elif len(decommissioned_nodes) == 0: id = random.choice(eligible_active_nodes()) operations.append((DECOMMISSION, id)) decommission(id) else: op = random.choice(op_types) if op == DECOMMISSION: id = random.choice(eligible_active_nodes()) operations.append((DECOMMISSION, id)) decommission(id) elif op == ADD: id = random.choice(decommissioned_nodes) operations.append((ADD, id)) add(id) # topic operation if len(topics) == 0: op = ADD_TOPIC else: op = random.choice(tp_op_types) if op == ADD_TOPIC: operations.append(( ADD_TOPIC, f"test-topic-{random.randint(0,2000)}-{round(time.time()*1000000)}", random.choice(ALLOWED_REPLICATION), 3)) else: operations.append((DELETE_TOPIC, random.choice(topics))) return operations def _create_random_topics(self, count): max_partitions = 10 topics = [] for i in range(0, count): name = f"topic-{i}" spec = TopicSpec( name=name, partition_count=random.randint(1, max_partitions), replication_factor=random.choice(ALLOWED_REPLICATION)) topics.append(spec) for spec in topics: DefaultClient(self.redpanda).create_topic(spec) return topics """ Adding nodes to the cluster should result in partition reallocations to new nodes """ @cluster(num_nodes=7, log_allow_list=CHAOS_LOG_ALLOW_LIST) @parametrize(enable_failures=True) @parametrize(enable_failures=False) def test_node_operations(self, enable_failures): # allocate 5 nodes for the cluster self.redpanda = RedpandaService( self.test_context, 5, extra_rp_conf={ "enable_auto_rebalance_on_node_add": True, "group_topic_partitions": 3, "default_topic_replications": 3, }) self.redpanda.start() # create some topics topics = self._create_random_topics(10) self.redpanda.logger.info(f"using topics: {topics}") # select one of the topics to use in consumer/producer self.topic = random.choice(topics).name self.start_producer(1, throughput=100) self.start_consumer(1) self.await_startup() self.active_nodes = set( [self.redpanda.idx(n) for n in self.redpanda.nodes]) # collect current mapping self.ids_mapping = {} for n in self.redpanda.nodes: self.ids_mapping[self.redpanda.idx(n)] = self.redpanda.idx(n) self.next_id = sorted(list(self.ids_mapping.keys()))[-1] + 1 self.redpanda.logger.info(f"Initial ids mapping: {self.ids_mapping}") NODE_OP_TIMEOUT = 360 def get_next_id(): id = self.next_id self.next_id += 1 return id def failure_injector_loop(): f_injector = FailureInjector(self.redpanda) while enable_failures: f_type = random.choice(FailureSpec.FAILURE_TYPES) length = 0 # allow suspending any node if f_type == FailureSpec.FAILURE_SUSPEND: length = random.randint( 1, NodeOperationFuzzyTest.max_suspend_duration_seconds) node = random.choice(self.redpanda.nodes) else: #kill/termianate only active nodes (not to influence the test outcome) idx = random.choice(list(self.active_nodes)) node = self.redpanda.get_node(idx) f_injector.inject_failure( FailureSpec(node=node, type=f_type, length=length)) delay = random.randint( NodeOperationFuzzyTest.min_inter_failure_time, NodeOperationFuzzyTest.max_inter_failure_time) self.redpanda.logger.info( f"waiting {delay} seconds before next failure") time.sleep(delay) if enable_failures: finjector_thread = threading.Thread(target=failure_injector_loop, args=()) finjector_thread.daemon = True finjector_thread.start() def decommission(idx): node_id = self.ids_mapping[idx] self.logger.info(f"decommissioning node: {idx} with id: {node_id}") def decommissioned(): try: admin = Admin(self.redpanda) # if broker is already draining, it is suceess brokers = admin.get_brokers() for b in brokers: if b['node_id'] == node_id and b[ 'membership_status'] == 'draining': return True r = admin.decommission_broker(id=node_id) return r.status_code == 200 except requests.exceptions.RetryError: return False except requests.exceptions.ConnectionError: return False except requests.exceptions.HTTPError: return False wait_until(decommissioned, timeout_sec=NODE_OP_TIMEOUT, backoff_sec=2) admin = Admin(self.redpanda) def is_node_removed(idx_to_query, node_id): try: brokers = admin.get_brokers( self.redpanda.get_node(idx_to_query)) ids = map(lambda broker: broker['node_id'], brokers) return not node_id in ids except: return False def node_removed(): node_removed_cnt = 0 for idx in self.active_nodes: if is_node_removed(idx, node_id): node_removed_cnt += 1 node_count = len(self.redpanda.nodes) majority = int(node_count / 2) + 1 self.redpanda.logger.debug( f"node {node_id} removed on {node_removed_cnt} nodes, majority: {majority}" ) return node_removed_cnt >= majority wait_until(node_removed, timeout_sec=NODE_OP_TIMEOUT, backoff_sec=2) self.redpanda.stop_node(self.redpanda.get_node(idx)) kafkacat = KafkaCat(self.redpanda) def replicas_per_node(): node_replicas = {} md = kafkacat.metadata() self.redpanda.logger.info(f"metadata: {md}") for topic in md['topics']: for p in topic['partitions']: for r in p['replicas']: id = r['id'] if id not in node_replicas: node_replicas[id] = 0 node_replicas[id] += 1 return node_replicas def seed_servers_for(idx): seeds = map( lambda n: { "address": n.account.hostname, "port": 33145 }, self.redpanda.nodes) return list( filter( lambda n: n['address'] != self.redpanda.get_node(idx). account.hostname, seeds)) def add_node(idx, cleanup=True): id = get_next_id() self.logger.info(f"adding node: {idx} back with new id: {id}") self.ids_mapping[idx] = id self.redpanda.stop_node(self.redpanda.get_node(idx)) if cleanup: self.redpanda.clean_node(self.redpanda.get_node(idx), preserve_logs=True) # we do not reuse previous node ids and override seed server list self.redpanda.start_node( self.redpanda.get_node(idx), timeout=NodeOperationFuzzyTest.min_inter_failure_time + NodeOperationFuzzyTest.max_suspend_duration_seconds + 30, override_cfg_params={ "node_id": id, "seed_servers": seed_servers_for(idx) }) def has_new_replicas(): per_node = replicas_per_node() self.logger.info(f"replicas per node: {per_node}") return id in per_node wait_until(has_new_replicas, timeout_sec=NODE_OP_TIMEOUT, backoff_sec=2) def is_topic_present(name): kcl = KCL(self.redpanda) lines = kcl.list_topics().splitlines() self.redpanda.logger.debug( f"checking if topic {name} is present in {lines}") for l in lines: if l.startswith(name): return True return False def create_topic(spec): try: DefaultClient(self.redpanda).create_topic(spec) except Exception as e: self.redpanda.logger.warn( f"error creating topic {spec.name} - {e}") try: return is_topic_present(spec.name) except Exception as e: self.redpanda.logger.warn(f"error while listing topics - {e}") return False def delete_topic(name): try: DefaultClient(self.redpanda).delete_topic(name) except Exception as e: self.redpanda.logger.warn(f"error deleting topic {name} - {e}") try: return not is_topic_present(name) except Exception as e: self.redpanda.logger.warn(f"error while listing topics - {e}") return False work = self.generate_random_workload(10, skip_nodes=set(), available_nodes=self.active_nodes) self.redpanda.logger.info(f"node operations to execute: {work}") for op in work: op_type = op[0] self.logger.info( f"executing - {op} - current ids: {self.ids_mapping}") if op_type == ADD: idx = op[1] self.active_nodes.add(idx) add_node(idx) if op_type == DECOMMISSION: idx = op[1] self.active_nodes.remove(idx) decommission(idx) elif op_type == ADD_TOPIC: spec = TopicSpec(name=op[1], replication_factor=op[2], partition_count=op[3]) wait_until(lambda: create_topic(spec) == True, timeout_sec=180, backoff_sec=2) elif op_type == DELETE_TOPIC: wait_until(lambda: delete_topic(op[1]) == True, timeout_sec=180, backoff_sec=2) enable_failures = False self.run_validation(enable_idempotence=False, producer_timeout_sec=60, consumer_timeout_sec=180)
class MetricsReporterTest(Test): def __init__(self, test_ctx, *args, **kwargs): self._ctx = test_ctx super(MetricsReporterTest, self).__init__(test_context=test_ctx) """ Validates key availability properties of the system using a single partition. """ @cluster(num_nodes=4) def test_redpanda_metrics_reporting(self): """ Testing if when fetching from single node all partitions are returned in round robin fashion """ # setup http server http = HttpServer(self._ctx) http.start() # report every two seconds extra_conf = { "health_monitor_tick_interval": 1000, "metrics_reporter_tick_interval": 2000, "metrics_reporter_report_interval": 1000, "enable_metrics_reporter": True, "metrics_reporter_url": f"{http.url}/metrics", } self.redpanda = RedpandaService(self.test_context, 3, KafkaCliTools, extra_rp_conf=extra_conf) self.redpanda.start() total_topics = 5 total_partitions = 0 for _ in range(0, total_topics): partitions = random.randint(1, 8) total_partitions += partitions self.redpanda.create_topic( [TopicSpec(partition_count=partitions, replication_factor=3)]) # create topics self.redpanda.logger.info( f"created {total_topics} topics with {total_partitions} partitions" ) def _state_up_to_date(): if http.requests: r = json.loads(http.requests[-1]['body']) return r['topic_count'] == total_topics return False wait_until(_state_up_to_date, 20, backoff_sec=1) http.stop() metadata = [json.loads(r['body']) for r in http.requests] for m in metadata: self.redpanda.logger.info(m) def assert_fields_are_the_same(metadata, field): assert all(m[field] == metadata[0][field] for m in metadata) # cluster uuid and create timestamp should stay the same across requests assert_fields_are_the_same(metadata, 'cluster_uuid') assert_fields_are_the_same(metadata, 'cluster_created_ts') # get the last report last = metadata.pop() assert last['topic_count'] == total_topics assert last['partition_count'] == total_partitions nodes_meta = last['nodes'] assert len(last['nodes']) == 3 assert all('node_id' in n for n in nodes_meta) assert all('cpu_count' in n for n in nodes_meta) assert all('version' in n for n in nodes_meta) assert all('uptime_ms' in n for n in nodes_meta) assert all('is_alive' in n for n in nodes_meta) assert all('disks' in n for n in nodes_meta)